Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4b0c7af
Fix handling of empty batches in SumReduceDimsOp
volkhin May 19, 2018
f4bb4bc
Deferrable async_scheduling finishRun fix
May 19, 2018
3a8615e
Simplify exception handling in async_scheduling
May 19, 2018
25504dd
[C2]worker_coordinator_memorize_worker_ids
stephenyan1231 May 19, 2018
58410cd
Add unit test for nets with no type set
May 19, 2018
6b17efa
Ignore total length argument in sympolic_pad_packed_sequence
ahhegazy May 19, 2018
c39f84f
Add support for MKLDNN to async_scheduling
May 19, 2018
0f77800
[AuFL][ensemble] support branch output for prediction
May 19, 2018
ea1c0b4
Fix a bug in add_loss in layer_model_helper
chocjy May 19, 2018
7f5efa2
Support lradaption for adam
May 19, 2018
14ec584
Perf tweaks for async_scheduling
May 19, 2018
9529dd9
add quantization to SparseSimdAdagradOp
May 19, 2018
2da8dc7
[sr] [codemod] Change all SR callsites to use new API
madzoox May 19, 2018
b168a14
Back out "Fix handling of empty batches in SumReduceDimsOp"
pjh5 May 19, 2018
ed1df7f
Add the flow to support operator benchmark
lly-zero-one May 19, 2018
ad4ae12
[tum][gpu] Connect DPM trainer with flow and unit tests
xianjiec May 19, 2018
e3eb8dc
w/o normalized lradaption for adam dense only
May 19, 2018
d318e34
[fb] Use SharedPromise in DeferrableAsyncSchedulingNet
May 19, 2018
93da60a
[tum] implement cuda sparseLengthsMean and LengthsMean
May 19, 2018
5f3a2e3
Adding an optional parameter to allow use of protobufs in InferShapes…
mnaumovfb May 19, 2018
d0f3931
Move feature_to_index to FeatureSpec.feature_to_index
May 19, 2018
0db5ca8
[Caffe2] Rename bytes_moved to bytes_written
May 19, 2018
438bdd8
[c2] fix ReduceFrontSumOp for empty case by setting 0
xianjiec May 19, 2018
e84fdd8
[Caffe2] [Int8] Improve Intel CPU performance
May 19, 2018
6aab2df
[Easy] Improve PrependDim op logging
chocjy May 19, 2018
a71340a
DBFileReader expand db_path using os.path.expanduser(..)
xush6528 May 19, 2018
31a0bb3
[Caffe2] Add bytes_read to cost structure
May 19, 2018
b01fcc5
Fix sleef on aarch64 for hhvm
pjh5 May 19, 2018
3eff8a3
Merge remote-tracking branch 'origin/master' into update-from-facebook
bddppq May 19, 2018
893ec8d
Remove duplicated part in caffe2/ideep/operators/conv_op.cc
bddppq May 19, 2018
0f3f88e
Rename test helper function test_adagrad_sparse_helper to adagrad_spa…
bddppq May 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions aten/src/ATen/cpu/vec256/vec256_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <sleef.h>
#endif

#include <iostream>

This comment was marked as off-topic.

This comment was marked as off-topic.

This comment was marked as off-topic.


namespace at {
namespace vec256 {
namespace {
Expand Down
97 changes: 97 additions & 0 deletions binaries/bench_gen/bench_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import argparse

from caffe2.python.model_helper import ModelHelper
from caffe2.python.predictor import mobile_exporter
from caffe2.python import workspace, brew


def parse_kwarg(kwarg_str):
key, value = kwarg_str.split('=')
try:
value = int(value)
except ValueError:
try:
value = float(value)
except ValueError:
pass
return key, value


def main(args):
# User defined keyword arguments
kwargs = {"order": "NCHW"}
kwargs.update(dict(args.kwargs))

model = ModelHelper(name=args.benchmark_name)

op_type = args.operator # assumes a brew type op name
input_name = args.input_name
output_name = args.output_name

iters = int(args.iters)
for i in range(iters):
input_blob_name = input_name + (str(i) if i > 0 and args.chain else '')
output_blob_name = output_name + str(i + 1)
add_op = getattr(brew, op_type)
add_op(model, input_blob_name, output_blob_name, **kwargs)
if args.chain:
input_name, output_name = output_name, input_name

workspace.RunNetOnce(model.param_init_net)

init_net, predict_net = mobile_exporter.Export(
workspace, model.net, model.params
)

if args.debug:
print("init_net:")
for op in init_net.op:
print(" ", op.type, op.input, "-->", op.output)
print("predict_net:")
for op in predict_net.op:
print(" ", op.type, op.input, "-->", op.output)

with open(args.predict_net, 'wb') as f:
f.write(predict_net.SerializeToString())
with open(args.init_net, 'wb') as f:
f.write(init_net.SerializeToString())


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Utilitity to generate Caffe2 benchmark models.")
parser.add_argument("operator", help="Caffe2 operator to benchmark.")
parser.add_argument("-b", "--blob",
help="Instantiate a blob --blob name=dim1,dim2,dim3",
action='append')
parser.add_argument("--context", help="Context to run on.", default="CPU")
parser.add_argument("--kwargs", help="kwargs to pass to operator.",
nargs="*", type=parse_kwarg, default=[])
parser.add_argument("--init_net", help="Output initialization net.",
default="init_net.pb")
parser.add_argument("--predict_net", help="Output prediction net.",
default="predict_net.pb")
parser.add_argument("--benchmark_name",
help="Name of the benchmark network",
default="benchmark")
parser.add_argument("--input_name", help="Name of the input blob.",
default="data")
parser.add_argument("--output_name", help="Name of the output blob.",
default="output")
parser.add_argument("--iters",
help="Number of iterations to run the operator.",
default="1")
parser.add_argument("-d", "--debug", help="Print debug information.",
action='store_true')
parser.add_argument("-c", "--chain",
help="Chain ops together (create data dependencies)",
action='store_true')
args = parser.parse_args()
main(args)
16 changes: 10 additions & 6 deletions binaries/benchmark_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,16 @@ void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {

void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
if (backend != "builtin") {
string engine = backend == "nnpack" ? "NNPACK"
: backend == "eigen" ? "EIGEN"
: backend == "mkl"
? "MKLDNN"
: backend == "cuda" ? "CUDA"
: backend == "default" ? "" : "NONE";
string engine = backend == "nnpack"
? "NNPACK"
: backend == "eigen" ? "EIGEN"
: backend == "mkl" ? "MKLDNN"
: backend == "cuda"
? "CUDA"
: backend == "dnnlowp" ? "DNNLOWP"
: backend == "dnnlowp_16"
? "DNNLOWP_16"
: backend == "default" ? "" : "NONE";
CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
for (int i = 0; i < net_def->op_size(); i++) {
caffe2::OperatorDef* op_def = net_def->mutable_op(i);
Expand Down
32 changes: 29 additions & 3 deletions caffe2/core/net_async_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ CAFFE2_DEFINE_bool(
true,
"Select next non-busy stream");

CAFFE2_DEFINE_bool(
caffe2_net_async_use_single_pool,
false,
"Use single pool for all devices");

namespace caffe2 {

thread_local std::vector<int> AsyncNetBase::stream_counters_;
Expand Down Expand Up @@ -109,7 +114,12 @@ std::shared_ptr<TaskThreadPool> AsyncNetBase::pool_getter(

std::shared_ptr<TaskThreadPool> AsyncNetBase::pool(
const DeviceOption& device_option) {
if (device_option.device_type() == CPU) {
if (FLAGS_caffe2_net_async_use_single_pool) {
return pool_getter(cpu_pools_, CPU, -1, num_workers_);
}
if (device_option.device_type() == CPU ||
device_option.device_type() == MKLDNN ||
device_option.device_type() == IDEEP) {
auto numa_node_id = device_option.numa_node_id();
CAFFE_ENFORCE(
numa_node_id >= -1 &&
Expand Down Expand Up @@ -141,8 +151,8 @@ int AsyncNetBase::stream(int task_id) {
do {
stream_id = stream_counters_[gpu_id]++;
stream_counters_[gpu_id] %= FLAGS_caffe2_streams_per_gpu;
} while (!isStreamFree(task_id, stream_id) &&
FLAGS_caffe2_net_async_check_stream_status);
} while (FLAGS_caffe2_net_async_check_stream_status &&
!isStreamFree(task_id, stream_id));
}
return stream_id;
}
Expand Down Expand Up @@ -226,6 +236,16 @@ void AsyncNetBase::asyncWait(
first_op->WaitEvents(events, stream_id);
}

void AsyncNetBase::reset() {
for (auto& op : GetOperators()) {
op->ResetEvent();
}
#ifdef CAFFE2_USE_EXCEPTION_PTR
std::unique_lock<std::mutex> exception_lock(exception_mutex_);
caught_exception_ = nullptr;
#endif // CAFFE2_USE_EXCEPTION_PTR
}

void AsyncNetBase::storeExceptionPtr() {
#ifdef CAFFE2_USE_EXCEPTION_PTR
std::unique_lock<std::mutex> exception_lock(exception_mutex_);
Expand All @@ -236,6 +256,12 @@ void AsyncNetBase::storeExceptionPtr() {
}

void AsyncNetBase::run(int task_id, int stream_id) {
// Optionally insert async wait ops,
// skip when using --caffe2_net_async_finish_chain -
// all parents are guaranteed to be finished
if (!FLAGS_caffe2_net_async_finish_chain) {
asyncWait(task_id, stream_id, parents(task_id));
}
std::string err_msg;
for (auto& op_id : chains_[task_id]) {
auto& op = operators_[op_id];
Expand Down
10 changes: 10 additions & 0 deletions caffe2/core/net_async_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/thread_pool.h"

CAFFE2_DECLARE_int(caffe2_streams_per_gpu);
CAFFE2_DECLARE_bool(caffe2_net_async_finish_chain);
CAFFE2_DECLARE_int(caffe2_net_async_max_gpus);
CAFFE2_DECLARE_int(caffe2_net_async_max_numa_nodes);
CAFFE2_DECLARE_int(caffe2_net_async_cpu_pool_size);
CAFFE2_DECLARE_bool(caffe2_net_async_check_stream_status);
CAFFE2_DECLARE_bool(caffe2_net_async_use_single_pool);

namespace caffe2 {

class AsyncNetExecutorHelper;
Expand Down Expand Up @@ -63,6 +71,8 @@ class AsyncNetBase : public NetBase {

bool isStreamFree(int task_id, int stream_id) const;

virtual void reset();

// Operator/task graph
std::vector<OperatorBase*> operators_;
std::vector<dag_utils::OperatorNode> operator_nodes_;
Expand Down
5 changes: 0 additions & 5 deletions caffe2/core/net_async_polling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@ void AsyncPollingNet::schedule(int task_id) {
task_timers_[task_id]->MicroSeconds());
}

// Non-blocking wait, setups scheduling of dependent async computations;
// canSchedule ensures that there's no busy wait,
// for CUDA events we need to insert CUDA event synchronization to ensure
// that async CUDA computations are executed in correct order
asyncWait(task_id, stream_id, parents(task_id));
try {
if (FLAGS_caffe2_dag_net_collect_stats) {
Timer run_time;
Expand Down
87 changes: 40 additions & 47 deletions caffe2/core/net_async_scheduling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ AsyncSchedulingNet::AsyncSchedulingNet(
}

void AsyncSchedulingNet::reset() {
AsyncNetBase::reset();

processed_tasks_num_ = 0;
cleanup_ = false;
success_ = true;

for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
Expand All @@ -37,8 +38,10 @@ void AsyncSchedulingNet::schedule(int task_id) {
const auto& device_option = event(task_id).GetDeviceOption();
pool(device_option)->run([this, task_id]() {
if (success_) {
int stream_id = stream(task_id);
asyncWait(task_id, stream_id, parents(task_id));
int stream_id = 0;
if (FLAGS_caffe2_streams_per_gpu > 1) {
stream_id = stream(task_id);
}
try {
run(task_id, stream_id);
} catch (const std::exception& e) {
Expand All @@ -51,9 +54,14 @@ void AsyncSchedulingNet::schedule(int task_id) {
for (auto child_id : children(task_id)) {
int parent_count = updateParentCount(child_id);
if (parent_count == 0) {
if (!success_ || cleanup_ ||
FLAGS_caffe2_net_async_always_schedule_child ||
canSchedule(child_id)) {
// Schedule a child if:
// - there is failure, we skip an op execution and finish the job
// - forced scheduling though --caffe2_net_async_always_schedule_child
// - --caffe2_net_async_finish_chain is set, in this case parents are
// guaranteed to be finished
// - in all other cases, check parents with canSchedule
if (!success_ || FLAGS_caffe2_net_async_always_schedule_child ||
FLAGS_caffe2_net_async_finish_chain || canSchedule(child_id)) {
schedule(child_id);
} else {
const auto& device_option = event(child_id).GetDeviceOption();
Expand All @@ -64,37 +72,8 @@ void AsyncSchedulingNet::schedule(int task_id) {
}
}

if (success_) {
if (task_count == tasksNum()) {
// All tasks are finished, polling thread is sleeping;
// only one thread enters here
finalizeEvents();
finishRun();
return;
}
} else {
// Before setting running_ to false and notifying waiters we need to
// 1. Ensure that only one thread does the cleanup
// 2. Ensure that all other pending tasks in workers and polling threads
// are finished and
// 3. Ensure that all tasks that were not scheduled have their events set
{
std::unique_lock<std::mutex> cleanup_lock(cleanup_mutex_);
if (cleanup_) {
return;
}
cleanup_ = true;
}

// Errors are not recoverable and happen in exceptional cases,
// ok to busy wait
while (processed_tasks_num_ != tasksNum()) {
}

// Make sure all events are set, wait for scheduled events
if (task_count == tasksNum()) {
finalizeEvents();

// Notify observers and waiters
finishRun();
}
});
Expand All @@ -110,7 +89,7 @@ void AsyncSchedulingNet::pollAndSchedule(int task_id) {
// - parents are ready
// - we failed / cleanup started (no ops will run)

if (can_schedule || cleanup_ || !success_ || parent_failed) {
if (can_schedule || !success_ || parent_failed) {
schedule(task_id);
} else {
const auto& device_option = event(task_id).GetDeviceOption();
Expand All @@ -128,24 +107,38 @@ int AsyncSchedulingNet::updateParentCount(int child_id) {
}

void AsyncSchedulingNet::finishRun() {
{
std::unique_lock<std::mutex> lock(running_mutex_);
running_ = false;
}

// notify observers and waiters
StopAllObservers();
running_ = false;
running_cv_.notify_all();
}

bool AsyncSchedulingNet::DoRunAsync() {
std::unique_lock<std::mutex> lock(running_mutex_);
CAFFE_ENFORCE(!running_, "Concurrent RunAsync calls");
running_ = true;
reset();
bool AsyncSchedulingNet::RunAsync() {
try {
std::unique_lock<std::mutex> lock(running_mutex_);
if (running_) {
LOG(ERROR) << "Detected concurrent runs";
return false;
}
running_ = true;
reset();

StartAllObservers();
StartAllObservers();

for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
if (parents(task_id).empty()) {
schedule(task_id);
for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
if (parents(task_id).empty()) {
schedule(task_id);
}
}
} catch (const std::exception& e) {
LOG(ERROR) << "Exception while starting an async run: " << e.what();
finalizeEvents();
finishRun();
return false;
}

if (tasksNum() == 0) {
Expand Down
7 changes: 2 additions & 5 deletions caffe2/core/net_async_scheduling.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ class AsyncSchedulingNet : public AsyncNetBase {
void Wait() override;

protected:
bool DoRunAsync() override;
bool RunAsync() override;

void pollAndSchedule(int task_id);
void schedule(int task_id);
void reset();
void reset() override;
virtual void finishRun();
int updateParentCount(int child_id);

Expand All @@ -28,9 +28,6 @@ class AsyncSchedulingNet : public AsyncNetBase {
std::atomic<bool> running_;
std::atomic<bool> success_;

std::mutex cleanup_mutex_;
std::atomic<bool> cleanup_;

std::atomic<int> processed_tasks_num_;

DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
Expand Down
Loading