pytorch · bddppq · May 20, 2018 · May 19, 2018 · May 19, 2018 · May 19, 2018
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -6,6 +6,8 @@
 #include <sleef.h>
 #endif
 
+#include <iostream>
+
 namespace at {
 namespace vec256 {
 namespace {

diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+
+from caffe2.python.model_helper import ModelHelper
+from caffe2.python.predictor import mobile_exporter
+from caffe2.python import workspace, brew
+
+
+def parse_kwarg(kwarg_str):
+    key, value = kwarg_str.split('=')
+    try:
+        value = int(value)
+    except ValueError:
+        try:
+            value = float(value)
+        except ValueError:
+            pass
+    return key, value
+
+
+def main(args):
+    # User defined keyword arguments
+    kwargs = {"order": "NCHW"}
+    kwargs.update(dict(args.kwargs))
+
+    model = ModelHelper(name=args.benchmark_name)
+
+    op_type = args.operator  # assumes a brew type op name
+    input_name = args.input_name
+    output_name = args.output_name
+
+    iters = int(args.iters)
+    for i in range(iters):
+        input_blob_name = input_name + (str(i) if i > 0 and args.chain else '')
+        output_blob_name = output_name + str(i + 1)
+        add_op = getattr(brew, op_type)
+        add_op(model, input_blob_name, output_blob_name, **kwargs)
+        if args.chain:
+            input_name, output_name = output_name, input_name
+
+    workspace.RunNetOnce(model.param_init_net)
+
+    init_net, predict_net = mobile_exporter.Export(
+        workspace, model.net, model.params
+    )
+
+    if args.debug:
+        print("init_net:")
+        for op in init_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+        print("predict_net:")
+        for op in predict_net.op:
+            print(" ", op.type, op.input, "-->", op.output)
+
+    with open(args.predict_net, 'wb') as f:
+        f.write(predict_net.SerializeToString())
+    with open(args.init_net, 'wb') as f:
+        f.write(init_net.SerializeToString())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Utilitity to generate Caffe2 benchmark models.")
+    parser.add_argument("operator", help="Caffe2 operator to benchmark.")
+    parser.add_argument("-b", "--blob",
+                        help="Instantiate a blob --blob name=dim1,dim2,dim3",
+                        action='append')
+    parser.add_argument("--context", help="Context to run on.", default="CPU")
+    parser.add_argument("--kwargs", help="kwargs to pass to operator.",
+                        nargs="*", type=parse_kwarg, default=[])
+    parser.add_argument("--init_net", help="Output initialization net.",
+                        default="init_net.pb")
+    parser.add_argument("--predict_net", help="Output prediction net.",
+                        default="predict_net.pb")
+    parser.add_argument("--benchmark_name",
+                        help="Name of the benchmark network",
+                        default="benchmark")
+    parser.add_argument("--input_name", help="Name of the input blob.",
+                        default="data")
+    parser.add_argument("--output_name", help="Name of the output blob.",
+                        default="output")
+    parser.add_argument("--iters",
+                        help="Number of iterations to run the operator.",
+                        default="1")
+    parser.add_argument("-d", "--debug", help="Print debug information.",
+                        action='store_true')
+    parser.add_argument("-c", "--chain",
+                        help="Chain ops together (create data dependencies)",
+                        action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
@@ -69,12 +69,16 @@ void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
 
 void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
   if (backend != "builtin") {
-    string engine = backend == "nnpack" ? "NNPACK"
-                                        : backend == "eigen" ? "EIGEN"
-                                                             : backend == "mkl"
-                ? "MKLDNN"
-                : backend == "cuda" ? "CUDA"
-                                    : backend == "default" ? "" : "NONE";
+    string engine = backend == "nnpack"
+        ? "NNPACK"
+        : backend == "eigen" ? "EIGEN"
+                             : backend == "mkl" ? "MKLDNN"
+                                                : backend == "cuda"
+                    ? "CUDA"
+                    : backend == "dnnlowp" ? "DNNLOWP"
+                                           : backend == "dnnlowp_16"
+                            ? "DNNLOWP_16"
+                            : backend == "default" ? "" : "NONE";
     CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
     for (int i = 0; i < net_def->op_size(); i++) {
       caffe2::OperatorDef* op_def = net_def->mutable_op(i);

diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
@@ -36,6 +36,11 @@ CAFFE2_DEFINE_bool(
     true,
     "Select next non-busy stream");
 
+CAFFE2_DEFINE_bool(
+    caffe2_net_async_use_single_pool,
+    false,
+    "Use single pool for all devices");
+
 namespace caffe2 {
 
 thread_local std::vector<int> AsyncNetBase::stream_counters_;
@@ -109,7 +114,12 @@ std::shared_ptr<TaskThreadPool> AsyncNetBase::pool_getter(
 
 std::shared_ptr<TaskThreadPool> AsyncNetBase::pool(
     const DeviceOption& device_option) {
-  if (device_option.device_type() == CPU) {
+  if (FLAGS_caffe2_net_async_use_single_pool) {
+    return pool_getter(cpu_pools_, CPU, -1, num_workers_);
+  }
+  if (device_option.device_type() == CPU ||
+      device_option.device_type() == MKLDNN ||
+      device_option.device_type() == IDEEP) {
     auto numa_node_id = device_option.numa_node_id();
     CAFFE_ENFORCE(
         numa_node_id >= -1 &&
@@ -141,8 +151,8 @@ int AsyncNetBase::stream(int task_id) {
     do {
       stream_id = stream_counters_[gpu_id]++;
       stream_counters_[gpu_id] %= FLAGS_caffe2_streams_per_gpu;
-    } while (!isStreamFree(task_id, stream_id) &&
-             FLAGS_caffe2_net_async_check_stream_status);
+    } while (FLAGS_caffe2_net_async_check_stream_status &&
+             !isStreamFree(task_id, stream_id));
   }
   return stream_id;
 }
@@ -226,6 +236,16 @@ void AsyncNetBase::asyncWait(
   first_op->WaitEvents(events, stream_id);
 }
 
+void AsyncNetBase::reset() {
+  for (auto& op : GetOperators()) {
+    op->ResetEvent();
+  }
+#ifdef CAFFE2_USE_EXCEPTION_PTR
+  std::unique_lock<std::mutex> exception_lock(exception_mutex_);
+  caught_exception_ = nullptr;
+#endif // CAFFE2_USE_EXCEPTION_PTR
+}
+
 void AsyncNetBase::storeExceptionPtr() {
 #ifdef CAFFE2_USE_EXCEPTION_PTR
   std::unique_lock<std::mutex> exception_lock(exception_mutex_);
@@ -236,6 +256,12 @@ void AsyncNetBase::storeExceptionPtr() {
 }
 
 void AsyncNetBase::run(int task_id, int stream_id) {
+  // Optionally insert async wait ops,
+  // skip when using --caffe2_net_async_finish_chain -
+  // all parents are guaranteed to be finished
+  if (!FLAGS_caffe2_net_async_finish_chain) {
+    asyncWait(task_id, stream_id, parents(task_id));
+  }
   std::string err_msg;
   for (auto& op_id : chains_[task_id]) {
     auto& op = operators_[op_id];

diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
@@ -13,6 +13,14 @@
 #include "caffe2/utils/proto_utils.h"
 #include "caffe2/utils/thread_pool.h"
 
+CAFFE2_DECLARE_int(caffe2_streams_per_gpu);
+CAFFE2_DECLARE_bool(caffe2_net_async_finish_chain);
+CAFFE2_DECLARE_int(caffe2_net_async_max_gpus);
+CAFFE2_DECLARE_int(caffe2_net_async_max_numa_nodes);
+CAFFE2_DECLARE_int(caffe2_net_async_cpu_pool_size);
+CAFFE2_DECLARE_bool(caffe2_net_async_check_stream_status);
+CAFFE2_DECLARE_bool(caffe2_net_async_use_single_pool);
+
 namespace caffe2 {
 
 class AsyncNetExecutorHelper;
@@ -63,6 +71,8 @@ class AsyncNetBase : public NetBase {
 
   bool isStreamFree(int task_id, int stream_id) const;
 
+  virtual void reset();
+
   // Operator/task graph
   std::vector<OperatorBase*> operators_;
   std::vector<dag_utils::OperatorNode> operator_nodes_;

diff --git a/caffe2/core/net_async_polling.cc b/caffe2/core/net_async_polling.cc
@@ -64,11 +64,6 @@ void AsyncPollingNet::schedule(int task_id) {
           task_timers_[task_id]->MicroSeconds());
     }
 
-    // Non-blocking wait, setups scheduling of dependent async computations;
-    // canSchedule ensures that there's no busy wait,
-    // for CUDA events we need to insert CUDA event synchronization to ensure
-    // that async CUDA computations are executed in correct order
-    asyncWait(task_id, stream_id, parents(task_id));
     try {
       if (FLAGS_caffe2_dag_net_collect_stats) {
         Timer run_time;

diff --git a/caffe2/core/net_async_scheduling.cc b/caffe2/core/net_async_scheduling.cc
@@ -15,8 +15,9 @@ AsyncSchedulingNet::AsyncSchedulingNet(
 }
 
 void AsyncSchedulingNet::reset() {
+  AsyncNetBase::reset();
+
   processed_tasks_num_ = 0;
-  cleanup_ = false;
   success_ = true;
 
   for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
@@ -37,8 +38,10 @@ void AsyncSchedulingNet::schedule(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   pool(device_option)->run([this, task_id]() {
     if (success_) {
-      int stream_id = stream(task_id);
-      asyncWait(task_id, stream_id, parents(task_id));
+      int stream_id = 0;
+      if (FLAGS_caffe2_streams_per_gpu > 1) {
+        stream_id = stream(task_id);
+      }
       try {
         run(task_id, stream_id);
       } catch (const std::exception& e) {
@@ -51,9 +54,14 @@ void AsyncSchedulingNet::schedule(int task_id) {
     for (auto child_id : children(task_id)) {
       int parent_count = updateParentCount(child_id);
       if (parent_count == 0) {
-        if (!success_ || cleanup_ ||
-            FLAGS_caffe2_net_async_always_schedule_child ||
-            canSchedule(child_id)) {
+        // Schedule a child if:
+        // - there is failure, we skip an op execution and finish the job
+        // - forced scheduling though --caffe2_net_async_always_schedule_child
+        // - --caffe2_net_async_finish_chain is set, in this case parents are
+        //   guaranteed to be finished
+        // - in all other cases, check parents with canSchedule
+        if (!success_ || FLAGS_caffe2_net_async_always_schedule_child ||
+            FLAGS_caffe2_net_async_finish_chain || canSchedule(child_id)) {
           schedule(child_id);
         } else {
           const auto& device_option = event(child_id).GetDeviceOption();
@@ -64,37 +72,8 @@ void AsyncSchedulingNet::schedule(int task_id) {
       }
     }
 
-    if (success_) {
-      if (task_count == tasksNum()) {
-        // All tasks are finished, polling thread is sleeping;
-        // only one thread enters here
-        finalizeEvents();
-        finishRun();
-        return;
-      }
-    } else {
-      // Before setting running_ to false and notifying waiters we need to
-      // 1. Ensure that only one thread does the cleanup
-      // 2. Ensure that all other pending tasks in workers and polling threads
-      //    are finished and
-      // 3. Ensure that all tasks that were not scheduled have their events set
-      {
-        std::unique_lock<std::mutex> cleanup_lock(cleanup_mutex_);
-        if (cleanup_) {
-          return;
-        }
-        cleanup_ = true;
-      }
-
-      // Errors are not recoverable and happen in exceptional cases,
-      // ok to busy wait
-      while (processed_tasks_num_ != tasksNum()) {
-      }
-
-      // Make sure all events are set, wait for scheduled events
+    if (task_count == tasksNum()) {
       finalizeEvents();
-
-      // Notify observers and waiters
       finishRun();
     }
   });
@@ -110,7 +89,7 @@ void AsyncSchedulingNet::pollAndSchedule(int task_id) {
   //  - parents are ready
   //  - we failed / cleanup started (no ops will run)
 
-  if (can_schedule || cleanup_ || !success_ || parent_failed) {
+  if (can_schedule || !success_ || parent_failed) {
     schedule(task_id);
   } else {
     const auto& device_option = event(task_id).GetDeviceOption();
@@ -128,24 +107,38 @@ int AsyncSchedulingNet::updateParentCount(int child_id) {
 }
 
 void AsyncSchedulingNet::finishRun() {
+  {
+    std::unique_lock<std::mutex> lock(running_mutex_);
+    running_ = false;
+  }
+
   // notify observers and waiters
   StopAllObservers();
-  running_ = false;
   running_cv_.notify_all();
 }
 
-bool AsyncSchedulingNet::DoRunAsync() {
-  std::unique_lock<std::mutex> lock(running_mutex_);
-  CAFFE_ENFORCE(!running_, "Concurrent RunAsync calls");
-  running_ = true;
-  reset();
+bool AsyncSchedulingNet::RunAsync() {
+  try {
+    std::unique_lock<std::mutex> lock(running_mutex_);
+    if (running_) {
+      LOG(ERROR) << "Detected concurrent runs";
+      return false;
+    }
+    running_ = true;
+    reset();
 
-  StartAllObservers();
+    StartAllObservers();
 
-  for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
-    if (parents(task_id).empty()) {
-      schedule(task_id);
+    for (auto task_id = 0; task_id < tasksNum(); ++task_id) {
+      if (parents(task_id).empty()) {
+        schedule(task_id);
+      }
     }
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Exception while starting an async run: " << e.what();
+    finalizeEvents();
+    finishRun();
+    return false;
   }
 
   if (tasksNum() == 0) {

diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
@@ -15,11 +15,11 @@ class AsyncSchedulingNet : public AsyncNetBase {
   void Wait() override;
 
  protected:
-  bool DoRunAsync() override;
+  bool RunAsync() override;
 
   void pollAndSchedule(int task_id);
   void schedule(int task_id);
-  void reset();
+  void reset() override;
   virtual void finishRun();
   int updateParentCount(int child_id);
 
@@ -28,9 +28,6 @@ class AsyncSchedulingNet : public AsyncNetBase {
   std::atomic<bool> running_;
   std::atomic<bool> success_;
 
-  std::mutex cleanup_mutex_;
-  std::atomic<bool> cleanup_;
-
   std::atomic<int> processed_tasks_num_;
 
   DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);