Draft for sub-tasks 1 and 2 of DDP Communication Hook

sinannasir · sinannasir · commit baf88b6ad66c · 2020-07-06T14:27:34.000-07:00
Pull Request resolved: #40848 Draft for sub-tasks 1 and 2 of [39272](#39272) ghstack-source-id: 107207660 Differential Revision: [D22328310](https://our.internmc.facebook.com/intern/diff/D22328310/)
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
@@ -2989,6 +2989,54 @@ def test_param_layout_mismatch_error(self):
             with self.assertRaisesRegex(RuntimeError, ".* appears not to match strides of the same param in process 0"):
                 m_ddp = DistributedDataParallel(m, device_ids=[dev0], process_group=process_group)
 
+    @requires_gloo()
+    def test_ddp_comm_hook_future_passing(self):
+        """
+        This unit test verifies whether the Future object is passed properly.
+        The callback function creates a Future object and sets a value to it.
+        """
+        class test_ddp_comm_hook(nn.Module):
+            def __init__(self):
+                super(test_ddp_comm_hook, self).__init__()
+                self.t0 = Task()
+
+            def forward(self, x, rank):
+                return self.t0(x + rank)
+
+        def run_and_verify_grad(model):
+            # Run forward
+            output = model(8, self.rank)
+
+            # # The grads of all parameters should be None at this point.
+            [self.assertIsNone(p.grad) for p in model.parameters()]
+
+            # Run backward
+            output.mean().backward()
+
+            # # # Now locally unused parameter should have grad updated on all ranks.
+            [self.assertEqual(p.grad, torch.ones(2, 2)) for p in model.parameters()]
+
+        def simple_hook(state, bucket):
+            fut = torch.futures.Future()
+            fut.set_result([torch.ones(4)])
+
+            def fut_then(fut):
+                # bucket.set_tensors(fut.wait())
+                for bt, ft in zip(bucket.get_tensors(), fut.wait()):
+                    bt.copy_(ft)
+            return fut.then(fut_then)
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+
+        # Test on CPU
+        cpu_model = DistributedDataParallel(
+            test_ddp_comm_hook().cpu(),
+            process_group=process_group
+        )
+        cpu_model.reducer.register_comm_hook(None, simple_hook)
+        run_and_verify_grad(cpu_model)
+
 
 class ReducerModule(nn.Module):
     def __init__(self):
@@ -3125,6 +3173,42 @@ def test_forward_backward_optimizer(self):
             output.backward()
             optimizer.step()
 
+    def test_ddp_comm_hook_register_just_once(self):
+        """
+        DDP communication hook can only be registered once. This test validates whether
+        the error is thrown properly when register_comm_hook is called more than once.
+        """
+        model = self._create_mixed_precision_model()
+        reducer = self._create_reducer_for_models([model], find_unused_parameters=True)
+
+        def dummy_hook(state, bucket):
+            fut = torch.futures.Future()
+            fut.set_result(bucket.get_tensors())
+            return fut.then()
+        reducer.register_comm_hook(None, dummy_hook)
+        try:
+            reducer.register_comm_hook(None, dummy_hook)
+        except Exception as e:
+            if "register_comm_hook can only be called once" in str(e):
+                return
+            else:
+                raise e
+
+    def test_ddp_comm_hook_callable(self):
+        """
+        The Python hook must be callable. This unit test checks whether this condition
+        is properly checked inside reducer.
+        """
+        model = self._create_mixed_precision_model()
+        reducer = self._create_reducer_for_models([model], find_unused_parameters=True)
+        try:
+            reducer.register_comm_hook(state=None, hook=1)
+        except Exception as e:
+            if "comm_hook must be callable" in str(e):
+                return
+            else:
+                raise e
+
 
 class ComputeBucketAssignmentTest(TestCase):
     def test_single_limit_single_dtype(self):
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
@@ -4,6 +4,7 @@
 
 #include <ATen/core/functional.h>
 #include <torch/csrc/distributed/c10d/reducer.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/tensor_flatten.h>
 
 namespace c10d {
@@ -79,4 +80,25 @@ void broadcast_coalesced(
   }
 }
 
+GradBucket::GradBucket(std::vector<at::Tensor>& tensors) : tensors_(tensors){};
+
+std::vector<at::Tensor> GradBucket::get_tensors() {
+  return tensors_;
+};
+
+void GradBucket::set_tensors(std::vector<at::Tensor>& tensors) {
+  tensors_ = tensors;
+}
+
+PythonCommHook::PythonCommHook(py::object state, py::object hook)
+    : state_(std::move(state)), hook_(std::move(hook)){};
+c10::intrusive_ptr<torch::jit::Future> PythonCommHook::operate(
+    const GradBucket& bucket) {
+  py::gil_scoped_acquire acquire;
+
+  c10::intrusive_ptr<torch::jit::Future> fut;
+  return hook_(state_, bucket)
+      .cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>()
+      ->fut;
+};
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
@@ -4,6 +4,7 @@
 
 #include <ATen/ATen.h>
 #include <c10d/ProcessGroup.hpp>
+#include <torch/csrc/utils/pybind.h>
 
 namespace c10d {
 
@@ -13,4 +14,31 @@ void broadcast_coalesced(
     at::TensorList tensors,
     size_t buffer_size);
 
+class GradBucket {
+ public:
+  explicit GradBucket(std::vector<at::Tensor>& tensors);
+  std::vector<at::Tensor> get_tensors();
+  void set_tensors(std::vector<at::Tensor>& tensors);
+
+ private:
+  std::vector<at::Tensor> tensors_;
+};
+
+struct CommHookInterface {
+ public:
+  virtual c10::intrusive_ptr<torch::jit::Future> operate(
+      const GradBucket& bucket) = 0;
+};
+
+class TORCH_API PythonCommHook : public CommHookInterface {
+ public:
+  PythonCommHook(py::object state, py::object hook);
+
+  c10::intrusive_ptr<torch::jit::Future> operate(
+      const GradBucket& bucket) override;
+
+ private:
+  py::object state_;
+  py::object hook_;
+};
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -115,6 +115,18 @@ PyObject* c10d_init(PyObject* _unused) {
 
   auto module = py::handle(c10d_module).cast<py::module>();
 
+  shared_ptr_class_<::c10d::GradBucket>(module, "GradBucket")
+      .def(py::init<std::vector<Tensor>&>(), py::arg("tensors"))
+      .def(
+          "get_tensors",
+          &::c10d::GradBucket::get_tensors,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "set_tensors",
+          &::c10d::GradBucket::set_tensors,
+          py::arg("tensors"),
+          py::call_guard<py::gil_scoped_release>());
+
   shared_ptr_class_<::c10d::Reducer>(module, "Reducer")
       .def(
           py::init<
@@ -131,6 +143,12 @@ PyObject* c10d_init(PyObject* _unused) {
           py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap,
           py::arg("find_unused_parameters") = false,
           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "register_comm_hook",
+          &::c10d::Reducer::register_comm_hook,
+          py::arg("state"),
+          py::arg("hook"),
+          py::call_guard<py::gil_scoped_release>())
       .def(
           "initialize_buckets",
           &::c10d::Reducer::initialize_buckets,
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
@@ -14,6 +14,7 @@
 #include <torch/csrc/distributed/c10d/comm.h>
 #include <torch/csrc/utils/hash.h>
 #include <torch/csrc/utils/memory.h>
+#include <torch/csrc/utils/pybind.h>
 
 namespace c10d {
 namespace {
@@ -161,6 +162,8 @@ Reducer::Reducer(
       }
     }
   }
+
+  comm_hook_.reset();
 }
 
 // Note [Skip allreducing local_used_maps_dev]
@@ -575,7 +578,11 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
       //
       tensors.push_back(replica.contents);
     }
-    bucket.work = process_group_->allreduce(tensors);
+    if (comm_hook_ == nullptr) {
+      bucket.work = process_group_->allreduce(tensors);
+    } else {
+      bucket.future_work = comm_hook_->operate(GradBucket(tensors));
+    }
   }
 }
 
@@ -924,8 +931,13 @@ void Reducer::finalize_backward() {
 
   // Wait for asynchronous reduction to complete and unflatten contents.
   for (auto& bucket : buckets_) {
-    TORCH_INTERNAL_ASSERT(bucket.work);
-    bucket.work->wait();
+    if (comm_hook_ == nullptr) {
+      TORCH_INTERNAL_ASSERT(bucket.work);
+      bucket.work->wait();
+    } else {
+      TORCH_INTERNAL_ASSERT(bucket.future_work);
+      bucket.future_work->wait();
+    }
     if (!bucket.expect_sparse_gradient) {
       // We don't need to finalize the sparse bucket since the sparse grad and
       // the bucket essentially point to the same storage. As a result, once
@@ -1079,6 +1091,22 @@ std::vector<std::vector<size_t>> Reducer::rebuildBuckets() {
   return rebuilt_bucket_indices;
 }
 
+void Reducer::register_comm_hook(py::object state, py::object comm_hook) {
+  TORCH_CHECK(
+      py::isinstance<py::function>(comm_hook), "comm_hook must be callable.");
+
+  Reducer::register_comm_hook_internal(
+      std::make_unique<PythonCommHook>(std::move(state), std::move(comm_hook)));
+}
+
+void Reducer::register_comm_hook_internal(
+    std::unique_ptr<CommHookInterface> iface) {
+  TORCH_CHECK(
+      comm_hook_ == nullptr, "register_comm_hook can only be called once.");
+
+  comm_hook_ = std::move(iface);
+}
+
 namespace {
 
 // Tensors may be coalesced into buckets. Buckets must contain tensors of
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
@@ -11,6 +11,8 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/distributed/autograd/context/context.h>
+#include <torch/csrc/distributed/c10d/comm.h>
+#include <torch/csrc/utils/pybind.h>
 
 namespace c10d {
 
@@ -53,6 +55,8 @@ class Reducer {
     return backward_stats_;
   }
 
+  void register_comm_hook(py::object state, py::object comm_hook);
+
  protected:
   // Forward declaration.
   struct Bucket;
@@ -99,6 +103,9 @@ class Reducer {
   // Work handle for allreduce on local_used_maps_
   std::shared_ptr<c10d::ProcessGroup::Work> local_used_work_;
 
+  std::unique_ptr<CommHookInterface> comm_hook_;
+  void register_comm_hook_internal(std::unique_ptr<CommHookInterface> iface);
+
   void verify_replicas_within_process();
 
   void verify_replica0_across_processes();
@@ -197,6 +204,8 @@ class Reducer {
     // Keep work handle around when this set of buckets is being reduced.
     std::shared_ptr<c10d::ProcessGroup::Work> work;
 
+    c10::intrusive_ptr<torch::jit::Future> future_work;
+
     // If this bucket should expect a single sparse gradient.
     // Implies: replicas[i].variables.size() == 1.
     bool expect_sparse_gradient = false;