[15/N] Add allreduce_coalesced custom op with CPU/CUDA implementations (#88846)

H-Huang · pytorchmergebot · commit 6e5f736d86be · 2022-11-12T14:23:45.000Z
Differential Revision: [D41227740](https://our.internmc.facebook.com/intern/diff/D41227740) Pull Request resolved: #88846 Approved by: https://github.com/kwen2501
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -1503,6 +1503,21 @@ def _test_collectives(self, backend):
             with self.subTest(collective=collective, args=args):
                 self._call_collective_with_varying_tensors(backend, collective, *args)
 
+    def _test_allreduce_coalesced(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        # TODO: this will be updated in the future to not be backend specific
+        device = "cuda" if backend == "nccl" else "cpu"
+        tensors = [torch.ones(10, 10, device=torch.device(device))]
+        dist.all_reduce_coalesced(tensors, dist.ReduceOp.SUM)
+        for tensor in tensors:
+            self.assertEqual(tensor, torch.ones(10, 10) * self.world_size)
+
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
         super(CompilerTest, self).setUp()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
@@ -2363,6 +2363,10 @@ class GlooProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGro
     def test_collectives(self):
         self._test_collectives(backend="gloo")
 
+    @requires_gloo()
+    def test_allreduce_coalesced(self):
+        self._test_allreduce_coalesced(backend="gloo")
+
 class CompilerTest(test_c10d_common.CompilerTest):
 
     @property
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -2953,6 +2953,11 @@ class NcclProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGro
     def test_collectives(self):
         self._test_collectives(backend="nccl")
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    def test_allreduce_coalesced(self):
+        self._test_allreduce_coalesced(backend="nccl")
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
@@ -40,6 +40,19 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_(
       std::move(tensor_vec), work);
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
 c10::intrusive_ptr<Work> reduce_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -177,6 +190,10 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "allreduce_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, allreduce_));
+  m.def(
+      "allreduce_coalesced_",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, allreduce_coalesced_));
   m.def(
       "allgather_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, allgather_));
@@ -249,6 +266,25 @@ c10::intrusive_ptr<Work> allreduce(
       opts.timeout.count()));
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    const AllreduceCoalescedOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::allreduce_coalesced_", "")
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
+                           at::TensorList,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                           int64_t)>();
+
+  return op.call(
+      tensors,
+      process_group,
+      c10::make_intrusive<ReduceOp>(opts.reduceOp),
+      opts.timeout.count());
+}
+
 c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
@@ -21,6 +21,11 @@ TORCH_API c10::intrusive_ptr<Work> allreduce(
     at::TensorList tensors,
     const AllreduceOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> allreduce_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    const AllreduceCoalescedOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -149,6 +149,32 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_cuda_(
       std::move(tensor_vec), work);
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
+c10::intrusive_ptr<Work> allreduce_coalesced_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
 std::tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>
 allgather_cpu_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
@@ -367,6 +393,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("allreduce_", allreduce_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("allreduce_coalesced_", allreduce_coalesced_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("allreduce_coalesced_", allreduce_coalesced_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("allgather_", allgather_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1134,10 +1134,10 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "allreduce_coalesced",
-              [](::c10d::ProcessGroup& self,
-                 std::vector<at::Tensor>& xs,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 const std::vector<at::Tensor>& xs,
                  ::c10d::AllreduceCoalescedOptions opts) {
-                return self.allreduce_coalesced(xs, opts);
+                return ::c10d::ops::allreduce_coalesced(self, xs, opts);
               },
               py::arg("tensors"),
               py::arg("opts") = ::c10d::AllreduceCoalescedOptions(),