[6/N] [Dispatchable Collectives] Update recv with CPU / CUDA implementations

H-Huang · H-Huang · commit 23a5fd0987d8 · 2022-08-22T16:10:23.000-07:00
ghstack-source-id: 6c91a29 Pull Request resolved: #83876
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -1358,6 +1358,7 @@ def _test_collectives(self, backend):
         )
         collectives_and_args = [
             (dist.send, self.rank),
+            (dist.recv,),
             (dist.broadcast, self.rank),
             (dist.all_reduce,)
         ]
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
@@ -93,16 +93,6 @@ c10::intrusive_ptr<Work> barrier(
       BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
 }
 
-c10::intrusive_ptr<Work> recv_(
-    at::TensorList tensors,
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    int64_t srcRank,
-    int64_t tag) {
-  auto tensor_vec = tensors.vec();
-  return process_group->recv(
-      tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
-}
-
 TORCH_LIBRARY(c10d, m) {
   // The following ProcessGroup and Work definations are more like declarations.
   // They don't expose the details of the two classes into TorchScript.
@@ -113,6 +103,8 @@ TORCH_LIBRARY(c10d, m) {
   // __torch_dispatch__.
   m.def(
       "send(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int dstRank, int tag) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "recv_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int srcRank, int tag) -> __torch__.torch.classes.c10d.Work");
   m.def(
       "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, int timeout) -> __torch__.torch.classes.c10d.Work");
   m.def(
@@ -138,7 +130,6 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "barrier",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, barrier));
-  m.def("recv_", dispatch(c10::DispatchKey::CompositeExplicitAutograd, recv_));
 }
 } // namespace
 
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -27,6 +27,26 @@ c10::intrusive_ptr<Work> send_cuda(
       tensor_vec, static_cast<int>(dstRank), static_cast<int>(tag));
 }
 
+c10::intrusive_ptr<Work> recv_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recv(
+      tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
+}
+
+c10::intrusive_ptr<Work> recv_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recv(
+      tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
+}
+
 c10::intrusive_ptr<Work> broadcast_cpu_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -89,6 +109,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("send", send_cuda);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("recv_", recv_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("recv_", recv_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("broadcast_", broadcast_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/OpsImpl.hpp b/torch/csrc/distributed/c10d/OpsImpl.hpp
@@ -19,6 +19,18 @@ c10::intrusive_ptr<Work> send_cuda(
     int64_t dstRank,
     int64_t tag);
 
+c10::intrusive_ptr<Work> recv_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag);
+
+c10::intrusive_ptr<Work> recv_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag);
+
 c10::intrusive_ptr<Work> broadcast_cpu_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,

Original file line number	Diff line number	Diff line change
`@@ -1358,6 +1358,7 @@ def _test_collectives(self, backend):`
`1358`	`1358`	`)`
`1359`	`1359`	`collectives_and_args = [`
`1360`	`1360`	`(dist.send, self.rank),`
	`1361`	`+ (dist.recv,),`
`1361`	`1362`	`(dist.broadcast, self.rank),`
`1362`	`1363`	`(dist.all_reduce,)`
`1363`	`1364`	`]`