[6/N] [Dispatchable Collectives] Update recv with CPU / CUDA implementations (#83876)

H-Huang · pytorchmergebot · commit d39e9c1e9087 · 2022-10-04T20:30:21.000Z
* ### Changes - Updates for the recv collective ### Context #86225 Differential Revision: [D40044552](https://our.internmc.facebook.com/intern/diff/D40044552) Pull Request resolved: #83876 Approved by: https://github.com/kwen2501
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -29,6 +29,26 @@ c10::intrusive_ptr<Work> send_cuda(
       tensor_vec, static_cast<int>(dstRank), static_cast<int>(tag));
 }
 
+c10::intrusive_ptr<Work> recv_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recv(
+      tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
+}
+
+c10::intrusive_ptr<Work> recv_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t srcRank,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recv(
+      tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> broadcast_cpu_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -105,6 +125,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("send", send_cuda);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("recv_", recv_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("recv_", recv_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("broadcast_", broadcast_cpu_);
 }