pytorch · H-Huang · Oct 6, 2022 · Oct 7, 2022 · Oct 31, 2022 · Nov 9, 2022
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -1466,7 +1466,7 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # multi tensor collectives
         if collective == dist.barrier:
             collective()
-        elif collective == dist.all_gather:
+        elif collective in (dist.all_gather, dist.gather):
             collective([tensor], tensor, *args)
         elif collective == dist.scatter:
             collective(tensor, [tensor], *args)

diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,30 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> gather_cpu_(
+    const std::vector<std::vector<at::Tensor>>& output_tensors,
+    const std::vector<at::Tensor>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  return process_group->gather(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
+      const_cast<std::vector<at::Tensor>&>(input_tensors),
+      GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> gather_cuda_(
+    const std::vector<std::vector<at::Tensor>>& output_tensors,
+    const std::vector<at::Tensor>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  return process_group->gather(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
+      const_cast<std::vector<at::Tensor>&>(input_tensors),
+      GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> scatter_cpu_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
@@ -359,6 +383,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("gather_", gather_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("gather_", gather_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("scatter_", scatter_cpu_);
 }