pytorch
diff --git a/‎test/forward_backward_compatibility/check_forward_backward_compatibility.py‎
Lines changed: 1 addition & 2 deletions b/‎test/forward_backward_compatibility/check_forward_backward_compatibility.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎torch/csrc/distributed/c10d/Backend.hpp‎
Lines changed: 20 additions & 26 deletions b/‎torch/csrc/distributed/c10d/Backend.hpp‎
Lines changed: 20 additions & 26 deletions
diff --git a/‎torch/csrc/distributed/c10d/Ops.cpp‎
Lines changed: 20 additions & 5 deletions b/‎torch/csrc/distributed/c10d/Ops.cpp‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎torch/csrc/distributed/c10d/Ops.hpp‎
Lines changed: 1 addition & 1 deletion b/‎torch/csrc/distributed/c10d/Ops.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -285,8 +285,7 @@
     ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 11, 1)),
     ("aten::_scaled_dot_product_attention", datetime.date(2022, 11, 1)),
     # Distributed c10d ops are all going to be updated
-    ("c10d::.*", datetime.date(2022, 10, 31)),
-    ("c10d::allgather_", datetime.date(2022, 10, 1)),
+    ("c10d::.*", datetime.date(2022, 12, 31)),
     ("aten::to_padded_tensor", datetime.date(2022, 10, 1)),
     ("aten::nested_to_padded_tensor", datetime.date(2022, 10, 1)),
     ("aten::nested_tensor", datetime.date(2022, 10, 15)),
 
@@ -10,7 +10,6 @@
 #include <ATen/ATen.h>
 #include <c10/macros/Macros.h>
 
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
@@ -22,28 +21,28 @@ constexpr auto kDefaultTimeout =
 
 namespace c10d {
 
-// Options is a base struct that defines the basic options
-// when constructing a Backend. Each Backend subclass should
-// extend this struct and define its options if it wants to provide more
-// config options (beyond basic ones defined here) to end user.
-struct TORCH_API Options : torch::CustomClassHolder {
-  explicit Options(
-      std::string backend,
-      std::chrono::milliseconds timeout = kDefaultTimeout)
-      : timeout(timeout), backend(backend) {}
-  virtual ~Options() = default;
-
-  std::chrono::milliseconds timeout;
-
-  // backend name
-  const std::string backend;
-};
-
 class TORCH_API Backend : public torch::CustomClassHolder {
  public:
   explicit Backend(int rank, int size);
   virtual ~Backend() = 0;
 
+  int getRank() const {
+    return rank_;
+  }
+
+  int getSize() const {
+    return size_;
+  }
+
+  virtual void startCoalescing() {
+    // no-op for backends that have not implemented startCoalescing
+  }
+
+  virtual void endCoalescing(
+      std::vector<c10::intrusive_ptr<Work>>& /* reqs */) {
+    // no-op for backends that have not implemented endCoalescing
+  }
+
   // Subclasses must override this method to return the backend name
   virtual const std::string getBackendName() const {
     TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
@@ -255,14 +254,6 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support barrier"));
   }
 
-  int getRank() const {
-    return rank_;
-  }
-
-  int getSize() const {
-    return size_;
-  }
-
  protected:
   // Implementations of this interface need to call this to setup
   // appropriate logging etc.
@@ -272,6 +263,9 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   c10::optional<c10d::SequenceNum> sequenceNum_ = c10::nullopt;
   const int rank_;
   const int size_;
+  // Debug level setting. It is parsed once when ProcessGroup is constructed and
+  // remains the same across use of this process group.
+  DebugLevel dist_debug_level_;
 };
 
 } // namespace c10d
@@ -60,12 +60,13 @@ c10::intrusive_ptr<Work> reduce_(
 std::tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>
 allgather_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
-    const std::vector<at::Tensor>& input_tensors,
+    at::TensorList input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t timeout) {
+  auto input_tensors_vec = input_tensors.vec();
   auto work = process_group->allgather(
       const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
-      const_cast<std::vector<at::Tensor>&>(input_tensors),
+      input_tensors_vec,
       AllgatherOptions{std::chrono::milliseconds(timeout)});
 
   // Copy output tensors (not storage) so that this can be used in a functional
@@ -132,6 +133,7 @@ c10::intrusive_ptr<Work> alltoall_(
 }
 
 c10::intrusive_ptr<Work> barrier(
+    at::Tensor /* unused */,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
     int64_t timeout) {
@@ -252,17 +254,18 @@ c10::intrusive_ptr<Work> allreduce(
 c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
-    const std::vector<at::Tensor>& input_tensors,
+    at::TensorList input_tensors,
     const AllgatherOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::allgather_", "")
                        .typed<std::tuple<
                            std::vector<std::vector<at::Tensor>>,
                            c10::intrusive_ptr<Work>>(
                            const std::vector<std::vector<at::Tensor>>&,
-                           const std::vector<at::Tensor>&,
+                           at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t)>();
+
   return std::get<1>(op.call(
       output_tensors, input_tensors, process_group, opts.timeout.count()));
 }
@@ -376,10 +379,22 @@ c10::intrusive_ptr<Work> barrier(
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::barrier", "")
                        .typed<c10::intrusive_ptr<::c10d::Work>(
+                           at::Tensor,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            const std::vector<int64_t>&,
                            int64_t)>();
-  return op.call(process_group, opts.device_ids, opts.timeout.count());
+
+  // Default to using cpu implementation
+  at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
+  // if opts.device_ids or backend is nccl are specified then use cuda
+  // implementation
+  // TODO: getBackendName() is always "NOT DEFINED"
+  if (opts.device_ids.size() > 0 || process_group->getBackendName() == "nccl") {
+    // set cuda tensor
+    tensor = at::empty(
+        {0}, at::TensorOptions().device(at::kCUDA, opts.device_ids[0]));
+  }
+  return op.call(tensor, process_group, opts.device_ids, opts.timeout.count());
 }
 
 c10::intrusive_ptr<Work> send(
 
@@ -24,7 +24,7 @@ TORCH_API c10::intrusive_ptr<Work> allreduce(
 TORCH_API c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
-    const std::vector<at::Tensor>& input_tensors,
+    at::TensorList input_tensors,
     const AllgatherOptions& opts = {});
 
 TORCH_API c10::intrusive_ptr<Work> reduce_scatter(