pytorch
diff --git a/‎test/distributed/test_distributed.py‎
Lines changed: 8 additions & 140 deletions b/‎test/distributed/test_distributed.py‎
Lines changed: 8 additions & 140 deletions
diff --git a/‎torch/lib/c10d/NCCLUtils.hpp‎
Lines changed: 0 additions & 9 deletions b/‎torch/lib/c10d/NCCLUtils.hpp‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎torch/lib/c10d/ProcessGroup.cpp‎
Lines changed: 0 additions & 66 deletions b/‎torch/lib/c10d/ProcessGroup.cpp‎
Lines changed: 0 additions & 66 deletions
diff --git a/‎torch/lib/c10d/ProcessGroup.hpp‎
Lines changed: 0 additions & 16 deletions b/‎torch/lib/c10d/ProcessGroup.hpp‎
Lines changed: 0 additions & 16 deletions
@@ -1503,46 +1503,24 @@ def test_all_gather_coalesced_with_empty(self):
         self._barrier()
 
     # AllToAll
-    def _test_all_to_all_single_equal_split_helper(
-        self,
-        group,
-        group_id,
-        rank,
-        cuda=False,
-        rank_to_GPU=None,
-    ):
+    def _test_all_to_all_single_equal_split_helper(self, group, group_id, rank):
         if group_id is not None:
             size = len(group)
             in_tensor = torch.ones([size, size]) * rank
             expected_tensor = torch.cat([torch.ones([1, size]) * i for i in group])
             out_tensor = torch.ones([size, size]) * -1
-            if cuda:
-                in_tensor = in_tensor.cuda(rank_to_GPU[rank][0])
-                expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
-                out_tensor = out_tensor.cuda(rank_to_GPU[rank][0])
             dist.all_to_all_single(out_tensor, in_tensor, group=group_id)
             self.assertEqual(out_tensor, expected_tensor)
         self._barrier()
 
-    def _test_all_to_all_single_unequal_split_helper(
-        self,
-        group,
-        group_id,
-        rank,
-        cuda=False,
-        rank_to_GPU=None,
-    ):
+    def _test_all_to_all_single_unequal_split_helper(self, group, group_id, rank):
         if group_id is not None:
             size = len(group)
             in_splits = [i + 1 for i in group]
             out_splits = [rank + 1 for _ in group]
             in_tensor = torch.ones([sum(in_splits), size]) * rank
             out_tensor = torch.ones([(rank + 1) * size, size])
             expected_tensor = torch.cat([torch.ones([rank + 1, size]) * i for i in group])
-            if cuda:
-                in_tensor = in_tensor.cuda(rank_to_GPU[rank][0])
-                expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
-                out_tensor = out_tensor.cuda(rank_to_GPU[rank][0])
             dist.all_to_all_single(
                 out_tensor, in_tensor, out_splits, in_splits, group=group_id)
             self.assertEqual(out_tensor, expected_tensor)
@@ -1562,159 +1540,49 @@ def _test_all_to_all_helper(self, group, group_id, rank):
                 self.assertEqual(t1, t2)
         self._barrier()
 
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     def test_all_to_all_single_equal_split(self):
         group, group_id, rank = self._init_global_test()
         self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    def test_all_to_all_single_equal_split_cuda(self):
-        group, group_id, rank = self._init_global_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_equal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     def test_all_to_all_single_unequal_split(self):
         group, group_id, rank = self._init_global_test()
         self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    def test_all_to_all_single_unequal_split_cuda(self):
-        group, group_id, rank = self._init_global_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_unequal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
     @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all")
     def test_all_to_all(self):
         group, group_id, rank = self._init_global_test()
         self._test_all_to_all_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     @skip_if_small_worldsize
     def test_all_to_all_single_equal_split_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    @skip_if_small_worldsize
-    def test_all_to_all_single_equal_split_group_cuda(self):
-        group, group_id, rank = self._init_group_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_equal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     @skip_if_small_worldsize
     def test_all_to_all_single_unequal_split_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    @skip_if_small_worldsize
-    def test_all_to_all_single_unequal_split_group_cuda(self):
-        group, group_id, rank = self._init_global_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_unequal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
     @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all")
     @skip_if_small_worldsize
     def test_all_to_all_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_to_all_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     def test_all_to_all_single_equal_split_full_group(self):
         group, group_id, rank = self._init_full_group_test()
         self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    def test_all_to_all_single_equal_split_full_group_cuda(self):
-        group, group_id, rank = self._init_full_group_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_equal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
-    @unittest.skipIf(
-        BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
-    )
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all_single")
     def test_all_to_all_single_unequal_split_full_group(self):
         group, group_id, rank = self._init_full_group_test()
         self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
 
-    @unittest.skipIf(
-        BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
-    )
-    @skip_if_no_gpu
-    @skip_if_rocm
-    def test_all_to_all_single_unequal_split_full_group_cuda(self):
-        group, group_id, rank = self._init_full_group_test()
-        rank_to_GPU = self._init_multigpu_helper()
-        self._test_all_to_all_single_unequal_split_helper(
-            group,
-            group_id,
-            rank,
-            True,
-            rank_to_GPU,
-        )
-
     @unittest.skipIf(BACKEND != "mpi", "Only MPI supports all_to_all")
     def test_all_to_all_full_group(self):
         group, group_id, rank = self._init_full_group_test()
 
@@ -17,15 +17,6 @@
 #define ENABLE_NCCL_ERROR_CHECKING
 #endif
 
-// P2P is enabled only for NCCL versions 2.7+ since ncclSend()
-// and ncclRecv() are not supported in earlier versions.
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
-    (NCCL_MINOR >= 7)
-#define ENABLE_NCCL_P2P_SUPPORT
-#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
-#define ENABLE_NCCL_P2P_SUPPORT
-#endif
-
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd)                                                 \
   do {                                                                       \
 
@@ -92,70 +92,4 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroup::allgather_coalesced(
       "no support for allgather_coalesced in this process group");
 }
 
-void ProcessGroup::checkSplitSizes(
-    const std::vector<int64_t>& split_sizes,
-    const at::Tensor& tensor,
-    int group_size) {
-  if (split_sizes.size() == 0) {
-    TORCH_CHECK(
-        tensor.size(0) % group_size == 0,
-        "Tensor's dim 0 does not divide equally across group size");
-  } else {
-    TORCH_CHECK(
-        split_sizes.size() == group_size,
-        "Number of tensor splits not equal to group size");
-    int sum = std::accumulate(split_sizes.begin(), split_sizes.end(), 0);
-    TORCH_CHECK(
-        sum == tensor.size(0), "Split sizes doesn't match total dim 0 size");
-  }
-}
-
-int64_t ProcessGroup::computeLengthsAndOffsets(
-    const std::vector<int64_t>& split_sizes,
-    const at::Tensor& tensor,
-    std::vector<int>* lengths,
-    std::vector<int>* offsets) {
-  int64_t group_size = lengths->size();
-  bool equal_splits = false;
-  int64_t dim0_size = tensor.size(0);
-  int64_t row_size = (dim0_size ? tensor.numel() / dim0_size : 1);
-  int64_t split_size = 0;
-  int64_t offset = 0;
-
-  if (split_sizes.size() == 0) {
-    equal_splits = true;
-    split_size = tensor.size(0) / group_size;
-  }
-  for (int i = 0; i < group_size; i++) {
-    int64_t length = row_size * (equal_splits ? split_size : split_sizes[i]);
-    TORCH_INTERNAL_ASSERT(
-        length <= std::numeric_limits<int>::max() &&
-            offset <= std::numeric_limits<int>::max(),
-        "Length or offset larger than INT_MAX not supported");
-    (*lengths)[i] = length;
-    (*offsets)[i] = offset;
-    offset += length;
-  }
-  return offset;
-}
-
-int64_t ProcessGroup::computeLengthsAndOffsets(
-    const std::vector<at::Tensor>& tensors,
-    std::vector<int>* lengths,
-    std::vector<int>* offsets) {
-  int64_t group_size = lengths->size();
-  int64_t offset = 0;
-  for (int i = 0; i < group_size; i++) {
-    int64_t length = tensors[i].numel();
-    TORCH_INTERNAL_ASSERT(
-        length <= std::numeric_limits<int>::max() &&
-            offset <= std::numeric_limits<int>::max(),
-        "Length or offset larger than INT_MAX not supported");
-    (*lengths)[i] = length;
-    (*offsets)[i] = offset;
-    offset += length;
-  }
-  return offset;
-}
-
 } // namespace c10d
@@ -204,22 +204,6 @@ class ProcessGroup {
       const BarrierOptions& opts = BarrierOptions()) = 0;
 
  protected:
-  void checkSplitSizes(
-      const std::vector<int64_t>& split_sizes,
-      const at::Tensor& tensor,
-      int group_size);
-
-  int64_t computeLengthsAndOffsets(
-      const std::vector<int64_t>& split_sizes,
-      const at::Tensor& tensor,
-      std::vector<int>* lengths,
-      std::vector<int>* offsets);
-
-  int64_t computeLengthsAndOffsets(
-      const std::vector<at::Tensor>& tensors,
-      std::vector<int>* lengths,
-      std::vector<int>* offsets);
-
   const int rank_;
   const int size_;
 };