Addresses review

syed-ahmed · syed-ahmed · commit 635cced8576a · 2023-04-18T11:55:09.000-07:00
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -1292,21 +1292,6 @@ def _test_fp16(self, gradient_as_bucket_view=False):
     def test_fp16(self):
         self._test_fp16()
 
-    @requires_nccl()
-    @requires_nccl_version((2, 17), "Need NCCL 2.17+ for configuring NCCL communicators")
-    @skip_if_lt_x_gpu(2)
-    def test_ddp_default_cga(self):
-        nccl_debug_file = tempfile.NamedTemporaryFile()
-        os.environ["NCCL_DEBUG"] = "INFO"
-        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
-
-        self._test_fp16()
-
-        # Tests if default CGA for DDP is 2
-        nccl_debug_file_content = nccl_debug_file.read()
-        cga_cluster_size = re.search(rb'CGA cluster.*(\d+)|$', nccl_debug_file_content).group(1)
-        self.assertEqual(int(cga_cluster_size), 2)
-
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_fp16_grad_is_view(self):
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -53,9 +53,9 @@
 #endif
 
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && (NCCL_MINOR >= 17)
-#define ENABLE_NCCL_RANK_CONFIG
+#define NCCL_HAS_COMM_CTA_CGA
 #elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
-#define ENABLE_NCCL_RANK_CONFIG
+#define NCCL_HAS_COMM_CTA_CGA
 #endif
 
 // Macro to throw on a non-successful NCCL return value.
@@ -185,31 +185,28 @@ class NCCLComm {
       int rank,
       ncclUniqueId commId) {
     auto comm = std::make_shared<NCCLComm>();
-#ifndef NCCL_HAS_COMM_NONBLOCKING
     C10D_NCCL_CHECK(
         ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank), c10::nullopt);
-#else
-   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-   if (nccl_use_nonblocking()) {
-     config.blocking = 0;
-   }
-   C10D_NCCL_CHECK_TIMEOUT(
-     ncclCommInitRankConfig(&(comm->ncclComm_), numRanks, commId, rank, &config), comm->ncclComm_, c10::nullopt);
-#endif
     comm->ncclId_ = commId;
     comm->rank_ = rank;
     return comm;
   }
 
-#ifdef ENABLE_NCCL_RANK_CONFIG
+#ifdef NCCL_HAS_COMM_NONBLOCKING
   static std::shared_ptr<NCCLComm> create(
       int numRanks,
       int rank,
       ncclUniqueId commId,
       ncclConfig_t& config) {
     auto comm = std::make_shared<NCCLComm>();
-    C10D_NCCL_CHECK(
+    if (nccl_use_nonblocking()) {
+      config.blocking = 0;
+      C10D_NCCL_CHECK_TIMEOUT(
+        ncclCommInitRankConfig(&(comm->ncclComm_), numRanks, commId, rank, &config), comm->ncclComm_, c10::nullopt);
+    else {
+      C10D_NCCL_CHECK(
         ncclCommInitRankConfig(&(comm->ncclComm_), numRanks, commId, rank, &config), c10::nullopt);
+    }
     comm->ncclId_ = commId;
     comm->rank_ = rank;
     return comm;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1150,7 +1150,7 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     int deviceIndex = devices[i].index();
 
     gpuGuard.set_index(deviceIndex);
-#ifdef ENABLE_NCCL_RANK_CONFIG
+#ifdef NCCL_HAS_COMM_NONBLOCKING
     ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID, options_->config);
 #else
     ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -280,7 +280,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // Schedule NCCL operations on high priority CUDA streams
     bool is_high_priority_stream;
 
-#ifdef ENABLE_NCCL_RANK_CONFIG
+#ifdef NCCL_HAS_COMM_NONBLOCKING
     // Configure ranks
     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
 #endif
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1977,7 +1977,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def_property_readonly(
               "is_ucc_available", &::c10d::ProcessGroupNCCL::isUCCAvailable);
 
-#ifdef ENABLE_NCCL_RANK_CONFIG
+#ifdef NCCL_HAS_COMM_CTA_CGA
   py::class_<ncclConfig_t>(
       processGroupNCCL,
       "NCCLConfig",
@@ -2027,7 +2027,7 @@ Example::
     >>> dist.init_process_group("nccl", pg_options=nccl_options)
       )")
       .def(py::init<bool>(), py::arg("is_high_priority_stream") = false)
-#ifdef ENABLE_NCCL_RANK_CONFIG
+#ifdef NCCL_HAS_COMM_CTA_CGA
       .def_readwrite(
           "is_high_priority_stream",
           &::c10d::ProcessGroupNCCL::Options::is_high_priority_stream)
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -749,16 +749,6 @@ def __init__(
         else:
             self.process_group = process_group
 
-        if dist.get_backend(
-            self.process_group
-        ) == "nccl" and torch.cuda.nccl.version() >= (2, 17):
-            # Note: NVIDIA recommends using CGA Cluster Size of 2 when using DDP.
-            default_cga = dist.ProcessGroupNCCL.Options().config.cga_cluster_size  # type: ignore[attr-defined]
-            default_pg_nccl = self.process_group._get_backend(torch.device("cuda"))
-            current_cga = default_pg_nccl.options.config.cga_cluster_size
-            if current_cga == default_cga:
-                default_pg_nccl.options.config.cga_cluster_size = 2
-
         self.static_graph = False
         self.dim = dim
         self.module = module