[v1.7 patch] Disallow creation of ProcessGroupNCCL without GPUs. (#45642)

pritamdamania · pritamdamania · commit 9af59ca76453 · 2020-10-08T17:05:51.000-07:00
Summary: Note: This PR has been merged into master at b5a2f04 after the 1.7 branch cut (see original PR: #45642). This PR is to merge it into the 1.7 branch. ---- Original Commit Description Follows --- Pull Request resolved: #45642 Prior to #45181, initializing a NCCL process group would work even if no GPUs were present. Although, now since init_process_group calls `barrier()` this would fail. In general the problem was that we could initialize ProcessGroupNCCL without GPUs and then if we called a method like `barrier()` the process would crash since we do % numGPUs resulting in division by zero. ghstack-source-id: 113490343 Test Plan: waitforbuildbot Reviewed By: osalpekar Differential Revision: D24038839 fbshipit-source-id: a1f1db52cabcfb83e06c1a11ae9744afbf03f8dc
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
@@ -29,7 +29,8 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests, skip_if_win32, create_device
+    skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \
+    create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -1594,13 +1595,30 @@ def create(num, prefix):
                 self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor)
             del pg
 
+class ProcessGroupNCCLNoGPUTest(TestCase):
+    MAIN_PROCESS_RANK = 0
+
+    def setUp(self):
+        self.rank = self.MAIN_PROCESS_RANK
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            raise unittest.SkipTest("GPUs are available, skipping test")
+
+    def tearDown(self):
+        pass
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_init_no_gpus(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"):
+            c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
 
-@requires_nccl()
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "TSAN is not fork-safe since we're forking in a multi-threaded environment",
-)
-@skip_if_rocm
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
@@ -1615,6 +1633,8 @@ def setUp(self):
     def tearDown(self):
         pass
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_empty_tensors(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1639,6 +1659,8 @@ def test_empty_tensors(self):
         pg.reduce_scatter(ys, xs).wait()
         self.assertEqual(0, ys[0].numel())
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_broadcast_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1661,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor):
             for i in range(self.num_gpus):
                 self.assertEqual(tensors[i], tensors[rt])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allreduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1722,6 +1746,8 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                 allreduce(tensors, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1752,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None):
                 with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                     reduce(tensors, self.rank, rt, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allgather_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1777,6 +1805,8 @@ def allgather(output_ts, input_ts):
             for s_idx, t in enumerate(device_ts):
                 self.assertEqual(torch.tensor([s_idx]), t)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_scatter_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1854,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op):
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             self.assertEqualIgnoreType(expected, output[i])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_barrier(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -450,6 +450,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       opTimeout_(options.opTimeout),
       futureNCCLCallbackStreams_(c10::cuda::device_count()),
       isHighPriorityStream_(options.isHighPriorityStream) {
+  TORCH_CHECK(at::cuda::getNumGPUs() != 0,
+    "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
   try {
     parseNcclBlockingWait();
   } catch (std::exception& e) {
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
@@ -130,6 +130,17 @@ def requires_mpi():
         "c10d was not compiled with the MPI backend",
     )
 
+def skip_if_rocm_single_process(func):
+    """Skips a test for ROCm in a single process environment"""
+    func.skip_if_rocm = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        raise unittest.SkipTest("Test skipped for ROCm")
+
+    return wrapper
 
 def skip_if_rocm(func):
     """Skips a test for ROCm"""