Skip to content

Commit 9af59ca

Browse files
committed
[v1.7 patch] Disallow creation of ProcessGroupNCCL without GPUs. (#45642)
Summary: Note: This PR has been merged into master at b5a2f04 after the 1.7 branch cut (see original PR: #45642). This PR is to merge it into the 1.7 branch. ---- Original Commit Description Follows --- Pull Request resolved: #45642 Prior to #45181, initializing a NCCL process group would work even if no GPUs were present. Although, now since init_process_group calls `barrier()` this would fail. In general the problem was that we could initialize ProcessGroupNCCL without GPUs and then if we called a method like `barrier()` the process would crash since we do % numGPUs resulting in division by zero. ghstack-source-id: 113490343 Test Plan: waitforbuildbot Reviewed By: osalpekar Differential Revision: D24038839 fbshipit-source-id: a1f1db52cabcfb83e06c1a11ae9744afbf03f8dc
1 parent 653d766 commit 9af59ca

File tree

3 files changed

+52
-7
lines changed

3 files changed

+52
-7
lines changed

test/distributed/test_c10d.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
from torch.testing._internal.common_distributed import MultiProcessTestCase, \
3030
requires_gloo, requires_nccl, requires_nccl_version, \
3131
skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
32-
simple_sparse_reduce_tests, skip_if_win32, create_device
32+
skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \
33+
create_device
3334

3435
from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
3536
retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -1594,13 +1595,30 @@ def create(num, prefix):
15941595
self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor)
15951596
del pg
15961597

1598+
class ProcessGroupNCCLNoGPUTest(TestCase):
1599+
MAIN_PROCESS_RANK = 0
1600+
1601+
def setUp(self):
1602+
self.rank = self.MAIN_PROCESS_RANK
1603+
self.world_size = 1
1604+
self.file = tempfile.NamedTemporaryFile(delete=False)
1605+
self.num_gpus = torch.cuda.device_count()
1606+
if self.num_gpus > 0:
1607+
raise unittest.SkipTest("GPUs are available, skipping test")
1608+
1609+
def tearDown(self):
1610+
pass
1611+
1612+
@requires_nccl()
1613+
@skip_if_rocm_single_process
1614+
def test_init_no_gpus(self):
1615+
store = c10d.FileStore(self.file.name, self.world_size)
1616+
with self.assertRaisesRegex(
1617+
RuntimeError,
1618+
"ProcessGroupNCCL is only supported with GPUs, no GPUs found!"):
1619+
c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
1620+
15971621

1598-
@requires_nccl()
1599-
@unittest.skipIf(
1600-
TEST_WITH_TSAN,
1601-
"TSAN is not fork-safe since we're forking in a multi-threaded environment",
1602-
)
1603-
@skip_if_rocm
16041622
class ProcessGroupNCCLTest(TestCase):
16051623
MAIN_PROCESS_RANK = 0
16061624

@@ -1615,6 +1633,8 @@ def setUp(self):
16151633
def tearDown(self):
16161634
pass
16171635

1636+
@requires_nccl()
1637+
@skip_if_rocm_single_process
16181638
def test_empty_tensors(self):
16191639
store = c10d.FileStore(self.file.name, self.world_size)
16201640
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1639,6 +1659,8 @@ def test_empty_tensors(self):
16391659
pg.reduce_scatter(ys, xs).wait()
16401660
self.assertEqual(0, ys[0].numel())
16411661

1662+
@requires_nccl()
1663+
@skip_if_rocm_single_process
16421664
def test_broadcast_ops(self):
16431665
store = c10d.FileStore(self.file.name, self.world_size)
16441666
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1661,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor):
16611683
for i in range(self.num_gpus):
16621684
self.assertEqual(tensors[i], tensors[rt])
16631685

1686+
@requires_nccl()
1687+
@skip_if_rocm_single_process
16641688
def test_allreduce_ops(self):
16651689
store = c10d.FileStore(self.file.name, self.world_size)
16661690
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1722,6 +1746,8 @@ def allreduce(tensors, op):
17221746
with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
17231747
allreduce(tensors, op)
17241748

1749+
@requires_nccl()
1750+
@skip_if_rocm_single_process
17251751
def test_reduce_ops(self):
17261752
store = c10d.FileStore(self.file.name, self.world_size)
17271753
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1752,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None):
17521778
with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
17531779
reduce(tensors, self.rank, rt, op)
17541780

1781+
@requires_nccl()
1782+
@skip_if_rocm_single_process
17551783
def test_allgather_ops(self):
17561784
store = c10d.FileStore(self.file.name, self.world_size)
17571785
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1777,6 +1805,8 @@ def allgather(output_ts, input_ts):
17771805
for s_idx, t in enumerate(device_ts):
17781806
self.assertEqual(torch.tensor([s_idx]), t)
17791807

1808+
@requires_nccl()
1809+
@skip_if_rocm_single_process
17801810
def test_reduce_scatter_ops(self):
17811811
store = c10d.FileStore(self.file.name, self.world_size)
17821812
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1854,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op):
18541884
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
18551885
self.assertEqualIgnoreType(expected, output[i])
18561886

1887+
@requires_nccl()
1888+
@skip_if_rocm_single_process
18571889
def test_barrier(self):
18581890
store = c10d.FileStore(self.file.name, self.world_size)
18591891
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

torch/lib/c10d/ProcessGroupNCCL.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
450450
opTimeout_(options.opTimeout),
451451
futureNCCLCallbackStreams_(c10::cuda::device_count()),
452452
isHighPriorityStream_(options.isHighPriorityStream) {
453+
TORCH_CHECK(at::cuda::getNumGPUs() != 0,
454+
"ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
453455
try {
454456
parseNcclBlockingWait();
455457
} catch (std::exception& e) {

torch/testing/_internal/common_distributed.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,17 @@ def requires_mpi():
130130
"c10d was not compiled with the MPI backend",
131131
)
132132

133+
def skip_if_rocm_single_process(func):
134+
"""Skips a test for ROCm in a single process environment"""
135+
func.skip_if_rocm = True
136+
137+
@wraps(func)
138+
def wrapper(*args, **kwargs):
139+
if not TEST_WITH_ROCM:
140+
return func(*args, **kwargs)
141+
raise unittest.SkipTest("Test skipped for ROCm")
142+
143+
return wrapper
133144

134145
def skip_if_rocm(func):
135146
"""Skips a test for ROCm"""

0 commit comments

Comments
 (0)