Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .jenkins/pytorch/multigpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ fi

time python test/run_test.py --verbose -i distributed
time python test/run_test.py --verbose -i c10d
time python test/run_test.py --verbose -i c10d_spawn
assert_git_not_dirty
17 changes: 9 additions & 8 deletions c10/cuda/CUDACachingAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,15 +383,16 @@ struct THCCachingAllocator
if (ptr) {
std::lock_guard<std::recursive_mutex> lock(mutex);
Block* block = find_allocated_block(ptr);
if (!block) {
AT_ERROR("invalid device pointer: ", ptr);
}
if (stream.stream() == block->stream) {
// ignore uses on the allocation stream, since those don't require any
// special synchronization
return;
// block could be nullptr in some cases, e.g., tensor loaded from blob, or
// shared from another process, or not pointing to a CUDA tensor.
if (block) {
if (stream.stream() == block->stream) {
// ignore uses on the allocation stream, since those don't require any
// special synchronization
return;
}
block->stream_uses.insert(stream);
}
block->stream_uses.insert(stream);
}
}

Expand Down
1 change: 1 addition & 0 deletions test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
'autograd',
'cpp_extensions',
'c10d',
'c10d_spawn',
'cuda',
'cuda_primary_ctx',
'dataloader',
Expand Down
192 changes: 192 additions & 0 deletions test/test_c10d_spawn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import sys
import tempfile
import unittest

import torch
import torch.distributed as c10d
import torch.multiprocessing as mp

from common_cuda import TEST_MULTIGPU
from common_utils import TestCase, load_tests, run_tests
from common_utils import NO_MULTIPROCESSING_SPAWN

# load_tests from common_utils is used to automatically filter tests for
# sharding on sandcastle. This line silences flake warnings
load_tests = load_tests

if not c10d.is_available():
print('c10d not available, skipping tests')
sys.exit(0)


if NO_MULTIPROCESSING_SPAWN:
print('spawn not available, skipping tests')
sys.exit(0)


NO_NCCL = not hasattr(c10d, "ProcessGroupNCCL")


class ProcessGroupShareTensorTest(TestCase):

world_size = 2

@classmethod
def opts(cls, threads=2):
opts = c10d.ProcessGroupGloo.Options()
opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
opts.timeout = 5.0
opts.threads = threads
return opts

@classmethod
def _init_pg_gloo(cls, rank, filename, world_size):
store = c10d.FileStore(filename, world_size)
return c10d.ProcessGroupGloo(
store, rank, world_size, ProcessGroupShareTensorTest.opts())

@classmethod
def _init_pg_nccl(cls, rank, filename, world_size):
store = c10d.FileStore(filename, world_size)
return c10d.ProcessGroupNCCL(store, rank, world_size)

def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
ws = self.world_size
# file store will delete the test file on destruction
file = tempfile.NamedTemporaryFile(delete=False)
ctx = mp.get_context('spawn')
c2p = ctx.Queue(2)
p2c = ctx.Queue(2)
ps = []
for i in range(ws):
p = ctx.Process(
target=f,
args=(i, file.name, shared_tensors, ws, init_pg, c2p, p2c))

p.start()
ps.append(p)

for _ in range(ws * n_output):
pid, expected, result = c2p.get()
self.assertEqual(
expected,
result,
(
"Expect rank {} to broadcast result {} but got {}."
).format(pid, expected, result)
)

for _ in range(ws):
p2c.put(0)

for p in ps:
p.join(2)

# Why classmethod? multiprocessing cannot pickle TestCase subclass when in
# spawn mode. See https://bugs.python.org/issue33884.
Copy link
Contributor

@ezyang ezyang Jun 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, test_multiprocessing.py does this by just having the test runner methods as honest to goodness top-level functions. This is just an informational comment, since a class method is just as good.

@classmethod
def _test_broadcast_process(
cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
pg = init_pg(rank, filename, world_size)
xs = [shared_tensors[rank]]
pg.broadcast(xs).wait()
c2p.put((rank, torch.zeros(2, 2), xs[0].to("cpu")))
p2c.get()

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
def test_shared_broadcast_gloo(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_broadcast_process,
[torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_gloo,
1)


@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
@unittest.skipIf(NO_NCCL, "NCCL needed")
def test_shared_broadcast_nccl(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_broadcast_process,
[torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_nccl,
1)

@classmethod
def _test_allreduce_process(
cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
pg = init_pg(rank, filename, world_size)
xs = [shared_tensors[rank]]
pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait()
c2p.put((rank, torch.ones(2, 2) * 2, xs[0].to("cpu")))
p2c.get()

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
def test_shared_allreduce_gloo(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_allreduce_process,
[torch.ones(2, 2).to(i) for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_gloo,
1)

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
@unittest.skipIf(NO_NCCL, "NCCL needed")
def test_shared_allreduce_nccl(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_allreduce_process,
[torch.ones(2, 2).to(i) for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_nccl,
1)

@classmethod
def _test_reduce_process(
cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
pg = init_pg(rank, filename, world_size)
x = shared_tensors[rank]
pg.reduce(x, root=0, op=c10d.ReduceOp.SUM).wait()
if rank == 0:
c2p.put((rank, torch.ones(2, 2) * 2, x.to("cpu")))
else:
c2p.put((rank, torch.ones(2, 2), x.to("cpu")))
p2c.get()

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
@unittest.skipIf(NO_NCCL, "NCCL needed")
def test_shared_reduce_nccl(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_reduce_process,
[torch.ones(2, 2).to(i) for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_nccl,
1)

@classmethod
def _test_allgather_process(
cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
pg = init_pg(rank, filename, world_size)
xs = [shared_tensors[rank]]
ys = [[torch.zeros_like(xs[0]) for i in range(world_size)]]
pg.allgather(ys, xs).wait()
for i in range(world_size):
c2p.put((rank, torch.ones(2, 2) * i, ys[0][i].to("cpu")))

p2c.get()

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
def test_shared_allgather_gloo(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_allgather_process,
[torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_gloo,
self.world_size)

@unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
@unittest.skipIf(NO_NCCL, "NCCL needed")
def test_shared_allgather_nccl(self):
self._test_multiprocess(
ProcessGroupShareTensorTest._test_allgather_process,
[torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
ProcessGroupShareTensorTest._init_pg_nccl,
self.world_size)


if __name__ == '__main__':
run_tests()