Fix Process Group for tensors shared across processes (#21449)

mrshenli · facebook-github-bot · commit 25d1496d581d · 2019-06-11T11:50:25.000-07:00
Summary: Ops on a Process Group (pg) instance will hit an error when input/output tensors are created on a different process, because, pg calls `recordStream` on `CUDACachingAllocator` which only knows tensors created within the same process. The proposed solution is to add a `suppressError` arg (suggestions for better names?) to `recordStream`. See comments in code for arguments. CC pichuang1984 Pull Request resolved: #21449 Differential Revision: D15689736 Pulled By: mrshenli fbshipit-source-id: e7fc81b167868f8666536067eaa7ae2c8584d88e
diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
@@ -29,4 +29,5 @@ fi
 
 time python test/run_test.py --verbose -i distributed
 time python test/run_test.py --verbose -i c10d
+time python test/run_test.py --verbose -i c10d_spawn
 assert_git_not_dirty
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
@@ -383,15 +383,16 @@ struct THCCachingAllocator
     if (ptr) {
       std::lock_guard<std::recursive_mutex> lock(mutex);
       Block* block = find_allocated_block(ptr);
-      if (!block) {
-        AT_ERROR("invalid device pointer: ", ptr);
-      }
-      if (stream.stream() == block->stream) {
-        // ignore uses on the allocation stream, since those don't require any
-        // special synchronization
-        return;
+      // block could be nullptr in some cases, e.g., tensor loaded from blob, or
+      // shared from another process, or not pointing to a CUDA tensor.
+      if (block) {
+        if (stream.stream() == block->stream) {
+          // ignore uses on the allocation stream, since those don't require any
+          // special synchronization
+          return;
+        }
+        block->stream_uses.insert(stream);
       }
-      block->stream_uses.insert(stream);
     }
   }
 
diff --git a/test/run_test.py b/test/run_test.py
@@ -21,6 +21,7 @@
     'autograd',
     'cpp_extensions',
     'c10d',
+    'c10d_spawn',
     'cuda',
     'cuda_primary_ctx',
     'dataloader',
diff --git a/test/test_c10d_spawn.py b/test/test_c10d_spawn.py
@@ -0,0 +1,192 @@
+import sys
+import tempfile
+import unittest
+
+import torch
+import torch.distributed as c10d
+import torch.multiprocessing as mp
+
+from common_cuda import TEST_MULTIGPU
+from common_utils import TestCase, load_tests, run_tests
+from common_utils import NO_MULTIPROCESSING_SPAWN
+
+# load_tests from common_utils is used to automatically filter tests for
+# sharding on sandcastle. This line silences flake warnings
+load_tests = load_tests
+
+if not c10d.is_available():
+    print('c10d not available, skipping tests')
+    sys.exit(0)
+
+
+if NO_MULTIPROCESSING_SPAWN:
+    print('spawn not available, skipping tests')
+    sys.exit(0)
+
+
+NO_NCCL = not hasattr(c10d, "ProcessGroupNCCL")
+
+
+class ProcessGroupShareTensorTest(TestCase):
+
+    world_size = 2
+
+    @classmethod
+    def opts(cls, threads=2):
+        opts = c10d.ProcessGroupGloo.Options()
+        opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        opts.timeout = 5.0
+        opts.threads = threads
+        return opts
+
+    @classmethod
+    def _init_pg_gloo(cls, rank, filename, world_size):
+        store = c10d.FileStore(filename, world_size)
+        return c10d.ProcessGroupGloo(
+            store, rank, world_size, ProcessGroupShareTensorTest.opts())
+
+    @classmethod
+    def _init_pg_nccl(cls, rank, filename, world_size):
+        store = c10d.FileStore(filename, world_size)
+        return c10d.ProcessGroupNCCL(store, rank, world_size)
+
+    def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
+        ws = self.world_size
+        # file store will delete the test file on destruction
+        file = tempfile.NamedTemporaryFile(delete=False)
+        ctx = mp.get_context('spawn')
+        c2p = ctx.Queue(2)
+        p2c = ctx.Queue(2)
+        ps = []
+        for i in range(ws):
+            p = ctx.Process(
+                target=f,
+                args=(i, file.name, shared_tensors, ws, init_pg, c2p, p2c))
+
+            p.start()
+            ps.append(p)
+
+        for _ in range(ws * n_output):
+            pid, expected, result = c2p.get()
+            self.assertEqual(
+                expected,
+                result,
+                (
+                    "Expect rank {} to broadcast result {} but got {}."
+                ).format(pid, expected, result)
+            )
+
+        for _ in range(ws):
+            p2c.put(0)
+
+        for p in ps:
+            p.join(2)
+
+    # Why classmethod? multiprocessing cannot pickle TestCase subclass when in
+    # spawn mode. See https://bugs.python.org/issue33884.
+    @classmethod
+    def _test_broadcast_process(
+            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, filename, world_size)
+        xs = [shared_tensors[rank]]
+        pg.broadcast(xs).wait()
+        c2p.put((rank, torch.zeros(2, 2), xs[0].to("cpu")))
+        p2c.get()
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    def test_shared_broadcast_gloo(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_broadcast_process,
+            [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_gloo,
+            1)
+
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    @unittest.skipIf(NO_NCCL, "NCCL needed")
+    def test_shared_broadcast_nccl(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_broadcast_process,
+            [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_nccl,
+            1)
+
+    @classmethod
+    def _test_allreduce_process(
+            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, filename, world_size)
+        xs = [shared_tensors[rank]]
+        pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait()
+        c2p.put((rank, torch.ones(2, 2) * 2, xs[0].to("cpu")))
+        p2c.get()
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    def test_shared_allreduce_gloo(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_allreduce_process,
+            [torch.ones(2, 2).to(i) for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_gloo,
+            1)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    @unittest.skipIf(NO_NCCL, "NCCL needed")
+    def test_shared_allreduce_nccl(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_allreduce_process,
+            [torch.ones(2, 2).to(i) for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_nccl,
+            1)
+
+    @classmethod
+    def _test_reduce_process(
+            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, filename, world_size)
+        x = shared_tensors[rank]
+        pg.reduce(x, root=0, op=c10d.ReduceOp.SUM).wait()
+        if rank == 0:
+            c2p.put((rank, torch.ones(2, 2) * 2, x.to("cpu")))
+        else:
+            c2p.put((rank, torch.ones(2, 2), x.to("cpu")))
+        p2c.get()
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    @unittest.skipIf(NO_NCCL, "NCCL needed")
+    def test_shared_reduce_nccl(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_reduce_process,
+            [torch.ones(2, 2).to(i) for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_nccl,
+            1)
+
+    @classmethod
+    def _test_allgather_process(
+            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, filename, world_size)
+        xs = [shared_tensors[rank]]
+        ys = [[torch.zeros_like(xs[0]) for i in range(world_size)]]
+        pg.allgather(ys, xs).wait()
+        for i in range(world_size):
+            c2p.put((rank, torch.ones(2, 2) * i, ys[0][i].to("cpu")))
+
+        p2c.get()
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    def test_shared_allgather_gloo(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_allgather_process,
+            [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_gloo,
+            self.world_size)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    @unittest.skipIf(NO_NCCL, "NCCL needed")
+    def test_shared_allgather_nccl(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_allgather_process,
+            [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+            ProcessGroupShareTensorTest._init_pg_nccl,
+            self.world_size)
+
+
+if __name__ == '__main__':
+    run_tests()