Fix Process Group for tensors shared across processes

mrshenli · mrshenli · commit d23f4c1da8c5 · 2019-06-06T08:08:28.000-07:00
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
@@ -376,22 +376,33 @@ struct THCCachingAllocator
     cacheInfoAux(small_blocks, dev_id, total, largest);
   }
 
-  void recordStream(void* ptr, cuda::CUDAStream stream)
+  void recordStream(void* ptr, cuda::CUDAStream stream, bool suppressError=false)
   {
     // Empty tensor's storage().data() might be a null ptr. As there is no
     // blocks associated with those tensors, it is fine to do nothing here.
     if (ptr) {
       std::lock_guard<std::recursive_mutex> lock(mutex);
       Block* block = find_allocated_block(ptr);
       if (!block) {
-        AT_ERROR("invalid device pointer: ", ptr);
-      }
-      if (stream.stream() == block->stream) {
-        // ignore uses on the allocation stream, since those don't require any
-        // special synchronization
-        return;
+        // In some cases (e.g., tensor loaded from blob, or shared by another
+        // process), this CUDACachingAllocator does not know about the ptr,
+        // and the caller of this function might not have enough context to
+        // check where the tensor is originated. One option is to expose a new
+        // API from CUDACachingAllocator to check whether it knows about the
+        // ptr, but it would force other use cases to unnecessarily do two
+        // map look up (one check + one recordStream). Hence, we provide a
+        // suppressError argument to avoid error and two lookups.
+        if (!suppressError) {
+          AT_ERROR("invalid device pointer: ", ptr);
+        }
+      } else {
+        if (stream.stream() == block->stream) {
+          // ignore uses on the allocation stream, since those don't require any
+          // special synchronization
+          return;
+        }
+        block->stream_uses.insert(stream);
       }
-      block->stream_uses.insert(stream);
     }
   }
 
@@ -651,9 +662,9 @@ void* getBaseAllocation(void *ptr, size_t *size)
   return caching_allocator.getBaseAllocation(ptr, size);
 }
 
-void recordStream(void *ptr, cuda::CUDAStream stream)
+void recordStream(void *ptr, cuda::CUDAStream stream, bool suppressError)
 {
-  caching_allocator.recordStream(ptr, stream);
+  caching_allocator.recordStream(ptr, stream, suppressError);
 }
 
 std::mutex* getFreeMutex()
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
@@ -46,7 +46,7 @@ C10_CUDA_API Allocator* get();
 C10_CUDA_API void emptyCache();
 C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
 C10_CUDA_API void* getBaseAllocation(void *ptr, size_t *size);
-C10_CUDA_API void recordStream(void *ptr, CUDAStream stream);
+C10_CUDA_API void recordStream(void *ptr, CUDAStream stream, bool suppressError=false);
 C10_CUDA_API uint64_t currentMemoryAllocated(int device);
 C10_CUDA_API uint64_t maxMemoryAllocated(int device);
 C10_CUDA_API void     resetMaxMemoryAllocated(int device);
diff --git a/test/test_c10d.py b/test/test_c10d.py
@@ -20,9 +20,10 @@
 import torch.nn.functional as F
 import torch.distributed as c10d
 import torch.distributed as dist
+import torch.multiprocessing as mp
 from torch.nn.parallel import DistributedDataParallel
 
-from common_utils import TestCase, load_tests, run_tests
+from common_utils import TestCase, load_tests, run_tests, PY3
 from common_utils import retry_on_address_already_in_use_error
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -1606,6 +1607,54 @@ def allreduce(tensors):
                     tensors_list[i - 2][j])
 
 
+class ProcessGroupShareTensorTest(TestCase):
+
+    @property
+    def world_size(self):
+        return 2
+
+    def opts(threads=2):
+        opts = c10d.ProcessGroupGloo.Options()
+        opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        opts.timeout = 5.0
+        opts.threads = threads
+        return opts
+
+    def _test_allreduce_gloo_process(rank, filename, shared_tensors, world_size):
+        store = c10d.FileStore(filename, world_size)
+        pg = c10d.ProcessGroupGloo(
+            store, rank, world_size, ProcessGroupShareTensorTest.opts())
+        xs = [shared_tensors[rank]]
+        pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait()
+        xs[0].to('cpu').allclose(torch.ones(2, 2))
+
+    @unittest.skipIf(not PY3, "Python 3 needed")
+    @skip_if_not_multigpu
+    def test_allreduce_gloo(self):
+        file = tempfile.NamedTemporaryFile(delete=False)
+        shared_tensors = [torch.ones(2, 2).to(i).share_memory_() for i in range(2)]
+        mp.spawn(ProcessGroupShareTensorTest._test_allreduce_gloo_process,
+                 args=(file.name, shared_tensors, self.world_size),
+                 nprocs=self.world_size,
+                 join=True)
+
+    def _test_allreduce_nccl_process(rank, filename, shared_tensors, world_size):
+        store = c10d.FileStore(filename, world_size)
+        pg = c10d.ProcessGroupNCCL(store, rank, world_size)
+        xs = [shared_tensors[rank]]
+        pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait()
+        xs[0].to('cpu').allclose(torch.ones(2, 2))
+
+    @unittest.skipIf(not PY3, "Python 3 needed")
+    @skip_if_not_multigpu
+    def test_allreduce_nccl(self):
+        file = tempfile.NamedTemporaryFile(delete=False)
+        shared_tensors = [torch.ones(2, 2).to(i).share_memory_() for i in range(2)]
+        mp.spawn(ProcessGroupShareTensorTest._test_allreduce_gloo_process,
+                 args=(file.name, shared_tensors, self.world_size),
+                 nprocs=self.world_size,
+                 join=True)
+
 class Net(nn.Module):
     def __init__(self):
         super(Net, self).__init__()
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -161,7 +161,7 @@ void initializeStreamsEvents(
     // `tensors` are created on a different stream. Hence, they must record
     // new streams in this Work to prevent being freed before the Work finishes.
     c10::cuda::CUDACachingAllocator::recordStream(
-      tensors[i].storage().data(), streams[i]);
+      tensors[i].storage().data(), streams[i], true);
   }
 }
 
@@ -205,7 +205,7 @@ void initializeStreamsEvents(
       // new streams in this Work to prevent being freed before the Work
       // finishes.
       c10::cuda::CUDACachingAllocator::recordStream(
-        tensor.storage().data(), streams[i]);
+        tensor.storage().data(), streams[i], true);
     }
   }
 }
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -414,7 +414,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     //
     // See [Sync Streams].
     c10::cuda::CUDACachingAllocator::recordStream(
-      inputs[i].storage().data(), ncclStream);
+      inputs[i].storage().data(), ncclStream, true);
 
     C10D_NCCL_CHECK(fn(
       inputs[i],
@@ -529,7 +529,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
     [&] (at::Tensor& input, at::Tensor& output,
          ncclComm_t comm, at::cuda::CUDAStream& stream) {
       c10::cuda::CUDACachingAllocator::recordStream(
-        output.storage().data(), stream
+        output.storage().data(), stream, true
       );
       return ncclAllGather(
         input.data_ptr(),
@@ -548,7 +548,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
         for (size_t j = 0; j < outputTensors[0].size(); ++j) {
           // See [Sync Streams].
           c10::cuda::CUDACachingAllocator::recordStream(
-            outputTensors[i][j].storage().data(), ncclStreams[i]);
+            outputTensors[i][j].storage().data(), ncclStreams[i], true);
 
           outputTensors[i][j].copy_(outputFlattened[i][j], true);
         }
@@ -572,7 +572,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
     [&] (at::Tensor& input, at::Tensor& output,
          ncclComm_t comm, at::cuda::CUDAStream& stream) {
       c10::cuda::CUDACachingAllocator::recordStream(
-        output.storage().data(), stream
+        output.storage().data(), stream, true
       );
       return ncclReduceScatter(
         input.data_ptr(),
@@ -591,7 +591,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
         for (size_t j = 0; j < inputTensors[0].size(); ++j) {
           // See [Sync Streams].
           c10::cuda::CUDACachingAllocator::recordStream(
-            inputTensors[i][j].storage().data(), ncclStreams[i]);
+            inputTensors[i][j].storage().data(), ncclStreams[i], true);
 
           inputFlattened[i][j].copy_(inputTensors[i][j], true);
         }

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ void initializeStreamsEvents(`
`161`	`161`	// `tensors` are created on a different stream. Hence, they must record
`162`	`162`	`// new streams in this Work to prevent being freed before the Work finishes.`
`163`	`163`	`c10::cuda::CUDACachingAllocator::recordStream(`
`164`		`- tensors[i].storage().data(), streams[i]);`
	`164`	`+ tensors[i].storage().data(), streams[i], true);`
`165`	`165`	`}`
`166`	`166`	`}`
`167`	`167`
`@@ -205,7 +205,7 @@ void initializeStreamsEvents(`
`205`	`205`	`// new streams in this Work to prevent being freed before the Work`
`206`	`206`	`// finishes.`
`207`	`207`	`c10::cuda::CUDACachingAllocator::recordStream(`
`208`		`- tensor.storage().data(), streams[i]);`
	`208`	`+ tensor.storage().data(), streams[i], true);`
`209`	`209`	`}`
`210`	`210`	`}`
`211`	`211`	`}`