Fix ProcessGroupGloo allgather for tensors with shared storage (#21490)

mrshenli · facebook-github-bot · commit 39d412194f1e · 2019-06-12T11:59:17.000-07:00
Summary: Fix #20421 `ProcessGroupGloo` only requires input/output tensors to be contiguous. Contiguous tensors might not start from the beginning of the underlying storage, e.g., `chunk(..., dim=0)[1]`. The current implementation passes `tensor.storage().data()` ptr to gloo buffer. This leads to wrong results if the tensor has a non-zero storage offset. The proposed solution is to use `tensor.data_ptr()` instead. Let's see if this breaks any tests. cc qijianan777 Pull Request resolved: #21490 Differential Revision: D15768907 Pulled By: mrshenli fbshipit-source-id: 9d7d1e9baf0461b31187c7d21a4a53b1fbb07397
diff --git a/test/test_c10d_spawn.py b/test/test_c10d_spawn.py
@@ -72,7 +72,7 @@ def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
                 expected,
                 result,
                 (
-                    "Expect rank {} to broadcast result {} but got {}."
+                    "Expect rank {} to receive tensor {} but got {}."
                 ).format(pid, expected, result)
             )
 
@@ -187,6 +187,26 @@ def test_shared_allgather_nccl(self):
             ProcessGroupShareTensorTest._init_pg_nccl,
             self.world_size)
 
+    @classmethod
+    def _test_allgather_chunk_process(
+            cls, rank, filename, shared_tensor, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, filename, world_size)
+        chunks = torch.chunk(shared_tensor, world_size, dim=0)
+        x = chunks[rank]
+        ys = [torch.zeros_like(x) for _ in range(world_size)]
+        pg.allgather(ys, x).wait()
+        c2p.put((rank, chunks[0].to("cpu"), ys[0].to("cpu")))
+        c2p.put((rank, chunks[1].to("cpu"), ys[1].to("cpu")))
+        p2c.get()
+
+    @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+    def test_shared_allgather_chunk_gloo(self):
+        self._test_multiprocess(
+            ProcessGroupShareTensorTest._test_allgather_chunk_process,
+            torch.tensor(range(4)).reshape(2, 2),
+            ProcessGroupShareTensorTest._init_pg_gloo,
+            self.world_size)
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
@@ -272,8 +272,13 @@ inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
 
 template <typename T>
 inline T* getDataPointer(const at::Tensor& tensor) {
-  // NB: This does NOT respect storage_offset from the tensor
-  return static_cast<T*>(tensor.storage().data());
+  // This method is only used in ProcessGroupGloo for now. Call sites must make
+  // sure that the input tensor is contiguous. It is OK if the tensor does not
+  // start from the beginning of the storage. For example, it could come from
+  // chunk(..., dim=0)[1]. Hence, we need to use data_ptr() instead of
+  // tensor.storage().data()
+  // NB: not using tensor.data<T>() because tensor is not aware of gloo::TYPE
+  return static_cast<T*>(tensor.data_ptr());
 }
 
 template <typename T>