Add sparse tensor allreduce (#22036)

pietern · facebook-github-bot · commit a7ec889de4d8 · 2019-06-24T07:34:09.000-07:00
Summary: Pull Request resolved: #22036 Implemented only on ProcessGroupGloo, as an allgather of metadata (sparse_dim, dense_dim, and nnz), followed by an allgather of indices, followed by an allgather of values. Once these operations have finished, all ranks locally compute a reduction over these sparse tensors. Works for both CPU and CUDA tensors. This surfaced a problem with the existing assumption of only modifying tensors that are passed at the call site, because for sparse tensors we don't know the dimensions of the output tensors before we run the collective. To deal with this unknown, this commit adds a `result` function to the `c10d::ProcessGroup::Work` class that returns a vector of tensors. It's a bit odd to have to retrieve the result through this function only for operations on sparse tensors. To make this work irrespective of tensor layout, we can create a follow-up commit to make all in place operations make their results accessible through this function as well. This doesn't break any existing contracts but does have the potential to add interface ambiguity. This is a resubmission of #19146. Reviewed By: mrshenli Differential Revision: D15926384 fbshipit-source-id: b6ee5d81606bfa8ed63c3d63a9e307613491e0ae
diff --git a/test/test_c10d.py b/test/test_c10d.py
@@ -11,7 +11,7 @@
 from datetime import timedelta
 
 from itertools import groupby
-from functools import wraps
+from functools import partial, reduce, wraps
 from collections import namedtuple
 
 import torch
@@ -157,6 +157,49 @@ def simple_multi_input_reduce_tests(rank, world_size):
     ]
 
 
+def simple_sparse_reduce_tests(rank, world_size, num_inputs=1):
+    """
+    Generate a number of basic test cases for sparse reduction.
+    These cover tensors with a varying number of sparse dimensions and a varying
+    number of dense dimensions. The only reduction operation we support is sum.
+    """
+    def generate(rank, world_size, sparse_dims=1, dense_dims=0):
+        # First sparse dimension is [0..rank].
+        # Subsequent dimensions are always 0, so we know there is
+        # a non-empty intersection between any two sparse tensors.
+        indices = [range(rank + 1)]
+        shape = [world_size] + [2 for _ in range(dense_dims)]
+        for _ in range(sparse_dims - 1):
+            indices.append([0] * (rank + 1))
+            shape.append(world_size)
+        values = torch.ones([rank + 1] + [2 for _ in range(dense_dims)])
+        return torch.sparse_coo_tensor(indices, values, shape)
+
+    def compute_sum(fn, world_size):
+        return reduce(lambda a, b: a + b, [fn(rank, world_size) for rank in range(world_size)])
+
+    return [
+        (
+            [
+                fn(num_inputs * rank + i, num_inputs * world_size)
+                for i in range(num_inputs)
+            ],
+            [
+                compute_sum(fn, num_inputs * world_size)
+                for i in range(num_inputs)
+            ],
+        )
+        for fn in [
+            partial(generate, sparse_dims=1),
+            partial(generate, sparse_dims=2),
+            partial(generate, sparse_dims=3),
+            partial(generate, dense_dims=1),
+            partial(generate, dense_dims=2),
+            partial(generate, dense_dims=3),
+        ]
+    ]
+
+
 class StoreTestBase(object):
     def _create_store(self, i):
         raise RuntimeError("not implemented")
@@ -788,6 +831,54 @@ def test_allreduce_stress_cuda(self):
         inputs = [torch.Tensor([i + self.rank]).cuda() for i in range(1000)]
         self._test_allreduce_stress(inputs)
 
+    def test_sparse_allreduce_checks(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+
+        t1 = torch.zeros([1])
+        t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,))
+        t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,))
+
+        with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"):
+            opts = c10d.AllreduceOptions()
+            pg.allreduce([], opts)
+
+        with self.assertRaisesRegex(ValueError, "invalid tensor layout"):
+            opts = c10d.AllreduceOptions()
+            pg.allreduce([t1, t2], opts)
+
+        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+            opts = c10d.AllreduceOptions()
+            pg.allreduce([t2, t3], opts)
+
+        # Sparse allreduce only works with c10d.ReduceOp.SUM.
+        for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]:
+            with self.assertRaisesRegex(ValueError, "unsupported reduction operation"):
+                opts = c10d.AllreduceOptions()
+                opts.reduceOp = op
+                pg.allreduce([t3], opts)
+
+    def _test_sparse_allreduce_basics(self, fn):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+
+        for num_inputs_per_rank in [1, 2]:
+            tests = simple_sparse_reduce_tests(
+                self.rank,
+                self.world_size,
+                num_inputs=num_inputs_per_rank)
+            for (inputs, outputs) in tests:
+                work = pg.allreduce([fn(input) for input in inputs])
+                work.wait()
+                self.assertEqual(work.result(), outputs)
+
+    def test_sparse_allreduce_basics(self):
+        self._test_sparse_allreduce_basics(lambda t: t)
+
+    @skip_if_not_multigpu
+    def test_sparse_allreduce_basics_cuda(self):
+        self._test_sparse_allreduce_basics(lambda t: t.clone().cuda())
+
     def test_scatter_checks(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -477,6 +477,15 @@ They are used in specifying strategies for reduction collectives, e.g.,
       .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
       .def("exception", &::c10d::ProcessGroup::Work::exception)
       .def("source_rank", &::c10d::ProcessGroup::Work::sourceRank)
+      .def(
+          "result",
+          [](::c10d::ProcessGroup::Work& work) -> std::vector<at::Tensor> {
+            auto tensors = work.result();
+            for (auto& tensor : tensors) {
+              tensor = autograd::make_variable(tensor);
+            }
+            return tensors;
+          })
       .def("synchronize", &::c10d::ProcessGroup::Work::synchronize)
       .def(
           "wait",
diff --git a/torch/lib/c10d/ProcessGroup.cpp b/torch/lib/c10d/ProcessGroup.cpp
@@ -27,6 +27,10 @@ int ProcessGroup::Work::sourceRank() const {
       "that correspond to a recv or recv-from-any call.");
 }
 
+std::vector<at::Tensor> ProcessGroup::Work::result() const {
+  throw std::runtime_error("result() not implemented.");
+}
+
 void ProcessGroup::Work::synchronize() {}
 
 void ProcessGroup::Work::wait() {
diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
@@ -52,6 +52,9 @@ class ProcessGroup {
     // Returns source rank if this objects represents a recv-from-any.
     virtual int sourceRank() const;
 
+    // Returns result tensors, if applicable.
+    virtual std::vector<at::Tensor> result() const;
+
     // Ensures that operations on the output tensors that are invoked
     // after this function returns are correctly sequenced after the
     // asynchronous completion of this work.
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,10 @@ int ProcessGroup::Work::sourceRank() const {`
`27`	`27`	`"that correspond to a recv or recv-from-any call.");`
`28`	`28`	`}`
`29`	`29`
	`30`	`+std::vector<at::Tensor> ProcessGroup::Work::result() const {`
	`31`	`+ throw std::runtime_error("result() not implemented.");`
	`32`	`+}`
	`33`	`+`
`30`	`34`	`void ProcessGroup::Work::synchronize() {}`
`31`	`35`
`32`	`36`	`void ProcessGroup::Work::wait() {`