pytorch · pietern · Apr 18, 2019 · Jun 19, 2019 · Jun 19, 2019 · Jun 19, 2019
diff --git a/test/test_c10d.py b/test/test_c10d.py
@@ -2646,6 +2646,46 @@ def forward(self, x):
             loss = criterion(output, target)
             loss.backward()
 
+    def test_sparse_gradients(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+
+        class SparseGradientModule(nn.Module):
+            def __init__(self):
+                super(SparseGradientModule, self).__init__()
+                self.embedding = nn.EmbeddingBag(10, 10, sparse=True)
+
+            def forward(self, x):
+                return F.softmax(self.embedding(x), dim=1)
+
+        # Ensure initialized weights and inputs are identical across processes
+        torch.manual_seed(1337)
+
+        vanilla_model = SparseGradientModule()
+        ddp_model = DistributedDataParallel(
+            copy.deepcopy(vanilla_model),
+            process_group=process_group,
+        )
+
+        mult = 2
+        batch_size = mult * self.world_size
+        criterion = nn.CrossEntropyLoss()
+        input = torch.randint(0, 10, [batch_size, 2])
+        target = torch.randint(0, 10, [batch_size])
+
+        # Run with entire batch against single process version
+        criterion(vanilla_model(input), target).backward()
+
+        # Run with partial batch against multi process version
+        partial_input = input.split(mult)[self.rank]
+        partial_target = target.split(mult)[self.rank]
+        criterion(ddp_model(partial_input), partial_target).backward()
+
+        # Check that the gradients are sparse and identical
+        vanilla_parameter = next(vanilla_model.parameters())
+        ddp_parameter = next(ddp_model.parameters())
+        self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad)
+
 
 class ReducerModule(nn.Module):
     def __init__(self):

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -47,6 +47,7 @@ PyObject* c10d_init(PyObject* _unused) {
   shared_ptr_class_<::c10d::Reducer>(module, "Reducer")
       .def(py::init<
            std::vector<std::vector<torch::autograd::Variable>>,
+           std::vector<std::vector<bool>>,
            std::vector<std::vector<size_t>>,
            std::shared_ptr<::c10d::ProcessGroup>>())
       .def(
@@ -543,6 +544,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
       &::c10d::compute_bucket_assignment_by_size,
       py::arg("tensors"),
       py::arg("bucket_size"),
+      py::arg("expect_sparse_gradient"),
       py::call_guard<py::gil_scoped_release>());
 
   module.def(