V3: Initial commit

pietern · pietern · commit 4ee9dd516b87 · 2019-04-26T22:37:01.000-07:00
Differential Revision: D15113272
Differential Version: 80846101
diff --git a/test/test_c10d.py b/test/test_c10d.py
@@ -2257,6 +2257,57 @@ def forward(self, x):
         loss2 = criterion(output2, target)
         loss2.backward()
 
+    @skip_if_not_nccl
+    @skip_if_not_multigpu
+    def test_no_used_parameters(self):
+        """
+        Note: this test can be sped up by only running it on a CPU module
+        once DistributedDataParallel supports them.
+        """
+        store = c10d.FileStore(self.file.name, self.world_size)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        class NoUsedParameters(nn.Module):
+            def __init__(self):
+                super(NoUsedParameters, self).__init__()
+
+                # Make sure this module has some parameters, only to then decide
+                # to never use them from the `forward` function.
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.fc3 = nn.Linear(4, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                return x * 0.0
+
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        model = DistributedDataParallel(
+            NoUsedParameters().float().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+
+        batch_size = 4
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+
+        # After initialization, no parameter has their gradient set.
+        for p in model.parameters():
+            self.assertTrue(p.requires_grad)
+            self.assertIsNone(p.grad)
+
+        # Run `forward` function.
+        model(input)
+
+        # Because none of the parameters were used, we expect reduction for
+        # all parameters will be executed right when initializing the reducer.
+        # Once `forward` returns, all the parameter's gradients must be set.
+        for p in model.parameters():
+            self.assertTrue(p.requires_grad)
+            self.assertIsNotNone(p.grad)
+            self.assertTrue(torch.is_tensor(p.grad))
+            self.assertEqual(p.size(), p.grad.size())
+
 
 class ReducerModule(nn.Module):
     def __init__(self):
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
@@ -228,12 +228,16 @@ void Reducer::mark_variable_ready(
     }
   }
 
-  // Queue function to finalize once the final bucket was marked ready.
+  // Run finalizer function once the final bucket was marked ready.
   if (next_bucket_ == buckets_.size()) {
-    // Autograd callbacks can only be registered while the engine is running.
-    AT_ASSERT(called_from_autograd);
-    torch::autograd::Engine::get_default_engine().queue_callback(
-        [=] { this->finalize_backward(); });
+    if (called_from_autograd) {
+      torch::autograd::Engine::get_default_engine().queue_callback([=] {
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        this->finalize_backward();
+      });
+    } else {
+      finalize_backward();
+    }
   }
 }
 
@@ -375,6 +379,28 @@ void Reducer::prepare_for_backward(
   std::unordered_set<torch::autograd::Function*> seen;
   std::vector<torch::autograd::Function*> queue;
 
+  // Check that any prior reduction has finished.
+  // The variable `expect_autograd_hooks` is true until gradients for all
+  // parameters have been received and all buckets are ready.
+  if (expect_autograd_hooks_) {
+    AT_ERROR(
+        "Expected to have finished reduction in the prior iteration before ",
+        "starting a new one. ",
+        "",
+        "This error indicates that your module has parameters that were ",
+        "not used in producing its output (the return value of `forward`). ",
+        "",
+        "You can enable unused parameter detection by passing the keyword "
+        "argument `find_unused_parameters=True` to ",
+        "`torch.nn.parallel.DistributedDataParallel`. ",
+        "",
+        "If you already have this argument set, then the distributed data ",
+        "parallel module wasn't able to locate the output tensors in the ",
+        "return value of your module's `forward` function. ",
+        "Please include the structure of the return value of `forward` of ",
+        "your module when reporting this issue (e.g. list, dict, iterable).");
+  }
+
   // Reset accounting.
   has_marked_unused_parameters_ = true;
   expect_autograd_hooks_ = true;
@@ -433,34 +459,12 @@ void Reducer::prepare_for_backward(
 }
 
 void Reducer::finalize_backward() {
-  std::lock_guard<std::mutex> lock(mutex_);
-
   // No longer expect autograd hooks to fire after this function returns.
   AT_ASSERT(expect_autograd_hooks_);
   expect_autograd_hooks_ = false;
 
   // Check that all buckets were completed and had their work kicked off.
-  if (next_bucket_ < buckets_.size()) {
-    // If the reducer marked unused parameters and we STILL didn't get
-    // gradients for all module parameters, something is seriously wrong.
-    AT_ASSERT(!has_marked_unused_parameters_);
-    AT_ERROR(
-        "Expected to have gradients for all module parameters upon returning ",
-        "from the call to `torch.autograd.backward`. ",
-        "",
-        "This error indicates that your module has parameters that were ",
-        "not used in producing its output (the return value of `forward`). ",
-        "",
-        "You can enable unused parameter detection by passing the keyword "
-        "argument `find_unused_parameters=True` to ",
-        "`torch.nn.parallel.DistributedDataParallel`. ",
-        "",
-        "If you already have this argument set, then the distributed data ",
-        "parallel module wasn't able to locate the output tensors in the ",
-        "return value of your module's `forward` function. ",
-        "Please include the structure of the return value of `forward` of ",
-        "your module when reporting this issue (e.g. list, dict, iterable).");
-  }
+  AT_ASSERT(next_bucket_ == buckets_.size());
 
   // Wait for asynchronous reduction to complete and unflatten contents.
   for (auto& bucket : buckets_) {