Delay reduction of unused parameters until first autograd hook is called

pietern · pietern · commit 216c38480dc5 · 2019-06-25T21:39:21.000+02:00
Reduction of gradients for unused parameters should happen as soon as
possible, because they potentially block reduction of gradients for
used parameters. This used to happen instantly when
`prepare_for_backward` was called and it found parameters that didn't
contribute. This meant that if you have a model with unused
parameters, and you want to discard the model output (i.e. not call
backward on some loss), reduction of the gradients of those unused
parameters would have been kicked off, and you'd see an error the next
time you called `forward`.

In this commit, this original approach is slightly changed to delay
reduction of the gradients of those unused parameters until the first
autograd hook is called. This means that you can now discard the model
output regardless of the model having unused parameters or not.

This is a prerequisite for making the `find_unused_parameters`
argument to DDP default to `True`.
diff --git a/test/test_c10d.py b/test/test_c10d.py
@@ -2592,15 +2592,13 @@ def step_model(model, input, target):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
-    @skip_if_not_nccl
-    @skip_if_not_multigpu
     def test_ignored_output(self):
         """
-        Note: this test can be sped up by only running it on a CPU module
-        once DistributedDataParallel supports them.
+        Test that the output of a model can be ignored and that there is no
+        implicit requirement that `backward` gets called.
         """
         store = c10d.FileStore(self.file.name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
         class IgnoredOutput(nn.Module):
             def __init__(self):
@@ -2614,17 +2612,59 @@ def forward(self, x):
                 x = self.relu(self.fc2(x))
                 return F.softmax(x, dim=1)
 
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
         model = DistributedDataParallel(
-            IgnoredOutput().float().to(device_id),
-            device_ids=[device_id],
+            IgnoredOutput().float(),
             process_group=process_group,
         )
 
         batch_size = 4
         criterion = nn.CrossEntropyLoss()
         input = torch.rand([batch_size, 2], dtype=torch.float)
-        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)])
+
+        # Run a few iterations where we ignore the output.
+        for _ in range(4):
+            output = model(input)
+            del output
+
+        # Run a few iterations where we use the output.
+        for _ in range(4):
+            output = model(input)
+            loss = criterion(output, target)
+            loss.backward()
+
+    def test_ignored_output_with_unused_parameters(self):
+        """
+        Test that the output of a model can be ignored and that there is no
+        implicit requirement that `backward` gets called, if not all model
+        parameters participated in computing the model output.
+        """
+        store = c10d.FileStore(self.file.name, self.world_size)
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+
+        class IgnoredOutputWithUnusedParameters(nn.Module):
+            def __init__(self):
+                super(IgnoredOutputWithUnusedParameters, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.fc3 = nn.Linear(4, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                return F.softmax(x, dim=1)
+
+        model = DistributedDataParallel(
+            IgnoredOutputWithUnusedParameters().float(),
+            process_group=process_group,
+            find_unused_parameters=True,
+        )
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)])
 
         # Run a few iterations where we ignore the output.
         for _ in range(4):
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
@@ -48,8 +48,8 @@ Reducer::Reducer(
       expect_sparse_gradients_(std::move(expect_sparse_gradients)),
       expect_autograd_hooks_(false),
       require_finalize_(false),
-      has_marked_unused_parameters_(false),
       next_bucket_(0),
+      has_marked_unused_parameters_(false),
       backward_stats_base_(0) {
   AT_ASSERTM(replicas_.size() >= 1, "Expected at least one model replica.");
   AT_ASSERTM(replicas_[0].size() >= 1, "Expected at least one parameter.");
@@ -118,6 +118,10 @@ Reducer::Reducer(
       for (size_t variable_index = 0; variable_index < variable_count;
            variable_index++) {
         auto& variable = replicas_[replica_index][variable_index];
+        const auto index = VariableIndex{
+          replica_index : replica_index,
+          variable_index : variable_index,
+        };
 
         // The gradient accumulator function is lazily initialized once.
         // Therefore we can use its presence in the autograd graph as
@@ -126,21 +130,14 @@ Reducer::Reducer(
 
         // Hook to execute after the gradient accumulator has executed.
         hooks_.emplace_back(
-            grad_accumulator->add_post_hook(
-                torch::make_unique<LambdaPostHook>([=] {
-                  std::lock_guard<std::mutex> lock(this->mutex_);
-                  this->mark_variable_ready(
-                      replica_index,
-                      variable_index,
-                      /* called_from_autograd= */ true);
-                })),
+            grad_accumulator->add_post_hook(torch::make_unique<LambdaPostHook>(
+                [=] { this->autograd_hook(index); })),
             grad_accumulator);
 
         // Map raw function pointer to replica index and parameter index.
         // This is used later on when the autograd graph is traversed
         // to check for parameters for which no gradient is computed.
-        func_[grad_accumulator.get()] =
-            std::make_tuple(replica_index, variable_index);
+        func_[grad_accumulator.get()] = index;
 
         // The gradient accumulator is stored as weak_ptr in the autograd
         // metadata of the variable, so we have to keep it alive here for
@@ -177,9 +174,9 @@ Reducer::~Reducer() noexcept(false) {
   }
 }
 
-void Reducer::mark_variable_ready_dense(
-    size_t replica_index,
-    size_t variable_index) {
+void Reducer::mark_variable_ready_dense(VariableIndex index) {
+  const auto replica_index = index.replica_index;
+  const auto variable_index = index.variable_index;
   const auto& bucket_index = variable_locators_[variable_index];
   auto& bucket = buckets_[bucket_index.bucket_index];
   auto& replica = bucket.replicas[replica_index];
@@ -214,9 +211,9 @@ void Reducer::mark_variable_ready_dense(
   }
 }
 
-void Reducer::mark_variable_ready_sparse(
-    size_t replica_index,
-    size_t variable_index) {
+void Reducer::mark_variable_ready_sparse(VariableIndex index) {
+  const auto replica_index = index.replica_index;
+  const auto variable_index = index.variable_index;
   const auto& bucket_index = variable_locators_[variable_index];
   auto& bucket = buckets_[bucket_index.bucket_index];
   auto& replica = bucket.replicas[replica_index];
@@ -235,22 +232,37 @@ void Reducer::mark_variable_ready_sparse(
   replica.contents = grad;
 }
 
-// Called when the gradient for the specified variable is ready.
-// It can be called from two places:
-// - By an autograd thread after executing a gradient accumulator function.
-// - By the `Reducer::prepare_for_backward` function if the variable doesn't
-//   show up in the autograd graph (and it wouldn't be called by autograd).
-void Reducer::mark_variable_ready(
-    size_t replica_index,
-    size_t variable_index,
-    bool called_from_autograd) {
+// The function `autograd_hook` is called after the gradient for a
+// model parameter has been accumulated into its gradient tensor.
+// This function is only to be called from the autograd thread.
+void Reducer::autograd_hook(VariableIndex index) {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+
   // Ignore if we don't expect to be called.
   // This may be the case if the user wants to accumulate gradients
   // for number of iterations before reducing them.
   if (!expect_autograd_hooks_) {
     return;
   }
 
+  // If there are model parameters that went unused when computing the model
+  // output, they won't be part of the autograd graph, and won't receive
+  // gradients. These parameters are discovered in the `prepare_for_backward`
+  // function and their indexes stored in the `unused_parameters_` vector.
+  if (!has_marked_unused_parameters_ && !unused_parameters_.empty()) {
+    has_marked_unused_parameters_ = true;
+    for (const auto& unused_index : unused_parameters_) {
+      mark_variable_ready(unused_index);
+    }
+  }
+
+  // Finally mark variable for which this function was originally called.
+  mark_variable_ready(index);
+}
+
+void Reducer::mark_variable_ready(VariableIndex index) {
+  const auto replica_index = index.replica_index;
+  const auto variable_index = index.variable_index;
   AT_ASSERTM(replica_index < replicas_.size(), "Out of range replica index.");
   AT_ASSERTM(
       variable_index < variable_locators_.size(),
@@ -293,9 +305,9 @@ void Reducer::mark_variable_ready(
   }
 
   if (bucket.expect_sparse_gradient) {
-    mark_variable_ready_sparse(replica_index, variable_index);
+    mark_variable_ready_sparse(index);
   } else {
-    mark_variable_ready_dense(replica_index, variable_index);
+    mark_variable_ready_dense(index);
   }
 
   // TODO(@pietern): Make this work for both CPU/CUDA tensors.
@@ -316,14 +328,10 @@ void Reducer::mark_variable_ready(
 
   // Run finalizer function once the final bucket was marked ready.
   if (next_bucket_ == buckets_.size()) {
-    if (called_from_autograd) {
-      torch::autograd::Engine::get_default_engine().queue_callback([=] {
-        std::lock_guard<std::mutex> lock(this->mutex_);
-        this->finalize_backward();
-      });
-    } else {
-      finalize_backward();
-    }
+    torch::autograd::Engine::get_default_engine().queue_callback([=] {
+      std::lock_guard<std::mutex> lock(this->mutex_);
+      this->finalize_backward();
+    });
   }
 }
 
@@ -489,8 +497,8 @@ void Reducer::prepare_for_backward(
   std::vector<torch::autograd::Function*> queue;
 
   // Check that any prior reduction has finished.
-  // The variable `expect_autograd_hooks` is true until gradients for all
-  // parameters have been received and all buckets are ready.
+  // The variable `require_finalize_` is true until all gradients
+  // have been computed and reduction of all buckets has been kicked off.
   if (require_finalize_) {
     AT_ERROR(
         "Expected to have finished reduction in the prior iteration before ",
@@ -513,7 +521,6 @@ void Reducer::prepare_for_backward(
   }
 
   // Reset accounting.
-  has_marked_unused_parameters_ = true;
   expect_autograd_hooks_ = true;
   next_bucket_ = 0;
   backward_stats_base_ = current_time_in_nanos();
@@ -524,11 +531,14 @@ void Reducer::prepare_for_backward(
     bucket.pending = bucket.replicas.size();
   }
 
+  // Reset unused parameter accounting.
+  has_marked_unused_parameters_ = false;
+  unused_parameters_.clear();
+
   // If no outputs are specified, we assume that autograd hooks for ALL
   // variables will be called, and we don't have to search the autograd graph
   // for presence of these hooks.
   if (outputs.empty()) {
-    has_marked_unused_parameters_ = false;
     return;
   }
 
@@ -562,10 +572,7 @@ void Reducer::prepare_for_backward(
       continue;
     }
 
-    size_t replica_index;
-    size_t variable_index;
-    std::tie(replica_index, variable_index) = it.second;
-    mark_variable_ready(replica_index, variable_index);
+    unused_parameters_.push_back(it.second);
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
@@ -51,30 +51,37 @@ class Reducer {
   // Forward declaration.
   struct Bucket;
 
+  // Locates a specific variable by replica index and variable index.
+  struct VariableIndex {
+    size_t replica_index;
+    size_t variable_index;
+  };
+
   std::mutex mutex_;
   std::vector<std::vector<torch::autograd::Variable>> replicas_;
   std::shared_ptr<c10d::ProcessGroup> process_group_;
   std::vector<std::vector<bool>> expect_sparse_gradients_;
 
   std::vector<std::vector<std::shared_ptr<torch::autograd::Function>>>
       grad_accumulators_;
-  std::unordered_map<torch::autograd::Function*, std::tuple<int, int>> func_;
+  std::unordered_map<torch::autograd::Function*, VariableIndex> func_;
   std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Function>>>
       hooks_;
 
   bool expect_autograd_hooks_;
   bool require_finalize_;
-  bool has_marked_unused_parameters_;
   size_t next_bucket_;
 
-  void mark_variable_ready_dense(size_t replica_index, size_t variable_index);
+  bool has_marked_unused_parameters_;
+  std::vector<VariableIndex> unused_parameters_;
+
+  void mark_variable_ready_dense(VariableIndex index);
+
+  void mark_variable_ready_sparse(VariableIndex index);
 
-  void mark_variable_ready_sparse(size_t replica_index, size_t variable_index);
+  void mark_variable_ready(VariableIndex index);
 
-  void mark_variable_ready(
-      size_t replica_index,
-      size_t variable_index,
-      bool called_from_autograd = false);
+  void autograd_hook(VariableIndex index);
 
   void mark_bucket_ready(size_t bucket_index);