Don't modify requires_grad when running DataParallel in no_grad mode (#5880)

colesbury · ezyang · commit d11b7fbd1c49 · 2018-03-19T15:26:51.000-04:00
Previously, running DataParallel in no_grad mode would change the requires_grad property of the network's parameters to False. The issue is that Broadcast returns aliases of the inputs for the source device. In no_grad mode, it would deatch these inputs in-place. Fixes #5851
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -1527,6 +1527,23 @@ def test_no_grad_modifies_version(self):
         self.assertRaisesRegex(RuntimeError, 'modified by an inplace operation',
                                lambda: z.backward())
 
+    def test_no_grad_input(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(self, x):
+                return x
+
+            @staticmethod
+            def backward(self, grad_output):
+                return grad_output
+
+        x = torch.randn(5, requires_grad=True)
+        with torch.no_grad():
+            y = MyFunction.apply(x)
+
+        self.assertTrue(x.requires_grad)
+        self.assertIsNone(y.grad_fn)
+
     def test_backward_copy(self):
         # This tests checks backward engine for a very subtle bug that appreared
         # in one of the initial versions of autograd. Gradients tensors were
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1905,6 +1905,15 @@ def test_broadcast_not_requiring_grad(self):
             input_var = variables[output_idx % len(variables)]
             self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_broadcast_no_grad(self):
+        x = torch.randn(1, 2, dtype=torch.cuda.float32, requires_grad=True)
+        with torch.no_grad():
+            broadcasted = Broadcast.apply((0, 1), x)
+        self.assertTrue(x.requires_grad)
+        for output in broadcasted:
+            self.assertFalse(output.requires_grad)
+
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_replicate(self):
         module = nn.Linear(10, 5).float().cuda()
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
@@ -373,15 +373,22 @@ static void _wrap_outputs(THPFunction *self,
   auto set_history = [&](Variable& var, uint32_t output_nr, bool is_input, bool is_modified,
                          bool is_differentiable) {
     if (!is_differentiable) {
-      if (!var.requires_grad()) return;
+      if (!var.requires_grad()) {
+        return;
+      }
       // NB: we don't support returning non-differentiable views that could require grad
-      // (this could happen if someone were to return an input to the function).
       if (var.is_view()) {
         throw std::runtime_error("Returning Variables sharing storage with other Variables "
                                  "that require grad is not supported in Python functions. "
                                  "Please submit a feature request if you hit this error.");
       }
-      var.detach_();
+      // Return detached aliases of inputs, instead of changing their requires_grad
+      // property.
+      if (is_input) {
+        var = var.detach();
+      } else {
+        var.detach_();
+      }
     } else if (is_modified) {
       if (var.is_leaf() && var.requires_grad()) {
         throw std::runtime_error("a leaf Variable that requires grad has been used in an in-place operation.");