Fix issues with lazy grad initialization (#912)

apaszke · soumith · commit c238ee368165 · 2017-03-03T14:23:51.000-05:00
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -357,19 +357,31 @@ def bw_hook(module, grad_input, grad_output):
         self.assertEqual(input.grad.data, expected_grad)
 
     def test_zero_grad(self):
+        i = Variable(torch.randn(2, 5), requires_grad=True)
         module = nn.Linear(5, 5)
         for p in module.parameters():
             p.requires_grad = False
         module.zero_grad()
 
         module.weight.requires_grad = True
-        module.weight._grad = Variable(module.weight.data.clone().fill_(1))
+        module.zero_grad()
+        self.assertIsNone(module.weight.grad)  # uninitialized grad
+
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
 
         module.bias.requires_grad = True
-        module.weight._grad = Variable(module.weight.data.clone().fill_(1))
-        module.bias._grad = Variable(module.bias.data.clone().fill_(1))
+        module.zero_grad()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNotNone(module.bias.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
+        self.assertGreater(module.bias.grad.data.abs().sum(), 0)
         module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
@@ -377,7 +377,7 @@ def eval(self):
     def zero_grad(self):
         """Sets gradients of all model parameters to zero."""
         for p in self.parameters():
-            if p.requires_grad:
+            if p.grad is not None:
                 p.grad.data.zero_()
 
     def share_memory(self):
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
@@ -11,7 +11,7 @@ def clip_grad_norm(parameters, max_norm, norm_type=2):
         max_norm (float or int): max norm of the gradients
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
     """
-    parameters = list(parameters)
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
     max_norm = float(max_norm)
     norm_type = float(norm_type)
     if norm_type == float('inf'):