pytorch
diff --git a/‎test/common.py‎
Lines changed: 12 additions & 5 deletions b/‎test/common.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎test/common_nn.py‎
Lines changed: 6 additions & 3 deletions b/‎test/common_nn.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎test/test_autograd.py‎
Lines changed: 45 additions & 1 deletion b/‎test/test_autograd.py‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎test/test_multiprocessing.py‎
Lines changed: 3 additions & 3 deletions b/‎test/test_multiprocessing.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/test_nn.py‎
Lines changed: 29 additions & 17 deletions b/‎test/test_nn.py‎
Lines changed: 29 additions & 17 deletions
@@ -118,11 +118,18 @@ def assertEqual(self, x, y, prec=None, message=''):
             y = y.data
 
         if torch.is_tensor(x) and torch.is_tensor(y):
-            max_err = 0
-            super(TestCase, self).assertEqual(x.size(), y.size())
-            for index in iter_indices(x):
-                max_err = max(max_err, abs(x[index] - y[index]))
-            self.assertLessEqual(max_err, prec, message)
+            def assertTensorsEqual(a, b):
+                max_err = 0
+                super(TestCase, self).assertEqual(a.size(), b.size())
+                for index in iter_indices(a):
+                    max_err = max(max_err, abs(a[index] - b[index]))
+                self.assertLessEqual(max_err, prec, message)
+            self.assertEqual(x.is_sparse, y.is_sparse, message)
+            if x.is_sparse:
+                assertTensorsEqual(x.indices(), y.indices())
+                assertTensorsEqual(x.values(), y.values())
+            else:
+                assertTensorsEqual(x, y)
         elif type(x) == str and type(y) == str:
             super(TestCase, self).assertEqual(x, y)
         elif is_iterable(x) and is_iterable(y):
 
@@ -337,15 +337,18 @@ def _jacobian(self, input, num_out):
 
     def _flatten_tensors(self, x):
         if torch.is_tensor(x):
-            return x.view(-1)
+            if x.is_sparse:
+                return x.to_dense().view(-1)
+            else:
+                return x.view(-1)
         elif isinstance(x, Variable):
-            return x.data.view(-1)
+            return self._flatten_tensors(x.data)
         else:
             return tuple(self._flatten_tensors(a) for a in x)
 
     def _zero_grad_input(self, input):
         if isinstance(input, Variable):
-            if input.requires_grad:
+            if input.requires_grad and input.grad is not None:
                 input.grad.data.zero_()
         elif torch.is_tensor(input):
             return
 
@@ -128,6 +128,49 @@ def _test_backward(self):
     def test_backward(self):
         self._test_backward()
 
+    def test_sparse_backward(self):
+        class FixedGradientFunction(Function):
+
+            def __init__(self, grad):
+                self.grad = grad
+
+            def forward(self, x):
+                return x
+
+            def backward(self, grad_x):
+                return self.grad
+
+        size = torch.Size([6, 3, 2])
+        i1 = torch.LongTensor([
+            [0, 3, 4],
+            [0, 2, 2],
+        ])
+        v1 = torch.DoubleTensor([[1, 2], [4, 5], [7, 8]])
+        sparse_grad1 = torch.sparse.DoubleTensor(i1, v1, size)
+        i2 = torch.LongTensor([
+            [0, 1, 3, 4],
+            [0, 1, 2, 2],
+        ])
+        v2 = torch.DoubleTensor([[1, 2], [4, 3], [4, 5], [7, 8]])
+        sparse_grad2 = torch.sparse.DoubleTensor(i2, v2, size)
+        dense_grad = torch.rand(size).double()
+        sparse_fn1 = FixedGradientFunction(sparse_grad1)
+        sparse_fn2 = FixedGradientFunction(sparse_grad2)
+        dense_fn = FixedGradientFunction(dense_grad)
+
+        # sparse first
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (sparse_fn1(x) + dense_fn(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, dense_grad + sparse_grad1 + sparse_grad2)
+        # dense first
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (dense_fn(x) + sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, dense_grad + sparse_grad1 + sparse_grad2)
+        # sparse only
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, sparse_grad1 + sparse_grad2)
+
     @unittest.skip("BasicEngine is out of date")
     def test_backward_basic_engine(self):
         with backward_engine(torch.autograd.engine.BasicEngine):
@@ -197,7 +240,8 @@ def test_indexing(self):
         y = Variable(x, requires_grad=True)
 
         def check_index(idx):
-            y.grad.data.zero_()
+            if y.grad is not None:
+                y.grad.data.zero_()
             indexed_tensor = x[idx]
             indexed_var = y[idx]
 
 
@@ -80,8 +80,8 @@ def autograd_sharing(queue, ready, master_modified):
     is_ok = var.data.equal(expected_var)
     var.data[:] = torch.ones(5, 5)
 
-    is_ok &= var.grad.data.equal(torch.zeros(5, 5))
-    var.grad.data[:] = torch.ones(5, 5)
+    is_ok &= var.grad is None
+    var._grad = Variable(torch.ones(5, 5), requires_grad=False)
 
     queue.put(is_ok)
 
@@ -358,7 +358,7 @@ def _test_autograd_sharing(self, var):
         queue = mp.Queue()
         p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
         p.start()
-        var.grad.data.zero_()
+        var._grad = Variable(torch.zeros(5, 5), requires_grad=False)
         queue.put(var)
 
         ready.wait()
 
@@ -196,7 +196,8 @@ def _forward_criterion(self, criterion, input, target):
     def _backward_criterion(self, criterion, input, target):
         input_tuple = input if isinstance(input, tuple) else (input,)
         for i in input_tuple:
-            i.grad.data.zero_()
+            if i.grad is not None:
+                i.grad.data.zero_()
         args = input_tuple + (target,)
         criterion(*args).backward()
         if isinstance(input, tuple):
@@ -206,18 +207,24 @@ def _backward_criterion(self, criterion, input, target):
 
     def _zero_grad_parameters(self, module):
         if hasattr(module, 'weight') and module.weight is not None:
-            module.weight.grad.data.zero_()
+            if module.weight.grad is not None:
+                module.weight.grad.data.zero_()
         if hasattr(module, 'bias') and module.bias is not None:
-            module.bias.grad.data.zero_()
+            if module.bias.grad is not None:
+                module.bias.grad.data.zero_()
 
     def _get_parameters(self, module):
         params = []
         d_params = []
         if hasattr(module, 'weight') and module.weight is not None:
             params += [module.weight.data]
+            if module.weight.grad is None:
+                module.weight._grad = Variable(module.weight.data.clone().zero_())
             d_params += [module.weight.grad.data]
         if hasattr(module, 'bias') and module.bias is not None:
             params += [module.bias.data]
+            if module.bias.grad is None:
+                module.bias._grad = Variable(module.bias.data.clone().zero_())
             d_params += [module.bias.grad.data]
         return params, d_params
 
@@ -356,13 +363,13 @@ def test_zero_grad(self):
         module.zero_grad()
 
         module.weight.requires_grad = True
-        module.weight.grad.data.fill_(1)
+        module.weight._grad = Variable(module.weight.data.clone().fill_(1))
         module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
 
         module.bias.requires_grad = True
-        module.weight.grad.data.fill_(1)
-        module.bias.grad.data.fill_(1)
+        module.weight._grad = Variable(module.weight.data.clone().fill_(1))
+        module.bias._grad = Variable(module.bias.data.clone().fill_(1))
         module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
@@ -586,7 +593,7 @@ def compare_scaling(grads):
         grads = torch.range(1, 100), torch.ones(10).div(1000)
         for norm_type in [0.5, 1.5, 2, 4, 'inf']:
             for p, g in zip(l.parameters(), grads):
-                p.grad.data.copy_(g)
+                p._grad = Variable(g.clone())
             norm_before = compute_norm(norm_type)
             clip_grad_norm(l.parameters(), max_norm, norm_type=norm_type)
             norm_after = compute_norm(norm_type)
@@ -1167,7 +1174,8 @@ def pad(tensor, length):
             self.assertEqual(unpacked_len, lengths)
 
             # check grad
-            padded.grad.data.zero_()
+            if padded.grad is not None:
+                padded.grad.data.zero_()
             grad_output = unpacked.data.clone().normal_()
             unpacked.backward(grad_output)
             if batch_first:
@@ -1185,13 +1193,15 @@ def pad(var, length):
 
         lengths = [10, 10, 6, 2, 2, 1, 1]
         max_length = lengths[0]
-        x = Variable(torch.randn(max_length, len(lengths), 3), requires_grad=True)
+        x_leaf = Variable(torch.randn(max_length, len(lengths), 3), requires_grad=True)
         lstm = nn.LSTM(3, 4, bidirectional=True, num_layers=2)
         lstm2 = deepcopy(lstm)
         if cuda:
-            x = x.cuda()
+            x = x_leaf.cuda()
             lstm.cuda()
             lstm2.cuda()
+        else:
+            x = x_leaf
 
         # Compute sequences separately
         seq_outs = []
@@ -1216,11 +1226,11 @@ def pad(var, length):
 
         # Check backward
         seq_out.sum().backward()
-        grad_x = x.grad.data.clone()
-        x.grad.data.zero_()
+        grad_x = x_leaf.grad.data.clone()
+        x_leaf.grad.data.zero_()
         unpacked.sum().backward()
 
-        self.assertEqual(x.grad.data, grad_x)
+        self.assertEqual(x_leaf.grad.data, grad_x)
         for p1, p2 in zip(lstm.parameters(), lstm2.parameters()):
             self.assertEqual(p1.grad, p2.grad)
 
@@ -1576,11 +1586,12 @@ def test_noncontig_conv_grad(self):
         grad = torch.randn(2, 2, 5, 10, 10).cuda()[:, 1]
         assert not grad.is_contiguous()
         output.backward(grad, retain_variables=True)
-        result = output.grad.data.clone()
-        output.grad.data.zero_()
+        self.assertIsNotNone(input.grad)
+        result = input.grad.data.clone()
+        input.grad.data.zero_()
 
         output.backward(grad.contiguous())
-        self.assertEqual(result, output.grad.data)
+        self.assertEqual(result, input.grad.data)
 
     def test_pixel_shuffle(self):
         batch_size = random.randint(1, 3)
@@ -1613,7 +1624,8 @@ def test_batchnorm_eval(self):
             grad1 = data.grad.data.clone()
 
             # 2nd pass
-            data.grad.data.zero_()
+            if data.grad is not None:
+                data.grad.data.zero_()
 
             res2 = module(data)
             res2.backward(grad)