add tests, copy diagonal code to backward for double differentiability

t-vi · t-vi · commit 31aad4f0773f · 2018-04-19T09:03:03.000+02:00
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -1928,6 +1928,21 @@ def test_diagonal_multidim(self):
         expected = xn.diagonal(0, -2, -1)
         self.assertEqual(expected.shape, result.shape)
         self.assertTrue(np.allclose(expected, result.numpy()))
+        # test non-continguous
+        xp = x.permute(1, 2, 3, 0)
+        result = torch.diagonal(xp, 0, -2, -1)
+        expected = xp.numpy().diagonal(0, -2, -1)
+        self.assertEqual(expected.shape, result.shape)
+        self.assertTrue(np.allclose(expected, result.numpy()))
+        # test that the backward requires grad
+        # we do this is because diagonal_backward uses inplace
+        # operations and gradgradcheck does not catch whether
+        # they works as expected
+        a = torch.randn(5, 6, requires_grad=True)
+        b = torch.diagonal(a)**2
+        c = b.sum()
+        d, = torch.autograd.grad(c,a, retain_graph=True, create_graph=True)
+        self.assertTrue(d.requires_grad)
 
     @staticmethod
     def _test_diagflat(self, dtype, device):
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
@@ -717,9 +717,36 @@ Tensor diag_backward(const Tensor & grad, IntList input_sizes, int64_t diagonal)
   return grad_input;
 }
 
-Tensor diagonal_backward(const Tensor & grad, IntList input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+Tensor diagonal_backward(const Tensor & grad, IntList input_sizes, int64_t offset, int64_t dim1_, int64_t dim2_) {
   auto grad_input = at::zeros(grad.type(), input_sizes);
-  auto diag = at::diagonal(grad_input, offset, dim1, dim2);
+  // the following until the assignment of auto diag
+  // copies the diagonal code in aten/src/ATen/native/TensorShape.cpp
+  // that would be equivalent to
+  //        auto diag = grad_input.diagonal(offset, dim1, dim2);
+  // when using diagonal, the output is not differentiable twice
+  // while this works
+  int64_t nDims = input_sizes.size();
+  int64_t dim1 = at::maybe_wrap_dim(dim1_, nDims);
+  int64_t dim2 = at::maybe_wrap_dim(dim2_, nDims);
+  int64_t diag_size;
+  int64_t storage_offset = grad_input.storage_offset();
+  if (offset >= 0) {
+    diag_size = std::min(grad_input.size(dim1), grad_input.size(dim2)-offset);
+    storage_offset += offset * grad_input.stride(dim2);
+  } else {
+    diag_size = std::min(grad_input.size(dim1)+offset, grad_input.size(dim2));
+    storage_offset -= offset * grad_input.stride(dim1);
+  }
+  auto sizes = std::vector<int64_t>(grad_input.sizes());
+  auto strides = std::vector<int64_t>(grad_input.strides());
+  sizes.erase(sizes.begin() + std::max(dim1, dim2));
+  strides.erase(strides.begin() + std::max(dim1, dim2));
+  sizes.erase(sizes.begin() + std::min(dim1, dim2));
+  strides.erase(strides.begin() + std::min(dim1, dim2));
+  sizes.push_back(diag_size);
+  strides.push_back(grad_input.stride(dim1)+grad_input.stride(dim2));
+  auto diag = grad_input.as_strided(sizes, strides, storage_offset);
+
   diag.copy_(grad);
   return grad_input;
 }