pytorch · ezyang · Apr 26, 2018 · Apr 10, 2018 · Apr 19, 2018 · Apr 23, 2018
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
@@ -44,11 +44,42 @@ Tensor diagflat(const Tensor& self, int64_t offset) {
   return self.contiguous().view(-1).diag(offset);
 }
 
-Tensor diagonal(const Tensor& self, int64_t offset) {
-  if (self.dim() != 2) {
-    throw std::runtime_error("diagonal expects a 2-dimensional tensor");
-  }
-  return self.diag(offset);
+Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_) {
+  int64_t nDims = self.dim();
+  int64_t dim1 = maybe_wrap_dim(dim1_, nDims);
+  int64_t dim2 = maybe_wrap_dim(dim2_, nDims);
+  AT_ASSERT(dim1 != dim2, "diagonal dimensions cannot be identical %zd, %zd", dim1_, dim2_);
+  int64_t diag_size;
+  int64_t storage_offset = self.storage_offset();
+  // compute storage offset and size for the diagonal
+  // for positive values of offset (above the main diagonal)
+  // "leftmost columns" (along dim2) are dropped
+  // for negative values of offset (below the main diagonal)
+  // "topmost rows" (along dim1) are dropped.
+  // Note that we invert +/- in the second to absorb the negative
+  // sign in the offset.
+  if (offset >= 0) {
+    diag_size = std::min(self.size(dim1), self.size(dim2)-offset);
+    storage_offset += offset * self.stride(dim2);
+  } else {
+    diag_size = std::min(self.size(dim1)+offset, self.size(dim2));
+    storage_offset -= offset * self.stride(dim1);
+  }
+  AT_ASSERT(diag_size > 0, "invalid diagonal offset %zd", offset); // the diagonal offset was too large in magnitude
+
+  // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum)
+  // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
+  sizes.erase(sizes.begin() + std::max(dim1, dim2));
+  strides.erase(strides.begin() + std::max(dim1, dim2));
+  sizes.erase(sizes.begin() + std::min(dim1, dim2));
+  strides.erase(strides.begin() + std::min(dim1, dim2));
+  sizes.push_back(diag_size);
+  strides.push_back(self.stride(dim1)+self.stride(dim2));
+
+  // return view with new parameters
+  return self.as_strided(sizes, strides, storage_offset);
 }
 
 Tensor expand(const Tensor& self, IntList size) {

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -277,8 +277,7 @@
 - func: diagflat(Tensor self, int64_t offset=0) -> Tensor
   variants: function
 
-- func: diagonal(Tensor self, int64_t offset=0) -> Tensor
-  variants: function
+- func: diagonal(Tensor self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) -> Tensor
 
 - func: dot(Tensor self, Tensor tensor) -> Tensor
 

diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -2116,6 +2116,18 @@ def test_mul_out_result_requires_grad(self):
         # we should throw an exception if the output requires grad
         self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
 
+    def test_diagonal_derivative_requires_grad(self):
+        # test that the backward requires grad
+        # we do this is because diagonal_backward uses inplace
+        # operations and gradgradcheck does not catch whether
+        # they works as expected (it will succeed even if
+        # the gradient has requires_grad == False
+        a = torch.randn(5, 6, requires_grad=True)
+        b = torch.diagonal(a)**2
+        c = b.sum()
+        d, = torch.autograd.grad(c, a, retain_graph=True, create_graph=True)
+        self.assertTrue(d.requires_grad)
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
@@ -2630,6 +2642,18 @@ class dont_convert(tuple):
     ('diag', (M,), NO_ARGS, '1d'),
     ('diag', (M, M), (1,), '2d_1'),
     ('diag', (M, M), (2,), '2d_2'),
+    ('diagonal', (M, M), NO_ARGS, '2d'),
+    ('diagonal', (3, 5), NO_ARGS, '2d_wide'),
+    ('diagonal', (3, 5), (2,), '2d_wide_pos'),
+    ('diagonal', (3, 5), (-2,), '2d_wide_neg'),
+    ('diagonal', (5, 3), NO_ARGS, '2d_tall'),
+    ('diagonal', (5, 3), (2,), '2d_tall_pos'),
+    ('diagonal', (5, 3), (-2,), '2d_tall_neg'),
+    ('diagonal', (M, M), (1,), '2d_1'),
+    ('diagonal', (M, M), (2,), '2d_2'),
+    ('diagonal', (M, M, M), (1, 1, 2), '3d_1'),
+    ('diagonal', (M, M, M), (2, 0, 1), '3d_2'),
+    ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
     ('tril', (M, M), NO_ARGS),
     ('tril', (M, M), (2,), 'idx'),
     ('triu', (M, M), NO_ARGS),

diff --git a/test/test_torch.py b/test/test_torch.py
@@ -1909,6 +1909,25 @@ def _test_diagonal(self, dtype, device):
     def test_diagonal(self):
         self._test_diagonal(self, dtype=torch.float32, device='cpu')
 
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    def test_diagonal_multidim(self):
+        x = torch.randn(10, 11, 12, 13)
+        xn = x.numpy()
+        for args in [(2, 2, 3),
+                     (2,),
+                     (-2, 1, 2),
+                     (0, -2, -1)]:
+            result = torch.diagonal(x, *args)
+            expected = xn.diagonal(*args)
+            self.assertEqual(expected.shape, result.shape)
+            self.assertTrue(np.allclose(expected, result.numpy()))
+        # test non-continguous
+        xp = x.permute(1, 2, 3, 0)
+        result = torch.diagonal(xp, 0, -2, -1)
+        expected = xp.numpy().diagonal(0, -2, -1)
+        self.assertEqual(expected.shape, result.shape)
+        self.assertTrue(np.allclose(expected, result.numpy()))
+
     @staticmethod
     def _test_diagflat(self, dtype, device):
         # Basic sanity test

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -200,6 +200,9 @@
 - name: diag(Tensor self, int64_t diagonal)
   self: diag_backward(grad, self.sizes(), diagonal)
 
+- name: diagonal(Tensor self, int64_t offset, int64_t dim1, int64_t dim2)
+  self: diagonal_backward(grad, self.sizes(), offset, dim1, dim2)
+
 - name: dist(Tensor self, Tensor other, Scalar p)
   self: norm_backward(grad, self - other, p, result)
   other: -norm_backward(grad, self - other, p, result)

diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
@@ -19,7 +19,7 @@
 deprecated_path = os.path.join(os.path.dirname(__file__), 'deprecated.yaml')
 
 VIEW_FUNCTIONS = {
-    'alias', 'as_strided', 'expand', 'narrow', 'permute', 'select', 'slice',
+    'alias', 'as_strided', 'diagonal', 'expand', 'narrow', 'permute', 'select', 'slice',
     'squeeze', 't', 'transpose', 'unfold', 'unsqueeze', 'view',
 }
 

diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
@@ -717,6 +717,13 @@ Tensor diag_backward(const Tensor & grad, IntList input_sizes, int64_t diagonal)
   return grad_input;
 }
 
+Tensor diagonal_backward(const Tensor & grad, IntList input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+  auto grad_input = at::zeros(grad.type(), input_sizes);
+  auto diag = grad_input.diagonal(offset, dim1, dim2);
+  diag.copy_(grad);
+  return grad_input;
+}
+
 Tensor mse_loss_double_backward(const Tensor & grad, const Tensor & input, bool size_average, bool reduce) {
   auto grad_input = 2 * grad;
   if (size_average && reduce) {

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -1267,9 +1267,11 @@
 
 add_docstr(torch.diagonal,
            r"""
-diagonal(input, offset=0) -> Tensor
+diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
 
-Returns a 1-D tensor with the diagonal elements of :attr:`input`.
+Returns a partial view of :attr:`input` with the its diagonal elements
+with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+at the end of the shape.
 
 The argument :attr:`offset` controls which diagonal to consider:
 
@@ -1278,9 +1280,15 @@
 - If :attr:`offset` < 0, it is below the main diagonal.
 
 Args:
-    input (Tensor): the input tensor. Must be 2-dimensional.
+    input (Tensor): the input tensor. Must be at least 2-dimensional.
     offset (int, optional): which diagonal to consider. Default: 0
         (main diagonal).
+    dim1 (int, optional): first dimension with respect to which to
+        take diagonal. Default: 0.
+    dim2 (int, optional): second dimension with respect to which to
+        take diagonal. Default: 1.
+
+.. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
 
 Examples::
 
@@ -1305,6 +1313,17 @@
     -0.2239
     [torch.FloatTensor of size 2]
 
+    >>> x = torch.randn(2, 5, 4, 2)
+    >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+
+    (0 ,.,.) =
+     -0.6806 -0.0281 -0.6595 -0.4199
+      0.8741 -0.1793 -0.6997  0.6265
+
+    (1 ,.,.) =
+      0.6182  1.3069  1.6503  1.7627
+     -0.2122 -0.2250  0.0990 -2.6433
+    [torch.FloatTensor of size (2,2,4)]
 """)
 
 add_docstr(torch.dist,