svd bwd

ssnl · ssnl · commit fc62b950f1e6 · 2017-11-30T12:38:06.000-05:00
diff --git a/aten/src/ATen/native/NativeFunctions.cpp b/aten/src/ATen/native/NativeFunctions.cpp
@@ -286,7 +286,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _det_with_svd(const Tensor& self) {
   // check symmetric
   bool symmetric = self.equal(self.transpose(0, 1));
 
-  auto svd = self.svd(false);
+  auto svd = self.svd(true);
   auto sigma = std::get<1>(svd);
   auto u = std::get<0>(svd);
   auto v = std::get<2>(svd);
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -1915,6 +1915,13 @@ def random_symmetric_matrix(l):
     return A.mm(A.transpose(0, 1))
 
 
+def random_fullrank_matrix_distinct_singular_value(l):
+    A = torch.randn(l, l)
+    u, _, v = A.svd()
+    s = torch.arange(1, l + 1).mul_(1.0 / (l + 1))
+    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
+
+
 class dont_convert(tuple):
     pass
 
@@ -2187,6 +2194,8 @@ class dont_convert(tuple):
     ('det', lambda: random_square_matrix_of_rank(S, S - 2), (), 'dim2_null', (), [skipIfNoLapack]),
     ('det', lambda: random_square_matrix_of_rank(S, 1), (), 'rank1', (), [skipIfNoLapack]),
     ('det', lambda: random_square_matrix_of_rank(S, 2), (), 'rank2', (), [skipIfNoLapack]),
+    ('det', lambda: random_fullrank_matrix_distinct_singular_value(S), (), 'distinct_postive_s', (), [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S), (), '', (), [skipIfNoLapack]),
     ('gesv', (S, S), ((S, S),), '', (), [skipIfNoLapack]),
     ('potrf', _make_cov(S), (True,), '', (), [skipIfNoLapack]),
     ('eq', (S, S, S), ((S, S, S),)),
@@ -2363,7 +2372,17 @@ def maybe_non_contig(tensor):
     'potrf'
 }
 EXCLUDE_GRADGRADCHECK = {
-    'det'
+    'svd'
+}
+EXCLUDE_GRADGRADCHECK_BY_TEST_NAME = {
+    # Some of the following det ones pass because random matrix has full rank
+    # with high probability. But we can't rely on this. So only test gradgrad on
+    # test_det_distinct_postive_s.
+    'test_det',
+    'test_det_symmetric',
+    'test_det_dim2_null',
+    'test_det_rank1',
+    'test_det_rank2'
 }
 
 
@@ -2417,10 +2436,10 @@ def gradgradcheck_method_precision_override(test_name):
     return override
 
 
-def run_grad_and_gradgrad_checks(test_case, test_name, apply_method, output_variable,
+def run_grad_and_gradgrad_checks(test_case, name, test_name, apply_method, output_variable,
                                  input_variables, run_gradgradcheck=True):
     test_case.assertTrue(gradcheck(apply_method, input_variables, eps=1e-6, atol=PRECISION))
-    if not run_gradgradcheck:
+    if name in EXCLUDE_GRADGRADCHECK or test_name in EXCLUDE_GRADGRADCHECK_BY_TEST_NAME:
         return
     grad_y = generate_gradoutput(output_variable, non_contiguous=True)
     gradgradcheck_precision_override = gradgradcheck_method_precision_override(test_name)
@@ -2442,7 +2461,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
         test_case.assertEqual(unpack_variables(output_variable), output_tensor)
 
     if run_grad_checks:
-        run_grad_and_gradgrad_checks(test_case, test_name, apply_fn,
+        run_grad_and_gradgrad_checks(test_case, name, test_name, apply_fn,
                                      output_variable, f_args_variable)
 
     self_variable = f_args_variable[0]
@@ -2486,10 +2505,9 @@ def check(name):
                     # TODO: check that both have changed after adding all inplace ops
 
                 if not is_inplace and name not in EXCLUDE_GRADCHECK:
-                    run_grad_and_gradgrad_checks(self, test_name,
+                    run_grad_and_gradgrad_checks(self, name, test_name,
                                                  lambda *inputs: getattr(inputs[0], name)(*inputs[1:]),
-                                                 output_variable, (self_variable,) + args_variable,
-                                                 name not in EXCLUDE_GRADGRADCHECK)
+                                                 output_variable, (self_variable,) + args_variable)
 
                 # functional interface tests
                 if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -553,7 +553,7 @@
   self: sum_backward(grad, self.sizes(), dim, keepdim)
 
 - name: svd(Tensor self, bool some)
-  self: not_implemented("svd")
+  self: svd_backward(grads, self, some, res1, res2, res3)
 
 - name: symeig(Tensor self, bool eigenvectors, bool upper)
   self: not_implemented("symeig")
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
@@ -54,7 +54,9 @@
 # to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h.
 SUPPORTED_RETURN_TYPES = {
     'Tensor', 'std::tuple<Tensor,Tensor>',
-    'std::tuple<Tensor,Tensor,Tensor>', 'std::vector<Tensor>',
+    'std::tuple<Tensor,Tensor,Tensor>',
+    'std::tuple<Tensor,Tensor,Tensor,Tensor>',
+    'std::vector<Tensor>',
     'Scalar', 'bool', 'int64_t', 'void*'
 }
 
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
@@ -1,5 +1,6 @@
 #include "Functions.h"
 #include <ATen/WrapDimUtils.h>
+#include <iostream>
 
 // define constants like M_PI and C keywords for MSVC
 #ifdef _MSC_VER
@@ -502,28 +503,85 @@ std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
   }
 }
 
+// https://j-towns.github.io/papers/svd-derivative.pdf
+Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
+          bool some, const Tensor& raw_u, const Tensor& sigma, const Tensor& raw_v) {
+  auto m = self.size(0);
+  auto n = self.size(1);
+  auto k = sigma.size(0);
+
+  Tensor u, v;
+  if (!some) {
+    // ignore the free subspace
+    u = raw_u.narrow(1, 0, k);
+    v = raw_v.narrow(1, 0, k);
+  } else {
+    u = raw_u;
+    v = raw_v;
+  }
+
+  auto gu = grads[0];
+  auto gsigma = grads[1];
+  auto gv = grads[2];
+  auto im = self.type().eye(m);
+  auto in = self.type().eye(n);
+  auto ut = u.t();
+  auto vt = v.t();
+  auto sigma_mat = sigma.diag();
+  auto sigma_mat_inv = sigma.pow(-1).diag();
+  auto sigma_expanded_sq = sigma.pow(2).expand_as(sigma_mat);
+  auto F = (sigma_expanded_sq - sigma_expanded_sq.t()).pow(-1);
+  auto& long_type = sigma.type().toScalarType(at::kLong);
+  auto diag_indices = long_type.arange(0, F.numel(), k + 1);
+  F.view({-1}).index_fill_(0, diag_indices, 0);
+
+  Tensor u_term, sigma_term, v_term;
+
+  if (gu.defined()) {
+    u_term = u.mm(F.mul(ut.mm(gu) - gu.t().mm(u))).mm(sigma_mat);
+    if (m > k) {
+      u_term = u_term + (im - u.mm(ut)).mm(gu).mm(sigma_mat_inv);
+    }
+    u_term = u_term.mm(vt);
+  } else {
+    u_term = self.type().zeros({1}).expand_as(self);
+  }
+
+  if (gsigma.defined()) {
+    sigma_term = u.mm(gsigma.diag()).mm(vt);
+  } else {
+    sigma_term = self.type().zeros({1}).expand_as(self);
+  }
+
+  if (gv.defined()) {
+    auto gvt = gv.t();
+    v_term = sigma_mat.mm(F.mul(vt.mm(gv) - gvt.mm(v))).mm(vt);
+    if (n > k) {
+      v_term = v_term + sigma_mat_inv.mm(gvt.mm(in - v.mm(vt)));
+    }
+    v_term = u.mm(v_term);
+  } else {
+    v_term = self.type().zeros({1}).expand_as(self);
+  }
+
+  return u_term + sigma_term + v_term;
+}
+
 // Formula:
 //   d det / d A_ij = \sum_k (\prod_{l neq k} Sigma_l) U_ik V_jk
 // that is, if det != 0
 //   d det / d A = U * (Sigma / det) * V^T
 Tensor _det_with_svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
           const Tensor& det, const Tensor& u, const Tensor& sigma, const Tensor& v) {
+  std::vector<torch::autograd::Variable> svd_grads(grads.begin() + 1, grads.end());
+  auto svd_term = svd_backward(svd_grads, self, true, u, sigma, v);
+
   auto det_grad = grads[0];
-  // If any gradient is defined on svd, then it must be in a double backward
-  // because the svd results are not exposed to users. That is, it can only come
-  // from auto-differentiating this method:
-  //     dA = _det_with_svd_backward(d det, A, [det, u, s, v]=_det_with_svd(A)),
-  // getting ddu, dds, ddv, and calling this method again to accumulate ddA.
-  for (size_t i = 1; i < 4; i++) {
-    if (grads[i].defined()) {
-      throw std::runtime_error("Double backward through det is not supported.");
-    }
-  }
   auto size = self.size(0);
   auto null_dim = size - sigma.nonzero().size(0);
   if (null_dim >= 2) {
     // \prod_{l neq k} Sigma_l is zero every where
-    return zeros_like(self);
+    return svd_term;
   }
   if (null_dim == 1) {
     // only last sigma is 0
@@ -532,10 +590,10 @@ Tensor _det_with_svd_backward(const std::vector<torch::autograd::Variable> &grad
     auto scale = sigma.narrow(0, 0, size - 1).prod();
     auto last_u = u.narrow(1, size - 1, 1);
     auto last_v = v.narrow(1, size - 1, 1);
-    return last_u.mm(last_v.transpose(0, 1)).mul_(scale.mul_(det_grad));
+    return svd_term + last_u.mm(last_v.transpose(0, 1)).mul_(scale.mul_(det_grad));
   }
   // no zero singular values
-  return u.mm(sigma.pow(-1).mul_(det.mul(det_grad)).diag()).mm(v.transpose(0, 1));
+  return svd_term + u.mm(sigma.pow(-1).mul_(det.mul(det_grad)).diag()).mm(v.transpose(0, 1));
 }
 
 }
diff --git a/tools/jit/templates/aten_dispatch.cpp b/tools/jit/templates/aten_dispatch.cpp
@@ -90,7 +90,7 @@ void pack_list(list_of_retainable & outputs, std::tuple<Tensor, Tensor, Tensor>
   outputs.push_back(toRetainableSteal(std::move(std::get<1>(v))));
   outputs.push_back(toRetainableSteal(std::move(std::get<2>(v))));
 }
-void pack_list(std::vector<Tensor> & outputs, std::tuple<Tensor, Tensor, Tensor, Tensor> v) {
+void pack_list(list_of_retainable & outputs, std::tuple<Tensor, Tensor, Tensor, Tensor> v) {
   outputs.push_back(toRetainableSteal(std::move(std::get<0>(v))));
   outputs.push_back(toRetainableSteal(std::move(std::get<1>(v))));
   outputs.push_back(toRetainableSteal(std::move(std::get<2>(v))));
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -4276,18 +4276,29 @@
 `U, S, V = torch.svd(A)` returns the singular value decomposition of a
 real matrix `A` of size `(n x m)` such that :math:`A = USV'*`.
 
-`U` is of shape `n x n`
+`U` is of shape `n x min(n, m)`
 
-`S` is of shape `n x m`
+`S` is a diagonal square matrix of shape `min(n, m) x min(n, m)`, represented as
+a vector of shape `(min(n, m),)` containing its diagonal entries.
 
-`V` is of shape `m x m`.
+`V` is of shape `m x min(n, m)`.
 
 :attr:`some` represents the number of singular values to be computed.
 If `some=True`, it computes some and `some=False` computes all.
 
 .. note:: Irrespective of the original strides, the returned matrix `U`
           will be transposed, i.e. with strides `(1, n)` instead of `(n, 1)`.
 
+.. note:: Extra care needs to be taken when backward through `U` and `V`
+          outputs. Such operation is really only stable when :attr:`input` is
+          full rank with all distinct singular values. Otherwise, `NaN` can
+          appear as the gradients are not properly defined. Also, when
+          :attr:`some` = `False`, the gradients on `U[:, min(n, m):]` and
+          `V[:, min(n, m):]` will be ignored as those vectors can be arbitrary
+          bases of the subspaces.
+
+.. note:: Double backward through :meth:`~torch.svd` is not supported currently.
+
 Args:
     input (Tensor): the input 2D Tensor
     some (bool, optional): controls the number of singular values to be computed
diff --git a/torch/functional.py b/torch/functional.py
@@ -248,7 +248,13 @@ def maybeSqueeze(tensor):
 
 
 def det(var):
-    """Calculates determinant of a 2D square Variable
+    """Calculates determinant of a 2D square Variable.
+
+    .. note::
+        Backward through `det` internally uses SVD results. So double backward
+        through `det` will need to backward through :meth:`~Tensor.svd`. This
+        can be unstable in certain cases. Please see :meth:`~torch.svd` for
+        details.
 
     Arguments:
         var (Variable): The input 2D square Variable.

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ void pack_list(list_of_retainable & outputs, std::tuple<Tensor, Tensor, Tensor>`
`90`	`90`	`outputs.push_back(toRetainableSteal(std::move(std::get<1>(v))));`
`91`	`91`	`outputs.push_back(toRetainableSteal(std::move(std::get<2>(v))));`
`92`	`92`	`}`
`93`		`-void pack_list(std::vector<Tensor> & outputs, std::tuple<Tensor, Tensor, Tensor, Tensor> v) {`
	`93`	`+void pack_list(list_of_retainable & outputs, std::tuple<Tensor, Tensor, Tensor, Tensor> v) {`
`94`	`94`	`outputs.push_back(toRetainableSteal(std::move(std::get<0>(v))));`
`95`	`95`	`outputs.push_back(toRetainableSteal(std::move(std::get<1>(v))));`
`96`	`96`	`outputs.push_back(toRetainableSteal(std::move(std::get<2>(v))));`