pytorch · vishwakftw · May 18, 2019 · May 19, 2019 · May 20, 2019 · May 22, 2019
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -2431,25 +2431,6 @@
     ${THTensor}_pstrf(res1_, res2_, self_, (upper) ? "U" : "L", tol_);
     res2 -= 1;  // LAPACK returns 1-indexed pivots
 ]]
-[[
-  name: _th_qr
-  cname: qr
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* res1
-      output: True
-    - arg: THTensor* res2
-      output: True
-    - THTensor* self
-]]
 [[
   name: _th_geqrf
   cname: geqrf

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
@@ -713,7 +713,7 @@ class CAFFE2_API Tensor {
   std::tuple<Tensor,Tensor> solve(const Tensor & A) const;
   Tensor cholesky_inverse(bool upper=false) const;
   std::tuple<Tensor,Tensor> pstrf(bool upper=true, Scalar tol=-1) const;
-  std::tuple<Tensor,Tensor> qr() const;
+  std::tuple<Tensor,Tensor> qr(bool some=true) const;
   std::tuple<Tensor,Tensor> geqrf() const;
   Tensor orgqr(const Tensor & input2) const;
   Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;

diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
@@ -1194,8 +1194,8 @@ inline Tensor Tensor::cholesky_inverse(bool upper) const {
 inline std::tuple<Tensor,Tensor> Tensor::pstrf(bool upper, Scalar tol) const {
     return dispatch_type().pstrf(*this, upper, tol);
 }
-inline std::tuple<Tensor,Tensor> Tensor::qr() const {
-    return dispatch_type().qr(*this);
+inline std::tuple<Tensor,Tensor> Tensor::qr(bool some) const {
+    return dispatch_type().qr(*this, some);
 }
 inline std::tuple<Tensor,Tensor> Tensor::geqrf() const {
     return dispatch_type().geqrf(*this);

diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
@@ -523,7 +523,7 @@ struct CAFFE2_API Type {
   virtual std::tuple<Tensor,Tensor> solve(const Tensor & self, const Tensor & A) const = 0;
   virtual Tensor cholesky_inverse(const Tensor & self, bool upper) const = 0;
   virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper, Scalar tol) const = 0;
-  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self, bool some) const = 0;
   virtual std::tuple<Tensor,Tensor> geqrf(const Tensor & self) const = 0;
   virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0;
   virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) const = 0;

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -38,6 +38,14 @@ extern "C" void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);
 // trtrs
 extern "C" void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 extern "C" void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+
+// geqrf
+extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+
+// orgqr
+extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
 #endif
 
 namespace at {
@@ -75,6 +83,16 @@ void lapackTriangularSolve(char uplo, char trans, char diag, int n, int nrhs, sc
   AT_ERROR("triangular_solve only takes float or double Tensors");
 }
 
+template<class scalar_t>
+void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info) {
+  AT_ERROR("geqrf only takes float or double Tensors");
+}
+
+template<class scalar_t>
+void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info) {
+  AT_ERROR("orgqr only takes float or double Tensors");
+}
+
 #ifdef USE_LAPACK
 template<> void lapackSolve<double>(int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) {
   dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
@@ -123,6 +141,22 @@ template<> void lapackTriangularSolve<double>(char uplo, char trans, char diag,
 template<> void lapackTriangularSolve<float>(char uplo, char trans, char diag, int n, int nrhs, float *a, int lda, float *b, int ldb, int *info) {
   strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
 }
+
+template<> void lapackGeqrf<double>(int m, int n, double *a, int lda, double *tau, double *work, int lwork, int *info) {
+  dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+}
+
+template<> void lapackGeqrf<float>(int m, int n, float *a, int lda, float *tau, float *work, int lwork, int *info) {
+  sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+}
+
+template<> void lapackOrgqr<double>(int m, int n, int k, double *a, int lda, double *tau, double *work, int lwork, int *info) {
+  dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+}
+
+template<> void lapackOrgqr<float>(int m, int n, int k, float *a, int lda, float *tau, float *work, int lwork, int *info) {
+  sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+}
 #endif
 
 // Below of the definitions of the functions operating on a batch that are going to be dispatched
@@ -648,4 +682,155 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out(Tensor& result, Tensor& clone_
   return std::tuple<Tensor&, Tensor&>(result, clone_A);
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<typename scalar_t>
+static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
+                        std::vector<int64_t>& infos) {
+#ifndef USE_LAPACK
+  AT_ERROR("qr: LAPACK library not found in compilation");
+#else
+  auto self_data = self.data<scalar_t>();
+  auto tau_data = tau.data<scalar_t>();
+  auto self_matrix_stride = matrixStride(self);
+  auto tau_stride = tau.size(-1);
+  auto batch_size = batchCount(self);
+
+  int info;
+  // Run once, first to get the optimum work size.
+  // Since we deal with batches of matrices with the same dimensions, doing this outside
+  // the loop saves (batch_size - 1) workspace queries which would provide the same result
+  // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
+  int lwork = -1;
+  scalar_t wkopt;
+  lapackGeqrf<scalar_t>(m, n, self_data, m, tau_data, &wkopt, lwork, &info);
+  lwork = static_cast<int>(wkopt);
+  Tensor work = at::empty({lwork}, self.options());
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+
+    // now compute the actual R and TAU
+    lapackGeqrf<scalar_t>(m, n, self_working_ptr, m, tau_working_ptr, work.data<scalar_t>(), lwork, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
+  }
+#endif
+}
+
+template<typename scalar_t>
+static void apply_orgqr(Tensor& self, const Tensor& tau, int64_t m, int64_t n_columns,
+                        int64_t k, std::vector<int64_t>& infos) {
+#ifndef USE_LAPACK
+  AT_ERROR("qr: LAPACK library not found in compilation");
+#else
+  auto self_data = self.data<scalar_t>();
+  auto tau_data = tau.data<scalar_t>();
+  auto self_matrix_stride = matrixStride(self);
+  auto tau_stride = tau.size(-1);
+  auto batch_size = batchCount(self);
+
+  int info;
+  // Run once, first to get the optimum work size.
+  // Since we deal with batches of matrices with the same dimensions, doing this outside
+  // the loop saves (batch_size - 1) workspace queries which would provide the same result
+  // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
+  int lwork = -1;
+  scalar_t wkopt;
+  lapackOrgqr<scalar_t>(m, n_columns, k, self_data, m, tau_data, &wkopt, lwork, &info);
+  lwork = static_cast<int>(wkopt);
+  Tensor work = at::empty({lwork}, self.options());
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+
+    // now compute the actual Q
+    lapackOrgqr<scalar_t>(m, n_columns, k, self_working_ptr, m, tau_working_ptr, work.data<scalar_t>(), lwork, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
+  }
+#endif
+}
+
+std::tuple<Tensor, Tensor> _qr_helper_cpu(const Tensor& self, bool some) {
+  std::vector<int64_t> infos(batchCount(self), 0);
+  int64_t m = self.size(-2), n = self.size(-1);
+
+  // Setup inputs for apply_geqrf
+  auto self_sizes = self.sizes().vec();
+  self_sizes.pop_back();
+  self_sizes[self.dim() - 2] = std::min(m, n);
+  auto tau_working_copy = at::empty(self_sizes, self.options());
+  Tensor q_working_copy;
+
+  // Setup input geometry for apply_orgqr
+  std::vector<int64_t> q_sizes, q_strides;
+  int64_t n_columns_q;
+  Tensor R;
+  std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, some);
+
+  // If there are no elements, then we simply return a pair of tensors of required dimensions
+  if (self.numel() == 0) {
+    // Fix the number of columns of q appropriately
+    q_sizes[self.dim() - 1] = n_columns_q;
+    q_working_copy = at::eye(q_sizes[self.dim() - 2], q_sizes[self.dim() - 1], self.options());
+    q_working_copy = q_working_copy.expand_as(q_working_copy);
+
+    // We repurpose the same q_sizes for R
+    // Fix the number of rows and columns of q_working_copy appropriately
+    q_sizes[self.dim() - 1] = n;
+    q_sizes[self.dim() - 2] = n_columns_q;
+    R = at::empty(q_sizes, self.options());
+    return std::make_tuple(q_working_copy, R);
+  }
+
+  // First perform GEQRF for R and TAU (the elementary reflectors)
+  // We will need to generate R from the upper triangular matrix from the
+  // matrix input to GEQRF.
+  q_working_copy = at::empty_strided(q_sizes, q_strides, self.options());
+  q_working_copy.narrow(-1, 0, n).copy_(self);
+
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "qr_cpu", [&]{
+    apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n, infos);
+  });
+  if (self.dim() > 2) {
+    batchCheckErrors(infos, "qr_cpu");
+  } else {
+    singleCheckErrors(infos[0], "qr_cpu");
+  }
+
+  R = q_working_copy.slice(-2, 0, n_columns_q).slice(-1, 0, n).triu();
+
+  // Next perform ORGQR for Q using the results (both raw R and TAU) from GEQRF
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "qr_cpu", [&]{
+    apply_orgqr<scalar_t>(q_working_copy, tau_working_copy, m, n_columns_q, std::min(m, n), infos);
+  });
+  if (self.dim() > 2) {
+    batchCheckErrors(infos, "qr_cpu");
+  } else {
+    singleCheckErrors(infos[0], "qr_cpu");
+  }
+  return std::make_tuple(q_working_copy.narrow_copy(-1, 0, n_columns_q), R);
+}
+
+std::tuple<Tensor,Tensor> qr(const Tensor& self, bool some) {
+  TORCH_CHECK(self.dim() >= 2,
+              "self should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
+  return at::_qr_helper(self, some);
+}
+
+std::tuple<Tensor&,Tensor&> qr_out(Tensor& Q, Tensor& R, const Tensor& self, bool some) {
+  Tensor Q_tmp, R_tmp;
+  std::tie(Q_tmp, R_tmp) = at::_qr_helper(self, some);
+  Q.resize_as_(Q_tmp).copy_(Q_tmp);
+  R.resize_as_(R_tmp).copy_(R_tmp);
+  return std::tuple<Tensor&, Tensor&>(Q, R);
+}
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/TensorUtils.h>
 #include <limits>
 #include <sstream>
 
@@ -187,4 +188,30 @@ static inline std::tuple<Tensor,Tensor> _linear_solve_broadcast_args(const Tenso
   return std::make_tuple(arg1_broadcasted, arg2_broadcasted);
 }
 
+// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition
+static inline std::tuple<std::vector<int64_t>,
+                         std::vector<int64_t>,
+                         int64_t> _compute_geometry_for_Q(const Tensor& input, bool some) {
+  int64_t m = input.size(-2), n = input.size(-1);
+  int64_t n_columns_q;
+
+  // We need to compute the required size of Q based on the `some` option
+  auto q_sizes = input.sizes().vec();
+  if (!some && m > n) {
+    q_sizes[input.dim() - 1] = m;
+    n_columns_q = m;
+  } else {
+    q_sizes[input.dim() - 1] = n;
+    n_columns_q = std::min(m, n);
+  }
+  auto q_strides = at::detail::defaultStrides(q_sizes);
+
+  // Q should be a column-major or a batch of column-major matrices
+  // ... x m x n will have strides: ...., n, 1
+  // We require: ...., 1, m
+  q_strides[input.dim() - 1] = m;
+  q_strides[input.dim() - 2] = 1;
+  return std::make_tuple(q_sizes, q_strides, n_columns_q);
+}
+
 }}  // namespace at::native