pytorch · animesht · Jul 24, 2018 · Jul 24, 2018 · Jul 25, 2018 · Jul 26, 2018
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -2938,27 +2938,6 @@
     - THTensor* tensor1
     - THTensor* tensor2
 ]]
-[[
-  name: _gesv_single
-  cname: gesv
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - method
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* solution
-      output: True
-    - arg: THTensor* lu
-      output: True
-    - THTensor* self
-    - THTensor* A
-]]
 [[
   name: gels
   types:

diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp
@@ -46,7 +46,7 @@ template<> void lapackGesv<double>(
 template <typename scalar_t>
 static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
 #ifndef USE_LAPACK
-  AT_ERROR("gesv: LAPACK library not found in compilation");
+  AT_ERROR("gesv : Lapack library not found in compile time");
 #endif
   auto A_data = A.data<scalar_t>();
   auto b_data = b.data<scalar_t>();
@@ -57,7 +57,7 @@ static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
 
-  auto ipiv = at::empty({n}, b.type().toScalarType(kInt));
+  auto ipiv = at::empty({n}, b.options().dtype(kInt));
 
   for (int64_t i = 0; i < batch_size; i++) {
     int info;
@@ -72,6 +72,37 @@ static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
   }
 }
 
+std::tuple<Tensor&,Tensor&> _gesv_single_out_cpu(
+    Tensor& sol, Tensor& lu,
+    const Tensor& self, const Tensor& A) {
+#ifndef USE_LAPACK
+  AT_ERROR("gesv : Lapack library not found in compile time");
+#endif
+  int info = 0;
+  Tensor temp_sol;
+  Tensor temp_lu;
+  auto& A_tensor = prepareTensorsForLapack(A, lu, temp_lu);
+  auto& b_tensor = prepareTensorsForLapack(self, sol, temp_sol);
+
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+    const int64_t n = sol.size(0);
+    const int64_t nrhs = sol.size(1);
+    auto A_ptr = A_tensor.data<scalar_t>();
+    auto b_ptr = b_tensor.data<scalar_t>();
+    auto ipiv = at::empty({n}, sol.options().dtype(kInt));
+    lapackGesv<scalar_t>(n, nrhs, A_ptr, n, ipiv.data<int>(), b_ptr, n, &info);
+  });
+  checkErrors({info});
+
+  if (temp_sol.defined()) {
+    sol.copy_(temp_sol);
+  }
+  if (temp_lu.defined()) {
+    lu.copy_(temp_lu);
+  }
+  return std::tuple<Tensor&, Tensor&>(sol, lu);
+}
+
 std::tuple<Tensor,Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A) {
   std::vector<int64_t> infos(batchCount(A), 0);
   auto A_working_copy = cloneBatchedColumnMajor(A);
@@ -83,17 +114,21 @@ std::tuple<Tensor,Tensor> _gesv_helper_cpu(const Tensor& self, const Tensor& A)
   return std::tuple<Tensor,Tensor>(b_working_copy, A_working_copy);
 }
 
+std::tuple<Tensor,Tensor> _gesv_single(const Tensor& self, const Tensor& A) {
+  auto sol = self.type().tensor();
+  auto lu = self.type().tensor();
+  return self.type()._gesv_single_out(sol, lu, self, A);
+}
+
 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
-  if (self.dim() <= 2 && A.dim() <= 2) {
-    // TODO: #7102: It's not necessary to have gesv (single) bindings for both
-    // TH and ATen. We should remove the TH gesv bindings, especially
-    // since the lapackGesv function is already in ATen.
+  bool batched = !(self.dim() <= 2 && A.dim() <= 2);
+  checkInputs(self, A, batched);
+
+  if (!batched) {
     return at::_gesv_single(self, A);
   }
 
-  checkInputs(self, A);
-
   // broadcast the batch dimensions of self and A.
   IntList self_batch_sizes(self.sizes().data(), self.ndimension() - 2);
   IntList A_batch_sizes(A.sizes().data(), A.ndimension() - 2);
@@ -114,13 +149,14 @@ std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
 }
 
 std::tuple<Tensor&,Tensor&> gesv_out(
-    Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) {
+    Tensor& sol, Tensor& lu, const Tensor& self, const Tensor& A) {
   if (self.dim() > 2 || A.dim() > 2) {
     AT_ERROR("torch.gesv() with the `out` keyword does not support batching. "
-                  "b.dim() (%lld) and A.dim() (%lld) must both be 2.",
-                  (long long)self.dim(), (long long)A.dim());
+             "b.dim() (", self.dim(), ") and A.dim() (", A.dim(),
+             ") must both be 2.");
   }
-  return at::_gesv_single_out(solution, lu, self, A);
+
+  return self.type()._gesv_single_out(sol, lu, self, A);
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h
@@ -1,30 +1,96 @@
+#include <utility>
 #include "ATen/ATen.h"
 
 namespace at { namespace native {
 
-static inline void checkInputs(const Tensor& self, const Tensor& A) {
-  if (A.size(-1) != A.size(-2)) {
-    AT_ERROR("A must be batches of square matrices, "
-        "but they are %lld by %lld matrices",
-        (long long)A.size(-1), (long long)A.size(-2));
+static inline bool isTransposeContiguous(Tensor& self) {
+  return self.dim() == 2 &&
+          self.stride(0) == 1 &&
+          self.stride(1) == self.size(0);
+}
+
+/* gesv takes (self, A) and returns (sol, lu).
+ * (i)  output tensors (sol, lu) may be same as input tensors (self, A)
+ * (ii) for 2D matrices, .t_() represents their column-major format
+ *
+ * Before passing pointers to Lapack, we need to ensure that these pointers
+ * represent Fortran-contiguous tensors in column-major format
+ *
+ * Cases:
+ * 1) `out` has correct shape but elements do not form a contiguous
+ * chunk of memory. Since shape is correct, we don't resize_ it. Instead, we
+ * clone the input tensor into a buffer, use the buffer for Lapack and finally
+ * copy the buffer to the output tensor.
+ *
+ * 2) out.t() is contiguous:
+ *    (i)  &in == &out: use out.data() as is. Do nothing
+ *    (ii) &in != &out: copy in.t() to out.t()
+ * 3) out.t() is not contiguous:
+ *    - resize_ should fix contiguity/size issues
+ *    (i)  &in == &out: copy in.t().clone() to out (same tensor)
+ *    (ii) &in != &out: copy in.t() to out
+ */
+static inline Tensor& prepareTensorsForLapack(
+    const Tensor& in, Tensor& out, Tensor& temp) {
+  int64_t x = in.size(0);
+  int64_t y = (in.dim() == 1) ? 1 : in.size(1);
+  bool out_tc = isTransposeContiguous(out);
+  bool out_correct_shape =
+    out.dim() == 2 && out.size(0) == x && out.size(1) == y;
+
+  // view potential 1D `in` as 2D
+  auto in_t = in.view({x, y}).t_();
+
+  if (!out_tc && !out.is_contiguous() && out_correct_shape) {
+    temp = in_t.clone().t_();
+  } else if (out_tc && &in != &out) {
+    out.t().resize_({y, x}).copy_(in_t);
+  } else if (!out_tc) {
+    out.resize_({y, x});
+    if (&in == &out) {
+      out.copy_(in_t.clone()).t_();
+    } else {
+      out.copy_(in_t).t_();
+    }
   }
-  if (A.size(-1) != self.size(-2)) {
-    AT_ERROR("Incompatible matrix sizes for matmul: each A "
-        "matrix is %llu by %lld but each b matrix is %lld by %lld.",
-        (long long)A.size(-1), (long long)A.size(-1),
-        (long long)self.size(-2), (long long)self.size(-1));
+  // return ref to usable tensor for Lapack
+  return temp.defined() ? temp : out;
+}
+
+static inline void checkInputs(const Tensor& self, const Tensor& A, bool batched) {
+  if (batched) {
+    if (A.size(-1) != A.size(-2)) {
+      AT_ERROR("A must be batches of square matrices, "
+          "but they are ", A.size(-1), " by ", A.size(-2), " matrices");
+    } else if (A.size(-1) != self.size(-2)) {
+      AT_ERROR("incompatible matrix sizes for matmul: each a "
+          "matrix is ", A.size(-1), " by ", A.size(-1),
+          " but each b matrix is ", self.size(-2), " by ", self.size(-1));
+    }
+  } else {
+    if (A.dim() != 2) {
+      AT_ERROR("A should have 2 dimensions, but has ", A.dim());
+    } else if (self.dim() != 1 && self.dim() != 2) {
+      AT_ERROR("B should have 1 or 2 dimensions, but has ", self.dim());
+    } else if (A.size(0) != A.size(1)) {
+      AT_ERROR("A must be a square matrix, but is ",
+          A.size(0), " by ", A.size(1));
+    } else if (A.size(0) != self.size(0)) {
+      AT_ERROR("A,B size incompatible - A has ", A.size(0),
+          " rows, B has ", self.size(0), " cols");
+    }
   }
 }
 
 static inline void checkErrors(std::vector<int64_t> infos) {
   for (size_t i = 0; i < infos.size(); i++) {
     auto info = infos[i];
     if (info < 0) {
-      AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value",
-          (long long)i, -info);
+      AT_ERROR("gesv: For batch ", i, ": Argument ",
+          -info, " has illegal value");
     } else if (info > 0) {
-      AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.",
-          (long long)i, info, info);
+      AT_ERROR("gesv: For batch ", i, ": U(", info, ",", info,
+          ") is zero, singular U.");
     }
   }
 }

diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
@@ -19,6 +19,28 @@ namespace at {
 namespace native {
 
 #ifdef USE_MAGMA
+
+template<class scalar_t>
+void magmaGesv(
+    int64_t n, int64_t nrhs, scalar_t* A_data, int64_t lda,
+    int* ipiv, scalar_t* B_data, int64_t ldb, int* info) {
+  AT_ERROR("magma: gesv only takes float or double Tensors");
+}
+
+template<>
+void magmaGesv<float>(
+    int64_t n, int64_t nrhs, float* A_data, int64_t lda,
+    int* ipiv, float* B_data, int64_t ldb, int* info) {
+  magma_sgesv_gpu(n, nrhs, A_data, lda, ipiv, B_data, ldb, info);
+}
+
+template<>
+void magmaGesv<double>(
+    int64_t n, int64_t nrhs, double* A_data, int64_t lda,
+    int* ipiv, double* B_data, int64_t ldb, int* info) {
+  magma_dgesv_gpu(n, nrhs, A_data, lda, ipiv, B_data, ldb, info);
+}
+
 template<class scalar_t>
 void magmaGesvBatched(
     magma_int_t n, magma_int_t nrhs, scalar_t** dA_array, magma_int_t ldda,
@@ -138,6 +160,38 @@ std::tuple<Tensor,Tensor> _gesv_helper_cuda(const Tensor& self, const Tensor& A)
   return std::tuple<Tensor,Tensor>(b_working_copy, A_working_copy);
 }
 
+std::tuple<Tensor&,Tensor&> _gesv_single_out_cuda(Tensor& sol, Tensor& lu,
+    const Tensor& self, const Tensor& A) {
+#ifndef USE_MAGMA
+AT_ERROR("gesv: MAGMA library not found in "
+    "compilation. Please rebuild with MAGMA.");
+#else
+  int info = 0;
+  int* ipiv;
+  Tensor temp_sol;
+  Tensor temp_lu;
+  auto& A_tensor = prepareTensorsForLapack(A, lu, temp_lu);
+  auto& b_tensor = prepareTensorsForLapack(self, sol, temp_sol);
+
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+      const int64_t n = sol.size(0);
+      const int64_t nrhs = sol.size(1);
+      auto A_ptr = A_tensor.data<scalar_t>();
+      auto b_ptr = b_tensor.data<scalar_t>();
+      ALLOCATE_ARRAY(ipiv, int, n, sol);
+      magmaGesv<scalar_t>(n, nrhs, A_ptr, n, ipiv, b_ptr, n, &info);
+  });
+  checkErrors({info});
+
+  if (temp_sol.defined()) {
+    sol.copy_(temp_sol);
+  }
+  if (temp_lu.defined()) {
+    lu.copy_(temp_lu);
+  }
+  return std::tuple<Tensor&,Tensor&>(sol, lu);
+#endif
+}
 }}  // namespace at::native
 
 #undef ALLOCATE_ARRAY
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -720,6 +720,14 @@
     CPU: _gesv_helper_cpu
     CUDA: _gesv_helper_cuda
 
+- func: _gesv_single(Tensor self, Tensor A) -> (Tensor, Tensor)
+
+- func: _gesv_single_out(Tensor solution, Tensor lu, Tensor self, Tensor A) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _gesv_single_out_cpu
+    CUDA: _gesv_single_out_cuda
+
 - func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor
   variants: function
 

diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp
@@ -3,8 +3,6 @@
 #else
 
 
-TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
 TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
 TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
@@ -37,21 +35,6 @@ TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *r
 TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info);
 
 
-/* Compute the solution to a real system of linear equations  A * X = B */
-void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#else
-  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#endif
-#else
-  THError("gesv : Lapack library not found in compile time\n");
-#endif
-  return;
-}
-
 /* Solve a triangular system of the form A * X = B  or A^T * X = B */
 void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info)
 {

diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h
@@ -2,8 +2,6 @@
 #define TH_GENERIC_FILE "generic/THLapack.h"
 #else
 
-/* AX=B */
-TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info);
 /* Solve a triangular system of the form A * X = B  or A^T * X = B */
 TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info);
 /* ||AX-B|| */