pytorch
diff --git a/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 0 additions & 27 deletions b/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 0 additions & 27 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebra.cpp‎
Lines changed: 100 additions & 0 deletions b/‎aten/src/ATen/native/BatchLinearAlgebra.cpp‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/LinearAlgebraUtils.h‎
Lines changed: 14 additions & 3 deletions b/‎aten/src/ATen/native/LinearAlgebraUtils.h‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cu‎
Lines changed: 112 additions & 1 deletion b/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cu‎
Lines changed: 112 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 5 additions & 5 deletions b/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎aten/src/TH/generic/THLapack.cpp‎
Lines changed: 0 additions & 17 deletions b/‎aten/src/TH/generic/THLapack.cpp‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎aten/src/TH/generic/THLapack.h‎
Lines changed: 0 additions & 2 deletions b/‎aten/src/TH/generic/THLapack.h‎
Lines changed: 0 additions & 2 deletions
@@ -2323,33 +2323,6 @@
     - THTensor* self
     - THTensor* A
 ]]
-[[
-  name: _th_symeig
-  cname: syev
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants:
-    - function
-  return: argument 0,1
-  arguments:
-    - arg: THTensor* res1
-      output: True
-    - arg: THTensor* res2
-      output: True
-    - THTensor* self
-    - arg: bool eigenvectors
-      if_true: V
-      if_false: N
-      default: N
-    - arg: bool upper
-      if_true: U
-      if_false: L
-      default: U
-]]
 [[
   name: _th_eig
   cname: geev
 
@@ -46,6 +46,10 @@ extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *w
 // orgqr
 extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+
+// syev
+extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
+extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
 #endif
 
 namespace at {
@@ -93,6 +97,11 @@ void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scala
   AT_ERROR("orgqr only takes float or double Tensors");
 }
 
+template<class scalar_t>
+void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info) {
+  AT_ERROR("symeig only takes float or double Tensors");
+}
+
 #ifdef USE_LAPACK
 template<> void lapackSolve<double>(int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) {
   dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
@@ -157,6 +166,14 @@ template<> void lapackOrgqr<double>(int m, int n, int k, double *a, int lda, dou
 template<> void lapackOrgqr<float>(int m, int n, int k, float *a, int lda, float *tau, float *work, int lwork, int *info) {
   sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
 }
+
+template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, int *info) {
+  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+}
+
+template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, int *info) {
+  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+}
 #endif
 
 // Below of the definitions of the functions operating on a batch that are going to be dispatched
@@ -833,4 +850,87 @@ std::tuple<Tensor&,Tensor&> qr_out(Tensor& Q, Tensor& R, const Tensor& self, boo
   return std::tuple<Tensor&, Tensor&>(Q, R);
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename scalar_t>
+static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool upper, std::vector<int64_t>& infos) {
+#ifndef USE_LAPACK
+  AT_ERROR("symeig: LAPACK library not found in compilation");
+#else
+  auto self_data = self.data<scalar_t>();
+  auto eigvals_data = eigvals.data<scalar_t>();
+  auto self_matrix_stride = matrixStride(self);
+  auto eigvals_stride = eigvals.size(-1);
+  auto batch_size = batchCount(self);
+  auto n = self.size(-1);
+
+  char uplo = upper ? 'U' : 'L';
+  char jobz = eigenvectors ? 'V' : 'N';
+
+  int info;
+  // Run once, first to get the optimum work size.
+  // Since we deal with batches of matrices with the same dimensions, doing this outside
+  // the loop saves (batch_size - 1) workspace queries which would provide the same result
+  // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
+  int lwork = -1;
+  scalar_t wkopt;
+  lapackSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, &info);
+  lwork = static_cast<int>(wkopt);
+  Tensor work = at::empty({lwork}, self.options());
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+
+    // now compute the eigenvalues and the eigenvectors (optionally)
+    lapackSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data<scalar_t>(), lwork, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
+  }
+#endif
+}
+
+std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvectors, bool upper) {
+  std::vector<int64_t> infos(batchCount(self), 0);
+
+  auto self_sizes = self.sizes().vec();
+  self_sizes.pop_back();
+  auto eigvals = at::empty(self_sizes, self.options());
+
+  if (self.numel() == 0) {
+    return std::tuple<Tensor, Tensor>(eigvals, at::empty_like(self));
+  }
+
+  auto self_working_copy = cloneBatchedColumnMajor(self);
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cpu", [&]{
+    apply_symeig<scalar_t>(self_working_copy, eigvals, eigenvectors, upper, infos);
+  });
+
+  if (!eigenvectors) {
+    self_working_copy.zero_();
+  }
+  if (self.dim() > 2) {
+    batchCheckErrors(infos, "symeig_cpu");
+  } else {
+    singleCheckErrors(infos[0], "symeig_cpu");
+  }
+  return std::tuple<Tensor, Tensor>(eigvals, self_working_copy);
+}
+
+std::tuple<Tensor, Tensor> symeig(const Tensor& self, bool eigenvectors, bool upper) {
+  squareCheckInputs(self);
+  return at::_symeig_helper(self, eigenvectors, upper);
+}
+
+std::tuple<Tensor&, Tensor&> symeig_out(Tensor& vals, Tensor& vecs, const Tensor& self, bool eigenvectors, bool upper) {
+  squareCheckInputs(self);
+  Tensor vals_tmp, vecs_tmp;
+  std::tie(vals_tmp, vecs_tmp) = at::_symeig_helper(self, eigenvectors, upper);
+  vals.resize_as_(vals_tmp).copy_(vals_tmp);
+  vecs.resize_as_(vecs_tmp).copy_(vecs_tmp);
+  return std::tuple<Tensor&, Tensor&>(vals, vecs);
+}
+
 }}  // namespace at::native
@@ -3,6 +3,7 @@
 #include <ATen/TensorUtils.h>
 #include <limits>
 #include <sstream>
+#include <cstring>
 
 namespace at { namespace native {
 
@@ -110,7 +111,7 @@ static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A) {
            " but each b matrix is ", self.size(-2), " by ", self.size(-1));
 }
 
-// Validates input shapes for operations on batches of square matrices (inverse, cholesky, lu)
+// Validates input shapes for operations on batches of square matrices (inverse, cholesky, lu, symeig)
 static inline void squareCheckInputs(const Tensor& self) {
   TORCH_CHECK(self.size(-1) == self.size(-2),
            "A must be batches of square matrices, "
@@ -145,7 +146,12 @@ static inline void batchCheckErrors(const Tensor& infos, const char* name) {
     if (info < 0) {
       AT_ERROR(name, ": For batch ", i, ": Argument ", -info, " has illegal value");
     } else if (info > 0) {
-      AT_ERROR(name, ": For batch ", i, ": U(", info, ",", info, ") is zero, singular U.");
+      if (strstr(name, "symeig")) {
+        AT_ERROR(name, ": For batch ", i, ": the algorithm failed to converge; ", info,
+                 " off-diagonal elements of an intermediate tridiagonal form did not converge to zero.")
+      } else {
+        AT_ERROR(name, ": For batch ", i, ": U(", info, ",", info, ") is zero, singular U.");
+      }
     }
   }
 }
@@ -158,7 +164,12 @@ static inline void singleCheckErrors(int64_t info, const char* name) {
   if (info < 0) {
     AT_ERROR(name, ": Argument ", -info, " has illegal value");
   } else if (info > 0) {
-    AT_ERROR(name, ": U(", info, ",", info, ") is zero, singular U.");
+    if (strstr(name, "symeig")) {
+      AT_ERROR(name, ": the algorithm failed to converge; ", info,
+               " off-diagonal elements of an intermediate tridiagonal form did not converge to zero.")
+    } else {
+      AT_ERROR(name, ": U(", info, ",", info, ") is zero, singular U.");
+    }
   }
 }
 
 
@@ -143,7 +143,15 @@ template<class scalar_t>
 void magmaOrgqr(
     magma_int_t m, magma_int_t n, magma_int_t k, scalar_t* dA,
     magma_int_t ldda, scalar_t* tau, scalar_t* dT, magma_int_t nb, magma_int_t* info) {
-  AT_ERROR("orgqr only takes float or doule Tensors");
+  AT_ERROR("orgqr only takes float or double Tensors");
+}
+
+template<class scalar_t>
+void magmaSymeig(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, scalar_t* dA, magma_int_t ldda,
+    scalar_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork,
+    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  AT_ERROR("symeig only takes float or double Tensors");
 }
 
 template<>
@@ -405,6 +413,22 @@ void magmaOrgqr<float>(
     float* tau, float* dT, magma_int_t nb, magma_int_t* info) {
   magma_sorgqr_gpu(m, n, k, dA, ldda, tau, dT, nb, info);
 }
+
+template<>
+void magmaSymeig<double>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double* dA, magma_int_t ldda,
+    double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork,
+    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
+}
+
+template<>
+void magmaSymeig<float>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float* dA, magma_int_t ldda,
+    float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork,
+    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  magma_ssyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
+}
 #endif
 
 #define ALLOCATE_ARRAY(name, type, size, dummy_tensor) \
@@ -1123,6 +1147,93 @@ std::tuple<Tensor,Tensor> _qr_helper_cuda(const Tensor& self, bool some) {
                          r_working_copy.narrow_copy(-2, 0, n_columns_q).triu_());
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename scalar_t>
+static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool upper, std::vector<int64_t>& infos) {
+#ifndef USE_MAGMA
+AT_ERROR("symeig: MAGMA library not found in "
+    "compilation. Please rebuild with MAGMA.");
+#else
+  auto self_data = self.data<scalar_t>();
+  auto eigvals_data = eigvals.data<scalar_t>();
+  auto self_matrix_stride = matrixStride(self);
+  auto eigvals_stride = eigvals.size(-1);
+  int64_t batch_size = batchCount(self);
+  magma_int_t n = magma_int_cast(self.size(-1), "n");
+
+  magma_uplo_t uplo = upper ? MagmaUpper : MagmaLower;
+  magma_vec_t jobz = eigenvectors ? MagmaVec : MagmaNoVec;
+
+  scalar_t* wA;
+  ALLOCATE_ARRAY(wA, scalar_t, n * n, self);
+
+  magma_int_t info;
+  // Run once, first to get the optimum work sizes.
+  // Since we deal with batches of matrices with the same dimensions, doing this outside
+  // the loop saves (batch_size - 1) workspace queries which would provide the same result
+  // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
+  magma_int_t lwork = -1;
+  scalar_t wkopt;
+  magma_int_t liwork = -1;
+  magma_int_t iwkopt;
+  magmaSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &iwkopt, liwork, &info);
+  
+  scalar_t* work;
+  magma_int_t* iwork;
+  lwork = magma_int_cast(wkopt, "work_size");
+  liwork = magma_int_cast(iwkopt, "iwork_size");
+  ALLOCATE_ARRAY(work, scalar_t, lwork, self);
+  ALLOCATE_ARRAY(iwork, magma_int_t, liwork, self);
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+    magmaSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr,
+                          wA, n, work, lwork, iwork, liwork, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
+  }
+#endif
+}
+
+std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvectors, bool upper) {
+  std::vector<int64_t> infos(batchCount(self), 0);
+
+  auto self_sizes = self.sizes().vec();
+  self_sizes.pop_back();
+
+  // We create temporary tensors on the CPU, because tensors on the GPU
+  // cause segfault when passed to magmaSymeig. The data is later
+  // moved to the appropriate device.
+  // In the case where self.numel() == 0, we just return an empty tensor of
+  // dimensions on the CUDA (to avoid the unnecessary "to(at::kCUDA)")
+  auto eigvals_working_copy = self.numel() == 0
+                              ? at::empty(self_sizes, self.options())
+                              : at::empty(self_sizes, self.options().device(at::kCPU));
+
+  if (self.numel() == 0) {
+    return std::tuple<Tensor, Tensor>(eigvals_working_copy, at::empty_like(self));
+  }
+
+  auto self_working_copy = cloneBatchedColumnMajor(self);
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cuda", [&]{
+    apply_symeig<scalar_t>(self_working_copy, eigvals_working_copy, eigenvectors, upper, infos);
+  });
+
+  if (!eigenvectors) {
+    self_working_copy.zero_();
+  }
+  if (self.dim() > 2) {
+    batchCheckErrors(infos, "symeig_cuda");
+  } else {
+    singleCheckErrors(infos[0], "symeig_cuda");
+  }
+  return std::tuple<Tensor, Tensor>(eigvals_working_copy.to(self.device()), self_working_copy);
+}
+
 }}  // namespace at::native
 
 #undef ALLOCATE_ARRAY
@@ -3603,15 +3603,15 @@
     CUDA: _triangular_solve_helper_cuda
 
 - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
-  dispatch:
-    CPU: legacy::cpu::_th_symeig_out
-    CUDA: legacy::cuda::_th_symeig_out
 
 - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
   variants: method, function
+
+- func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor)
+  variants: function
   dispatch:
-    CPU: legacy::cpu::_th_symeig
-    CUDA: legacy::cuda::_th_symeig
+    CPU: _symeig_helper_cpu
+    CUDA: _symeig_helper_cuda
 
 - func: eig(Tensor self, bool eigenvectors=False, *, Tensor(a!) e, Tensor(b!) v) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
   dispatch:
 
@@ -5,8 +5,6 @@
 
 TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
 TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
-TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
-TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
 TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
 TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
 TH_EXTERNC void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
@@ -40,21 +38,6 @@ void THLapack_(gels)(char trans, int m, int n, int nrhs, scalar_t *a, int lda, s
 #endif
 }
 
-/* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
-matrix A */
-void THLapack_(syev)(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-#else
-  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-#endif
-#else
-  THError("syev : Lapack library not found in compile time\n");
-#endif
-}
-
 /* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and,
 optionally, the left and/or right eigenvectors */
 void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info)
 
@@ -4,8 +4,6 @@
 
 /* ||AX-B|| */
 TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, scalar_t *work, int lwork, int *info);
-/* Eigenvals */
-TH_API void THLapack_(syev)(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info);
 /* Non-sym eigenvals */
 TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info);
 /* svd */