pytorch
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp‎
Lines changed: 0 additions & 255 deletions b/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp‎
Lines changed: 0 additions & 255 deletions
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.h‎
Lines changed: 0 additions & 3 deletions b/‎aten/src/ATen/LegacyTHFunctionsCPU.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎aten/src/ATen/core/boxing/KernelFunction.cpp‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/boxing/KernelFunction.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 8 additions & 5 deletions b/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 0 additions & 40 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 0 additions & 40 deletions
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 37 additions & 26 deletions b/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 37 additions & 26 deletions
diff --git a/‎aten/src/ATen/native/cuda/LinearAlgebra.cu‎
Lines changed: 0 additions & 114 deletions b/‎aten/src/ATen/native/cuda/LinearAlgebra.cu‎
Lines changed: 0 additions & 114 deletions
@@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_trace(const Tensor & self);
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
 
@@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
 void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
   TORCH_INTERNAL_ASSERT(0,
     op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
+    "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
     "If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
     "Autograd dispatch key for the backend.");
 }
 
@@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat
 }
 
 bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const {
-  for (auto k : ks) {
-    if (kernels_.find(k) != kernels_.end()) {
-      return true;
-    }
+  TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
+  for (auto& kv : kernels_) {
+    if (ks.has(kv.first)) return true;
   }
   return false;
 }
@@ -196,6 +195,9 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //          In the past we directly call into backends(filled with catchAll) after BackendSelect.
   //          Now that we first call Autograd backend keys after BackendSelect, we should fill those
   //          with catchAll as well.
+  //    The implementation of (2.1) & (2.3) relies on the invariant that for a given backend,
+  //    `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
+  //    backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
   //  (3) Use fallthrough kernel that are registered as fallback.
   //  (4) Use catchAll kernel if available
   // Alias Key Precedence:
@@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
     updateDispatchTableEntry_(dispatcher, k);
   }
-  // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
+  // Note [Refresh Runtime Autograd entries in dispatchTable_]
+  // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
   DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
   updateDispatchTableEntry_(dispatcher, autograd_key);
 }
 
@@ -59,6 +59,8 @@ namespace c10 {
   _(prim, Store)                     \
   _(prim, AutogradZero)              \
   _(prim, AutogradAnyNonZero)        \
+  _(prim, AutogradAllNonZero)        \
+  _(prim, AutogradAllZero)           \
   _(prim, Starred)                   \
   _(prim, TupleConstruct)            \
   _(prim, TupleUnpack)               \
 
@@ -498,46 +498,6 @@ void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
 }
 #endif
 
-namespace {
-template<typename scalar_t>
-cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) {
-  TORCH_CHECK(false, "cublas ger is defined only for float and double");
-  return {};
-}
-template<>
-cublasStatus_t cublasGer<float>(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) {
-  return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-template<>
-cublasStatus_t cublasGer<double>(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) {
-  return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-} // anonymous namespace
-
-template<typename scalar_t>
-void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda)
-{
-  _cublasAdjustLdLevel2(m, n, &lda);
-  TORCH_CHECK((m <= INT_MAX) &&
-              (n <= INT_MAX) &&
-              (lda <= INT_MAX) &&
-              (incx <= INT_MAX) &&
-              (incy <= INT_MAX),
-              "cublasSger/cublasDger only supports m, n, lda, incx, incy with "
-              "the bound [val] <= %d", INT_MAX);
-  int i_m = (int)m;
-  int i_n = (int)n;
-  int i_lda = (int)lda;
-  int i_incx = (int)incx;
-  int i_incy = (int)incy;
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(cublasGer<scalar_t>(
-    handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
-}
-template void ger<float>(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
-template void ger<double>(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
-
 /* LEVEL 1 BLAS FUNCTIONS */
 
 template <>
 
@@ -42,6 +42,7 @@ namespace at { namespace cuda {
   _(nvrtcGetProgramLog)                          \
   _(nvrtcGetLoweredName)                         \
   _(cuModuleLoadData)                            \
+  _(cuModuleLoadDataEx)                          \
   _(cuModuleGetFunction)                         \
   _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
   _(cuGetErrorString)                            \
 
@@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) {
 }
 
 Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  return at::_addr(b_self, vec1, vec2, beta, alpha);
+  TORCH_WARN(
+    "torch.addr is deprecated and may be removed in a future PyTorch release. "
+    "This function can be implemented using torch.outer as "
+    "alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, "
+    "alpha * torch.outer(vec1, vec2) when beta is zero.");
+
+  Tensor outer_result = at::outer(vec1, vec2) * alpha;
+  if (beta.to<double>() == 0.0) {
+    return outer_result;
+  }
+  return outer_result + (self * beta);
 }
 
 Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  return at::_addr_(self, vec1, vec2, beta, alpha);
+  return at::addr_out(self, self, vec1, vec2, beta, alpha);
 }
 
 Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out");
-  return at::_addr_out(result, b_self, vec1, vec2, beta, alpha);
+  auto addr_result = at::addr(self, vec1, vec2, beta, alpha);
+  // Validates safe casting
+  const auto result_dtype = addr_result.scalar_type();
+  TORCH_CHECK(canCast(result_dtype, result.scalar_type()),
+              "result type ", result_dtype,
+              " can't be cast to the desired output type ", result.scalar_type());
+
+  at::native::resize_output(result, addr_result.sizes().vec());
+  result.copy_(addr_result);
+  return result;
 }
 
+// torch.ger, alias for torch.outer
 Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  check_1d(self, "self", "ger");
-  check_1d(vec2, "vec2", "ger");
-  if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) {
-    result.resize_({ self.size(0), vec2.size(0) });
-  }
-  // resize_ does the "broadcasting", don't need to broadcast again.
-  return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1));
+  TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. "
+             "Use torch.outer instead.");
+  return at::outer_out(result, self, vec2);
 }
 
 Tensor ger(const Tensor& self, const Tensor& vec2) {
-  Tensor result = at::empty({0}, self.options());
-  at::ger_out(result, self, vec2);
-  return result;
+  return self.outer(vec2);
 }
 
-// torch.outer, alias for torch.ger
 Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  return at::ger_out(result, self, vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  // torch.outer is implemented as a composite op using reshape and mul
+  at::mul_out(result, self.reshape({self.size(0), 1}), vec2);
+  return result;
 }
 
 Tensor outer(const Tensor& self, const Tensor& vec2) {
-  return self.ger(vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  return self.reshape({self.size(0), 1}) * vec2;
 }
 
 static void addmm_impl_cpu_(
 
@@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2,
   return self;
 }
 
-template<typename scalar_t>
-void addr_impl_ger_cuda(Tensor &out, const Tensor &self,
-                        const Tensor& vec1, const Tensor& vec2,
-                        scalar_t alpha, scalar_t beta) {
-  static_assert(std::is_same<scalar_t, float>::value ||
-                std::is_same<scalar_t, double>::value,
-                "addr_impl_ger_cuda: only float and double are supported");
-  if (&out != &self) {
-    at::native::resize_as_(out, self);
-    at::native::copy_(out, self);
-  }
-  if (beta == 0.0) {
-    at::native::zero_(out);
-  }
-  if (beta != 1.0) {
-    at::native::mul_(out, beta);
-  }
-  if (out.stride(0) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec1.size(0), vec2.size(0), alpha,
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(1)
-    );
-  } else if (out.stride(1) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-  } else {
-    Tensor cr = out.clone();
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-    out.set_(cr);
-  }
-}
-
-template<typename scalar_t>
-void addr_impl_cuda(Tensor &out, const Tensor &self,
-                    const Tensor& vec1, const Tensor& vec2,
-                    scalar_t alpha, scalar_t beta) {
-  // currently no Hger/SgerEx in Cublas.
-  Tensor vec2T = vec2.reshape({1, vec2.size(0)});
-  Tensor vec1M = vec1.reshape({vec1.size(0), 1});
-  addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha);
-}
-template<>
-void addr_impl_cuda<float>(Tensor &out, const Tensor &self,
-                           const Tensor& vec1, const Tensor& vec2,
-                           float alpha, float beta) {
-  addr_impl_ger_cuda<float>(out, self, vec1, vec2, alpha, beta);
-}
-template<>
-void addr_impl_cuda<double>(Tensor &out, const Tensor &self,
-                            const Tensor& vec1, const Tensor& vec2,
-                            double alpha, double beta) {
-  addr_impl_ger_cuda<double>(out, self, vec1, vec2, alpha, beta);
-}
-
-Tensor& addr_out_cuda(Tensor &out, const Tensor& self,
-                      const Tensor& vec1, const Tensor& vec2,
-                      Scalar beta, Scalar alpha) {
-  TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1,
-              "vec1 and vec2 should be 1-dimensional vectors. Got dimensions ",
-              vec1.dim(), " and ", vec2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == vec1.device() &&
-              out.device() == vec2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              vec1.device(), " and ", vec2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0),
-              "size mismatch",
-              ", input: ", self_.sizes(),
-              ", v1: ", vec1.sizes(),
-              ", v2: ", vec2.sizes());
-  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] {
-      addr_impl_cuda<scalar_t>(out, self_, vec1, vec2,
-                               alpha.to<scalar_t>(), beta.to<scalar_t>());
-  });
-  return out;
-}
-
-Tensor& addr__cuda(Tensor& self,
-                   const Tensor& vec1, const Tensor& vec2,
-                   Scalar beta, Scalar alpha) {
-  addr_out_cuda(self, self, vec1, vec2, beta, alpha);
-  return self;
-}
-
-Tensor addr_cuda(const Tensor& self,
-                  const Tensor& vec1, const Tensor& vec2,
-                  Scalar beta, Scalar alpha) {
-  Tensor out = at::empty({0}, self.options());
-  addr_out_cuda(out, self, vec1, vec2, beta, alpha);
-  return out;
-}
-
 Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
                         const Tensor& batch1, const Tensor& batch2,
                         Scalar beta, Scalar alpha) {
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel, const OperatorHandle&, Stack) {`
`22`	`22`	`void ambiguous_autogradother_kernel(OperatorKernel, const OperatorHandle& op, Stack) {`
`23`	`23`	`TORCH_INTERNAL_ASSERT(0,`
`24`	`24`	`op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "`
	`25`	`+ "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "`
`25`	`26`	`"If it's intended to override Math kernel behavior, please open an issue to request a dedicated "`
`26`	`27`	`"Autograd dispatch key for the backend.");`
`27`	`28`	`}`