Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0c6f207
linalg.cuda_prefer_cusolver flag py/c++ bindings
xwang233 Nov 8, 2021
e135ef1
global flag override heuristics
xwang233 Nov 8, 2021
400771a
doc and warning
xwang233 Nov 8, 2021
abbf65d
format
xwang233 Nov 8, 2021
fdf9705
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 10, 2021
b9465a5
LinalgBackend py/cpp bindings
xwang233 Nov 11, 2021
185d534
remove cuda_prefer_cusolver
xwang233 Nov 11, 2021
fecbf34
heuristic and preferred_backend
xwang233 Nov 11, 2021
9071e33
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 11, 2021
c8af8a8
doc
xwang233 Nov 11, 2021
4937c18
lint
xwang233 Nov 11, 2021
1e48f00
mypy
xwang233 Nov 11, 2021
03aef5e
flake8
xwang233 Nov 11, 2021
eac8a5a
warning
xwang233 Nov 11, 2021
2a5a233
clang format
xwang233 Nov 11, 2021
badba2d
typo
xwang233 Nov 11, 2021
c9344b8
add a test
xwang233 Nov 11, 2021
96b8896
skipCUDAIfRocm
xwang233 Nov 11, 2021
88ff857
override and binding
xwang233 Nov 11, 2021
7334ace
doc coverage fix
xwang233 Nov 11, 2021
210444f
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 11, 2021
1e8e0f8
remove rocm skip
xwang233 Nov 11, 2021
569a6b1
doc and warning message updates
xwang233 Nov 15, 2021
6ed5cdc
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 15, 2021
6fc8349
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 17, 2021
9d06197
move c10/core/LinalgBackend.h to ATen
xwang233 Nov 17, 2021
6b432f7
remove jit changes
xwang233 Nov 17, 2021
4eaf203
undef linalg macro
xwang233 Nov 17, 2021
4faf9f0
revert python_arg_parser changes
xwang233 Nov 17, 2021
4114dd2
change py bindings to torch.backends.cuda.linalg.preferred_library
xwang233 Nov 18, 2021
e3b532e
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 18, 2021
c557133
use pybind11 for py/c++ bindings
xwang233 Nov 19, 2021
00c7f0f
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 19, 2021
a372948
lint
xwang233 Nov 19, 2021
6310698
mypy
xwang233 Nov 19, 2021
7446e49
wrap test with a try-finally
xwang233 Nov 22, 2021
d6d29dc
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Nov 22, 2021
70ebdbe
Merge remote-tracking branch 'upstream/master' into linalg-runtime-sw…
xwang233 Dec 2, 2021
3a81c30
change warning message
xwang233 Dec 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,21 @@ void Context::setAllowTF32CuBLAS(bool b) {
allow_tf32_cublas = b;
}

at::LinalgBackend Context::linalgPreferredBackend() const {
return linalg_preferred_backend;
}

void Context::setLinalgPreferredBackend(at::LinalgBackend b) {
linalg_preferred_backend = b;
if (b != at::LinalgBackend::Default) {
TORCH_WARN_ONCE(
"torch.backends.cuda.preferred_linalg_library is an experimental feature. "
"If you see any error or unexpected behavior when this flag is set "
"please file an issue on GitHub."
);
}
}

bool Context::allowFP16ReductionCuBLAS() const {
return allow_fp16_reduction_cublas;
}
Expand Down
5 changes: 5 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ATen/core/ATenGeneral.h>
#include <ATen/core/Generator.h>
#include <ATen/CPUGeneratorImpl.h>
#include <ATen/LinalgBackend.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/core/DeprecatedTypeProperties.h>
#include <ATen/detail/CUDAHooksInterface.h>
Expand Down Expand Up @@ -128,6 +129,9 @@ class TORCH_API Context {
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool);

at::LinalgBackend linalgPreferredBackend() const;
void setLinalgPreferredBackend(at::LinalgBackend);

// Note [Enabling Deterministic Operations]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Operations in PyTorch that normally act nondeterministically, but have an alternate
Expand Down Expand Up @@ -249,6 +253,7 @@ class TORCH_API Context {
bool allow_tf32_cublas = true;
bool allow_fp16_reduction_cublas = true;
bool enabled_mkldnn = true;
at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default;
#ifdef C10_MOBILE
bool release_original_weights = true;
#else
Expand Down
31 changes: 31 additions & 0 deletions aten/src/ATen/LinalgBackend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#pragma once

#include <c10/util/Exception.h>

#include <ostream>
#include <string>

namespace at {

enum class LinalgBackend : int8_t { Default, Cusolver, Magma };

inline std::string LinalgBackendToString(at::LinalgBackend backend) {
switch (backend) {
case LinalgBackend::Default:
return "at::LinalgBackend::Default";
case LinalgBackend::Cusolver:
return "at::LinalgBackend::Cusolver";
case LinalgBackend::Magma:
return "at::LinalgBackend::Magma";
default:
TORCH_CHECK(false, "Unknown linalg backend");
}
}

inline std::ostream& operator<<(
std::ostream& stream,
at::LinalgBackend backend) {
return stream << LinalgBackendToString(backend);
}

} // namespace c10
182 changes: 140 additions & 42 deletions aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1471,10 +1471,18 @@ Tensor _inverse_helper_cuda_legacy(const Tensor& self) {

Tensor _inverse_helper_cuda(const Tensor& self) {
#ifdef USE_CUSOLVER
if ((self.dim() == 2) || (/* self.dim() > 2 && */ batchCount(self) <= 2) || !use_magma_) {
return _inverse_helper_cuda_lib(self); // cusolver or cublas
} else {
return _inverse_helper_cuda_legacy(self); // magma-cuda
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
return _inverse_helper_cuda_lib(self); // cusolver or cublas
case at::LinalgBackend::Magma:
return _inverse_helper_cuda_legacy(self); // magma-cuda
default:
if (batchCount(self) <= 2 || !use_magma_) {
return _inverse_helper_cuda_lib(self); // cusolver or cublas
} else {
return _inverse_helper_cuda_legacy(self); // magma-cuda
}
}
#else
return _inverse_helper_cuda_legacy(self); // magma-cuda
Expand Down Expand Up @@ -1503,10 +1511,18 @@ Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& in
// This function calculates the inverse matrix in-place
// result should be in column major order and contain matrices to invert
#ifdef USE_CUSOLVER
if ((result.dim() == 2) || (/* result.dim() > 2 && */ batchCount(result) <= 2) || !use_magma_) {
return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri); // cusolver or cublas
} else {
return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri); // magma-cuda
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri); // cusolver or cublas
case at::LinalgBackend::Magma:
return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri); // magma-cuda
default:
if (batchCount(result) <= 2 || !use_magma_) {
return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri); // cusolver or cublas
} else {
return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri); // magma-cuda
}
}
#else
return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri); // magma-cuda
Expand Down Expand Up @@ -1600,10 +1616,18 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
// Batched cholesky_solve is dispatched to magma.
Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
#ifdef USE_CUSOLVER
if (batchCount(self) == 1 || !use_magma_) {
return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
} else {
return _cholesky_solve_helper_cuda_magma(self, A, upper);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
case at::LinalgBackend::Magma:
return _cholesky_solve_helper_cuda_magma(self, A, upper);
default:
if (batchCount(self) == 1 || !use_magma_) {
return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
} else {
return _cholesky_solve_helper_cuda_magma(self, A, upper);
}
}
#else
return _cholesky_solve_helper_cuda_magma(self, A, upper);
Expand Down Expand Up @@ -1706,10 +1730,20 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)

static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
#ifdef USE_CUSOLVER
if (batchCount(input) == 1 || !use_magma_ || use_cusolver_potrf_batched_) {
cholesky_helper_cusolver(input, upper, info);
} else {
cholesky_helper_magma(input, upper, info);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
cholesky_helper_cusolver(input, upper, info);
break;
case at::LinalgBackend::Magma:
cholesky_helper_magma(input, upper, info);
break;
default:
if (batchCount(input) == 1 || !use_magma_ || use_cusolver_potrf_batched_) {
cholesky_helper_cusolver(input, upper, info);
} else {
cholesky_helper_magma(input, upper, info);
}
}
#else
cholesky_helper_magma(input, upper, info);
Expand Down Expand Up @@ -1777,10 +1811,19 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper)
// result should be in column major order and contain matrices to invert
// the content of result is overwritten by 'apply_cholesky_inverse'
#ifdef USE_CUSOLVER
if (batchCount(result) == 1 || !use_magma_) {
return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
} else {
return cholesky_inverse_kernel_impl_magma(result, infos, upper);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
case at::LinalgBackend::Magma:
return cholesky_inverse_kernel_impl_magma(result, infos, upper);
default:
if (batchCount(result) == 1 ||
!use_magma_) {
return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
} else {
return cholesky_inverse_kernel_impl_magma(result, infos, upper);
}
}
#else
return cholesky_inverse_kernel_impl_magma(result, infos, upper);
Expand Down Expand Up @@ -1944,20 +1987,39 @@ static void lu_batched_magma(const Tensor& input, const Tensor& pivots, const Te
static void apply_lu(const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
int64_t batch_size = batchCount(input);
#ifdef USE_CUSOLVER
// Use a heuristic to determine that cusolver is faster than MAGMA for the following sizes.
auto m = input.size(-2);
// exclude complex128 since nan_to_num_ does not work with it.
if ((batch_size == 1 || (batch_size <= 8 && m <= 16) || !use_magma_ ) && !input.is_complex()) {
lu_looped_cusolver(input, pivots, infos, compute_pivots);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Cusolver:
lu_looped_cusolver(input, pivots, infos, compute_pivots);
break;
case at::LinalgBackend::Magma:
if (batch_size == 1) {
lu_looped_magma(input, pivots, infos, compute_pivots);
} else {
lu_batched_magma(input, pivots, infos, compute_pivots);
}
break;
default:
// Use a heuristic to determine that cusolver is faster than MAGMA for the following sizes.
auto m = input.size(-2);
// exclude complex128 since nan_to_num_ does not work with it.
if ((batch_size == 1 ||
(batch_size <= 8 && m <= 16) ||
!use_magma_)
&& !input.is_complex()) {
lu_looped_cusolver(input, pivots, infos, compute_pivots);
} else {
lu_batched_magma(input, pivots, infos, compute_pivots);
}
}
#else
if (batch_size == 1) {
lu_looped_magma(input, pivots, infos, compute_pivots);
}
#endif // USE_CUSOLVER
else {
lu_batched_magma(input, pivots, infos, compute_pivots);
}
#endif // USE_CUSOLVER
}

REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
Expand Down Expand Up @@ -2064,12 +2126,12 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) {
// See discussions in https://github.com/pytorch/pytorch/pull/51348 for comparison of cuSOLVER-MAGMA
// and Windows failure.
// For reference here is the MAGMA-based implementation: https://gist.github.com/IvanYashchuk/2db50002c9d3c1462ff769e6410ad983
#if defined(USE_CUSOLVER)
return orgqr_helper_cusolver(result, tau); // cusolver
#else
TORCH_CHECK(false, "Calling torch.orgqr on a CUDA tensor requires compiling ",
"PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support.");
#endif
#if defined(USE_CUSOLVER)
return orgqr_helper_cusolver(result, tau); // cusolver
#else
TORCH_CHECK(false, "Calling torch.orgqr on a CUDA tensor requires compiling ",
"PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support.");
#endif
}

REGISTER_CUDA_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
Expand Down Expand Up @@ -2136,7 +2198,14 @@ void geqrf_magma(const Tensor& input, const Tensor& tau) {
// This is a backend library dispatching helper function for calling looped batch implementation
void geqrf_looped(const Tensor& input, const Tensor& tau) {
#if defined(USE_CUSOLVER)
return geqrf_cusolver(input, tau);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Magma:
return geqrf_magma(input, tau);
case at::LinalgBackend::Cusolver:
default:
return geqrf_cusolver(input, tau);
}
#else
return geqrf_magma(input, tau);
#endif
Expand Down Expand Up @@ -2273,9 +2342,16 @@ std::tuple<Tensor, Tensor> linalg_qr_helper_magma(const Tensor& self, c10::strin

std::tuple<Tensor, Tensor> _linalg_qr_helper_cuda(const Tensor& input, c10::string_view mode) {
#if defined(USE_CUSOLVER)
// _linalg_qr_helper_default is a generic function that is implemented using
// geqrf_stub and orgqr_stub. It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
return _linalg_qr_helper_default(input, mode);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Magma:
return linalg_qr_helper_magma(input, mode);
case at::LinalgBackend::Cusolver:
default:
// _linalg_qr_helper_default is a generic function that is implemented using
// geqrf_stub and orgqr_stub. It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
return _linalg_qr_helper_default(input, mode);
}
#else
return linalg_qr_helper_magma(input, mode);
#endif
Expand Down Expand Up @@ -2432,7 +2508,15 @@ void linalg_eigh_magma(const Tensor& eigenvalues, const Tensor& eigenvectors, co

void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
#if defined(USE_CUSOLVER)
linalg_eigh_cusolver(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Magma:
linalg_eigh_magma(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
break;
case at::LinalgBackend::Cusolver:
default:
linalg_eigh_cusolver(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
}
#else
linalg_eigh_magma(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
#endif
Expand Down Expand Up @@ -2731,7 +2815,14 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda_legacy(const Tensor& self, b

std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool some, bool compute_uv) {
#ifdef USE_CUSOLVER
return _svd_helper_cuda_lib(self, some, compute_uv);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Magma:
return _svd_helper_cuda_legacy(self, some, compute_uv);
case at::LinalgBackend::Cusolver:
default:
return _svd_helper_cuda_lib(self, some, compute_uv);
}
#else
return _svd_helper_cuda_legacy(self, some, compute_uv);
#endif
Expand Down Expand Up @@ -3046,10 +3137,17 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/

void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
#if defined(USE_CUSOLVER)
// linalg_lstsq_gels is a generic function that is implemented using
// geqrf_stub, ormqr_stub, and triangular_solve_stub
// It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
return linalg_lstsq_gels(a, b, infos);
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
case at::LinalgBackend::Magma:
return gels_magma(a, b, infos);
case at::LinalgBackend::Cusolver:
default:
// linalg_lstsq_gels is a generic function that is implemented using
// geqrf_stub, ormqr_stub, and triangular_solve_stub
// It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
return linalg_lstsq_gels(a, b, infos);
}
#else
return gels_magma(a, b, infos);
#endif
Expand Down
2 changes: 2 additions & 0 deletions docs/source/backends.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ torch.backends.cuda

Clears the cuFFT plan cache.

.. autofunction:: torch.backends.cuda.preferred_linalg_library


torch.backends.cudnn
^^^^^^^^^^^^^^^^^^^^
Expand Down
Loading