Skip to content

Commit 241f8dc

Browse files
Revert "Remove outdated CUDA 11 conditions (#154313)"
This reverts commit 3936e61. Reverted #154313 on behalf of https://github.com/izaitsevfb due to breaks internal builds ([comment](#154313 (comment)))
1 parent 6be8295 commit 241f8dc

20 files changed

+76
-46
lines changed

aten/src/ATen/cuda/CUDADataType.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
7878
return CUDA_R_64I;
7979
case c10::ScalarType::BFloat16:
8080
return CUDA_R_16BF;
81-
#if defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
81+
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
8282
case c10::ScalarType::Float8_e4m3fn:
8383
return CUDA_R_8F_E4M3;
8484
case c10::ScalarType::Float8_e5m2:

aten/src/ATen/cuda/CUDAGraph.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ void CUDAGraph::capture_end() {
139139
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
140140
// cudaGraphInstantiateWithFlags
141141
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
142-
#if (defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
142+
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
143143
int version = 0;
144144
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
145145
if (version < 11040) {
@@ -154,7 +154,7 @@ void CUDAGraph::capture_end() {
154154
#endif
155155
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
156156
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
157-
#if (defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
157+
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
158158
} else {
159159
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
160160
graph_,
@@ -216,7 +216,7 @@ void CUDAGraph::enable_debug_mode() {
216216
}
217217

218218
void CUDAGraph::debug_dump(const std::string& debug_path) {
219-
#if defined(CUDA_VERSION) || defined(USE_ROCM)
219+
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM)
220220
if (_cuda_graphs_debug) {
221221
TORCH_WARN("DEBUG: calling debug_dump()");
222222
if (has_graph_) {

aten/src/ATen/cuda/Exceptions.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,15 @@ constexpr const char* _cusolver_backend_suggestion = \
117117
"linear algebra operators with other supported backends. " \
118118
"See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
119119

120+
// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
120121
// When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
121122
#define TORCH_CUSOLVER_CHECK(EXPR) \
122123
do { \
123124
cusolverStatus_t __err = EXPR; \
124-
if (__err == CUSOLVER_STATUS_INVALID_VALUE) { \
125+
if ((CUDA_VERSION < 11500 && \
126+
__err == CUSOLVER_STATUS_EXECUTION_FAILED) || \
127+
(CUDA_VERSION >= 11500 && \
128+
__err == CUSOLVER_STATUS_INVALID_VALUE)) { \
125129
TORCH_CHECK_LINALG( \
126130
false, \
127131
"cusolver error: ", \

aten/src/ATen/cuda/cub.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
291291
#endif
292292
}
293293

294-
# if defined(CUDA_VERSION) || defined(USE_ROCM)
294+
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
295295

296296
template<typename T>
297297
struct BlockPrefixCallbackOp

aten/src/ATen/cuda/detail/LazyNVRTC.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,10 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
146146
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
147147
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
148148
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
149+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
149150
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
150151
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
152+
#endif
151153
NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
152154
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
153155
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)

aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ namespace at::cuda {
7676
AT_FORALL_NVRTC_BASE(_)
7777
#endif
7878

79-
#if defined(CUDA_VERSION)
79+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
8080
#define AT_FORALL_NVRTC(_) \
8181
AT_FORALL_NVRTC_EXTENDED(_) \
8282
_(nvrtcGetCUBINSize) \

aten/src/ATen/native/cuda/Blas.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
359359
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
360360
c10::MaybeOwned<Tensor> self_;
361361
if (&result != &self) {
362-
#if defined(CUDA_VERSION) || defined(USE_ROCM)
362+
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
363363
// Strangely, if mat2 has only 1 row or column, we get
364364
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
365365
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -495,6 +495,15 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
495495
}
496496
#else
497497
auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
498+
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
499+
// GELU is not supported (and does not compile!) prior
500+
// to CUDA 11.4. Have observed accuracy issues with
501+
// GELU epilogue in 11.4; disabling the GELU epilogue
502+
// path for CUDA version < 11.8.
503+
if (activation == Activation::GELU)
504+
activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
505+
#endif
506+
498507
bool okay = true;
499508
if (is_float_output_with_half_input) {
500509
AT_DISPATCH_REDUCED_FLOATING_TYPES(
@@ -637,7 +646,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
637646
// gating activation_to_gemm_and_blas_arg above; here we are manually
638647
// performing a post-GELU because we weren't able to use the GELU
639648
// epilogue above.
640-
#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
649+
#if !(defined(CUDA_VERSION) && CUDA_VERSION >= 11080) && !defined(USE_ROCM)
641650
if (useLtInterface && activation == Activation::GELU) {
642651
at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
643652
}
@@ -1008,7 +1017,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
10081017

10091018
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
10101019

1011-
#if defined(CUDA_VERSION) || defined(USE_ROCM)
1020+
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM)
10121021
cublasCommonArgs args(self, mat2, result);
10131022

10141023
at::cuda::blas::int8_gemm(

aten/src/ATen/native/cuda/MixedDtypesLinear.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <ATen/core/Tensor.h>
33
#include <ATen/cuda/CUDAUtils.h>
44

5-
#if defined(USE_ROCM) || defined(_MSC_VER)
5+
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
66
// Doesn't work on ROCm or Windows yet
77
// TODO: Add compiler warning? Add PyTorch config flag?
88
#else
@@ -20,7 +20,7 @@
2020
#include <ATen/native/cuda/cutlass_extensions/gemm/threadblock/default_mma.h>
2121
#endif
2222

23-
#if defined(USE_ROCM) || defined(_MSC_VER)
23+
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
2424
// Doesn't work on ROCm or Windows yet
2525
#else
2626
#define CUTLASS_STATUS_CHECK(status) \
@@ -32,7 +32,7 @@
3232

3333
namespace at::native {
3434

35-
#if defined(USE_ROCM) || defined(_MSC_VER)
35+
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
3636
// Doesn't work on ROCm or Windows yet or old compiler
3737
#else
3838
template<typename ElementInputA, typename ElementInputB, typename EpilogueTag>
@@ -198,7 +198,7 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
198198
const Tensor& scale,
199199
const std::optional<Tensor>& bias_opt,
200200
const std::optional<std::string_view> activation_opt) {
201-
#if defined(USE_ROCM) || defined(_MSC_VER)
201+
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
202202
TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
203203
return Tensor{};
204204
#else

aten/src/ATen/native/cuda/Nonzero.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ void nonzero_static_cuda_out_impl(
300300
int64_t size,
301301
int64_t fill_value,
302302
Tensor& out) {
303-
#if defined(CUDA_VERSION) || defined(USE_ROCM)
303+
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
304304

305305
Tensor self_contiguous_ = self.contiguous();
306306
// see comment in nonzero_cuda_out_impl on reqs for out

aten/src/ATen/native/cuda/PointwiseOpsKernel.cu

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ void addcmul_cuda_scalar_tensor2_kernel(
1717
const Scalar& value
1818
);
1919

20-
#if AT_USE_JITERATOR()
20+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
2121
constexpr char addcmul_name[] = "addcmul";
2222
#endif
2323
void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
@@ -37,7 +37,10 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
3737

3838
auto dtype = iter.common_dtype();
3939
if (at::isComplexType(dtype)) {
40-
#if AT_USE_JITERATOR()
40+
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
41+
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
42+
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
43+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
4144
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
4245
auto alpha = value.to<scalar_t>();
4346
static const auto addcmul_string = jiterator_stringify(
@@ -90,14 +93,17 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
9093
}
9194
}
9295

93-
#if AT_USE_JITERATOR()
96+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
9497
constexpr char addcmul_scalar_tensor2_name[] = "addcmul_scalar_tensor2";
9598
#endif
9699
void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar& scalar_tensor2, const Scalar& value) {
97100
auto dtype = iter.common_dtype();
98101

99102
if (at::isComplexType(dtype)) {
100-
#if AT_USE_JITERATOR()
103+
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
104+
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
105+
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
106+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
101107
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
102108
auto c = scalar_tensor2.to<scalar_t>();
103109
auto alpha = value.to<scalar_t>();
@@ -139,14 +145,17 @@ void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar&
139145
}
140146
}
141147

142-
#if AT_USE_JITERATOR()
148+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
143149
// return a + alpha * (b / static_cast<accscalar_t>(c));
144150
constexpr char addcdiv_name[] = "addcdiv";
145151
#endif
146152
void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
147153
auto dtype = iter.common_dtype();
148154
if (at::isComplexType(dtype)) {
149-
#if AT_USE_JITERATOR()
155+
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
156+
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
157+
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
158+
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
150159
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() {
151160
auto alpha = value.to<scalar_t>();
152161
static const auto addcdiv_string =

0 commit comments

Comments
 (0)