Skip to content

Commit 0c14c90

Browse files
committed
Update on "[Gloo] Support work-level timeouts in ProcessGroupGloo"
Add work-level timeouts to ProcessGroupGloo. This uses the timeout support in `waitSend` and `waitRecv` functions from Gloo's `unbound_buffer` construct. Differential Revision: [D22173763](https://our.internmc.facebook.com/intern/diff/D22173763/) [ghstack-poisoned]
2 parents ff7f83f + 1177de2 commit 0c14c90

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1215
-328
lines changed

.jenkins/pytorch/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ test_python_nn() {
150150
}
151151

152152
test_python_ge_config_profiling() {
153-
time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"
153+
time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
154154
assert_git_not_dirty
155155
}
156156

aten/src/ATen/Context.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,6 @@ void Context::setBenchmarkCuDNN(bool b) {
8686
benchmark_cudnn = b;
8787
}
8888

89-
bool Context::allowTF32CuBLAS() const {
90-
return allow_tf32_cublas;
91-
}
92-
93-
void Context::setAllowTF32CuBLAS(bool b) {
94-
allow_tf32_cublas = b;
95-
}
96-
9789
bool Context::hasMKL() const {
9890
#if AT_MKL_ENABLED()
9991
return true;

aten/src/ATen/Context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,6 @@ class CAFFE2_API Context {
109109
bool deterministic() const;
110110
void setDeterministic(bool);
111111
void alertNotDeterministic(c10::string_view const& caller);
112-
bool allowTF32CuBLAS() const;
113-
void setAllowTF32CuBLAS(bool);
114112
at::QEngine qEngine() const;
115113
void setQEngine(at::QEngine e);
116114
const std::vector<at::QEngine>& supportedQEngines() const;
@@ -138,7 +136,6 @@ class CAFFE2_API Context {
138136
bool deterministic_cudnn = false;
139137
bool _deterministic = false;
140138
bool benchmark_cudnn = false;
141-
bool allow_tf32_cublas = true;
142139
bool enabled_mkldnn = true;
143140
#ifdef C10_MOBILE
144141
bool release_original_weights = true;

aten/src/ATen/cuda/CUDABlas.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
233233
#else
234234
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
235235
if (prop->major >= 5) {
236-
#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
237-
// On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
238-
// manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
239236
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
240-
#endif // CUDA_VERSION < 11000
241237
TORCH_CUDABLAS_CHECK(cublasGemmEx(
242238
handle,
243239
opa,
@@ -258,11 +254,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
258254
ldc,
259255
CUDA_R_32F,
260256
CUBLAS_GEMM_DFALT_TENSOR_OP));
261-
#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
262-
// On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
263-
// manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
264257
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
265-
#endif // CUDA_VERSION < 11000
266258
} else {
267259
TORCH_CUDABLAS_CHECK(cublasSgemmEx(
268260
handle,

aten/src/ATen/cuda/CublasHandlePool.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,6 @@ cublasHandle_t getCurrentCUDABlasHandle() {
4141
auto handle = myPoolWindow->reserve(device);
4242
auto stream = c10::cuda::getCurrentCUDAStream();
4343
TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream));
44-
#if CUDA_VERSION >= 11000
45-
// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
46-
// FP32 data type calculations based on the value of the allow_tf32 flag.
47-
// To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
48-
if (at::globalContext().allowTF32CuBLAS()) {
49-
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
50-
} else {
51-
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
52-
}
53-
#endif
5444
return handle;
5545
}
5646

aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ namespace {
643643
const dim3 block(block_x, block_y, block_z);
644644
int kernel_stride_C = cuda::ATenCeilDiv(sizeC, block_x * 4);
645645
int kernel_size_C = cuda::ATenCeilDiv(sizeC, block_x * kernel_stride_C);
646-
646+
647647
// Do NOT clip grid_x, striding on Batch dimension is not in the kernel,
648648
// although it could be easily implemented given current kernel.
649649
int grid_x = sizeB*kernel_stride_C;
@@ -757,6 +757,8 @@ namespace {
757757
const Tensor& gradOutput,
758758
const Tensor& input)
759759
{
760+
// Nondeterministic because of atomicAdd usage
761+
globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_out_cuda");
760762
gradInput.resize_as_(input);
761763
adaptive_avg_pool2d_backward_out_cuda_template(
762764
gradInput, gradOutput, input);
@@ -767,6 +769,8 @@ namespace {
767769
const Tensor& gradOutput,
768770
const Tensor& input)
769771
{
772+
// Nondeterministic because of atomicAdd usage
773+
globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_cuda");
770774
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
771775
adaptive_avg_pool2d_backward_out_cuda_template(
772776
gradInput, gradOutput, input);

aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,13 +507,17 @@ Tensor& adaptive_avg_pool3d_backward_out_cuda(
507507
Tensor& gradInput,
508508
const Tensor& gradOutput_,
509509
const Tensor& input) {
510+
// Nondeterministic because of atomicAdd usage
511+
globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_out_cuda");
510512
adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
511513
return gradInput;
512514
}
513515

514516
Tensor adaptive_avg_pool3d_backward_cuda(
515517
const Tensor& gradOutput_,
516518
const Tensor& input) {
519+
// Nondeterministic because of atomicAdd usage
520+
globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_cuda");
517521
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
518522
adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
519523
return gradInput;

aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,8 @@ Tensor& adaptive_max_pool2d_backward_out_cuda(
451451
const Tensor& input,
452452
const Tensor& indices)
453453
{
454+
// Nondeterministic because of atomicAdd usage
455+
globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_out_cuda");
454456
adaptive_max_pool2d_backward_out_cuda_template(
455457
gradInput,
456458
gradOutput_,
@@ -464,6 +466,8 @@ Tensor adaptive_max_pool2d_backward_cuda(
464466
const Tensor& input,
465467
const Tensor& indices)
466468
{
469+
// Nondeterministic because of atomicAdd usage
470+
globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_cuda");
467471
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
468472
adaptive_max_pool2d_backward_out_cuda_template(
469473
gradInput,

aten/src/ATen/native/cuda/AveragePool3d.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ void avg_pool3d_out_cuda_template(
447447
break;
448448
}
449449

450-
AT_CUDA_CHECK(cudaGetLastError());
450+
AT_CUDA_CHECK(cudaGetLastError());
451451

452452
totalZ -= 65535;
453453
offsetZ += 65535;
@@ -585,7 +585,7 @@ void avg_pool3d_backward_out_cuda_template(
585585
1.0f/divide_factor,
586586
offsetZ);
587587

588-
AT_CUDA_CHECK(cudaGetLastError());
588+
AT_CUDA_CHECK(cudaGetLastError());
589589

590590
totalZ -= 65535;
591591
offsetZ += 65535;
@@ -700,6 +700,8 @@ Tensor& avg_pool3d_backward_out_cuda(
700700
bool count_include_pad,
701701
c10::optional<int64_t> divisor_override)
702702
{
703+
// Nondeterministic because of atomicAdd usage
704+
globalContext().alertNotDeterministic("avg_pool3d_backward_out_cuda");
703705
avg_pool3d_backward_out_cuda_template(
704706
gradInput,
705707
gradOutput_,
@@ -723,6 +725,8 @@ Tensor avg_pool3d_backward_cuda(
723725
bool count_include_pad,
724726
c10::optional<int64_t> divisor_override)
725727
{
728+
// Nondeterministic because of atomicAdd usage
729+
globalContext().alertNotDeterministic("avg_pool3d_backward_cuda");
726730
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
727731
avg_pool3d_backward_out_cuda_template(
728732
gradInput,

aten/src/ATen/native/cuda/DilatedMaxPool3d.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ void max_pool3d_with_indices_out_frame(
113113
dilationT, dilationH, dilationW,
114114
offsetZ);
115115

116-
AT_CUDA_CHECK(cudaGetLastError());
116+
AT_CUDA_CHECK(cudaGetLastError());
117117

118118
totalZ -= 65535;
119119
offsetZ += 65535;
@@ -179,7 +179,7 @@ void max_pool3d_with_indices_backward_out_frame(
179179
dilationT, dilationH, dilationW,
180180
offsetZ);
181181

182-
AT_CUDA_CHECK(cudaGetLastError());
182+
AT_CUDA_CHECK(cudaGetLastError());
183183

184184
totalZ -= 65535;
185185
offsetZ += 65535;
@@ -468,6 +468,8 @@ Tensor& max_pool3d_with_indices_backward_out_cuda(
468468
bool ceil_mode,
469469
const Tensor& indices)
470470
{
471+
// Nondeterministic because of atomicAdd usage
472+
globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_out_cuda");
471473
max_pool3d_with_indices_backward_out_cuda_template(
472474
gradInput,
473475
gradOutput,
@@ -491,6 +493,8 @@ Tensor max_pool3d_with_indices_backward_cuda(
491493
bool ceil_mode,
492494
const Tensor& indices)
493495
{
496+
// Nondeterministic because of atomicAdd usage
497+
globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_cuda");
494498
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
495499
max_pool3d_with_indices_backward_out_cuda_template(
496500
gradInput,

0 commit comments

Comments
 (0)