pytorch
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.jenkins/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/Context.cpp‎
Lines changed: 0 additions & 8 deletions b/‎aten/src/ATen/Context.cpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎aten/src/ATen/Context.h‎
Lines changed: 0 additions & 3 deletions b/‎aten/src/ATen/Context.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 0 additions & 8 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎aten/src/ATen/cuda/CublasHandlePool.cpp‎
Lines changed: 0 additions & 10 deletions b/‎aten/src/ATen/cuda/CublasHandlePool.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu‎
Lines changed: 5 additions & 1 deletion b/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/AveragePool3d.cu‎
Lines changed: 6 additions & 2 deletions b/‎aten/src/ATen/native/cuda/AveragePool3d.cu‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/DilatedMaxPool3d.cu‎
Lines changed: 6 additions & 2 deletions b/‎aten/src/ATen/native/cuda/DilatedMaxPool3d.cu‎
Lines changed: 6 additions & 2 deletions
@@ -150,7 +150,7 @@ test_python_nn() {
 }
 
 test_python_ge_config_profiling() {
-  time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
 
@@ -86,14 +86,6 @@ void Context::setBenchmarkCuDNN(bool b) {
   benchmark_cudnn = b;
 }
 
-bool Context::allowTF32CuBLAS() const {
-  return allow_tf32_cublas;
-}
-
-void Context::setAllowTF32CuBLAS(bool b) {
-  allow_tf32_cublas = b;
-}
-
 bool Context::hasMKL() const {
 #if AT_MKL_ENABLED()
   return true;
 
@@ -109,8 +109,6 @@ class CAFFE2_API Context {
   bool deterministic() const;
   void setDeterministic(bool);
   void alertNotDeterministic(c10::string_view const& caller);
-  bool allowTF32CuBLAS() const;
-  void setAllowTF32CuBLAS(bool);
   at::QEngine qEngine() const;
   void setQEngine(at::QEngine e);
   const std::vector<at::QEngine>& supportedQEngines() const;
@@ -138,7 +136,6 @@ class CAFFE2_API Context {
   bool deterministic_cudnn = false;
   bool _deterministic = false;
   bool benchmark_cudnn = false;
-  bool allow_tf32_cublas = true;
   bool enabled_mkldnn = true;
   #ifdef C10_MOBILE
   bool release_original_weights = true;
 
@@ -233,11 +233,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 #else
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major >= 5) {
-#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
-    // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
-    // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-#endif  // CUDA_VERSION < 11000
     TORCH_CUDABLAS_CHECK(cublasGemmEx(
         handle,
         opa,
@@ -258,11 +254,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
         ldc,
         CUDA_R_32F,
         CUBLAS_GEMM_DFALT_TENSOR_OP));
-#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
-    // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
-    // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-#endif  // CUDA_VERSION < 11000
   } else {
     TORCH_CUDABLAS_CHECK(cublasSgemmEx(
         handle,
 
@@ -41,16 +41,6 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   auto handle = myPoolWindow->reserve(device);
   auto stream = c10::cuda::getCurrentCUDAStream();
   TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream));
-#if CUDA_VERSION >= 11000
-  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
-  // FP32 data type calculations based on the value of the allow_tf32 flag.
-  // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
-  if (at::globalContext().allowTF32CuBLAS()) {
-    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
-  } else {
-    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-  }
-#endif
   return handle;
 }
 
 
@@ -643,7 +643,7 @@ namespace {
         const dim3 block(block_x, block_y, block_z);
         int kernel_stride_C = cuda::ATenCeilDiv(sizeC, block_x * 4);
         int kernel_size_C = cuda::ATenCeilDiv(sizeC, block_x * kernel_stride_C);
-        
+
         // Do NOT clip grid_x, striding on Batch dimension is not in the kernel,
         // although it could be easily implemented given current kernel.
         int grid_x = sizeB*kernel_stride_C;
@@ -757,6 +757,8 @@ namespace {
     const Tensor& gradOutput,
     const Tensor& input)
   {
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_out_cuda");
     gradInput.resize_as_(input);
     adaptive_avg_pool2d_backward_out_cuda_template(
       gradInput, gradOutput, input);
@@ -767,6 +769,8 @@ namespace {
     const Tensor& gradOutput,
     const Tensor& input)
   {
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_cuda");
     auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     adaptive_avg_pool2d_backward_out_cuda_template(
       gradInput, gradOutput, input);
 
@@ -507,13 +507,17 @@ Tensor& adaptive_avg_pool3d_backward_out_cuda(
     Tensor& gradInput,
     const Tensor& gradOutput_,
     const Tensor& input) {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_out_cuda");
   adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
   return gradInput;
 }
 
 Tensor adaptive_avg_pool3d_backward_cuda(
     const Tensor& gradOutput_,
     const Tensor& input) {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_cuda");
   auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
   return gradInput;
 
@@ -451,6 +451,8 @@ Tensor& adaptive_max_pool2d_backward_out_cuda(
   const Tensor& input,
   const Tensor& indices)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_out_cuda");
   adaptive_max_pool2d_backward_out_cuda_template(
     gradInput,
     gradOutput_,
@@ -464,6 +466,8 @@ Tensor adaptive_max_pool2d_backward_cuda(
   const Tensor& input,
   const Tensor& indices)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_cuda");
   auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   adaptive_max_pool2d_backward_out_cuda_template(
     gradInput,
 
@@ -447,7 +447,7 @@ void avg_pool3d_out_cuda_template(
               break;
           }
 
-          AT_CUDA_CHECK(cudaGetLastError()); 
+          AT_CUDA_CHECK(cudaGetLastError());
 
           totalZ -= 65535;
           offsetZ += 65535;
@@ -585,7 +585,7 @@ void avg_pool3d_backward_out_cuda_template(
                 1.0f/divide_factor,
                 offsetZ);
 
-            AT_CUDA_CHECK(cudaGetLastError()); 
+            AT_CUDA_CHECK(cudaGetLastError());
 
             totalZ -= 65535;
             offsetZ += 65535;
@@ -700,6 +700,8 @@ Tensor& avg_pool3d_backward_out_cuda(
   bool count_include_pad,
   c10::optional<int64_t> divisor_override)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("avg_pool3d_backward_out_cuda");
   avg_pool3d_backward_out_cuda_template(
     gradInput,
     gradOutput_,
@@ -723,6 +725,8 @@ Tensor avg_pool3d_backward_cuda(
   bool count_include_pad,
   c10::optional<int64_t> divisor_override)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("avg_pool3d_backward_cuda");
   auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   avg_pool3d_backward_out_cuda_template(
     gradInput,
 
@@ -113,7 +113,7 @@ void max_pool3d_with_indices_out_frame(
          dilationT, dilationH, dilationW,
          offsetZ);
 
-    AT_CUDA_CHECK(cudaGetLastError()); 
+    AT_CUDA_CHECK(cudaGetLastError());
 
     totalZ -= 65535;
     offsetZ += 65535;
@@ -179,7 +179,7 @@ void max_pool3d_with_indices_backward_out_frame(
         dilationT, dilationH, dilationW,
         offsetZ);
 
-    AT_CUDA_CHECK(cudaGetLastError()); 
+    AT_CUDA_CHECK(cudaGetLastError());
 
     totalZ -= 65535;
     offsetZ += 65535;
@@ -468,6 +468,8 @@ Tensor& max_pool3d_with_indices_backward_out_cuda(
   bool ceil_mode,
   const Tensor& indices)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_out_cuda");
   max_pool3d_with_indices_backward_out_cuda_template(
     gradInput,
     gradOutput,
@@ -491,6 +493,8 @@ Tensor max_pool3d_with_indices_backward_cuda(
   bool ceil_mode,
   const Tensor& indices)
 {
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_cuda");
   auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   max_pool3d_with_indices_backward_out_cuda_template(
     gradInput,
Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ test_python_nn() {`
`150`	`150`	`}`
`151`	`151`
`152`	`152`	`test_python_ge_config_profiling() {`
`153`		`- time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"`
	`153`	`+ time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"`
`154`	`154`	`assert_git_not_dirty`
`155`	`155`	`}`
`156`	`156`