pytorch
diff --git a/‎.ci/manywheel/build_cuda.sh‎
Lines changed: 5 additions & 2 deletions b/‎.ci/manywheel/build_cuda.sh‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎.github/ci_commit_pins/audio.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/audio.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/ci_commit_pins/vision.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/vision.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/CMakeLists.txt‎
Lines changed: 52 additions & 51 deletions b/‎aten/src/ATen/CMakeLists.txt‎
Lines changed: 52 additions & 51 deletions
diff --git a/‎aten/src/ATen/cuda/tunable/GemmCommon.h‎
Lines changed: 27 additions & 25 deletions b/‎aten/src/ATen/cuda/tunable/GemmCommon.h‎
Lines changed: 27 additions & 25 deletions
diff --git a/‎aten/src/ATen/cuda/tunable/README.md‎
Lines changed: 2 additions & 3 deletions b/‎aten/src/ATen/cuda/tunable/README.md‎
Lines changed: 2 additions & 3 deletions
@@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             export USE_CUFILE=0
         else
             DEPS_LIST+=(
-                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
                 "/usr/local/cuda/lib64/libcublas.so.12"
                 "/usr/local/cuda/lib64/libcublasLt.so.12"
                 "/usr/local/cuda/lib64/libcudart.so.12"
                 "/usr/local/cuda/lib64/libnvrtc.so.12"
                 "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
             DEPS_SONAME+=(
-                "libnvToolsExt.so.1"
                 "libcublas.so.12"
                 "libcublasLt.so.12"
                 "libcudart.so.12"
                 "libnvrtc.so.12"
                 "libcupti.so.12")
+
+            if [[ $CUDA_VERSION != 12.9* ]]; then
+                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
+                DEPS_SONAME+=("libnvToolsExt.so.1")
+            fi
         fi
     else
         echo "Using nvidia libs from pypi."
 
@@ -1 +1 @@
-8ad2aa5d354d1bf432339113860185d5a5d1abbd
+1b013f5b5a87a1882eb143c26d79d091150d6a37
@@ -1 +1 @@
-f5c6c2ec6490455e86f67b2a25c10390d60a27f7
+faffd5cf673615583da6517275e361cb3dbc77e6
@@ -256,6 +256,7 @@ endif()
 IF(USE_FBGEMM_GENAI)
   set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
   set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
+
   if(USE_CUDA)
     # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
     # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
@@ -292,58 +293,64 @@ IF(USE_FBGEMM_GENAI)
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
     )
 
-    target_include_directories(fbgemm_genai PUBLIC
+    target_include_directories(fbgemm_genai PRIVATE
       ${FBGEMM_THIRD_PARTY}/cutlass/include
       ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
       ${fbgemm_genai_mx8mx8bf16_grouped}
       ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
       ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
     )
-  else()
-    if(USE_ROCM)
-      # Only include the kernels we want to build to avoid increasing binary size.
-      file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
-        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
-        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
-      set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-
-      # Add additional HIPCC compiler flags for performance
-      set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-        -mllvm
-        -amdgpu-coerce-illegal-types=1
-        -mllvm
-        -enable-post-misched=0
-        -mllvm
-        -greedy-reverse-local-assignment=1
-        -fhip-new-launch-api)
-
-      # Only compile for gfx942 for now.
-      # This is rather hacky, I could not figure out a clean solution :(
-      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
-      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
-      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
-      endif()
-      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
-
-      hip_add_library(
-        fbgemm_genai STATIC
-        ${fbgemm_genai_native_rocm_hip}
-        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
-      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
-      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
-      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
-
-      target_include_directories(fbgemm_genai PUBLIC
-        # FBGEMM version of Composable Kernel is used due to some customizations
-        ${FBGEMM_THIRD_PARTY}/composable_kernel/include
-        ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
-        ${FBGEMM_THIRD_PARTY}/cutlass/include
-        ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
-        ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
-        ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
-      )
+
+    # Add FBGEMM_GENAI include directories for torch_ops.h
+    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
+  elseif(USE_ROCM)
+    # Only include the kernels we want to build to avoid increasing binary size.
+    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+    # Add additional HIPCC compiler flags for performance
+    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+      -mllvm
+      -amdgpu-coerce-illegal-types=1
+      -mllvm
+      -enable-post-misched=0
+      -mllvm
+      -greedy-reverse-local-assignment=1
+      -fhip-new-launch-api)
+
+    # Only compile for gfx942 for now.
+    # This is rather hacky, I could not figure out a clean solution :(
+    set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+    string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+    if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+      list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
     endif()
+    set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
+    hip_add_library(
+      fbgemm_genai STATIC
+      ${fbgemm_genai_native_rocm_hip}
+      HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+    set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
+    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+    target_include_directories(fbgemm_genai PRIVATE
+      # FBGEMM version of Composable Kernel is used due to some customizations
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
+    )
+
+    # Add FBGEMM_GENAI include directories for torch_ops.h
+    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
   endif()
 endif()
 
@@ -692,12 +699,6 @@ if(USE_CUDA AND NOT USE_ROCM)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
 
-  # Add FBGEMM_GENAI include directories for torch_ops.h
-  if(USE_FBGEMM_GENAI)
-    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
-    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
-  endif()
-
   if($ENV{ATEN_STATIC_CUDA})
     if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
 
@@ -13,6 +13,7 @@
 #include <c10/core/ScalarType.h>
 
 #include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/Tunable.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/util/StringUtil.h>
@@ -150,6 +151,7 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
       BLASType = "unknown";
   }
   return BLASType;
+
 }
 
 // Similar to Compute Type in GemmRocblas.h
@@ -244,33 +246,25 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio
 
 namespace detail {
 
-static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
+
+  if (!config.enabled) {
+    return true; // skip when disabled
+  }
+
   auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
-  // comparison done as 1D tensor
   at::Tensor ref = at::from_blob(c,       {size}, options);
   at::Tensor oth = at::from_blob(other_c, {size}, options);
   at::Tensor ref_float = ref.to(at::kFloat);
   at::Tensor oth_float = oth.to(at::kFloat);
-  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-  double last_succeed_atol = 1;
-  double last_succeed_rtol = 1;
-  for (auto& atol : atols) {
-    for (auto& rtol : rtols) {
-      if (at::allclose(ref_float, oth_float, rtol, atol)) {
-        last_succeed_atol = atol;
-        last_succeed_rtol = rtol;
-      }
-    }
-  }
-  if (last_succeed_atol == 1) {
-    return false;
-  }
-  else {
-    TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
-  }
 
-  return true;
+  const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
+  if (ok) {
+    TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol);
+  } else {
+    TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
+  }
+  return ok;
 }
 
 }
@@ -355,8 +349,10 @@ struct GemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -449,8 +445,10 @@ struct GemmAndBiasParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -546,8 +544,10 @@ struct GemmStridedBatchedParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -663,7 +663,9 @@ struct ScaledGemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
 
@@ -145,7 +145,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
 | PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. |
 | PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. |
 | PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. |
-| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. |
+| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". |
 | PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. |
 | PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. |
 | PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. |
@@ -173,10 +173,9 @@ All python APIs exist in the `torch.cuda.tunable` module.
 | get_max_tuning_iterations() -> int | |
 | set_filename(filename: str, insert_device_ordinal: bool = False) -> None | |
 | get_filename() -> str | |
+| set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5.
 | get_results() -> Tuple[str, str, str, float] | |
 | get_validators() -> Tuple[str, str] | |
-| write_file_on_exit(val: bool) -> None | Default is True. |
-| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
 | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-8ad2aa5d354d1bf432339113860185d5a5d1abbd`
	`1`	`+1b013f5b5a87a1882eb143c26d79d091150d6a37`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-f5c6c2ec6490455e86f67b2a25c10390d60a27f7`
	`1`	`+faffd5cf673615583da6517275e361cb3dbc77e6`