pytorch
diff --git a/‎.ci/pytorch/test.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/pytorch/test.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 0 additions & 2 deletions b/‎.lintrunner.toml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/KernelUtils.cuh‎
Lines changed: 89 additions & 8 deletions b/‎aten/src/ATen/native/cuda/KernelUtils.cuh‎
Lines changed: 89 additions & 8 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py‎
Lines changed: 4 additions & 3 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/check_results.py‎
Lines changed: 141 additions & 0 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/check_results.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎ b/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv‎
Lines changed: 4 additions & 0 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/distributed/elastic/agent/server/test/api_test.py‎
Lines changed: 2 additions & 2 deletions b/‎test/distributed/elastic/agent/server/test/api_test.py‎
Lines changed: 2 additions & 2 deletions
@@ -376,7 +376,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
 
   echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
   PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend
 
   TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
     --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@@ -403,7 +403,7 @@ pr_time_benchmarks() {
   PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
   echo "benchmark results on current PR: "
   cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
 }
 
 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
 
@@ -1254,7 +1254,6 @@ exclude_patterns = [
     'torch/fx/experimental/refinement_types.py',
     'torch/fx/experimental/rewriter.py',
     'torch/fx/experimental/schema_type_annotation.py',
-    'torch/fx/experimental/symbolic_shapes.py',
     'torch/fx/experimental/unification/__init__.py',
     'torch/fx/experimental/unification/core.py',
     'torch/fx/experimental/unification/dispatch.py',
@@ -1270,7 +1269,6 @@ exclude_patterns = [
     'torch/fx/experimental/unification/utils.py',
     'torch/fx/experimental/unification/variable.py',
     'torch/fx/experimental/unify_refinements.py',
-    'torch/fx/experimental/validator.py',
     'torch/fx/graph.py',
     'torch/fx/graph_module.py',
     'torch/fx/interpreter.py',
 
@@ -5,6 +5,74 @@
 #include <cuda_bf16.h>
 #endif
 
+// ROCm 6.3 is planned to have these functions, but until then here they are.
+#if defined(USE_ROCM) && ROCM_VERSION >= 60201
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+
+__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
+  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
+  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
+  union {
+    __hip_bfloat162_raw bf162_raw;
+    vec_short2 vs2;
+  } u{static_cast<__hip_bfloat162_raw>(value)};
+  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
+  return static_cast<__hip_bfloat162>(u.bf162_raw);
+#else
+  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
+  union u_hold {
+    __hip_bfloat162_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+
+__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
+  // The api expects an ext_vector_type of half
+  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
+  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
+  union {
+    __half2_raw h2r;
+    vec_fp162 fp16;
+  } u {static_cast<__half2_raw>(value)};
+  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
+  return static_cast<__half2>(u.h2r);
+#else
+  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
+  union u_hold {
+    __half2_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+#define ATOMICADD preview_unsafeAtomicAdd
+#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
+#else
+#define ATOMICADD atomicAdd
+#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0)
+#endif
+
 namespace at:: native {
 
 __device__ __forceinline__ size_t
@@ -47,7 +115,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     const index_t numel,
     scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::Half*>(tensor) + index,
@@ -61,17 +129,22 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     __half2 value2;
     value2.x = static_cast<__half>(value);
     value2.y = __int2half_rz(0);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2);
 
   } else if (!low_byte && index > 0) {
     __half2 value2;
     value2.x = __int2half_rz(0);
     value2.y = static_cast<__half>(value);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2);
 
   } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::Half*>(tensor) + index, static_cast<at::Half>(value));
+#else
     atomicAdd(
         reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value));
+#endif
   }
 #endif
 }
@@ -87,7 +160,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     const index_t numel,
     scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::BFloat16*>(tensor) + index,
@@ -100,18 +173,23 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
   if (low_byte && index < (numel - 1)) {
     __nv_bfloat162 value2;
     value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
-    value2.y = __int2bfloat16_rz(0);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
+    value2.y = NATIVE_ZERO_BF16;
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
 
   } else if (!low_byte && index > 0) {
     __nv_bfloat162 value2;
-    value2.x = __int2bfloat16_rz(0);
+    value2.x = NATIVE_ZERO_BF16;
     value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
 
   } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::BFloat16*>(tensor) + index, static_cast<at::BFloat16>(value));
+#else
     atomicAdd(
         reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
+#endif
   }
 #endif
 }
@@ -144,4 +222,7 @@ __device__ __forceinline__ void fastAtomicAdd(
   }
 }
 
+#undef ATOMICADD
+#undef NATIVE_ZERO_BF16
+
 } // namespace at::native
@@ -54,11 +54,12 @@
 
 
 class BenchmarkBase(ABC):
-    # measure total number of instruction spent in _work.
+    # Measure total number of instruction spent in _work.
+    # Garbage collection is NOT disabled during _work().
     _enable_instruction_count = False
 
-    # measure total number of instruction spent in convert_frame.compile_inner
-    # TODO is there other parts we need to add ?
+    # Measure total number of instruction spent in convert_frame.compile_inner
+    # Garbage collection is disabled during _work() to avoid noise.
     _enable_compile_time_instruction_count = False
 
     # number of iterations used to run when collecting instruction_count or compile_time_instruction_count.
 
@@ -0,0 +1,141 @@
+import csv
+import json
+import sys
+from dataclasses import dataclass
+
+import torch._logging.scribe as scribe
+
+
+@dataclass
+class ExpectedFileEntry:
+    benchmark_name: str
+    metric_name: str
+    expected_value: int
+    noise_margin: float
+
+
+@dataclass
+class ResultFileEntry:
+    benchmark_name: str
+    metric_name: str
+    actual_value: int
+
+
+def main():
+    # Expected file is the file that have the results that we are comparing against.
+    # Expected has the following format:
+    # benchmark_name, metric name, expected value, noise margin (as percentage)
+    # Example:
+    # add_loop_eager,compile_time_instruction_count,283178305, 0.01 (1% noise margin)
+    expected_file_path = sys.argv[1]
+
+    # Result file is the file that have the results of the current run. It has the following format:
+    # benchmark_name, metric name, expected value, noise margin (as percentage)
+    # Example:
+    # add_loop_eager,compile_time_instruction_count,283178305
+    result_file_path = sys.argv[2]
+
+    # Read expected data file.
+    expected_data: dict[str, ExpectedFileEntry] = {}
+
+    with open(expected_file_path) as f:
+        reader = csv.reader(f)
+        for row in reader:
+            entry = ExpectedFileEntry(
+                benchmark_name=row[0].strip(),
+                metric_name=row[1].strip(),
+                expected_value=int(row[2]),
+                noise_margin=float(row[3]),
+            )
+            key = (entry.benchmark_name, entry.metric_name)
+            assert key not in expected_data, f"Duplicate entry for {key}"
+            expected_data[key] = entry
+
+    # Read result data file.
+    result_data: dict[str, ResultFileEntry] = {}
+
+    with open(result_file_path) as f:
+        reader = csv.reader(f)
+        for row in reader:
+            entry = ResultFileEntry(
+                benchmark_name=row[0].strip(),
+                metric_name=row[1].strip(),
+                actual_value=int(row[2]),
+            )
+
+            key = (entry.benchmark_name, entry.metric_name)
+            assert key not in result_data, f"Duplicate entry for {key}"
+            result_data[key] = entry
+
+    fail = False
+    for key, entry in expected_data.items():
+        if key not in result_data:
+            print(f"Missing entry for {key} in result file")
+            sys.exit(1)
+
+        low = entry.expected_value - entry.expected_value * entry.noise_margin
+        high = entry.expected_value + entry.expected_value * entry.noise_margin
+        result = result_data[key].actual_value
+
+        def log(event_name):
+            scribe.open_source_signpost(
+                subsystem="pr_time_benchmarks",
+                name=event_name,
+                parameters=json.dumps(
+                    {
+                        "benchmark_name": entry.benchmark_name,
+                        "metric_name": entry.metric_name,
+                        "actual_value": result,
+                        "expected_value": entry.expected_value,
+                        "noise_margin": entry.noise_margin,
+                    }
+                ),
+            )
+
+        if result > high:
+            fail = True
+            ratio = float(result - entry.expected_value) * 100 / entry.expected_value
+            print(
+                f"REGRESSION: benchmark {key} failed, actual result {result} "
+                f"is {ratio:.2f}% higher than expected {entry.expected_value} ±{entry.noise_margin*100:.2f}% "
+                f"if this is an expected regression, please update the expected results."
+            )
+
+            log("fail_regression")
+
+        if result < low:
+            fail = True
+            ratio = float(entry.expected_value - result) * 100 / entry.expected_value
+
+            print(
+                f"WIN: benchmark {key} failed, actual result {result} is {ratio:.2f}% lower than "
+                f"expected {entry.expected_value} ±{entry.noise_margin*100:.2f}% "
+                f"please update the expected results."
+            )
+
+            log("fail_win")
+
+    # Log all benchmarks that do not have a regression test enabled for them.
+    for key, entry in result_data.items():
+        if key not in expected_data:
+            print(
+                f"MISSING REGRESSION TEST: benchmark {key} does not have a regression test enabled for it"
+            )
+            scribe.open_source_signpost(
+                subsystem="pr_time_benchmarks",
+                name="missing_regression_test",
+                parameters=json.dumps(
+                    {
+                        "benchmark_name": entry.benchmark_name,
+                        "metric_name": entry.metric_name,
+                    }
+                ),
+            )
+    if fail:
+        sys.exit(1)
+    else:
+        print("All benchmarks passed")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,3 @@
+a, instruction count, 110, 0.01
+b, memory, 100, 0.1
+c, something, 100, 0.1
@@ -0,0 +1,4 @@
+a, instruction count, 90
+b, memory, 200
+c, something, 107
+d, missing-test, 10
@@ -547,7 +547,7 @@ def test_assign_worker_ranks(self):
         )
 
     def test_assign_worker_ranks_indentical(self):
-        os.environ["TORCH_SKIP_STORE_BARRIER"] = "1"
+        os.environ["TORCH_ELASTIC_WORKER_IDENTICAL"] = "1"
         role_infos = [
             _RoleInstanceInfo("trainer", 0, 4),
             _RoleInstanceInfo("trainer", 1, 4),
@@ -597,7 +597,7 @@ def test_assign_worker_ranks_indentical(self):
                 ],
             ],
         )
-        os.environ["TORCH_SKIP_STORE_BARRIER"] = "0"
+        os.environ["TORCH_ELASTIC_WORKER_IDENTICAL"] = "0"
 
     def test_get_event(self):
         spec = self._get_worker_spec(max_restarts=1)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+a, instruction count, 110, 0.01`
	`2`	`+b, memory, 100, 0.1`
	`3`	`+c, something, 100, 0.1`