Skip to content

Commit fd6dac2

Browse files
committed
Update
[ghstack-poisoned]
2 parents 05f09d9 + 8b38c0d commit fd6dac2

File tree

27 files changed

+1825
-623
lines changed

27 files changed

+1825
-623
lines changed

.ci/pytorch/test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
376376

377377
echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
378378
PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
379-
python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
379+
python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend
380380

381381
TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
382382
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@@ -403,7 +403,7 @@ pr_time_benchmarks() {
403403
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
404404
echo "benchmark results on current PR: "
405405
cat "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
406-
406+
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
407407
}
408408

409409
if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then

.lintrunner.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,7 +1254,6 @@ exclude_patterns = [
12541254
'torch/fx/experimental/refinement_types.py',
12551255
'torch/fx/experimental/rewriter.py',
12561256
'torch/fx/experimental/schema_type_annotation.py',
1257-
'torch/fx/experimental/symbolic_shapes.py',
12581257
'torch/fx/experimental/unification/__init__.py',
12591258
'torch/fx/experimental/unification/core.py',
12601259
'torch/fx/experimental/unification/dispatch.py',
@@ -1270,7 +1269,6 @@ exclude_patterns = [
12701269
'torch/fx/experimental/unification/utils.py',
12711270
'torch/fx/experimental/unification/variable.py',
12721271
'torch/fx/experimental/unify_refinements.py',
1273-
'torch/fx/experimental/validator.py',
12741272
'torch/fx/graph.py',
12751273
'torch/fx/graph_module.py',
12761274
'torch/fx/interpreter.py',

aten/src/ATen/native/cuda/KernelUtils.cuh

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,74 @@
55
#include <cuda_bf16.h>
66
#endif
77

8+
// ROCm 6.3 is planned to have these functions, but until then here they are.
9+
#if defined(USE_ROCM) && ROCM_VERSION >= 60201
10+
#include <hip/hip_bf16.h>
11+
#include <hip/hip_fp16.h>
12+
13+
__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
14+
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
15+
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
16+
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
17+
static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
18+
union {
19+
__hip_bfloat162_raw bf162_raw;
20+
vec_short2 vs2;
21+
} u{static_cast<__hip_bfloat162_raw>(value)};
22+
u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
23+
return static_cast<__hip_bfloat162>(u.bf162_raw);
24+
#else
25+
static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
26+
union u_hold {
27+
__hip_bfloat162_raw h2r;
28+
unsigned int u32;
29+
};
30+
u_hold old_val, new_val;
31+
old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
32+
do {
33+
new_val.h2r = __hadd2(old_val.h2r, value);
34+
} while (!__hip_atomic_compare_exchange_strong(
35+
(unsigned int*)address, &old_val.u32, new_val.u32,
36+
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
37+
return old_val.h2r;
38+
#endif
39+
}
40+
41+
__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
42+
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
43+
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
44+
// The api expects an ext_vector_type of half
45+
typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
46+
static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
47+
union {
48+
__half2_raw h2r;
49+
vec_fp162 fp16;
50+
} u {static_cast<__half2_raw>(value)};
51+
u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
52+
return static_cast<__half2>(u.h2r);
53+
#else
54+
static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
55+
union u_hold {
56+
__half2_raw h2r;
57+
unsigned int u32;
58+
};
59+
u_hold old_val, new_val;
60+
old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
61+
do {
62+
new_val.h2r = __hadd2(old_val.h2r, value);
63+
} while (!__hip_atomic_compare_exchange_strong(
64+
(unsigned int*)address, &old_val.u32, new_val.u32,
65+
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
66+
return old_val.h2r;
67+
#endif
68+
}
69+
#define ATOMICADD preview_unsafeAtomicAdd
70+
#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
71+
#else
72+
#define ATOMICADD atomicAdd
73+
#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0)
74+
#endif
75+
876
namespace at:: native {
977

1078
__device__ __forceinline__ size_t
@@ -47,7 +115,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
47115
const index_t numel,
48116
scalar_t value) {
49117
#if ( \
50-
(defined(USE_ROCM)) || \
118+
(defined(USE_ROCM) && ROCM_VERSION < 60201) || \
51119
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
52120
gpuAtomicAddNoReturn(
53121
reinterpret_cast<at::Half*>(tensor) + index,
@@ -61,17 +129,22 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
61129
__half2 value2;
62130
value2.x = static_cast<__half>(value);
63131
value2.y = __int2half_rz(0);
64-
atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
132+
ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2);
65133

66134
} else if (!low_byte && index > 0) {
67135
__half2 value2;
68136
value2.x = __int2half_rz(0);
69137
value2.y = static_cast<__half>(value);
70-
atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
138+
ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2);
71139

72140
} else {
141+
#ifdef USE_ROCM
142+
gpuAtomicAddNoReturn(
143+
reinterpret_cast<at::Half*>(tensor) + index, static_cast<at::Half>(value));
144+
#else
73145
atomicAdd(
74146
reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value));
147+
#endif
75148
}
76149
#endif
77150
}
@@ -87,7 +160,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
87160
const index_t numel,
88161
scalar_t value) {
89162
#if ( \
90-
(defined(USE_ROCM)) || \
163+
(defined(USE_ROCM) && ROCM_VERSION < 60201) || \
91164
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
92165
gpuAtomicAddNoReturn(
93166
reinterpret_cast<at::BFloat16*>(tensor) + index,
@@ -100,18 +173,23 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
100173
if (low_byte && index < (numel - 1)) {
101174
__nv_bfloat162 value2;
102175
value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
103-
value2.y = __int2bfloat16_rz(0);
104-
atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
176+
value2.y = NATIVE_ZERO_BF16;
177+
ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
105178

106179
} else if (!low_byte && index > 0) {
107180
__nv_bfloat162 value2;
108-
value2.x = __int2bfloat16_rz(0);
181+
value2.x = NATIVE_ZERO_BF16;
109182
value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
110-
atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
183+
ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
111184

112185
} else {
186+
#ifdef USE_ROCM
187+
gpuAtomicAddNoReturn(
188+
reinterpret_cast<at::BFloat16*>(tensor) + index, static_cast<at::BFloat16>(value));
189+
#else
113190
atomicAdd(
114191
reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
192+
#endif
115193
}
116194
#endif
117195
}
@@ -144,4 +222,7 @@ __device__ __forceinline__ void fastAtomicAdd(
144222
}
145223
}
146224

225+
#undef ATOMICADD
226+
#undef NATIVE_ZERO_BF16
227+
147228
} // namespace at::native

benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,12 @@
5454

5555

5656
class BenchmarkBase(ABC):
57-
# measure total number of instruction spent in _work.
57+
# Measure total number of instruction spent in _work.
58+
# Garbage collection is NOT disabled during _work().
5859
_enable_instruction_count = False
5960

60-
# measure total number of instruction spent in convert_frame.compile_inner
61-
# TODO is there other parts we need to add ?
61+
# Measure total number of instruction spent in convert_frame.compile_inner
62+
# Garbage collection is disabled during _work() to avoid noise.
6263
_enable_compile_time_instruction_count = False
6364

6465
# number of iterations used to run when collecting instruction_count or compile_time_instruction_count.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import csv
2+
import json
3+
import sys
4+
from dataclasses import dataclass
5+
6+
import torch._logging.scribe as scribe
7+
8+
9+
@dataclass
10+
class ExpectedFileEntry:
11+
benchmark_name: str
12+
metric_name: str
13+
expected_value: int
14+
noise_margin: float
15+
16+
17+
@dataclass
18+
class ResultFileEntry:
19+
benchmark_name: str
20+
metric_name: str
21+
actual_value: int
22+
23+
24+
def main():
25+
# Expected file is the file that have the results that we are comparing against.
26+
# Expected has the following format:
27+
# benchmark_name, metric name, expected value, noise margin (as percentage)
28+
# Example:
29+
# add_loop_eager,compile_time_instruction_count,283178305, 0.01 (1% noise margin)
30+
expected_file_path = sys.argv[1]
31+
32+
# Result file is the file that have the results of the current run. It has the following format:
33+
# benchmark_name, metric name, expected value, noise margin (as percentage)
34+
# Example:
35+
# add_loop_eager,compile_time_instruction_count,283178305
36+
result_file_path = sys.argv[2]
37+
38+
# Read expected data file.
39+
expected_data: dict[str, ExpectedFileEntry] = {}
40+
41+
with open(expected_file_path) as f:
42+
reader = csv.reader(f)
43+
for row in reader:
44+
entry = ExpectedFileEntry(
45+
benchmark_name=row[0].strip(),
46+
metric_name=row[1].strip(),
47+
expected_value=int(row[2]),
48+
noise_margin=float(row[3]),
49+
)
50+
key = (entry.benchmark_name, entry.metric_name)
51+
assert key not in expected_data, f"Duplicate entry for {key}"
52+
expected_data[key] = entry
53+
54+
# Read result data file.
55+
result_data: dict[str, ResultFileEntry] = {}
56+
57+
with open(result_file_path) as f:
58+
reader = csv.reader(f)
59+
for row in reader:
60+
entry = ResultFileEntry(
61+
benchmark_name=row[0].strip(),
62+
metric_name=row[1].strip(),
63+
actual_value=int(row[2]),
64+
)
65+
66+
key = (entry.benchmark_name, entry.metric_name)
67+
assert key not in result_data, f"Duplicate entry for {key}"
68+
result_data[key] = entry
69+
70+
fail = False
71+
for key, entry in expected_data.items():
72+
if key not in result_data:
73+
print(f"Missing entry for {key} in result file")
74+
sys.exit(1)
75+
76+
low = entry.expected_value - entry.expected_value * entry.noise_margin
77+
high = entry.expected_value + entry.expected_value * entry.noise_margin
78+
result = result_data[key].actual_value
79+
80+
def log(event_name):
81+
scribe.open_source_signpost(
82+
subsystem="pr_time_benchmarks",
83+
name=event_name,
84+
parameters=json.dumps(
85+
{
86+
"benchmark_name": entry.benchmark_name,
87+
"metric_name": entry.metric_name,
88+
"actual_value": result,
89+
"expected_value": entry.expected_value,
90+
"noise_margin": entry.noise_margin,
91+
}
92+
),
93+
)
94+
95+
if result > high:
96+
fail = True
97+
ratio = float(result - entry.expected_value) * 100 / entry.expected_value
98+
print(
99+
f"REGRESSION: benchmark {key} failed, actual result {result} "
100+
f"is {ratio:.2f}% higher than expected {entry.expected_value} ±{entry.noise_margin*100:.2f}% "
101+
f"if this is an expected regression, please update the expected results."
102+
)
103+
104+
log("fail_regression")
105+
106+
if result < low:
107+
fail = True
108+
ratio = float(entry.expected_value - result) * 100 / entry.expected_value
109+
110+
print(
111+
f"WIN: benchmark {key} failed, actual result {result} is {ratio:.2f}% lower than "
112+
f"expected {entry.expected_value} ±{entry.noise_margin*100:.2f}% "
113+
f"please update the expected results."
114+
)
115+
116+
log("fail_win")
117+
118+
# Log all benchmarks that do not have a regression test enabled for them.
119+
for key, entry in result_data.items():
120+
if key not in expected_data:
121+
print(
122+
f"MISSING REGRESSION TEST: benchmark {key} does not have a regression test enabled for it"
123+
)
124+
scribe.open_source_signpost(
125+
subsystem="pr_time_benchmarks",
126+
name="missing_regression_test",
127+
parameters=json.dumps(
128+
{
129+
"benchmark_name": entry.benchmark_name,
130+
"metric_name": entry.metric_name,
131+
}
132+
),
133+
)
134+
if fail:
135+
sys.exit(1)
136+
else:
137+
print("All benchmarks passed")
138+
139+
140+
if __name__ == "__main__":
141+
main()

benchmarks/dynamo/pr_time_benchmarks/expected_results.csv

Whitespace-only changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
a, instruction count, 110, 0.01
2+
b, memory, 100, 0.1
3+
c, something, 100, 0.1
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a, instruction count, 90
2+
b, memory, 200
3+
c, something, 107
4+
d, missing-test, 10

test/distributed/elastic/agent/server/test/api_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ def test_assign_worker_ranks(self):
547547
)
548548

549549
def test_assign_worker_ranks_indentical(self):
550-
os.environ["TORCH_SKIP_STORE_BARRIER"] = "1"
550+
os.environ["TORCH_ELASTIC_WORKER_IDENTICAL"] = "1"
551551
role_infos = [
552552
_RoleInstanceInfo("trainer", 0, 4),
553553
_RoleInstanceInfo("trainer", 1, 4),
@@ -597,7 +597,7 @@ def test_assign_worker_ranks_indentical(self):
597597
],
598598
],
599599
)
600-
os.environ["TORCH_SKIP_STORE_BARRIER"] = "0"
600+
os.environ["TORCH_ELASTIC_WORKER_IDENTICAL"] = "0"
601601

602602
def test_get_event(self):
603603
spec = self._get_worker_spec(max_restarts=1)

0 commit comments

Comments
 (0)