Skip to content

Commit ad96097

Browse files
committed
Update on "[reland][inductor] Add an AOT compilation mode for Inductor CPP backend"
Summary: This is a reland of #94822 cc soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 [ghstack-poisoned]
2 parents 6e45a1b + 0498e22 commit ad96097

File tree

144 files changed

+2184
-2677
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

144 files changed

+2184
-2677
lines changed

.ci/docker/common/install_cache.sh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,11 @@ if [ -n "$ROCM_VERSION" ]; then
3636
curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
3737
else
3838
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
39-
case "$ID" in
40-
ubuntu)
41-
install_ubuntu
42-
;;
43-
*)
44-
install_binary
45-
;;
46-
esac
39+
# TODO: Install the pre-built binary from S3 as building from source
40+
# https://github.com/pytorch/sccache has started failing mysteriously
41+
# in which sccache server couldn't start with the following error:
42+
# sccache: error: Invalid argument (os error 22)
43+
install_binary
4744
fi
4845
chmod a+x /opt/cache/bin/sccache
4946

.ci/pytorch/common_utils.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,14 @@ function clone_pytorch_xla() {
149149
fi
150150
}
151151

152+
function install_matplotlib() {
153+
pip_install matplotlib
154+
}
155+
156+
function install_tabulate() {
157+
pip_install tabulate
158+
}
159+
152160
function setup_torchdeploy_deps(){
153161
conda install -y -n "py_${ANACONDA_PYTHON_VERSION}" "libpython-static=${ANACONDA_PYTHON_VERSION}"
154162
local CC

.ci/pytorch/test.sh

Lines changed: 81 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,29 @@ test_inductor() {
255255
python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose
256256
}
257257

258+
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
259+
# For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
260+
# the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
261+
# The matrix of test options is specified in .github/workflows/periodic.yml
262+
# and .github/workflows/inductor.yml
263+
DYNAMO_BENCHMARK_FLAGS=()
264+
265+
if [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
266+
DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
267+
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
268+
DYNAMO_BENCHMARK_FLAGS+=(--inductor)
269+
fi
270+
271+
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
272+
DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes)
273+
fi
274+
275+
if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
276+
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
277+
else
278+
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
279+
fi
280+
258281
test_single_dynamo_benchmark() {
259282
# Usage: test_single_dynamo_benchmark inductor_inference huggingface 0 --args-for-script
260283

@@ -277,143 +300,66 @@ test_single_dynamo_benchmark() {
277300
partition_flags=( --total-partitions 2 --partition-id "$shard_id" )
278301
fi
279302

280-
# Feel free to remove --device cuda if you ever decide to need to
281-
# test CPU as well in CI
282-
python "benchmarks/dynamo/$suite.py" \
283-
--ci --accuracy --timing --explain \
284-
"$@" "${partition_flags[@]}" \
285-
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
286-
python benchmarks/dynamo/check_csv.py \
287-
-f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
288-
}
289-
290-
test_aot_eager_benchmark() {
291-
# Usage: test_dynamo_benchmark huggingface 0
292-
293-
local exit_status=0
294-
295-
# Check inference with --float32
296-
test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager --device cuda || exit_status=$?
297-
298-
# Check training with --amp
299-
test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --device cuda --training --amp || exit_status=$?
300-
301-
if [[ $exit_status -ne 0 ]]; then
302-
echo "Some benchmarks failed; scroll up for details"
303+
if [[ "${TEST_CONFIG}" == *perf* ]]; then
304+
# MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377
305+
MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \
306+
--base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \
307+
--no-graphs --no-update-archive --no-gh-comment
308+
else
309+
python "benchmarks/dynamo/$suite.py" \
310+
--ci --accuracy --timing --explain \
311+
"${DYNAMO_BENCHMARK_FLAGS[@]}" \
312+
"$@" "${partition_flags[@]}" \
313+
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
314+
python benchmarks/dynamo/check_csv.py \
315+
-f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
303316
fi
304-
return $exit_status
305317
}
306318

307-
test_inductor_benchmark() {
319+
test_dynamo_benchmark() {
308320
# Usage: test_dynamo_benchmark huggingface 0
309321

310-
local device="$1"
322+
local suite="$1"
323+
shift
324+
local shard_id="$1"
311325
shift
312326

313-
if [[ $device == "cpu" ]]; then
314-
# TODO: Add training and dynamic shape test
315-
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --float32 --device cpu
327+
if [[ "${TEST_CONFIG}" == *perf* ]]; then
328+
# Performance test training only, for float32 and amp
329+
test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@"
330+
test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@"
316331
else
317332
# Check inference with --float32
318-
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --device cuda
319-
320-
# Check training with --amp
321-
test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp --device cuda
333+
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --float32 "$@"
322334

323-
# Check inference with --dynamic-shapes
324-
test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes --device cuda
335+
if [[ "${TEST_CONFIG}" != *cpu_accuracy* && "${TEST_CONFIG}" != *dynamic* ]]; then
336+
# Check training with --amp
337+
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
338+
fi
325339
fi
326340
}
327341

328-
test_inductor_benchmark_perf() {
329-
# Use test-reports directory under test folder will allow the CI to automatically pick up
330-
# the test reports and upload them to S3. Need to use full path here otherwise the script
331-
# will bark about file not found later on
342+
test_inductor_torchbench_smoketest_perf() {
332343
TEST_REPORTS_DIR=$(pwd)/test/test-reports
333-
PARTITION_FLAGS=""
334-
if [[ -n "$NUM_TEST_SHARDS" && -n "$2" ]]; then
335-
PARTITION_FLAGS="--total-partitions 2 --partition-id $2"
336-
fi
337344
mkdir -p "$TEST_REPORTS_DIR"
338-
# Check training with --amp
339-
# Not checking accuracy for perf test for now
340-
# shellcheck disable=SC2086
341-
if [[ "$1" == *smoketest* ]]; then
342-
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
343-
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
344-
--output "$TEST_REPORTS_DIR"/inductor_training_$1.csv
345-
# the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
346-
# this value needs to be actively maintained to make this check useful
347-
python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_$1.csv
348-
349-
# Check memory compression ratio for a few models
350-
for test in hf_Albert timm_efficientdet timm_vision_transformer; do
351-
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
352-
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
353-
--only $test --output "$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv
354-
cat "$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv
355-
python benchmarks/dynamo/check_memory_compression_ratio.py --actual \
356-
"$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv \
357-
--expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
358-
done
359-
else
360-
python benchmarks/dynamo/$1.py --ci --training --performance --disable-cudagraphs\
361-
--device cuda --inductor --amp $PARTITION_FLAGS --output "$TEST_REPORTS_DIR"/inductor_training_$1.csv
362-
fi
363-
}
364-
365-
# No sharding for the periodic job, we don't care if latency is bad
366-
test_aot_eager_all() {
367-
local exit_status=0
368-
PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench "" "$@" || exit_status=$?
369-
test_aot_eager_benchmark huggingface "" "$@" || exit_status=$?
370-
test_aot_eager_benchmark timm_models "" "$@" || exit_status=$?
371-
if [[ $exit_status -ne 0 ]]; then
372-
echo "Some benchmarks failed; scroll up for details"
373-
fi
374-
return $exit_status
375-
}
376345

377-
test_inductor_huggingface() {
378-
local device=$1
379-
shift
380-
test_inductor_benchmark "$device" huggingface ""
381-
}
382-
383-
test_inductor_huggingface_perf() {
384-
test_inductor_benchmark_perf huggingface
385-
}
386-
387-
test_inductor_timm_shard() {
388-
if [[ -z "$NUM_TEST_SHARDS" ]]; then
389-
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
390-
exit 1
391-
fi
392-
local device=$1
393-
shift
394-
test_inductor_benchmark "$device" timm_models "$1"
395-
}
396-
397-
test_inductor_timm_perf_shard() {
398-
if [[ -z "$NUM_TEST_SHARDS" ]]; then
399-
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
400-
exit 1
401-
fi
402-
test_inductor_benchmark_perf timm_models "$1"
403-
}
404-
405-
test_inductor_torchbench() {
406-
local device=$1
407-
shift
408-
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark "$device" torchbench ""
409-
}
410-
411-
test_inductor_torchbench_perf() {
412-
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark_perf torchbench
413-
}
414-
415-
test_inductor_torchbench_smoketest_perf(){
416-
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark_perf smoketest
346+
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
347+
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
348+
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
349+
# the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
350+
# this value needs to be actively maintained to make this check useful
351+
python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
352+
353+
# Check memory compression ratio for a few models
354+
for test in hf_Albert timm_efficientdet timm_vision_transformer; do
355+
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
356+
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
357+
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
358+
cat "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
359+
python benchmarks/dynamo/check_memory_compression_ratio.py --actual \
360+
"$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
361+
--expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
362+
done
417363
}
418364

419365
test_python_gloo_with_tls() {
@@ -842,6 +788,12 @@ test_executorch() {
842788
assert_git_not_dirty
843789
}
844790

791+
# TODO: Include this in the Docker image
792+
if [[ "${TEST_CONFIG}" == *_perf* ]]; then
793+
install_matplotlib
794+
install_tabulate
795+
fi
796+
845797
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* || "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
846798
(cd test && python -c "import torch; print(torch.__config__.show())")
847799
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -878,81 +830,24 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
878830
elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
879831
install_torchvision
880832
test_dynamo_shard 2
881-
elif [[ "${TEST_CONFIG}" == *aot_eager_all* ]]; then
882-
install_torchtext
883-
install_torchvision
884-
checkout_install_torchbench
885-
install_huggingface
886-
install_timm
887-
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
888-
# NB: This code path is currently dead because dynamic shapes takes
889-
# too long to run unsharded
890-
test_aot_eager_all --dynamic-shapes
891-
else
892-
test_aot_eager_all
893-
fi
894-
elif [[ "${TEST_CONFIG}" == *aot_eager_huggingface* ]]; then
895-
install_torchvision
896-
install_huggingface
897-
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
898-
test_aot_eager_benchmark huggingface "" --dynamic-shapes
899-
else
900-
test_aot_eager_benchmark huggingface ""
901-
fi
902-
elif [[ "${TEST_CONFIG}" == *aot_eager_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
903-
install_torchvision
904-
install_timm
905-
id=$((SHARD_NUMBER-1))
906-
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
907-
test_aot_eager_benchmark timm_models "$id" --dynamic-shapes
908-
else
909-
test_aot_eager_benchmark timm_models "$id"
910-
fi
911-
elif [[ "${TEST_CONFIG}" == *aot_eager_torchbench* ]]; then
912-
install_torchtext
913-
install_torchvision
914-
checkout_install_torchbench
915-
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
916-
PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench "" --dynamic-shapes
917-
else
918-
PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench ""
919-
fi
920-
elif [[ "${TEST_CONFIG}" == *inductor_huggingface* ]]; then
833+
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
921834
install_torchvision
922835
install_huggingface
923-
if [[ "${TEST_CONFIG}" == *inductor_huggingface_perf* ]]; then
924-
test_inductor_huggingface_perf
925-
elif [[ "${TEST_CONFIG}" == *inductor_huggingface_cpu_accuracy* ]]; then
926-
test_inductor_huggingface cpu
927-
else
928-
test_inductor_huggingface cuda
929-
fi
930-
elif [[ "${TEST_CONFIG}" == *inductor_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
836+
test_dynamo_benchmark huggingface ""
837+
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
931838
install_torchvision
932839
install_timm
933840
id=$((SHARD_NUMBER-1))
934-
if [[ "${TEST_CONFIG}" == *inductor_timm_perf* && $NUM_TEST_SHARDS -gt 1 ]]; then
935-
test_inductor_timm_perf_shard $id
936-
elif [[ "${TEST_CONFIG}" == *inductor_timm_cpu_accuracy* && $NUM_TEST_SHARDS -gt 1 ]]; then
937-
test_inductor_timm_shard cpu $id
938-
else
939-
test_inductor_timm_shard cuda $id
940-
fi
941-
elif [[ "${TEST_CONFIG}" == *inductor_torchbench* ]]; then
841+
test_dynamo_benchmark timm_models "$id"
842+
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
942843
install_torchtext
943844
install_torchvision
944-
if [[ "${TEST_CONFIG}" == *inductor_torchbench_perf* ]]; then
945-
checkout_install_torchbench
946-
test_inductor_torchbench_perf
947-
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_accuracy* ]]; then
948-
checkout_install_torchbench
949-
test_inductor_torchbench cpu
950-
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
845+
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
951846
checkout_install_torchbench hf_Bert hf_Albert timm_efficientdet timm_vision_transformer
952-
test_inductor_torchbench_smoketest_perf
847+
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
953848
else
954849
checkout_install_torchbench
955-
test_inductor_torchbench cuda
850+
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench ""
956851
fi
957852
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
958853
install_torchvision

.github/ci_commit_pins/xla.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
f9963f6c2d34b9662f93e5518adb15949be05f65
1+
015ebcba441dbd5dd21dc02ef12af2c29791a7f0

0 commit comments

Comments
 (0)