Skip to content

Commit b9b4f05

Browse files
csarofeenfacebook-github-bot
authored andcommitted
[nvFuser] Working towards reductions, codegen improvements (#40864)
Summary: Have basic reduction fusion working, and have improved code generator to approach performance of eager mode reductions. Coming soon will be pointwise-reduction fusions in a way that should prevent the possibility of hitting regressions. Also working on performant softmax kernels in the code generator which may be our next fusion target. Pull Request resolved: #40864 Reviewed By: ngimel Differential Revision: D22392877 Pulled By: soumith fbshipit-source-id: 457448a807d628b1035f6d90bc0abe8a87bf8447
1 parent e026d91 commit b9b4f05

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+6684
-2977
lines changed

.jenkins/pytorch/macos-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ test_python_all() {
6363
# Increase default limit on open file handles from 256 to 1024
6464
ulimit -n 1024
6565

66-
python test/run_test.py --verbose --exclude test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --determine-from="$DETERMINE_FROM"
66+
python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --determine-from="$DETERMINE_FROM"
6767

6868
assert_git_not_dirty
6969
}

.jenkins/pytorch/test.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,17 @@ test_python_nn() {
150150
}
151151

152152
test_python_ge_config_profiling() {
153-
time python test/run_test.py --include test_jit_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"
153+
time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"
154154
assert_git_not_dirty
155155
}
156156

157157
test_python_ge_config_legacy() {
158-
time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
158+
time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
159159
assert_git_not_dirty
160160
}
161161

162162
test_python_all_except_nn_and_cpp_extensions() {
163-
time python test/run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
163+
time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
164164
assert_git_not_dirty
165165
}
166166

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
2-
cd test && python run_test.py --exclude test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
2+
cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
33
if ERRORLEVEL 1 exit /b 1

caffe2/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,12 +448,14 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
448448
${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
449449
${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
450450
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp
451+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp
451452
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp
452453
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp
453454
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp
454455
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp
455456
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp
456457
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp
458+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp
457459
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp
458460
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_nodes.cpp
459461
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -463,13 +465,16 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
463465
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/manager.cpp
464466
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/shape_inference.cpp
465467
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/mutator.cpp
468+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp
466469
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp
470+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
471+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp
467472
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_utils.cpp
473+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_validation.cpp
468474
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower2device.cpp
469475
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/parser.cpp
470476
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/partition.cpp
471477
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/predicate_compute.cpp
472-
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tensor_meta.cpp
473478
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tensor_view.cpp
474479
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_iter.cpp
475480
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_replay.cpp

0 commit comments

Comments
 (0)