Skip to content

Commit 1adeed2

Browse files
bertmaherfacebook-github-bot
authored andcommitted
Speed up CUDA kernel launch when block/thread extents are statically known (#42899)
Summary: Pull Request resolved: #42899 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D23078708 Pulled By: bertmaher fbshipit-source-id: 237404b47a31672d7145d70996868a3b9b97924e
1 parent f373cda commit 1adeed2

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

torch/csrc/jit/tensorexpr/cuda_codegen.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,15 +868,24 @@ void CudaCodeGen::call(const std::vector<CallArg>& args) {
868868

869869
std::vector<int> gpu_block_extents_v(3, 1);
870870
std::vector<int> gpu_thread_extents_v(3, 1);
871+
871872
// evaluate all the block/thread extents into values
872873
// TODO: eventually, codegen these calculations and make them part of the
873874
// module.
874875
for (size_t i = 0; i < gpu_block_extents.size(); i++) {
876+
if (gpu_block_extents[i]->isConstant()) {
877+
gpu_block_extents_v[i] = immediateAs<int>(gpu_block_extents[i]);
878+
continue;
879+
}
875880
ExprEval<SimpleIREvaluator> eval(
876881
ExprHandle(gpu_block_extents[i]), buffer_args());
877882
gpu_block_extents_v[i] = eval.value<int>(args);
878883
}
879884
for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
885+
if (gpu_thread_extents[i]->isConstant()) {
886+
gpu_thread_extents_v[i] = immediateAs<int>(gpu_thread_extents[i]);
887+
continue;
888+
}
880889
ExprEval<SimpleIREvaluator> eval(
881890
ExprHandle(gpu_thread_extents[i]), buffer_args());
882891
gpu_thread_extents_v[i] = eval.value<int>(args);

0 commit comments

Comments
 (0)