Speed up CUDA kernel launch when block/thread extents are statically known (#42899)

bertmaher · facebook-github-bot · commit 1adeed2720bd · 2020-08-12T17:10:30.000-07:00
Summary: Pull Request resolved: #42899 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D23078708 Pulled By: bertmaher fbshipit-source-id: 237404b47a31672d7145d70996868a3b9b97924e
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -868,15 +868,24 @@ void CudaCodeGen::call(const std::vector<CallArg>& args) {
 
   std::vector<int> gpu_block_extents_v(3, 1);
   std::vector<int> gpu_thread_extents_v(3, 1);
+
   // evaluate all the block/thread extents into values
   // TODO: eventually, codegen these calculations and make them part of the
   // module.
   for (size_t i = 0; i < gpu_block_extents.size(); i++) {
+    if (gpu_block_extents[i]->isConstant()) {
+      gpu_block_extents_v[i] = immediateAs<int>(gpu_block_extents[i]);
+      continue;
+    }
     ExprEval<SimpleIREvaluator> eval(
         ExprHandle(gpu_block_extents[i]), buffer_args());
     gpu_block_extents_v[i] = eval.value<int>(args);
   }
   for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
+    if (gpu_thread_extents[i]->isConstant()) {
+      gpu_thread_extents_v[i] = immediateAs<int>(gpu_thread_extents[i]);
+      continue;
+    }
     ExprEval<SimpleIREvaluator> eval(
         ExprHandle(gpu_thread_extents[i]), buffer_args());
     gpu_thread_extents_v[i] = eval.value<int>(args);