[TensorExpr] Fix propagation of loop options when splitting loops (#40035)

nickgg · facebook-github-bot · commit aa91a65b5978 · 2020-07-22T11:49:07.000-07:00
Summary: Fix a bug in SplitWithTail and SplitWithMask where loop_options such as Cuda block/thread bindings are overwritten by the split. This PR fixes this bug by propagating the loop options to the outer loop, which for axis bindings should be equivalent. Pull Request resolved: #40035 Reviewed By: ZolotukhinM Differential Revision: D22080263 Pulled By: nickgg fbshipit-source-id: b8a9583fd90f69319fc4bb4db644e91f6ffa8e67
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -265,6 +265,59 @@ void testExprSplitWithMask01() {
   ExpectAllNear(c_v, c_ref, 1e-5);
 }
 
+void testSplitWithTailWithLoopOptions() {
+  KernelScope kernel_scope;
+  const int M = 21;
+  Buffer a_buf("a", kFloat, {M});
+  Buffer b_buf("b", kFloat, {M});
+  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+    return a_buf(m) + b_buf(m) + 1.0f;
+  });
+  For *outer, *inner, *tail;
+
+  LoopNest l({tensor});
+  auto loops = NodeFinder<For>::find(l.root_stmt());
+  ASSERT_GT(loops.size(), 0);
+  l.setGPUBlockIndex(loops[0], LoopOptions::IDX_Y);
+  l.splitWithTail(loops[0], 4, &outer, &inner, &tail);
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(inner, nullptr);
+  ASSERT_NE(tail, nullptr);
+
+  // Outer loop carries loop axis bindings.
+  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
+  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  // Inner loop has none.
+  ASSERT_TRUE(inner->loop_options().isDefault());
+
+  // Tail loop has none.
+  ASSERT_TRUE(tail->loop_options().isDefault());
+}
+
+void testSplitWithMaskWithLoopOptions() {
+  KernelScope kernel_scope;
+  const int M = 21;
+  Buffer a_buf("a", kFloat, {M});
+  Buffer b_buf("b", kFloat, {M});
+  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+    return a_buf(m) + b_buf(m) + 1.0f;
+  });
+  For *outer, *inner;
+
+  LoopNest l({tensor});
+  auto loops = NodeFinder<For>::find(l.root_stmt());
+  l.setGPUBlockIndex(loops[0], LoopOptions::IDX_Y);
+  l.splitWithMask(loops[0], 4, &outer, &inner);
+
+  // Outer loop carries loop axis bindings.
+  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
+  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  // Inner loop has none.
+  ASSERT_TRUE(inner->loop_options().isDefault());
+}
+
 void testScheduleBroadcastAddBuffer() {
   KernelScope kernel_scope;
   const int M = 4;
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
@@ -41,6 +41,8 @@ namespace jit {
   _(ExprSplitWithTail)                      \
   _(ExprSplitWithTailNone)                  \
   _(ExprSplitWithMask01)                    \
+  _(SplitWithTailWithLoopOptions)           \
+  _(SplitWithMaskWithLoopOptions)           \
   _(ScheduleBroadcastAddBuffer)             \
   _(ScheduleFunctionCall01)                 \
   _(ScheduleInlineFunc01)                   \
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -946,7 +946,8 @@ void LoopNest::splitWithTail(
       Substitute(Stmt::clone(f->body()), {{f->var(), combined_index1}});
 
   *inner = new For(i_inner, new IntImm(0), factor_expr, body_inner);
-  *outer = new For(i_outer, new IntImm(0), split_count, *inner);
+  *outer =
+      new For(i_outer, new IntImm(0), split_count, *inner, f->loop_options());
 
   // TODO: cleanup API for adding/removing statements
   p->replace_stmt(f, *outer);
@@ -1020,7 +1021,8 @@ void LoopNest::splitWithMask(For* f, int factor, For** outer, For** inner) {
   body_inner = Substitute(body_inner, {{f->var(), combined_index}});
 
   *inner = new For(i_inner, new IntImm(0), factor_expr, body_inner);
-  *outer = new For(i_outer, new IntImm(0), split_count, *inner);
+  *outer =
+      new For(i_outer, new IntImm(0), split_count, *inner, f->loop_options());
 
   // TODO: cleanup API for adding/removing statements
   p->replace_stmt(f, *outer);