[TensorExpr] Fuser: try merging adjacent fusion groups.

Mikhail Zolotukhin · bertmaher · commit ce71f97068cd · 2020-09-14T22:10:20.000-07:00
ghstack-source-id: 6673ea6 Pull Request resolved: #43671
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -287,5 +287,29 @@ void testFuserPass_Multidevice() {
     testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
   }
 }
+
+void testFuserPass_MergeGroups() {
+  WithCPUFuser cf;
+  KernelScope kernel_scope;
+  const auto graph_string = R"IR(
+    graph(%a : Float(128:1, device=cpu),
+          %b : Float(128:1, device=cpu)):
+      %x : Float(128:1, device=cpu) = aten::mul(%a, %a)
+      %y : Float(128:1, device=cpu) = aten::mul(%b, %b)
+      return (%x, %y))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 1);
+
+  // The %x and %y computations are completely independent and yet we should put
+  // them into a single fusion group rather than having two separate ones.
+  testing::FileCheck()
+      .check("= prim::TensorExprGroup_")
+      ->check_not("= prim::TensorExprGroup_")
+      ->run(*g);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
@@ -264,6 +264,7 @@ namespace jit {
   _(FuserPass_UnfusibleDevice)              \
   _(FuserPass_UnknownShapes)                \
   _(FuserPass_Multidevice)                  \
+  _(FuserPass_MergeGroups)                  \
   _(TrainBasic)
 
 #define TH_FORALL_TENSOREXPR_TESTS_LLVM(_) \
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
@@ -774,6 +774,8 @@ def fn(x, y):
         ge(*inputs_cuda0)
         ge(*inputs_cuda1)
 
+    # TODO: we're currently not checking 'device' in the type info when pulling
+    # nodes into a fusion group. We should fix that and re-enable this test.
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @unittest.skipIf(not RUN_CUDA_MULTI_GPU, "needs non-zero device")
     def test_kernel_cache_multi_gpu(self):
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -428,6 +428,7 @@ class TensorExprFuser {
   void createFusionGroups(Block* block) {
     std::vector<Node*> fusion_groups;
     auto reverse_iter = block->nodes().reverse();
+    Node* prev_fusion_group = nullptr;
     for (auto it = reverse_iter.begin(); it != reverse_iter.end();) {
       Node* n = *it;
       GRAPH_DEBUG("Considering node:", *n)
@@ -450,11 +451,40 @@ class TensorExprFuser {
       }
 
       Node* fusion_group = createFusionGroup(n);
-      fusion_groups.push_back(fusion_group);
-      it = fusion_group->reverseIterator();
+      debugDumpFusionGroup("Fusion group constructed: ", fusion_group);
+
+      // Try merging the just created fusion group into the previous one.
+      // If it did not work, then put the previous fusion group into
+      // fusion_groups vector - we will not touch it anymore in this loop.
+      // If merging suceeded, save the merged group as the "previous" fusion
+      // group so that we can try to merge the next one into it.
+      if (prev_fusion_group) {
+        debugDumpFusionGroup(
+            "Trying to merge into the previous fusion group: ",
+            prev_fusion_group);
+        if (canMerge(prev_fusion_group, fusion_group)) {
+          prev_fusion_group = tryMerge(prev_fusion_group, fusion_group);
+          debugDumpFusionGroup(
+              "Successfully merged into the previous fusion group: ",
+              prev_fusion_group);
+        } else {
+          GRAPH_DEBUG("Cannot merge into the previous fusion group");
+          fusion_groups.push_back(prev_fusion_group);
+          prev_fusion_group = fusion_group;
+        }
+      } else {
+        prev_fusion_group = fusion_group;
+      }
+      it = prev_fusion_group->reverseIterator();
       it++;
     }
 
+    // We were adding groups into the vector lagging by one - catch up with
+    // adding the last one
+    if (prev_fusion_group) {
+      fusion_groups.push_back(prev_fusion_group);
+    }
+
     for (Node* n : fusion_groups) {
       inlineIfTooSmall(n);
     }
@@ -617,7 +647,7 @@ class TensorExprFuser {
     REQ(consumer->owningBlock() == producer->owningBlock());
 
     // Symbolic checks
-    REQ(canHandle(producer));
+    REQ(canHandle(producer) || producer->kind() == prim::TensorExprGroup);
     TORCH_INTERNAL_ASSERT(
         consumer->kind() == prim::TensorExprGroup || canHandle(consumer));