respect aten planned overlap in inductor

eellison · eellison · commit 58700c552713 · 2025-10-03T05:37:52.000-07:00
ghstack-source-id: fa58952 Pull Request resolved: #164569
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
@@ -12,7 +12,7 @@
 import torch.distributed._functional_collectives as _functional_collectives
 from torch._C import FileCheck
 from torch._dynamo.utils import counters, same
-from torch._inductor.utils import run_and_get_triton_code
+from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
 from torch.testing._internal.common_distributed import (
     _dynamo_dist_per_rank_init,
     at_least_x_gpu,
@@ -67,6 +67,8 @@ def get_patches():
         "reorder_for_compute_comm_overlap_passes": [],
         "compile_threads": 1,
         "force_disable_caches": True,
+        # Messes up existing test strings
+        "test_configs.aten_fx_overlap_insert_overlap_deps": False,
     }
 
 
@@ -358,6 +360,8 @@ def get_bucket_patches(compute_multiplier=1.0):
         "reorder_for_compute_comm_overlap_passes": [],
         "compile_threads": 1,
         "force_disable_caches": True,
+        # messes up test strings
+        "test_configs.aten_fx_overlap_insert_overlap_deps": False,
     }
 
 
@@ -750,6 +754,91 @@ def func(a, b, c, *, ranks):
             correct = func(a, b, c, ranks=ranks)
             self.assertTrue(same(out, correct))
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches(2.0))
+    def test_bucketing_split_for_overlap_blocking_deps_inductor(self):
+        """Test that 4 independent all-gathers split into 2+2 buckets for better overlap with compute."""
+
+        # check that ordering is preserved in inductor
+
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        li = []
+        apply = functools.partial(apply_reordering_and_get_graph, out_li=li)
+        with (
+            _dynamo_dist_per_rank_init(
+                self.rank,
+                self.world_size,
+                self.backend(device_type),
+                fake_pg=not at_least_x_gpu(2),
+            ),
+            torch._inductor.config.patch(
+                "test_configs.aten_fx_overlap_insert_overlap_deps", True
+            ),
+            torch._inductor.config.patch(post_grad_custom_post_pass=apply),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            test_out, (code,) = run_and_get_code(compiled, a, b, c, d)
+
+            # Check that right deps are added
+            f = FileCheck()
+            for _ in range(2):
+                f.check("control_deps_op").check_same("all_gather").check_same(
+                    "subgraph_mm"
+                )
+                f.check("control_deps_op").check_same("mm").check_same("subgraph_wait")
+            f.run(li[0])
+
+            f = FileCheck()
+            f.check("def call").check(
+                "torch.ops._c10d_functional.all_gather_into_tensor"
+            )
+            f.check_count(".mm(", 1, exactly=True)
+            f.check_count(".wait(", 1, exactly=True)
+            f.check_count(
+                "torch.ops._c10d_functional.all_gather_into_tensor_", 1, exactly=True
+            )
+            f.check_count(".mm(", 1, exactly=True)
+            f.check_count(".wait(", 1, exactly=True)
+            f.run(code)
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(test_out, correct))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -2025,6 +2025,9 @@ class test_configs:
     # to be migrated when ready for use
     aten_fx_overlap_scheduling = False
 
+    # insert ordering deps for overlap
+    aten_fx_overlap_insert_overlap_deps = True
+
     # to be migrated when ready for use
     aten_fx_overlap_preserving_bucketing = False
 
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -86,6 +86,10 @@ def bucket_collectives(self) -> None:
         from torch._dynamo.graph_deduplication import _stable_topological_sort
 
         _stable_topological_sort(self.graph, additional_deps)
+
+        # After topological sort, preserve dependencies using effect tokens
+        self._preserve_dependencies_with_tokens(additional_deps)
+
         self.graph.lint()
 
     def _find_buckets(
@@ -254,3 +258,19 @@ def _apply_bucket(
                 overlap_deps[new_wait].add(info.hiding_node)
 
         return overlap_deps
+
+    def _preserve_dependencies_with_tokens(
+        self, additional_deps: dict[fx.Node, OrderedSet[fx.Node]]
+    ) -> None:
+        """
+        Preserve dependencies using effect tokens and with_effects higher-order op.
+
+        Uses the standalone token_dependencies utility for consistent behavior
+        across different overlap scheduling approaches.
+        """
+        from torch._inductor.fx_passes.control_dependencies import (
+            preserve_node_ordering,
+        )
+
+        if torch._inductor.config.test_configs.aten_fx_overlap_insert_overlap_deps:
+            preserve_node_ordering(self.graph, additional_deps)
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py
@@ -378,10 +378,40 @@ def run(self) -> torch.fx.GraphModule:
                 self._handle_other(node)
 
         self._reorder_graph()
+
         if torch._inductor.config.test_configs.aten_fx_overlap_preserving_bucketing:
             self._bucket_collectives()
+        elif torch._inductor.config.test_configs.aten_fx_overlap_insert_overlap_deps:
+            # If not bucketing, add effect tokens to preserve hiding dependencies
+            self._add_effect_tokens_for_overlap()
+
         return self.gm
 
+    def _add_effect_tokens_for_overlap(self) -> None:
+        """
+        Add effect tokens to preserve hiding dependency relationships when not bucketing.
+
+        This ensures that communication-compute overlap is preserved through effect tokens
+        when overlap preserving bucketing is not enabled.
+        """
+        from torch._inductor.fx_passes.control_dependencies import (
+            preserve_node_ordering,
+        )
+
+        # Collect hiding dependencies: hiding_node -> collective_start, wait -> hiding_node
+        additional_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+
+        for start_node, info in self.collective_info.items():
+            if info.hiding_node and not info.is_exposed:
+                # Compute depends on collective start (compute must wait for collective to start)
+                additional_deps[info.hiding_node].add(start_node)
+                # Wait depends on compute (wait must wait for compute to finish)
+                additional_deps[info.wait_node].add(info.hiding_node)
+
+        # Apply effect tokens to preserve these dependencies
+        if additional_deps:
+            preserve_node_ordering(self.graph, additional_deps)
+
     def _handle_other(self, node: fx.Node) -> None:
         self._schedule(node)