Limit path search within range (#164581)

eellison · pytorchmergebot · commit 415e64157247 · 2025-10-06T18:29:27.000Z
When we are looking if two nodes are dependent, limit path search within the bounds of their node idxs. Pull Request resolved: #164581 Approved by: https://github.com/ezyang ghstack dependencies: #164568, #164569
diff --git a/test/inductor/test_augmented_graph_helper.py b/test/inductor/test_augmented_graph_helper.py
@@ -339,6 +339,35 @@ def test_multiple_merge_unmerge(self):
         self.assertEqual(self.tracker.merge_sets[nodes[0]], {nodes[0]})
         self.assertEqual(len(self.tracker.merge_sets[nodes[1]]), 1)
 
+    def test_has_path_with_bounded_search(self):
+        """Test that bounded search correctly respects search range bounds."""
+        # Create a simple linear chain: x -> A -> B -> C -> D
+        graph = fx.Graph()
+        x = graph.placeholder("x")
+        a = graph.call_function(torch.neg, args=(x,), name="A")
+        b = graph.call_function(torch.abs, args=(a,), name="B")
+        c = graph.call_function(torch.relu, args=(b,), name="C")
+        d = graph.call_function(torch.sigmoid, args=(c,), name="D")
+        graph.output(d)
+
+        node_to_idx = {node: idx for idx, node in enumerate(graph.nodes)}
+        tracker = AugmentedGraphHelper(graph, node_to_idx=node_to_idx)
+
+        # Path exists from A to D: A -> B -> C -> D
+        self.assertTrue(tracker.has_path(a, d))
+
+        # Test with correct bounds: include all nodes in the path
+        a_idx = node_to_idx[a]
+        d_idx = node_to_idx[d]
+        # Bounds that include the full path should find it
+        self.assertTrue(tracker.has_path(a, d, bounded_search_range=(a_idx, d_idx)))
+
+        # Test with incorrect bounds: exclude critical intermediate nodes
+        c_idx = node_to_idx[c]
+        # Bounds that exclude A and B (only allowing C and D) should NOT find the path
+        # because the search can't reach back to A
+        self.assertFalse(tracker.has_path(a, d, bounded_search_range=(c_idx, d_idx)))
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/torch/_inductor/augmented_graph_helper.py b/torch/_inductor/augmented_graph_helper.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from typing import Optional
 
 import torch
 import torch.fx as fx
@@ -9,18 +10,20 @@ class AugmentedGraphHelper:
     """
     Graph helper that augments the original graph with additional
     dependencies and uses, plus tracks node equivalences for coalescing.
-
     TODO: if this becomes too large of compile time, consider binding
     graphcycles.cc
     """
 
-    def __init__(self, graph: fx.Graph):
+    def __init__(
+        self, graph: fx.Graph, node_to_idx: Optional[dict[fx.Node, int]] = None
+    ):
         # Each node starts in its own singleton set
         self.graph = graph
         self.merge_sets = {node: OrderedSet([node]) for node in graph.nodes}
-
         # Extra dependencies: node depends on dep (dep must come before node)
         self.extra_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        # Optional node to index mapping for bounded searches
+        self.node_to_idx = node_to_idx
 
     def add_extra_dep(self, *, n: fx.Node, dep: fx.Node) -> None:
         """Add extra dependency: node depends on dep."""
@@ -33,25 +36,20 @@ def merge_to_set(self, existing_node: fx.Node, new_node: fx.Node) -> None:
         existing_set = self.merge_sets[existing_node]
         new_set = self.merge_sets[new_node]
         assert len(new_set) == 1
-
         # Add all nodes from new_set to existing_set
         existing_set.update(new_set)
-
         # Update all nodes from new_set to point to existing_set
         for node in new_set:
             self.merge_sets[node] = existing_set
 
     def unmerge_node(self, node: fx.Node) -> None:
         """Remove a node from its merge set, making it singleton."""
         old_set = self.merge_sets[node]
-
         # If already singleton, nothing to do
         if len(old_set) == 1:
             return
-
         # Remove from old set
         old_set.remove(node)
-
         # Make node singleton
         self.merge_sets[node] = OrderedSet([node])
 
@@ -63,22 +61,29 @@ def get_merged_deps(self, node: fx.Node) -> OrderedSet[fx.Node]:
         2. Extra deps of node and its merge equivalents
         """
         deps: OrderedSet[fx.Node] = OrderedSet()
-
         # For each node in the merge set
         for merged_node in self.merge_sets[node]:
             # Add direct dependencies from all_input_nodes
             deps.update(merged_node.all_input_nodes)
             # Add extra dependencies
             deps.update(self.extra_deps[merged_node])
-
         return deps
 
     def has_cycle(self) -> bool:
         merged_deps = {n: self.get_merged_deps(n) for n in self.graph.nodes}
         return torch._dynamo.graph_deduplication._has_cycle(self.graph, merged_deps)
 
-    def has_path(self, source: fx.Node, target: fx.Node) -> bool:
-        """Check if there's a path from source to target."""
+    def has_path(
+        self,
+        source: fx.Node,
+        target: fx.Node,
+        bounded_search_range: Optional[tuple[int, int]] = None,
+    ) -> bool:
+        """
+        Check if there's a path from source to target.
+
+        If bounds are provided, only searches nodes within the idx of these ranges.
+        """
         # we should not be checking path from node to itself
         assert self.merge_sets[source] is not self.merge_sets[target]
 
@@ -92,6 +97,14 @@ def has_path(self, source: fx.Node, target: fx.Node) -> bool:
 
             # Get all dependencies
             for dep in self.get_merged_deps(current):
+                # If using bounds, skip nodes outside the range
+                if bounded_search_range is not None and self.node_to_idx is not None:
+                    min_idx, max_idx = bounded_search_range
+                    dep_idx = self.node_to_idx.get(dep)
+                    assert dep_idx is not None
+                    if dep_idx < min_idx or dep_idx > max_idx:
+                        continue
+
                 # Check if we reached source or its equivalent
                 if dep in self.merge_sets[source]:
                     return True
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
@@ -11,10 +12,58 @@
     is_reduce_scatter_tensor as is_reduce_scatter,
     is_wait_tensor,
 )
-from torch._inductor.fx_passes.overlap_scheduling import CollBucket, CollectiveInfo
+from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo
 from torch.utils._ordered_set import OrderedSet
 
 
+@dataclass(slots=True)
+class CollBucket:
+    """Track information about a bucket of collectives."""
+
+    collectives: list[fx.Node] = field(
+        default_factory=list
+    )  # Original collective starts
+    total_bytes: int = 0
+    min_start_idx: Optional[int] = None  # Minimum index of collective starts
+    max_wait_idx: Optional[int] = None  # Maximum index of collective waits
+
+    bucketed_start: Optional[fx.Node] = None  # After bucketing
+    bucketed_wait: Optional[fx.Node] = None  # After bucketing
+
+    def add_collective(
+        self,
+        coll_info: CollectiveInfo,
+        node_idx: dict[fx.Node, int],
+    ) -> None:
+        """
+        Add a collective to this bucket and update bucket metadata.
+
+        This handles all updates needed when adding a collective:
+        - Appends to collectives list
+        - Updates total bytes
+        - Updates min_start_idx and max_wait_idx
+        """
+        collective = coll_info.start_node
+
+        # Add to bucket
+        self.collectives.append(collective)
+        self.total_bytes += coll_info.size_bytes
+
+        # Update min start index
+        start_idx = node_idx[collective]
+        if self.min_start_idx is None:
+            self.min_start_idx = start_idx
+        else:
+            self.min_start_idx = min(self.min_start_idx, start_idx)
+
+        # Update max wait index
+        wait_idx = node_idx[coll_info.wait_node]
+        if self.max_wait_idx is None:
+            self.max_wait_idx = wait_idx
+        else:
+            self.max_wait_idx = max(self.max_wait_idx, wait_idx)
+
+
 def bucket_key(node: torch.fx.Node) -> Optional[object]:
     if is_all_gather(node):
         return _ag_group_key(node)
@@ -44,20 +93,19 @@ def __init__(
         self.scheduled = scheduled
         self.max_bucket_memory_gb = max_bucket_memory_gb
         self.node_idx = {n: i for i, n in enumerate(scheduled)}
+        self.aug_graph = AugmentedGraphHelper(self.graph, node_to_idx=self.node_idx)
 
     def bucket_collectives(self) -> None:
         """Main entry point for bucketing collectives."""
 
-        aug_graph = AugmentedGraphHelper(self.graph)
-
         # Add extra dependencies for hidden collectives
         # For each hidden collective, add: compute -> start and wait -> compute
         for start_node, info in self.collective_info.items():
             if info.hiding_node and not info.is_exposed:
                 # Add edge: hiding_compute depends on start (start must come before compute)
-                aug_graph.add_extra_dep(n=info.hiding_node, dep=start_node)
+                self.aug_graph.add_extra_dep(n=info.hiding_node, dep=start_node)
                 # Add edge: wait depends on hiding_compute (compute must come before wait)
-                aug_graph.add_extra_dep(n=info.wait_node, dep=info.hiding_node)
+                self.aug_graph.add_extra_dep(n=info.wait_node, dep=info.hiding_node)
 
         # Group collectives by bucket key (type, group, etc.)
         grouped_collectives: dict[object, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
@@ -68,7 +116,7 @@ def bucket_collectives(self) -> None:
 
         all_buckets: list[CollBucket] = []
         for collective_group in grouped_collectives.values():
-            buckets = self._find_buckets(collective_group, aug_graph)
+            buckets = self._find_buckets(collective_group)
             all_buckets.extend(buckets)
 
         # Collect all extra dependencies to preserve after bucketing
@@ -95,7 +143,6 @@ def bucket_collectives(self) -> None:
     def _find_buckets(
         self,
         collective_group: OrderedSet[fx.Node],
-        aug_graph: AugmentedGraphHelper,
     ) -> list[CollBucket]:
         """Find valid buckets within a group of similar collectives."""
 
@@ -108,13 +155,10 @@ def _find_buckets(
                 continue
 
             # Initialize bucket with first collective
-            bucket_info = CollBucket(
-                collectives=[start_node],
-                total_bytes=self.collective_info[start_node].size_bytes,
-            )
+            bucket_info = CollBucket()
+            bucket_info.add_collective(self.collective_info[start_node], self.node_idx)
             processed.add(start_node)
 
-            # TODO - limit within range
             for candidate in collective_group:
                 if candidate in processed:
                     continue
@@ -123,9 +167,10 @@ def _find_buckets(
                 if bucket_info.total_bytes + candidate_bytes > max_bucket_bytes:
                     continue
 
-                if self._can_add_to_bucket(bucket_info, candidate, aug_graph):
-                    bucket_info.collectives.append(candidate)
-                    bucket_info.total_bytes += candidate_bytes
+                if self._can_add_to_bucket(bucket_info, candidate):
+                    bucket_info.add_collective(
+                        self.collective_info[candidate], self.node_idx
+                    )
                     processed.add(candidate)
 
             if len(bucket_info.collectives) > 1:
@@ -137,11 +182,30 @@ def _ancestor_dep(self, n1: fx.Node, n2: fx.Node) -> bool:
         """Check if there's an ancestor relationship between two nodes."""
         return n1 in self.node_ancestors[n2] or n2 in self.node_ancestors[n1]
 
+    def _has_path(
+        self,
+        source: fx.Node,
+        source_bounds: tuple[int, int],
+        target: fx.Node,
+        target_bounds: tuple[int, int],
+    ) -> bool:
+        """Check if there's a path from source to target with bounded search."""
+
+        search_range = (
+            min(source_bounds[0], target_bounds[0]),
+            max(source_bounds[1], target_bounds[1]),
+        )
+
+        return self.aug_graph.has_path(
+            source,
+            target,
+            bounded_search_range=search_range,
+        )
+
     def _can_add_to_bucket(
         self,
         bucket_info: CollBucket,
         candidate: fx.Node,
-        aug_graph: AugmentedGraphHelper,
     ) -> bool:
         """
         Check if candidate can be added to bucket without interfering
@@ -174,29 +238,39 @@ def _can_add_to_bucket(
         # Check if there's a path between any existing start and candidate start.
         # Because the collectives have already been merged, we can just start from one
         # of them.
-        # TODO: we have a range of possible idxs of the merged node, and idx of new node.
-        # we should not do path search beyond that range
         existing_coll = bucket_info.collectives[0]
-        if aug_graph.has_path(existing_coll, candidate):
+
+        # Calculate bounds for path search
+        candidate_idx = self.node_idx[candidate]
+        candidate_wait_idx = self.node_idx[candidate_wait]
+
+        bucket_min_idx = bucket_info.min_start_idx
+        bucket_max_idx = bucket_info.max_wait_idx
+        assert bucket_min_idx is not None and bucket_max_idx is not None
+        existing_bounds = (bucket_min_idx, bucket_max_idx)
+        candidate_bounds = (candidate_idx, candidate_wait_idx)
+
+        if self._has_path(existing_coll, existing_bounds, candidate, candidate_bounds):
             return False
-        if aug_graph.has_path(candidate, existing_coll):
+        if self._has_path(candidate, candidate_bounds, existing_coll, existing_bounds):
             return False
 
         # Safe to merge starts - do the merge
-        aug_graph.merge_to_set(existing_coll, candidate)
+        self.aug_graph.merge_to_set(existing_coll, candidate)
 
         # Step 3: Check and merge waits
         existing_wait = self.collective_info[existing_coll].wait_node
-        candidate_wait = candidate_info.wait_node
-        # TODO - as above, limit search by idx
-        if aug_graph.has_path(existing_wait, candidate_wait) or aug_graph.has_path(
-            candidate_wait, existing_wait
+
+        if self._has_path(
+            existing_wait, existing_bounds, candidate_wait, candidate_bounds
+        ) or self._has_path(
+            candidate_wait, candidate_bounds, existing_wait, existing_bounds
         ):
             # Unmerge the start we just merged
-            aug_graph.unmerge_node(candidate)
+            self.aug_graph.unmerge_node(candidate)
             return False
 
-        aug_graph.merge_to_set(existing_wait, candidate_wait)
+        self.aug_graph.merge_to_set(existing_wait, candidate_wait)
         return True
 
     def _apply_bucket(
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py
@@ -157,16 +157,6 @@ def is_exposed(self) -> bool:
         return self.exposed_time_ms != 0
 
 
-@dataclass
-class CollBucket:
-    """Track information about a bucket of collectives."""
-
-    collectives: list[fx.Node]  # Original collective starts
-    bucketed_start: Optional[fx.Node] = None  # After bucketing
-    bucketed_wait: Optional[fx.Node] = None  # After bucketing
-    total_bytes: int = 0
-
-
 class OverlapScheduler:
     """
     Scheduler that reorders operations to maximize compute-collective overlap.