Update on "respect aten planned overlap in inductor"

eellison · eellison · commit 3ee7d6c54401 · 2025-10-03T16:18:33.000-07:00
Now that we have a hop to add implicit deps - use those deps for comm/compute overlap.

cc H-Huang awgu wanchaol fegin fduwjj wz337 wconstab d4l3k pragupta msaroufim dcci voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben

[ghstack-poisoned]
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
@@ -817,23 +817,17 @@ def func(a, b, c, d, *, ranks):
             # Check that right deps are added
             f = FileCheck()
             for _ in range(2):
-                f.check("control_deps_op").check_same("all_gather").check_same(
+                f.check("control_deps").check_same("all_gather").check_same(
                     "subgraph_mm"
                 )
-                f.check("control_deps_op").check_same("mm").check_same("subgraph_wait")
+                f.check("control_deps").check_same("mm").check_same("subgraph_wait")
             f.run(li[0])
 
             f = FileCheck()
-            f.check("def call").check(
-                "torch.ops._c10d_functional.all_gather_into_tensor"
-            )
-            f.check_count(".mm(", 1, exactly=True)
-            f.check_count(".wait(", 1, exactly=True)
-            f.check_count(
-                "torch.ops._c10d_functional.all_gather_into_tensor_", 1, exactly=True
-            )
-            f.check_count(".mm(", 1, exactly=True)
-            f.check_count(".wait(", 1, exactly=True)
+            for _ in range(2):
+                f.check_count("all_gather_into_tensor_out.default(", 1, exactly=True)
+                f.check_count("extern_kernels.mm(", 1, exactly=True)
+                f.check_count("wait_tensor.default(", 1, exactly=True)
             f.run(code)
 
             correct = func(a, b, c, d, ranks=ranks)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -7255,29 +7255,34 @@ def control_deps_op_lowering(additional_deps, subgraph_fn, *args):
 
     output = None
 
+    operation_len = len(V.graph.operations)
     assert len(subgraph_fn.graph_module.graph.find_nodes(op="placeholder")) == len(args)
     for i, node in enumerate(subgraph_fn.graph_module.graph.nodes):
         if node.op == "placeholder":
+            assert node not in V.graph.env
             V.graph.env[node] = args[i]
             continue
         elif node.op == "output":
             args, kwargs = V.graph.fetch_args_kwargs_from_env(node)
             output = torch.fx.Interpreter.output(V.graph, node, args, kwargs)
         else:
+            assert node not in V.graph.env
             V.graph.env[node] = V.graph.run_node(node)
 
     assert output is not None and additional_deps
-    output_list = output if isinstance(output, (list, tuple)) else [output]
 
-    for out in output_list:
-        if not isinstance(out, IRNode):
-            continue
-
-        # need to realize in order to add the dep
-        out.realize()
-        out_name = out.get_name()
+    # some operators, like wait_tensor, just return their input,
+    # so its more robust to add dep to the operation itself,
+    # otherwise you can have a cycle of
+    # a = coll
+    # b = control_deps(a, mm, ...)
+    # c = control_deps(b, wait, ...)
+    # if c == a, then you have a cycle.
+    for op in V.graph.operations[operation_len:]:
         for dep_name in dep_names:
-            V.graph.additional_buffer_deps[out_name].add(dep_name)
+            op_name = op.operation_name
+            assert op_name is not None
+            V.graph.additional_buffer_deps[op_name].add(dep_name)
 
     return output
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -2681,9 +2681,9 @@ def add_user(
                             )
                             add_user(other_name, node, is_weak=True)
 
-                for add_dep in V.graph.additional_buffer_deps[buf.get_name()]:
-                    add_user(add_dep, node, is_weak=True)
-                    node.add_fake_dep(WeakDep(add_dep, node.get_name()))
+            for add_dep in V.graph.additional_buffer_deps[node.get_name()]:
+                add_user(add_dep, node, is_weak=True)
+                node.add_fake_dep(WeakDep(add_dep, node.get_name()))
 
             # add normal non-mutation dependencies
             for read in node.read_writes.reads: