[export][unflatten] More strictly respect scope when removing inputs (#127607)

kwen2501 · pytorchmergebot · commit 6abca6a5647f · 2024-06-04T06:43:54.000Z
Code snippet from TorchTitan (LLaMa): ``` for layer in self.layers.values(): h = layer(h, self.freqs_cis) ``` `self.freqs_cis` is a buffer of root module (`self`). It is also an explicit arg in the call signature of original `layer` modules. If not respecting scope -- `freqs_cis`'s scope only corresponds to root -- `_sink_param` can remove `freqs_cis` from `layer`'s call signature, resulting in runtime error. There are two fixes in this PR: 1. We filter out the `inputs_to_state` corresponding to the current scope, using existing code that does prefix matching. 2. We delay the removal of param inputs from `call_module` nodes' `args`, till `_sink_param` call on that submodule returns. The return now returns information on which input is actually removed by the submodule, thus more accurate than just doing: ``` for node in call_module_nodes: node.args = tuple(filter(lambda n: n.name not in inputs_to_state, node.args)) ``` Before the PR: ![Screenshot 2024-05-31 at 1 40 24 AM](https://github.com/pytorch/pytorch/assets/6676466/a2e06b18-44d5-40ca-b242-0edab45075b7) After the PR: ![Screenshot 2024-05-31 at 1 43 41 AM](https://github.com/pytorch/pytorch/assets/6676466/b72afb94-cdfa-420d-b88b-29a92bf2a0c0) Pull Request resolved: #127607 Approved by: https://github.com/pianpwk
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
@@ -747,6 +747,28 @@ def forward(self, x):
         unep = unflatten(ep)
         self.assertTrue(torch.allclose(unep(*inps), m(*inps)))
 
+    def test_attr_as_submod_input(self):
+        class layer(torch.nn.Module):
+            def forward(self, x, const) -> torch.Tensor:
+                return x + const
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("const", torch.ones(4, 8))
+                self.layers = torch.nn.ModuleList([layer() for _ in range(2)])
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                for layer in self.layers:
+                    x = layer(x, self.const)
+                return x
+
+        mod = M()
+        x = torch.randn(4, 8)
+        ep = export(mod, (x,))
+        unflattened = unflatten(ep)
+        torch.testing.assert_close(unflattened(x), mod(x))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
@@ -337,16 +337,35 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 inputs_to_state[n] = targets
 
         _sink_params(self, inputs_to_state, [])
-        # Check all input nodes has been processed.
-        for name, module in self.named_modules():
-            if not hasattr(module, "graph"):
-                continue
-            for node in module.graph.nodes:
-                if node.op != "placeholder":
-                    continue
-                assert (
-                    node.name not in inputs_to_state
-                ), f"{node.name} was not sunk into the module {name} which has the graph: {module.graph}"
+
+        # Helper function to check input nodes of `module` has been processed.
+        def check_module_inputs(module, scope):
+            if hasattr(module, "graph"):
+                for node in module.graph.nodes:
+                    # sink_params() should turn placeholders into get_attr nodes
+                    # for attributes that are within scope of the current
+                    # module. We allow attributes to remain as placeholders if
+                    # they are inputs in the original module signature, meaning
+                    # they are a parent module's attribute, and therefore out of
+                    # scope of the current module.
+                    if (
+                        node.op == "placeholder"
+                        and node.name in inputs_to_state
+                        and any(
+                            fqn.split(".")[: len(scope)] == scope
+                            for fqn in inputs_to_state[node.name]
+                        )  # matching scope to avoid wrong assert
+                    ):
+                        raise AssertionError(
+                            f"{node.name} was not sunk into the module {scope} which has the graph: {module.graph}"
+                        )
+            # Recursively check the submodules.
+            for name, submod in module.named_children():
+                scope.append(name)
+                check_module_inputs(submod, scope)
+
+        # Recurively check all input nodes have been processed.
+        check_module_inputs(self, [])
 
         # Cache so we don't have to compute this every time.
         # NOTE: this needs to be kept in sync with the placeholders in
@@ -1010,14 +1029,23 @@ def _sink_params(
     scope: tracks where we are in the module hierarchy, so that we can emit the
         right `getattr(self, "foo.bar")` calls, etc.
     """
+    # This dict records inputs removed by child modules.
+    # Maps the module object id to the list of placeholder node names
+    # in the child module that were removed.
+    module_id_to_inputs_removed: Dict[int, List[str]] = defaultdict(list)
+
     # We need to use _modules here instead of named_children(), because we
     # explicitly want duplicate modules to show up in the traversal.
     for name, submodule in module._modules.items():
-        _sink_params(cast(torch.nn.Module, submodule), inputs_to_state, scope + [name])
+        submod_id_to_inputs_removed = _sink_params(
+            cast(torch.nn.Module, submodule), inputs_to_state, scope + [name]
+        )
+        for k, v in submod_id_to_inputs_removed.items():
+            module_id_to_inputs_removed[k].extend(v)
 
     if not hasattr(module, "graph"):
         # Not all modules have graphs defined, if they are empty modules with no operations (like ParameterList)
-        return
+        return module_id_to_inputs_removed
 
     graph = module.graph
     inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
@@ -1026,32 +1054,49 @@ def _sink_params(
     # Also remove from call_module nodes
     call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
     for node in call_module_nodes:
-        node.args = tuple(filter(lambda n: n.name not in inputs_to_state, node.args))
+        submodule = _recursive_getattr(module, node.target.split("."))
+        # remove placeholder from call_module node arguments, only if we've
+        # erased the placeholder node in the corresponding _sink_params() call
+        if submodule is not None and id(submodule) in module_id_to_inputs_removed:
+            node.args = tuple(
+                filter(
+                    lambda n: n.name not in module_id_to_inputs_removed[id(submodule)],
+                    node.args,
+                )
+            )
 
+    # Filter out inputs_to_state corresponding to current scope.
+    inputs_to_state_of_scope: Dict[torch.fx.Node, list[str]] = {}
     for node in inputs:
         if node.name not in inputs_to_state:
             continue
 
-        if len(node.users) > 0:
-            state_name = None
-            for sn in inputs_to_state[node.name]:
-                sn_split = sn.split(".")
-                if sn_split[: len(scope)] == scope:
-                    state_name = sn_split
-                    break
-
-            # If there's a mismatch beteewn scope name and state name, then
-            # there must be multuple scopes pointing to the same state name,
-            # meaning some modules are shared. In such case, we can simply skip
-            # updating the current node because another later iteration will
-            # take care of this input node when the unique match between scope
-            # and state name occurs.  To make sure this always happen, we should
-            # enforce the invariant that no placeholder node in the unflattened
-            # graph appears in inputs_to_state dict, which means all the extra
-            # input nodes have been handled.
-            if state_name is None:
-                continue
+        state_name = None
+        for sn in inputs_to_state[node.name]:
+            sn_split = sn.split(".")
+            if sn_split[: len(scope)] == scope:
+                state_name = sn_split
+                break
+
+        # If there's a mismatch beteewn scope name and state name, then
+        # there must be multuple scopes pointing to the same state name,
+        # meaning some modules are shared. In such case, we can simply skip
+        # updating the current node because another later iteration will
+        # take care of this input node when the unique match between scope
+        # and state name occurs.  To make sure this always happen, we should
+        # enforce the invariant that no placeholder node in the unflattened
+        # graph appears in inputs_to_state dict, which means all the extra
+        # input nodes have been handled.
+        if state_name is None:
+            continue
+
+        inputs_to_state_of_scope[node] = state_name
+
+    # Record name of remove inputs for return purpose.
+    inputs_removed: List[str] = []
 
+    for node, state_name in inputs_to_state_of_scope.items():
+        if len(node.users) > 0:
             attr_path = state_name[len(scope) :]
             state_attr = _recursive_getattr(module, attr_path)
             assert isinstance(state_attr, (torch.Tensor, torch.ScriptObject))
@@ -1061,13 +1106,20 @@ def _sink_params(
                 new_node = graph.create_node("get_attr", ".".join(attr_path))
 
             node.replace_all_uses_with(new_node, propagate_meta=True)
+
         graph.erase_node(node)
+        inputs_removed.append(node.name)
+
     if isinstance(module, InterpreterModule):
         module.finalize()
 
+    return {id(module): inputs_removed}
+
 
 def _recursive_getattr(obj, attr_path):
     for attr in attr_path:
+        if not hasattr(obj, attr):
+            return None
         obj = getattr(obj, attr)
 
     return obj