[TRTLLM-7073][feat] Support torch compile for PP for Llama and DeepSeekV3

liji-nv · liji-nv · commit 1c36f62c89f5 · 2025-09-22T20:31:20.000-07:00
Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
@@ -11,6 +11,7 @@
 
 import tensorrt_llm
 from tensorrt_llm import logger
+from tensorrt_llm.mapping import Mapping
 
 from .multi_stream.auto_multi_stream import multi_stream_schedule
 from .patterns.ar_residual_norm import register_ar_fusions
@@ -39,13 +40,16 @@ def __init__(
         enable_piecewise_cuda_graph: bool = False,
         capture_num_tokens: Optional[List[int]] = None,
         max_num_streams: int = 1,
+        mapping=None,
     ) -> None:
         super().__init__()
         self.elapsed_time = 0
         self.module_inference_event = []
         self.module_inference_time = 0
         self.call_count = 0
-        self.custom_passes = Backend.get_custom_pass(enable_userbuffers)
+        self.mapping = mapping
+        self.custom_passes = Backend.get_custom_pass(enable_userbuffers,
+                                                     mapping)
         self.rank = tensorrt_llm.mpi_rank()
         self.enable_inductor = enable_inductor
         self.capture_num_tokens = sorted(capture_num_tokens or [])
@@ -63,8 +67,7 @@ def __init__(
         self.match_count = []
 
     @classmethod
-    def get_custom_pass(cls, enable_userbuffers):
-        # TODO: add pp + tp support
+    def get_custom_pass(cls, enable_userbuffers, mapping: Mapping):
         world_size = tensorrt_llm.mpi_world_size()
         if not cls._custom_pass_instances:
             # Really naive pass manager here
@@ -75,7 +78,8 @@ def get_custom_pass(cls, enable_userbuffers):
                 os.environ["DISABLE_LAMPORT_REDUCE_NORM_FUSION"] = "1"
                 ub_enabled = enable_userbuffers and tensorrt_llm.bindings.internal.userbuffers.ub_supported(
                 )
-                register_ar_fusions(cls._custom_pass_instances, ub_enabled)
+                register_ar_fusions(cls._custom_pass_instances, mapping,
+                                    ub_enabled)
             else:
                 register_add_norm(cls._custom_pass_instances[0])
         return cls._custom_pass_instances
@@ -150,6 +154,11 @@ def __call__(self, gm: GraphModule,
                     assert isinstance(example_value, FakeTensor)
                     self.input_num_tokens = example_value.shape[0]
                     break
+                if node.name == "l_position_ids_":
+                    example_value = node.meta["example_value"]
+                    assert isinstance(example_value, FakeTensor)
+                    self.input_num_tokens = example_value.shape[-1]
+                    break
 
         if self.piecewise_cuda_graph:
             assert (
diff --git a/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py b/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py
@@ -209,7 +209,9 @@ def flatten_args(args):
                     for inplace_arg in inplace_map[func].values():
                         # At this stage, all inplace op must be using kwargs for all params
                         assert inplace_arg in node.kwargs
-                        latest_inplace_stat[node.kwargs[inplace_arg]] = vertex
+                        args = flatten_args(node.kwargs[inplace_arg])
+                        for arg in args:
+                            latest_inplace_stat[arg] = vertex
 
             for edge in in_edges.values():
                 edge.out_edges.append(vertex)
diff --git a/tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py b/tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py
@@ -8,21 +8,14 @@
                                              PatternMatcherPass, fwd_only,
                                              register_replacement)
 
-import tensorrt_llm
-
 from ...distributed import AllReduceFusionOp, AllReduceStrategy
 
 aten = torch.ops.aten
 from tensorrt_llm.mapping import Mapping
 
 
-def register_ar_residual_norm(custom_pass: PatternMatcherPass):
-    # TODO: add pp + tp support
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
+def register_ar_residual_norm(custom_pass: PatternMatcherPass,
+                              mapping: Mapping):
     residual_key = KeywordArg("residual")
     trtllm_allreduce_default = CallFunction(
         torch.ops.trtllm.allreduce.default, KeywordArg("input"), None, None,
@@ -117,14 +110,8 @@ def check_non_ub_strategy(match, strategy_node) -> bool:
     return True
 
 
-def register_ar_residual_norm_out_fp8_quant(custom_pass: PatternMatcherPass):
-    # TODO: add pp + tp support
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
-
+def register_ar_residual_norm_out_fp8_quant(custom_pass: PatternMatcherPass,
+                                            mapping: Mapping):
     input_node = KeywordArg("input")
     strategy_node = KeywordArg("strategy")
     allreduce_default = CallFunction(torch.ops.trtllm.allreduce.default,
@@ -200,14 +187,8 @@ def extra_check(match: Match) -> bool:
     )
 
 
-def register_ar_residual_norm_fp8_quant(custom_pass: PatternMatcherPass):
-    # TODO: add pp + tp support
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
-
+def register_ar_residual_norm_fp8_quant(custom_pass: PatternMatcherPass,
+                                        mapping: Mapping):
     input_node = KeywordArg("input")
     strategy_node = KeywordArg("strategy")
     allreduce_default = CallFunction(torch.ops.trtllm.allreduce.default,
@@ -282,14 +263,8 @@ def extra_check(match: Match) -> bool:
     )
 
 
-def register_ar_residual_norm_out_fp4_quant(custom_pass: PatternMatcherPass):
-    # TODO: add pp + tp support
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
-
+def register_ar_residual_norm_out_fp4_quant(custom_pass: PatternMatcherPass,
+                                            mapping: Mapping):
     input_node = KeywordArg("input")
     strategy_node = KeywordArg("strategy")
     allreduce_default = CallFunction(torch.ops.trtllm.allreduce.default,
@@ -360,14 +335,8 @@ def extra_check(match: Match) -> bool:
     )
 
 
-def register_ar_residual_norm_fp4_quant(custom_pass: PatternMatcherPass):
-    # TODO: add pp + tp support
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
-
+def register_ar_residual_norm_fp4_quant(custom_pass: PatternMatcherPass,
+                                        mapping: Mapping):
     input_node = KeywordArg("input")
     strategy_node = KeywordArg("strategy")
     allreduce_default = CallFunction(torch.ops.trtllm.allreduce.default,
@@ -437,12 +406,8 @@ def extra_check(match: Match) -> bool:
     )
 
 
-def register_ub_patterns(custom_passes: List[PatternMatcherPass]):
-    mapping = Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        tp_size=tensorrt_llm.mpi_world_size(),
-        rank=tensorrt_llm.mpi_rank(),
-    )
+def register_ub_patterns(custom_passes: List[PatternMatcherPass],
+                         mapping: Mapping):
 
     def register_convert_supported_ar_to_ub(custom_pass: PatternMatcherPass):
         strategy = int(AllReduceStrategy.AUTO)
@@ -717,16 +682,16 @@ def target_finalize_pattern(
 
 
 def register_ar_fusions(custom_passes: List[PatternMatcherPass],
-                        enable_ub: bool):
-    register_ar_residual_norm(custom_passes[-1])
+                        mapping: Mapping, enable_ub: bool):
+    register_ar_residual_norm(custom_passes[-1], mapping)
 
     custom_passes.append(PatternMatcherPass())
-    register_ar_residual_norm_fp8_quant(custom_passes[-1])
-    register_ar_residual_norm_fp4_quant(custom_passes[-1])
+    register_ar_residual_norm_fp8_quant(custom_passes[-1], mapping)
+    register_ar_residual_norm_fp4_quant(custom_passes[-1], mapping)
     # AR-Residual-Norm-Out-Quant-X is not supported by Userbuffers kernel.
     if not enable_ub:
-        register_ar_residual_norm_out_fp8_quant(custom_passes[-1])
-        register_ar_residual_norm_out_fp4_quant(custom_passes[-1])
+        register_ar_residual_norm_out_fp8_quant(custom_passes[-1], mapping)
+        register_ar_residual_norm_out_fp4_quant(custom_passes[-1], mapping)
 
     if enable_ub:
-        register_ub_patterns(custom_passes)
+        register_ub_patterns(custom_passes, mapping)
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
@@ -76,6 +76,12 @@ def inplace_info():
         },
         torch.ops.trtllm.logits_bitmask.default: {
             1: "logits"
+        },
+        torch.ops.trtllm.pp_recv.default: {
+            1: "tensors"
+        },
+        torch.ops.trtllm.pp_send.default: {
+            1: "tensors"
         }
     }
     return inplace_map
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -477,11 +477,14 @@ def init_pp_comm(mapping):
     _pp_comm = PPComm(mapping)
 
 
-def pp_recv(tensor):
+@torch.library.custom_op("trtllm::pp_recv", mutates_args=("tensors", ))
+def pp_recv(tensors: List[torch.Tensor]) -> None:
     """Receive tensors from previous pp rank."""
-    _pp_comm.recv(tensor)
+    for tensor in tensors:
+        _pp_comm.recv(tensor)
 
 
-def pp_send(tensor):
+@torch.library.custom_op("trtllm::pp_send", mutates_args=("tensors", ))
+def pp_send(tensors: List[torch.Tensor]) -> None:
     """Send tensors to next pp rank."""
     _pp_comm.send(tensor)
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -170,11 +170,12 @@ def forward_after_recv_fn(
         residual=...,
         **kwargs,
     ):
-        pp_recv(hidden_states)
         if residual is not ...:
             if residual is None:
                 residual = torch.empty_like(hidden_states)
-            pp_recv(residual)
+            pp_recv([hidden_states, residual])
+        else:
+            pp_recv([hidden_states])
         return forward_fn(
             position_ids,
             hidden_states,
@@ -207,11 +208,10 @@ def forward_before_send_fn(
         )
         if residual is not ...:
             hidden_states, residual = output
-            pp_send(hidden_states)
-            pp_send(residual)
+            pp_send([hidden_states, residual])
         else:
             hidden_states = output
-            pp_send(hidden_states)
+            pp_send([hidden_states])
         return output
 
     forward_before_send_fn.__wrapped_by_forward_before_send__ = True
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -364,7 +364,8 @@ def __init__(
                     _torch_compile_piecewise_cuda_graph,
                     capture_num_tokens=self._piecewise_cuda_graph_num_tokens,
                     max_num_streams=pytorch_backend_config.
-                    torch_compile_max_num_streams)
+                    torch_compile_max_num_streams,
+                    mapping=self.mapping)
                 if isinstance(self.model, DecoderModelForCausalLM):
                     self.model.model = torch.compile(
                         self.model.model,
@@ -2496,7 +2497,7 @@ def _forward_step_mm_encoder_only(
         return {'mm_embeddings': mm_embeddings, 'logits': None}
 
     def _init_userbuffers(self, hidden_size):
-        if self.mapping.tp_size <= 1:
+        if self.mapping.tp_size <= 1 or self.mapping.pp_size > 1:
             return False
 
         # Disable UB for unsupported platforms
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -113,11 +113,6 @@ def test_bfloat16(self, attn_backend, torch_compile):
                              ids=["tp4", "tp2pp2", "pp4"])
     def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
                             torch_compile):
-        if torch_compile and pp_size > 1:
-            pytest.skip(
-                "Pipeline parallel with torch.compile is not supported yet.\n"
-                "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "
-                "discarded at graph breaks.")
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=True,
@@ -1187,8 +1182,6 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
                             torch_compile):
-        if torch_compile and pp_size > 1:
-            pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1226,8 +1219,6 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
     @parametrize_with_ids("mtp", ["disable", "eagle", "vanilla"])
     def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
                               overlap_scheduler, torch_compile):
-        if torch_compile and mtp != "disable":
-            pytest.skip("https://nvbugs/5252313")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1280,8 +1271,6 @@ def test_cute_dsl_fp8_block_scales(
         overlap_scheduler,
         torch_compile,
     ):
-        if torch_compile and attention_dp:
-            pytest.skip("https://nvbugs/5252559")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
         torch_compile_config = (TorchCompileConfig(
             enable_fullgraph=True,
@@ -1384,8 +1373,6 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile):
-        if torch_compile and pp_size > 1:
-            pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1446,8 +1433,6 @@ def test_cute_dsl_fp8_block_scales_4gpus(
         overlap_scheduler,
         torch_compile,
     ):
-        if torch_compile and pp_size > 1:
-            pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
         torch_compile_config = (TorchCompileConfig(
             enable_fullgraph=True,
@@ -1669,8 +1654,6 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size,
                          torch_compile, mtp_nextn, moe_backend):
-        if torch_compile and pp_size > 1:
-            pytest.skip("PP with torch.compile is not supported yet.")
         if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                         or get_sm_version() == 121):
             pytest.skip(
diff --git a/tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py b/tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py
@@ -66,7 +66,9 @@ def row_linear_residual_norm_fusion_forward(
         x: torch.Tensor, residual: torch.Tensor, hidden_size: int,
         dtype: torch.dtype, tensor_parallel_size: int,
         tensor_parallel_rank: int, weights: torch.Tensor, fused_add_norm: bool):
-    backend = Backend()
+    backend = Backend(mapping=Mapping(world_size=tensor_parallel_size,
+                                      tp_size=tensor_parallel_size,
+                                      rank=tensor_parallel_rank))
     x = x.cuda()
     residual = residual.cuda()
     norm_weight = torch.randn((hidden_size, ), dtype=dtype, device="cuda")
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,12 @@ def inplace_info():`
`76`	`76`	`},`
`77`	`77`	`torch.ops.trtllm.logits_bitmask.default: {`
`78`	`78`	`1: "logits"`
	`79`	`+ },`
	`80`	`+ torch.ops.trtllm.pp_recv.default: {`
	`81`	`+ 1: "tensors"`
	`82`	`+ },`
	`83`	`+ torch.ops.trtllm.pp_send.default: {`
	`84`	`+ 1: "tensors"`
`79`	`85`	`}`
`80`	`86`	`}`
`81`	`87`	`return inplace_map`