improve based on comments

sychen52 · sychen52 · commit c3266c78a596 · 2025-09-22T21:47:49.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -1506,7 +1506,7 @@ def get_runner(
             self.num_experts,
             self.top_k,
         )
-        instance_key = (tile_tokens_dim, )
+        instance_key = (tile_tokens_dim, self.act_type)
         if instance_key not in FP8FP4BlockScaleMoERunner.runner_dict:
             FP8FP4BlockScaleMoERunner.runner_dict[
                 instance_key] = torch.classes.trtllm.FP8FP4BlockScaleMoERunner(
@@ -1668,30 +1668,6 @@ def fp8_fp4_block_scale_moe_runner(
     return kernel_runner(inputs, tactic=best_tactic)
 
 
-def fp8_fp4_block_scale_fake_output_without_finalize(
-    hidden_states: Union[torch.Tensor, Fp4QuantizedTensor],
-    num_experts: int,
-    top_k: int,
-    routing_bias: Optional[torch.Tensor],
-):
-    num_tokens = hidden_states.shape[0]
-    hidden_size = hidden_states.shape[1]
-
-    tile_tokens_dim = calculate_tile_tokens_dim(num_tokens, num_experts, top_k)
-
-    expanded_row_count = num_tokens * top_k
-    max_padding_required = (tile_tokens_dim - 1) * num_experts
-    max_num_padded_tokens = fp4_utils.pad_up(
-        expanded_row_count + max_padding_required, tile_tokens_dim)
-    wt_dtype = routing_bias.dtype if routing_bias is not None else torch.bfloat16
-    return [
-        hidden_states.new_empty((max_num_padded_tokens, hidden_size),
-                                dtype=torch.bfloat16),
-        hidden_states.new_empty((num_tokens, top_k), dtype=wt_dtype),
-        hidden_states.new_empty((num_tokens, top_k), dtype=torch.int32)
-    ]
-
-
 @fp8_fp4_block_scale_moe_runner.register_fake
 def _(
     routing_logits,
@@ -1716,17 +1692,25 @@ def _(
     do_finalize,
     act_type,
 ) -> List[torch.Tensor]:
+
+    num_tokens = hidden_states.shape[0]
+    hidden_size = hidden_states.shape[1]
+
     if do_finalize:
-        num_tokens = hidden_states.shape[0]
-        hidden_size = hidden_states.shape[1]
         return [
             hidden_states.new_empty((num_tokens, hidden_size),
                                     dtype=torch.bfloat16)
         ]
 
-    return fp8_fp4_block_scale_fake_output_without_finalize(
-        hidden_states,
-        num_experts,
-        top_k,
-        routing_bias,
-    )
+    tile_tokens_dim = calculate_tile_tokens_dim(num_tokens, num_experts, top_k)
+    expanded_row_count = num_tokens * top_k
+    max_padding_required = (tile_tokens_dim - 1) * num_experts
+    max_num_padded_tokens = fp4_utils.pad_up(
+        expanded_row_count + max_padding_required, tile_tokens_dim)
+    wt_dtype = routing_bias.dtype if routing_bias is not None else torch.bfloat16
+    return [
+        hidden_states.new_empty((max_num_padded_tokens, hidden_size),
+                                dtype=torch.bfloat16),
+        hidden_states.new_empty((num_tokens, top_k), dtype=wt_dtype),
+        hidden_states.new_empty((num_tokens, top_k), dtype=torch.int32)
+    ]