Skip to content

Commit c0596e4

Browse files
2ez4bzusberkeley
authored andcommitted
[NVIDIA#9150][feat] Add code for nano v3 to custom implementation in AD (NVIDIA#9465)
* Why? We would like to show an alternative to monkey-patching in AutoDeploy. * What? This commit builds on the existing custom model implementation for NemotronH and adds the bits relevant for MoE layers. Part of NVIDIA#9150. Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
1 parent ff44c3f commit c0596e4

File tree

4 files changed

+145
-13
lines changed

4 files changed

+145
-13
lines changed

tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ def _template_moe(
4545
valid_mask, selected_experts, torch.full_like(selected_experts, num_experts)
4646
)
4747
# Create one-hot encoding with an extra class.
48-
one_hot = F.one_hot(selected_experts_fixed, num_classes=num_experts + 1)
48+
# NOTE: `F.one_hot` only accepts `LongTensor` as an input, and will throw an error if the tensor is of another
49+
# dtype, even if `torch.int32`.
50+
one_hot = F.one_hot(selected_experts_fixed.long(), num_classes=num_experts + 1)
4951
expert_mask = one_hot[..., :num_experts].permute(2, 1, 0)
5052

5153
for expert_idx in range(num_experts):

tensorrt_llm/_torch/auto_deploy/models/hf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
222222
if custom_model_cls is not None:
223223
# `_from_config` has some behavior we would like to use where possible. It is
224224
# defined in the `PreTrainedModel` mixin.
225+
ad_logger.info(f"Using custom model implementation {custom_model_cls}")
225226
if not hasattr(custom_model_cls, "_from_config"):
226227
raise ValueError(
227228
f"`{custom_model_cls.__name__}` must have a `_from_config` class method. "

tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
from transformers.modeling_utils import PreTrainedModel
3333
from transformers.utils import ModelOutput
3434

35+
from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
36+
_nemotron_h_moe_forward,
37+
_nemotron_h_topk_router_forward,
38+
)
39+
3540

3641
class MambaRMSNormGated(torch.nn.Module):
3742
def __init__(self, hidden_size, group_size, eps=1e-5):
@@ -261,6 +266,8 @@ def __init__(self, config, layer_idx):
261266
self.mixer = NemotronHAttention(config, layer_idx=layer_idx)
262267
elif self.block_type == "mlp":
263268
self.mixer = NemotronHMLP(config, layer_idx=layer_idx)
269+
elif self.block_type == "moe":
270+
self.mixer = NemotronHMOE(config, layer_idx=layer_idx)
264271
else:
265272
raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}")
266273

@@ -277,12 +284,12 @@ def forward(self, hidden_states):
277284

278285
# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH
279286
class NemotronHMLP(nn.Module):
280-
def __init__(self, config, layer_idx: int):
287+
def __init__(self, config, layer_idx: int, intermediate_size: Optional[int] = None):
281288
super().__init__()
282289
self.config = config
283290
self.layer_idx = layer_idx
284291
self.hidden_size = config.hidden_size
285-
self.intermediate_size = config.intermediate_size
292+
self.intermediate_size = intermediate_size or config.intermediate_size
286293
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
287294
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
288295
self.act_fn = ACT2FN[config.mlp_hidden_act]
@@ -291,6 +298,50 @@ def forward(self, x):
291298
return self.down_proj(self.act_fn(self.up_proj(x)))
292299

293300

301+
class NemotronHMOE(nn.Module):
302+
def __init__(self, config, layer_idx: Optional[int] = None):
303+
super().__init__()
304+
self.config = config
305+
self.experts = nn.ModuleList(
306+
[
307+
NemotronHMLP(
308+
config, intermediate_size=config.moe_intermediate_size, layer_idx=layer_idx
309+
)
310+
for _ in range(config.n_routed_experts)
311+
]
312+
)
313+
self.gate = NemotronHTopkRouter(config)
314+
self.shared_experts = NemotronHMLP(
315+
config=config,
316+
intermediate_size=config.moe_shared_expert_intermediate_size,
317+
layer_idx=layer_idx,
318+
)
319+
320+
# TODO: inline code from `_nemotron_h_moe_forward` when removing patches.
321+
forward = _nemotron_h_moe_forward
322+
323+
324+
class NemotronHTopkRouter(nn.Module):
325+
def __init__(self, config):
326+
super().__init__()
327+
self.config = config
328+
self.top_k = config.num_experts_per_tok
329+
self.n_routed_experts = config.n_routed_experts
330+
self.routed_scaling_factor = config.routed_scaling_factor
331+
self.n_group = config.n_group
332+
self.topk_group = config.topk_group
333+
self.norm_topk_prob = config.norm_topk_prob
334+
335+
self.weight = nn.Parameter(
336+
torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32)
337+
)
338+
self.register_buffer(
339+
"e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32)
340+
)
341+
342+
forward = _nemotron_h_topk_router_forward
343+
344+
294345
# Copied from transformers.models.llama.modeling_llama.repeat_kv
295346
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
296347
"""
@@ -544,4 +595,6 @@ def forward(
544595

545596

546597
# TODO: uncomment after removing patches (and make sure it is imported in `__init__.py`).
598+
# from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
599+
#
547600
# AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)

tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,45 @@
1+
import functools
12
import types
23

34
import pytest
45
import torch
56
from _model_test_utils import _hf_model_dir_or_hub_id
67
from transformers import AutoConfig
78

9+
from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM
810
from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
911
_from_config_original,
1012
_nemotron_h_moe_forward,
1113
)
1214

13-
torch.manual_seed(42)
15+
_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8))
1416

1517

16-
def _load_nemotron_moe_layer(model_name_or_path: str):
18+
@pytest.fixture(scope="function", autouse=True)
19+
def set_seed():
20+
torch.manual_seed(42)
21+
22+
23+
def skip_on_no_hf_access(func):
24+
"""Decorator for skipping tests that fail due to HF access issues.
25+
26+
This allows us to share the same test code for CI (where access may be restricted, especially for private
27+
repositories) and locally.
28+
"""
29+
30+
@functools.wraps(func)
31+
def wrapper(*args, **kwargs):
32+
try:
33+
return func(*args, **kwargs)
34+
except OSError as e:
35+
if "not a valid model identifier" in str(e):
36+
pytest.skip("Test skipped due to (no) HF access.")
37+
raise
38+
39+
return wrapper
40+
41+
42+
def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None):
1743
"""
1844
Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module.
1945
"""
@@ -34,21 +60,37 @@ def _load_nemotron_moe_layer(model_name_or_path: str):
3460
cfg.num_key_value_heads = 2
3561
cfg.ssm_state_size = 32
3662

37-
model = _from_config_original(cfg, trust_remote_code=True)
63+
if custom_model_cls is None:
64+
model = _from_config_original(cfg, trust_remote_code=True)
65+
else:
66+
model = custom_model_cls._from_config(cfg)
3867
model.eval()
3968

4069
nemotron_moe = None
41-
for name, mod in model.named_modules():
70+
for _, mod in model.named_modules():
4271
if type(mod).__name__ == "NemotronHMOE":
4372
nemotron_moe = mod
4473
break
4574

4675
if nemotron_moe is None:
4776
raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.")
4877

78+
_set_gate_weights(nemotron_moe)
79+
4980
return nemotron_moe
5081

5182

83+
def _set_gate_weights(module):
84+
# This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter`
85+
# is initialized as `torch.empty` in the original model code, which no manner of random seed
86+
# setting will have any effect on. We therefore set it like the below to ensure the
87+
# reproducibility of the tests.
88+
for _, mod in module.named_modules():
89+
if type(mod).__name__ == "NemotronHTopkRouter":
90+
if hasattr(mod, "weight"):
91+
mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight))
92+
93+
5294
@pytest.mark.parametrize(
5395
"model_name",
5496
[
@@ -57,10 +99,11 @@ def _load_nemotron_moe_layer(model_name_or_path: str):
5799
),
58100
],
59101
)
60-
@pytest.mark.parametrize("B,S", [(2, 6), (1, 8)])
102+
@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
61103
@pytest.mark.parametrize("dtype", [torch.bfloat16])
104+
@torch.no_grad()
105+
@skip_on_no_hf_access
62106
def test_nemotronh_moe_patch_forward(model_name, B, S, dtype):
63-
pytest.skip("Skipping due to permission issue")
64107
device = "cuda"
65108

66109
module = _load_nemotron_moe_layer(model_name)
@@ -69,12 +112,45 @@ def test_nemotronh_moe_patch_forward(model_name, B, S, dtype):
69112
H = module.config.hidden_size
70113
x = torch.randn(B, S, H, device=device, dtype=dtype)
71114

72-
with torch.no_grad():
73-
ref = module(x)
115+
ref = module(x)
74116

75117
module.forward = types.MethodType(_nemotron_h_moe_forward, module)
76-
with torch.no_grad():
77-
test = module(x)
118+
test = module(x)
119+
120+
rtol = 0.05
121+
atol = 0.05
122+
123+
torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
124+
125+
126+
@pytest.mark.parametrize(
127+
"model_name",
128+
[
129+
_hf_model_dir_or_hub_id(
130+
"NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3"
131+
),
132+
],
133+
)
134+
@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
135+
@pytest.mark.parametrize("dtype", [torch.bfloat16])
136+
@torch.no_grad()
137+
@skip_on_no_hf_access
138+
def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype):
139+
device = "cuda"
140+
141+
module = _load_nemotron_moe_layer(model_name)
142+
module.to(device)
143+
144+
H = module.config.hidden_size
145+
x = torch.randn(B, S, H, device=device, dtype=dtype)
146+
147+
ref = module(x)
148+
149+
new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM)
150+
new_module.to(device)
151+
new_module.load_state_dict(module.state_dict())
152+
153+
test = new_module(x)
78154

79155
rtol = 0.05
80156
atol = 0.05

0 commit comments

Comments
 (0)