Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Address review comments
Signed-off-by: Michal Guzek <mguzek@nvidia.com>
  • Loading branch information
moraxu committed Sep 30, 2025
commit 100a22c864e67d31d00d283e6082bf901b12b6f4
45 changes: 41 additions & 4 deletions tests/integration/defs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,21 @@ def get_test_prompts(use_code_prompts: bool = False) -> list[str]:
]


def get_test_prompts_for_torch() -> list[str]:
"""Get test prompts for LoRA Torch testing.

Returns:
List of test prompts.
"""
return [
"Hey how are you doing today?",
"How is the weather in Seattle, WA?",
"Is it ok to fill diesel in a petrol car?",
"Can you check the top 5 trending songs on spotify?",
"What is the capital of France?",
]


def test_multi_lora_support(
hf_model_dir,
tllm_ckpt_dir,
Expand Down Expand Up @@ -909,10 +924,14 @@ def test_llm_torch_multi_lora_support(
target_hf_modules=["q_proj", "k_proj", "v_proj"],
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=1,
pipeline_parallel_size=1):
pipeline_parallel_size=1,
expected_outputs=None):
"""Test multi-LoRA support with LLM-API Torch backend."""

# if expected_outputs is None:
# raise ValueError("expected_outputs must be provided for exact validation")

start_time = time.time()
print("Creating dummy LoRAs...")
lora_start = time.time()
Expand All @@ -938,7 +957,7 @@ def test_llm_torch_multi_lora_support(
max_cpu_loras=num_loras,
lora_target_modules=target_trtllm_modules)

input_prompts = get_test_prompts(use_code_prompts)
input_prompts = get_test_prompts_for_torch()

with LLM_torch(
model=hf_model_dir,
Expand Down Expand Up @@ -985,13 +1004,31 @@ def test_llm_torch_multi_lora_support(
)

for i, output in enumerate(outputs):
actual_text = output.outputs[0].text
print(f"Prompt {i+1}: {input_prompts[i]}")
print(
f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}"
)
print(f"Output: {output.outputs[0].text}")
print(f"Actual: {actual_text}")
print("-" * 50)

# Validate exact outputs
# print("Validating exact outputs...")
# assert len(outputs) == len(expected_outputs), \
# f"Expected {len(expected_outputs)} outputs, got {len(outputs)}"

# for i, (output, expected) in enumerate(zip(outputs, expected_outputs)):
# actual_text = output.outputs[0].text
# print(f"Prompt {i+1}: {input_prompts[i]}")
# print(f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}")
# print(f"Expected: {expected}")
# print(f"Actual: {actual_text}")
# print("-" * 50)

# # Exact string comparison
# assert actual_text == expected, \
# f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}"

total_time = time.time() - start_time
print(f"Total test execution time: {total_time:.2f} seconds")

Expand Down
5 changes: 5 additions & 0 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,6 +1335,11 @@ def llm_lora_model_root(request):
elif item == "komt-mistral-7b-v1-lora":
model_root_list.append(
os.path.join(models_root, "komt-mistral-7b-v1-lora"))
elif item == "Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32":
model_root_list.append(
os.path.join(
models_root, "nemotron-nas",
"Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32"))

return ",".join(model_root_list)

Expand Down
44 changes: 41 additions & 3 deletions tests/integration/defs/examples/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -4055,28 +4055,66 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
llm_venv, engine_dir, llama_model_root):
"""Run Llama models with multiple dummy LoRAs using LLM-API Torch backend."""

if llama_model_root == "llama-3.3-70b-instruct":
if "llama-3.3-70b-instruct" in llama_model_root.lower():
tensor_parallel_size = 8
if get_device_count() < 8:
pytest.skip(
"Skipping: llama-3.3-70b-instruct model requires 8 GPUs")
else:
tensor_parallel_size = 1

expected_outputs = {
'llama-v3-8b-instruct-hf': [
" I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
" Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
" No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
" I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
'llama-3.1-8b-instruct': [
" I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
" Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
" | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
" I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
" Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
],
'llama-3.2-1b-instruct': [
" I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
" Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
" Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
" based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
" Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
],
'llama-3.2-3b-instruct': [
" I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
" (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
" and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
" and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
" Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
],
'llama-3.3-70b-instruct': [
" I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
" Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
" No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
" I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")

model_name = os.path.basename(llama_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llama_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=tensor_parallel_size,
)
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
Expand Down
24 changes: 21 additions & 3 deletions tests/integration/defs/examples/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
"""Module test_mistral test mistral examples."""
import multiprocessing
import os

import defs.ci_profiler
import psutil
Expand Down Expand Up @@ -293,27 +294,44 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
llm_mistral_model_root):
"""Run Mistral models with multiple dummy LoRAs using LLM-API Torch backend."""

if llm_mistral_model_root == "mistral-nemo-instruct-2407":
if "mistral-nemo-instruct-2407" in llm_mistral_model_root.lower():
tensor_parallel_size = 2
if get_device_count() < 2:
pytest.skip(
"Skipping: mistral-nemo-instruct-2407 model requires 2 GPUs")
else:
tensor_parallel_size = 1

expected_outputs = {
'mistral-7b-v0.1': [
"I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing",
"\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°",
"\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will",
"\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign",
"\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C."
],
'mistral-nemo-instruct-2407': [
" I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!",
" Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud",
" I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1",
" I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it",
" Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel"
],
}

print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(llm_mistral_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llm_mistral_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=tensor_parallel_size,
)
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
Expand Down
42 changes: 31 additions & 11 deletions tests/integration/defs/examples/test_nemotron_nas.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pathlib import Path

import defs.ci_profiler
Expand Down Expand Up @@ -141,49 +142,63 @@ def test_nemotron_nano_8b_lora_torch(nemotron_nas_example_root, llm_venv,
llm_rouge_root, engine_dir, cmodel_dir):
"""Run Nemotron Nano 8B with multiple dummy LoRAs using LLM-API Torch backend."""

expected_outputs = {
'llama-3.1-nemotron-nano-8b-v1': [
" I am having a bit of a problem with my computer. The screen is black, but my monitor is still giving me the same signals. The brightness",
" How is the climate like? What are some of the typical foods and drinks of the region? What is the economy like? How does the city compare",
" I have heard that it's possible but can be dangerous. What are the potential risks? Are there any safety guidelines? I should probably check some references",
" I can't do that right now. But I can suggest that if you're interested in music trends, you can check Spotify's \"Discover Weekly\"",
" The capital of France is Paris. But wait, I think there's another city called Paris. No, no, that's the same city. Maybe"
],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")

model_name = os.path.basename(nemotron_nas_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=nemotron_nas_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=1,
)
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
)


@pytest.mark.skip(
reason="TODO: Upload the LoRA adapter to llm-models repo and rerun the test"
)
reason="TODO: Resolve the hanging issue while running the test")
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("nemotron_nas_model_root", [
"Llama-3_3-Nemotron-Super-49B-v1",
],
indirect=True)
@pytest.mark.parametrize(
"llm_lora_model_root",
['Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32'],
indirect=True)
def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
nemotron_nas_model_root,
llm_lora_model_root,
llm_datasets_root, llm_rouge_root,
engine_dir, cmodel_dir):
"""Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""

print("Testing Nemotron Super 49B with real LoRA adapters...")

lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"
print(f"Using real LoRA from: {lora_adapter_path}")
print(f"Using real LoRA from: {llm_lora_model_root}")

defs.ci_profiler.start("test_nemotron_real_lora_torch")

lora_config = LoraConfig(
lora_dir=[lora_adapter_path],
lora_dir=[llm_lora_model_root],
max_lora_rank=32, # From adapter_config.json: "r": 32
max_loras=1,
max_cpu_loras=1,
Expand All @@ -196,7 +211,8 @@ def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
max_batch_size=2,
max_input_len=512,
max_seq_len=1024,
max_beam_width=1) as llm:
max_beam_width=1,
load_format="dummy") as llm:

prompts = [
"What is the capital of France?",
Expand All @@ -207,7 +223,7 @@ def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
temperature=0.7,
top_p=0.9)

lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]
lora_request = [LoRARequest("nemotron-lora", 0, llm_lora_model_root)]

print("Running inference with real LoRA adapter...")
outputs = llm.generate(prompts,
Expand Down Expand Up @@ -244,19 +260,23 @@ def test_nemotron_ultra_253b_lora_torch(nemotron_nas_example_root, llm_venv,
engine_dir, cmodel_dir):
"""Run Nemotron Ultra 253B with multiple dummy LoRAs using LLM-API Torch backend."""

expected_outputs = {
'Llama-3_1-Nemotron-Ultra-253B-v1': ["...", "...", "...", "...", "..."],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(nemotron_nas_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=nemotron_nas_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=8,
)
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
Expand Down
10 changes: 7 additions & 3 deletions tests/integration/defs/examples/test_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,14 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
llm_venv, engine_dir, llm_phi_model_root):
"""Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend."""

print("Testing Phi-4-mini-instruct with LLM-API Torch backend...")
expected_outputs = {
'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(llm_phi_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llm_phi_model_root,
llm_venv=llm_venv,
Expand All @@ -472,7 +477,6 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
target_hf_modules=["qkv_proj"],
target_trtllm_modules=["attn_qkv"],
zero_lora_weights=True,
use_code_prompts=False,
tensor_parallel_size=1,
)
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
2 changes: 1 addition & 1 deletion tests/integration/test_lists/test-db/l0_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ l0_h100:
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
- examples/test_mistral.py::test_mistral_7b_v0_1_with_bf16_lora_torch[mistral-7b-v0.1]
- examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]
- examples/test_phi.py::test_phi_4_mini_instruct_with_bf16_lora_torch[Phi-4-mini-instruct]
- examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct]
- examples/test_nemotron_nas.py::test_nemotron_nano_8b_lora_torch[Llama-3.1-Nemotron-Nano-8B-v1]
Expand Down