Address review comments

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
NVIDIA · schetlur-nv · Oct 3, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 15, 2025
commit 100a22c864e67d31d00d283e6082bf901b12b6f4
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
@@ -812,6 +812,21 @@ def get_test_prompts(use_code_prompts: bool = False) -> list[str]:
         ]
 
 
+def get_test_prompts_for_torch() -> list[str]:
+    """Get test prompts for LoRA Torch testing.
+
+    Returns:
+        List of test prompts.
+    """
+    return [
+        "Hey how are you doing today?",
+        "How is the weather in Seattle, WA?",
+        "Is it ok to fill diesel in a petrol car?",
+        "Can you check the top 5 trending songs on spotify?",
+        "What is the capital of France?",
+    ]
+
+
 def test_multi_lora_support(
     hf_model_dir,
     tllm_ckpt_dir,
@@ -909,10 +924,14 @@ def test_llm_torch_multi_lora_support(
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=1,
-        pipeline_parallel_size=1):
+        pipeline_parallel_size=1,
+        expected_outputs=None):
     """Test multi-LoRA support with LLM-API Torch backend."""
+
+    # if expected_outputs is None:
+    #     raise ValueError("expected_outputs must be provided for exact validation")
+
     start_time = time.time()
     print("Creating dummy LoRAs...")
     lora_start = time.time()
@@ -938,7 +957,7 @@ def test_llm_torch_multi_lora_support(
                              max_cpu_loras=num_loras,
                              lora_target_modules=target_trtllm_modules)
 
-    input_prompts = get_test_prompts(use_code_prompts)
+    input_prompts = get_test_prompts_for_torch()
 
     with LLM_torch(
             model=hf_model_dir,
@@ -985,13 +1004,31 @@ def test_llm_torch_multi_lora_support(
         )
 
         for i, output in enumerate(outputs):
+            actual_text = output.outputs[0].text
             print(f"Prompt {i+1}: {input_prompts[i]}")
             print(
                 f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}"
             )
-            print(f"Output: {output.outputs[0].text}")
+            print(f"Actual: {actual_text}")
             print("-" * 50)
 
+        # Validate exact outputs
+        # print("Validating exact outputs...")
+        # assert len(outputs) == len(expected_outputs), \
+        #     f"Expected {len(expected_outputs)} outputs, got {len(outputs)}"
+
+        # for i, (output, expected) in enumerate(zip(outputs, expected_outputs)):
+        #     actual_text = output.outputs[0].text
+        #     print(f"Prompt {i+1}: {input_prompts[i]}")
+        #     print(f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}")
+        #     print(f"Expected: {expected}")
+        #     print(f"Actual: {actual_text}")
+        #     print("-" * 50)
+
+        #     # Exact string comparison
+        #     assert actual_text == expected, \
+        #         f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}"
+
     total_time = time.time() - start_time
     print(f"Total test execution time: {total_time:.2f} seconds")
 

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -1335,6 +1335,11 @@ def llm_lora_model_root(request):
         elif item == "komt-mistral-7b-v1-lora":
             model_root_list.append(
                 os.path.join(models_root, "komt-mistral-7b-v1-lora"))
+        elif item == "Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32":
+            model_root_list.append(
+                os.path.join(
+                    models_root, "nemotron-nas",
+                    "Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32"))
 
     return ",".join(model_root_list)
 

diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
@@ -4055,28 +4055,66 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
                                         llm_venv, engine_dir, llama_model_root):
     """Run Llama models with multiple dummy LoRAs using LLM-API Torch backend."""
 
-    if llama_model_root == "llama-3.3-70b-instruct":
+    if "llama-3.3-70b-instruct" in llama_model_root.lower():
         tensor_parallel_size = 8
         if get_device_count() < 8:
             pytest.skip(
                 "Skipping: llama-3.3-70b-instruct model requires 8 GPUs")
     else:
         tensor_parallel_size = 1
 
+    expected_outputs = {
+        'llama-v3-8b-instruct-hf': [
+            " I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
+            " Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
+            " No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
+            " I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
+            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
+        ],
+        'llama-3.1-8b-instruct': [
+            " I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
+            " Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
+            " | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
+            " I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
+            " Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
+        ],
+        'llama-3.2-1b-instruct': [
+            " I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
+            " Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
+            " Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
+            " based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
+            " Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
+        ],
+        'llama-3.2-3b-instruct': [
+            " I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
+            " (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
+            " and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
+            " and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
+            " Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
+        ],
+        'llama-3.3-70b-instruct': [
+            " I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
+            " Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
+            " No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
+            " I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
+            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
+        ],
+    }
+
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
 
+    model_name = os.path.basename(llama_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llama_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=tensor_parallel_size,
-    )
+        expected_outputs=expected_outputs[model_name])
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"

diff --git a/tests/integration/defs/examples/test_mistral.py b/tests/integration/defs/examples/test_mistral.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Module test_mistral test mistral examples."""
 import multiprocessing
+import os
 
 import defs.ci_profiler
 import psutil
@@ -293,27 +294,44 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
                                       llm_mistral_model_root):
     """Run Mistral models with multiple dummy LoRAs using LLM-API Torch backend."""
 
-    if llm_mistral_model_root == "mistral-nemo-instruct-2407":
+    if "mistral-nemo-instruct-2407" in llm_mistral_model_root.lower():
         tensor_parallel_size = 2
         if get_device_count() < 2:
             pytest.skip(
                 "Skipping: mistral-nemo-instruct-2407 model requires 2 GPUs")
     else:
         tensor_parallel_size = 1
 
+    expected_outputs = {
+        'mistral-7b-v0.1': [
+            "I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing",
+            "\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°",
+            "\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will",
+            "\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign",
+            "\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C."
+        ],
+        'mistral-nemo-instruct-2407': [
+            " I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!",
+            " Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud",
+            " I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1",
+            " I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it",
+            " Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel"
+        ],
+    }
+
     print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
+    model_name = os.path.basename(llm_mistral_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llm_mistral_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=tensor_parallel_size,
-    )
+        expected_outputs=expected_outputs[model_name])
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"

diff --git a/tests/integration/defs/examples/test_nemotron_nas.py b/tests/integration/defs/examples/test_nemotron_nas.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 
 import defs.ci_profiler
@@ -141,49 +142,63 @@ def test_nemotron_nano_8b_lora_torch(nemotron_nas_example_root, llm_venv,
                                      llm_rouge_root, engine_dir, cmodel_dir):
     """Run Nemotron Nano 8B with multiple dummy LoRAs using LLM-API Torch backend."""
 
+    expected_outputs = {
+        'llama-3.1-nemotron-nano-8b-v1': [
+            " I am having a bit of a problem with my computer. The screen is black, but my monitor is still giving me the same signals. The brightness",
+            " How is the climate like? What are some of the typical foods and drinks of the region? What is the economy like? How does the city compare",
+            " I have heard that it's possible but can be dangerous. What are the potential risks? Are there any safety guidelines? I should probably check some references",
+            " I can't do that right now. But I can suggest that if you're interested in music trends, you can check Spotify's \"Discover Weekly\"",
+            " The capital of France is Paris. But wait, I think there's another city called Paris. No, no, that's the same city. Maybe"
+        ],
+    }
+
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
+
+    model_name = os.path.basename(nemotron_nas_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=nemotron_nas_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=1,
-    )
+        expected_outputs=expected_outputs[model_name])
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
     )
 
 
 @pytest.mark.skip(
-    reason="TODO: Upload the LoRA adapter to llm-models repo and rerun the test"
-)
+    reason="TODO: Resolve the hanging issue while running the test")
 @pytest.mark.skip_less_device(4)
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.parametrize("nemotron_nas_model_root", [
     "Llama-3_3-Nemotron-Super-49B-v1",
 ],
                          indirect=True)
+@pytest.mark.parametrize(
+    "llm_lora_model_root",
+    ['Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32'],
+    indirect=True)
 def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
                                             nemotron_nas_model_root,
+                                            llm_lora_model_root,
                                             llm_datasets_root, llm_rouge_root,
                                             engine_dir, cmodel_dir):
     """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""
 
     print("Testing Nemotron Super 49B with real LoRA adapters...")
 
-    lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"
-    print(f"Using real LoRA from: {lora_adapter_path}")
+    print(f"Using real LoRA from: {llm_lora_model_root}")
 
     defs.ci_profiler.start("test_nemotron_real_lora_torch")
 
     lora_config = LoraConfig(
-        lora_dir=[lora_adapter_path],
+        lora_dir=[llm_lora_model_root],
         max_lora_rank=32,  # From adapter_config.json: "r": 32
         max_loras=1,
         max_cpu_loras=1,
@@ -196,7 +211,8 @@ def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
              max_batch_size=2,
              max_input_len=512,
              max_seq_len=1024,
-             max_beam_width=1) as llm:
+             max_beam_width=1,
+             load_format="dummy") as llm:
 
         prompts = [
             "What is the capital of France?",
@@ -207,7 +223,7 @@ def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
                                          temperature=0.7,
                                          top_p=0.9)
 
-        lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]
+        lora_request = [LoRARequest("nemotron-lora", 0, llm_lora_model_root)]
 
         print("Running inference with real LoRA adapter...")
         outputs = llm.generate(prompts,
@@ -244,19 +260,23 @@ def test_nemotron_ultra_253b_lora_torch(nemotron_nas_example_root, llm_venv,
                                         engine_dir, cmodel_dir):
     """Run Nemotron Ultra 253B with multiple dummy LoRAs using LLM-API Torch backend."""
 
+    expected_outputs = {
+        'Llama-3_1-Nemotron-Ultra-253B-v1': ["...", "...", "...", "...", "..."],
+    }
+
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
+    model_name = os.path.basename(nemotron_nas_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=nemotron_nas_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=8,
-    )
+        expected_outputs=expected_outputs[model_name])
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"

diff --git a/tests/integration/defs/examples/test_phi.py b/tests/integration/defs/examples/test_phi.py
@@ -461,9 +461,14 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
         llm_venv, engine_dir, llm_phi_model_root):
     """Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend."""
 
-    print("Testing Phi-4-mini-instruct with LLM-API Torch backend...")
+    expected_outputs = {
+        'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."],
+    }
+
+    print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
+    model_name = os.path.basename(llm_phi_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llm_phi_model_root,
         llm_venv=llm_venv,
@@ -472,7 +477,6 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
         target_hf_modules=["qkv_proj"],
         target_trtllm_modules=["attn_qkv"],
         zero_lora_weights=True,
-        use_code_prompts=False,
         tensor_parallel_size=1,
-    )
+        expected_outputs=expected_outputs[model_name])
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -244,7 +244,7 @@ l0_h100:
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
   - test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
   - test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
-  - examples/test_mistral.py::test_mistral_7b_v0_1_with_bf16_lora_torch[mistral-7b-v0.1]
+  - examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]
   - examples/test_phi.py::test_phi_4_mini_instruct_with_bf16_lora_torch[Phi-4-mini-instruct]
   - examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct]
   - examples/test_nemotron_nas.py::test_nemotron_nano_8b_lora_torch[Llama-3.1-Nemotron-Nano-8B-v1]