NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/commands/trtllm-serve/trtllm-serve.rst‎
Lines changed: 34 additions & 30 deletions b/‎docs/source/commands/trtllm-serve/trtllm-serve.rst‎
Lines changed: 34 additions & 30 deletions
diff --git a/‎docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎docs/source/legacy/tensorrt_quickstart.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/legacy/tensorrt_quickstart.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm-api/_tensorrt_engine/quickstart_example.py‎
Lines changed: 8 additions & 2 deletions b/‎examples/llm-api/_tensorrt_engine/quickstart_example.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 1 addition & 0 deletions b/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 5 additions & 5 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/flashinfer.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/_torch/attention_backend/flashinfer.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/attention_backend/interface.py‎
Lines changed: 4 additions & 1 deletion b/‎tensorrt_llm/_torch/attention_backend/interface.py‎
Lines changed: 4 additions & 1 deletion
@@ -47,7 +47,7 @@ class LogitsPostProcessor : Algorithm
 
     bool operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
         runtime::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
-        std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched = std::nullopt) const;
+        std::optional<LogitsPostProcessorBatched> const& logitsPostProcessorBatched = std::nullopt) const;
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -34,7 +34,7 @@ using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
 bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
     tr::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
-    std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
+    std::optional<LogitsPostProcessorBatched> const& logitsPostProcessorBatched) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(LogitsPostProcessor);
 
@@ -201,56 +201,60 @@ Metrics Endpoint
 
 .. note::
 
-   This endpoint is beta maturity.
+   The metrics endpoint for the default PyTorch backend are in beta and are not as comprehensive as those for the TensorRT backend.
 
-   The statistics for the PyTorch backend are beta and not as comprehensive as those for the TensorRT backend.
+   Some fields, such as CPU memory usage, are not yet available for the PyTorch backend.
 
-   Some fields, such as CPU memory usage, are not available for the PyTorch backend.
+   Enabling ``enable_iter_perf_stats`` in the PyTorch backend can slightly impact performance, depending on the serving configuration.
 
-   Enabling ``enable_iter_perf_stats`` in the PyTorch backend can impact performance slightly, depending on the serving configuration.
+The ``/metrics`` endpoint provides runtime iteration statistics such as GPU memory usage and KV cache details.
 
-The ``/metrics`` endpoint provides runtime-iteration statistics such as GPU memory use and inflight-batching details.
-For the TensorRT backend, these statistics are enabled by default.
-However, for the PyTorch backend, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
+For the default PyTorch backend, iteration statistics logging is enabled by setting the ``enable_iter_perf_stats`` field in a YAML file:
 
 .. code-block:: yaml
 
-   # extra-llm-api-config.yml
-   pytorch_backend_config:
-    enable_iter_perf_stats: true
+   # extra_llm_config.yaml
+   enable_iter_perf_stats: true
 
-Then start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file as shown in the following example:
+Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file:
 
 .. code-block:: bash
 
-   trtllm-serve <model> \
-     --extra_llm_api_options <path-to-extra-llm-api-config.yml> \
-     [--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
+   trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml
 
-After at least one inference request is sent to the server, you can fetch the runtime-iteration statistics by polling the `/metrics` endpoint:
+After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint.
+Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed.
 
 .. code-block:: bash
 
-   curl -X GET http://<host>:<port>/metrics
+   curl -X GET http://localhost:8000/metrics
 
-*Example Output*
+Example output:
 
 .. code-block:: json
 
-   [
-       {
-           "gpuMemUsage": 56401920000,
-        "inflightBatchingStats": {
+    [
+        {
+            "gpuMemUsage": 76665782272,
+            "iter": 154,
+            "iterLatencyMS": 7.00688362121582,
+            "kvCacheStats": {
+                "allocNewBlocks": 3126,
+                "allocTotalBlocks": 3126,
+                "cacheHitRate": 0.00128,
+                "freeNumBlocks": 101253,
+                "maxNumBlocks": 101256,
+                "missedBlocks": 3121,
+                "reusedBlocks": 4,
+                "tokensPerBlock": 32,
+                "usedNumBlocks": 3
+            },
+            "numActiveRequests": 1
             ...
-        },
-        "iter": 1,
-        "iterLatencyMS": 16.505143404006958,
-        "kvCacheStats": {
-            ...
-        },
-        "newActiveRequestsQueueLatencyMS": 0.0007503032684326172
-    }
-]
+        }
+    ]
+
+
 
 Syntax
 ------
 
@@ -234,6 +234,28 @@ TODO: Use Chat Compeletions API / Responses API as the example after the PR is m
 We use OpenAI's official evaluation tool to test the model's accuracy. For more information see [https://github.com/openai/gpt-oss/tree/main/gpt_oss/evals](gpt-oss-eval).
 With the added support of Chat Completions and Responses API in `trtllm-serve,` `gpt_oss.evals` works directly without any modifications.
 
+You need to set `enable_attention_dp`, `tp_size`, `ep_size`, `max_batch_size` and `max_num_tokens` when launching the trtllm server and set `reasoning-effort` when launching evaluation in gpt-oss. Below are some reference configurations for accuracy evaluation on B200. 
+
+| **reasoning-effort** | **parallel configuration** | **max_batch_size** | **max_num_tokens** |
+|:--------------------:|:--------------------------:|:------------------:|:------------------:|
+| low/medium           | DEP8 / DEP4                | 128                | 32768              |
+| high                 | DEP8 / DEP4                | 2                  | 133120             |
+| low/medium           | TP8 / TP4                  | 1024               | 32768              |
+| high                 | TP8 / TP4                  | 720                | 133120             |
+
+Below is an example command for evaluating the accuracy of gpt-oss-120b with low and medium reasoning-effort on GPQA and AIME2025.
+
+```shell
+# execute this command in gpt-oss
+python -m gpt_oss.evals \
+  --sampler chat_completions \
+  --eval gpqa,aime25 \
+  --model gpt-oss-120b \
+  --reasoning-effort low,medium
+```
+
+
+
 ## Benchmarking Performance
 
 To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
 
@@ -1,7 +1,7 @@
 # LLM API with TensorRT Engine
 A simple inference example with TinyLlama using the LLM API:
 
-```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
+```{literalinclude} ../../../examples/llm-api/_tensorrt_engine/quickstart_example.py
     :language: python
     :linenos:
 ```
 
@@ -1,11 +1,17 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
 
 
 def main():
 
+    build_config = BuildConfig()
+    build_config.max_batch_size = 256
+    build_config.max_num_tokens = 1024
+
     # Model could accept HF model name, a path to local HF model,
     # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              build_config=build_config)
 
     # Sample prompts.
     prompts = [
 
@@ -76,6 +76,7 @@ srun -l \
 
         # This is optional
         cat > /tmp/pytorch_extra_args.txt << EOF
+cuda_graph_config: null
 print_iter_log: true
 enable_attention_dp: false
 EOF
 
@@ -364,7 +364,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     // Wait 10 minutes to check status of the node again
                     sleep(time: 10, unit: 'MINUTES')
                     // Avoid the node being stuck in the held state.
-                    Utils.exec(pipeline, Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""))
+                    Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
                     counter++
                 }
             }
@@ -1805,7 +1805,7 @@ def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="",
     if (env.alternativeTRT) {
         trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver)
     }
-    buildArgs = "--clean"
+    buildArgs = "--clean --nixl_root /opt/nvidia/nvda_nixl"
     if (cpu_arch == AARCH64_TRIPLE) {
         buildArgs += " -a '90-real;100-real;120-real'"
     }
@@ -2040,9 +2040,9 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
-        //"RTXPro6000-Pytorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
-        //"RTXPro6000-4_GPUs-Pytorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
-        //"RTXPro6000-4_GPUs-Pytorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
+        "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
+        "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
+        "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
     ]
 
     parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
 
@@ -170,7 +170,8 @@ def __post_init__(self) -> None:
     def create_cuda_graph_metadata(self,
                                    max_batch_size: int,
                                    sub_cross_metadata: bool = False,
-                                   max_draft_tokens: int = 0) -> Self:
+                                   max_draft_tokens: int = 0,
+                                   buffers=None) -> Self:
         metadata = super().create_cuda_graph_metadata(max_batch_size,
                                                       sub_cross_metadata,
                                                       max_draft_tokens)
 
@@ -140,6 +140,7 @@ class AttentionMetadata:
 
     # This buffer is currently only used for TrtllmAttentionMetadata.
     cache_indirection: Optional[torch.Tensor] = None
+    cuda_graph_buffers: dict[str, list[torch.Tensor]] = None
 
     _saved_tensors: Dict[str, torch.Tensor] = field(init=False,
                                                     default_factory=dict)
@@ -288,7 +289,8 @@ def prepare(self):
     def create_cuda_graph_metadata(self,
                                    max_batch_size: int,
                                    sub_cross_metadata: bool = False,
-                                   max_draft_tokens: int = 0) -> Self:
+                                   max_draft_tokens: int = 0,
+                                   buffers=None) -> Self:
         """
         Creates metadata for CUDA graph execution.
         CUDA graphs require to use pre-allocated buffers for all tensors in fields.
@@ -300,6 +302,7 @@ def create_cuda_graph_metadata(self,
 
         cuda_graph_metadata = copy.copy(self)
         cuda_graph_metadata.is_cuda_graph = True
+        cuda_graph_metadata.cuda_graph_buffers = buffers
         if self.has_cross_sub_metadata:
             cuda_graph_metadata.cross = cuda_graph_metadata.cross.create_cuda_graph_metadata(
                 max_batch_size, True)
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ using SizeType32 = tensorrt_llm::runtime::SizeType32;`
`34`	`34`
`35`	`35`	`bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,`
`36`	`36`	`tr::WorldConfig const& worldConfig, CudaStreamPtr const& stream,`
`37`		`- std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const`
	`37`	`+ std::optional<LogitsPostProcessorBatched> const& logitsPostProcessorBatched) const`
`38`	`38`	`{`
`39`	`39`	`TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);`
`40`	`40`	`NVTX3_SCOPED_RANGE(LogitsPostProcessor);`
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p`
`364`	`364`	`// Wait 10 minutes to check status of the node again`
`365`	`365`	`sleep(time: 10, unit: 'MINUTES')`
`366`	`366`	`// Avoid the node being stuck in the held state.`
`367`		`- Utils.exec(pipeline, Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} \|\| true\""))`
	`367`	`+ Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} \|\| true\""), numRetries: 3)`
`368`	`368`	`counter++`
`369`	`369`	`}`
`370`	`370`	`}`
`@@ -1805,7 +1805,7 @@ def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="",`
`1805`	`1805`	`if (env.alternativeTRT) {`
`1806`	`1806`	`trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver)`
`1807`	`1807`	`}`
`1808`		`- buildArgs = "--clean"`
	`1808`	`+ buildArgs = "--clean --nixl_root /opt/nvidia/nvda_nixl"`
`1809`	`1809`	`if (cpu_arch == AARCH64_TRIPLE) {`
`1810`	`1810`	`buildArgs += " -a '90-real;100-real;120-real'"`
`1811`	`1811`	`}`
`@@ -2040,9 +2040,9 @@ def launchTestJobs(pipeline, testFilter)`
`2040`	`2040`	`"DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],`
`2041`	`2041`	`"DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],`
`2042`	`2042`	`"DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],`
`2043`		`- //"RTXPro6000-Pytorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],`
`2044`		`- //"RTXPro6000-4_GPUs-Pytorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],`
`2045`		`- //"RTXPro6000-4_GPUs-Pytorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],`
	`2043`	`+ "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],`
	`2044`	`+ "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],`
	`2045`	`+ "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],`
`2046`	`2046`	`]`
`2047`	`2047`
`2048`	`2048`	`parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {`