Fix test and comments

JunyiXu-nv · JunyiXu-nv · commit 6db1ac41799a · 2025-11-25T06:32:36.000Z
Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/reasoning_parser.py b/tensorrt_llm/llmapi/reasoning_parser.py
@@ -57,7 +57,7 @@ def parse(self, text: str) -> ReasoningParserResult:
             # text before reasoning start tag is dropped
             text = splits[2]
         splits = text.partition(self.reasoning_end)
-        reasoning_content, content = splits[0].strip(), splits[2].strip()
+        reasoning_content, content = splits[0], splits[2]
         return ReasoningParserResult(content=content,
                                      reasoning_content=reasoning_content)
 
diff --git a/tensorrt_llm/serve/responses_utils.py b/tensorrt_llm/serve/responses_utils.py
@@ -741,7 +741,7 @@ def _create_output_messages(
 
 def _get_chat_completion_function_tools(
         tools: Optional[List[Tool]]) -> List[ChatCompletionToolsParam]:
-    function_tools: List[ChatCompletionToolsParam]() = []
+    function_tools: List[ChatCompletionToolsParam] = []
     if tools is None:
         return function_tools
 
@@ -962,7 +962,7 @@ async def _create_output_content(
     reasoning_parser: Optional[str] = None,
     tool_parser: Optional[str] = None,
     tools: Optional[List[Tool]] = None,
-) -> List[ResponseOutputItem]:
+) -> Tuple[List[ResponseOutputItem], List[ChatCompletionMessageParam]]:
     output_items: List[ResponseOutputItem] = []
     output_messages: List[ChatCompletionMessageParam] = []
     available_tools = _get_chat_completion_function_tools(tools)
@@ -1036,7 +1036,8 @@ async def _create_output_content(
 
 
 async def _create_output_content_harmony(
-        final_res: RequestOutput) -> List[ResponseOutputItem]:
+        final_res: RequestOutput
+) -> Tuple[List[ResponseOutputItem], List[Message]]:
     output_messages = _parse_output_tokens(final_res.outputs[0].token_ids)
     output_content = []
 
@@ -1423,6 +1424,17 @@ def _generate_streaming_event(
     delta_text = output.text_diff
     calls = []
 
+    def check_parser(parser_id: Optional[str],
+                     parser_dict: Optional[Dict[int, BaseReasoningParser]]):
+        if parser_id is not None:
+            if parser_dict is None:
+                raise RuntimeError(
+                    f"Parser({parser_id}) dictionary is not provided for streaming"
+                )
+
+    check_parser(reasoning_parser_id, reasoning_parser_dict)
+    check_parser(tool_parser_id, tool_parser_dict)
+
     delta_text, reasoning_delta_text = _apply_reasoning_parser(
         reasoning_parser_id=reasoning_parser_id,
         output_index=output_idx,
@@ -1447,13 +1459,6 @@ def _generate_streaming_event(
             f" ---------> delta text: {delta_text}, reasoning delta text: {reasoning_delta_text}, calls: {calls}"
         ))
 
-    if reasoning_parser_dict is None:
-        raise RuntimeError(
-            "Reasoning parser dictionary is not provided for streaming")
-    if output_idx not in reasoning_parser_dict:
-        raise RuntimeError(
-            f"Reasoning parser for output index {output_idx} is not found")
-
     # Check if we need to send done events for completed sections
     should_send_reasoning_done, should_send_text_done, reasoning_full_content, text_full_content = _should_send_done_events(
         output=output,
diff --git a/tests/unittest/llmapi/apps/_test_openai_responses.py b/tests/unittest/llmapi/apps/_test_openai_responses.py
@@ -86,7 +86,9 @@ def check_tool_calling(response, first_resp=True, prefix=""):
 @pytest.mark.asyncio(loop_scope="module")
 async def test_reasoning(client: openai.AsyncOpenAI, model: str):
     response = await client.responses.create(
-        model=model, input="Which one is larger as numeric, 9.9 or 9.11?")
+        model=model,
+        input="Which one is larger as numeric, 9.9 or 9.11?",
+        max_output_tokens=1024)
 
     check_reponse(response, "test_reasoning: ")
 
@@ -96,9 +98,10 @@ async def test_reasoning_effort(client: openai.AsyncOpenAI, model: str):
     for effort in ["low", "medium", "high"]:
         response = await client.responses.create(
             model=model,
-            instructions="Use less than 1024 tokens for reasoning",
+            instructions="Use less than 1024 tokens for the whole response",
             input="Which one is larger as numeric, 9.9 or 9.11?",
-            reasoning={"effort": effort})
+            reasoning={"effort": effort},
+            max_output_tokens=1024)
         check_reponse(response, f"test_reasoning_effort_{effort}: ")
 
 
@@ -121,20 +124,23 @@ async def test_chat(client: openai.AsyncOpenAI, model: str):
                                              }, {
                                                  "role": "user",
                                                  "content": "Tell me a joke."
-                                             }])
+                                             }],
+                                             max_output_tokens=1024)
     check_reponse(response, "test_chat: ")
 
 
 @pytest.mark.asyncio(loop_scope="module")
 async def test_multi_turn_chat(client: openai.AsyncOpenAI, model: str):
     response = await client.responses.create(model=model,
-                                             input="What is the answer of 1+1?")
+                                             input="What is the answer of 1+1?",
+                                             max_output_tokens=1024)
     check_reponse(response, "test_multi_turn_chat_1: ")
 
     response_2 = await client.responses.create(
         model=model,
         input="What is the answer of previous question?",
-        previous_response_id=response.id)
+        previous_response_id=response.id,
+        max_output_tokens=1024)
     check_reponse(response_2, "test_multi_turn_chat_2: ")
 
 
@@ -168,11 +174,10 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
         }
     }
     messages = [{"role": "user", "content": "What is the weather like in SF?"}]
-    response = await client.responses.create(
-        model=model,
-        input=messages,
-        tools=[tool_get_current_weather],
-    )
+    response = await client.responses.create(model=model,
+                                             input=messages,
+                                             tools=[tool_get_current_weather],
+                                             max_output_tokens=1024)
     messages.extend(response.output)
     function_call = check_tool_calling(response, True, "test_tool_calls: ")
 
@@ -188,7 +193,8 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
 
     response = await client.responses.create(model=model,
                                              input=messages,
-                                             tools=[tool_get_current_weather])
+                                             tools=[tool_get_current_weather],
+                                             max_output_tokens=1024)
 
     check_tool_calling(response, False, "test_tool_calls: ")
 
@@ -199,7 +205,7 @@ async def test_streaming(client: openai.AsyncOpenAI, model: str):
         model=model,
         input="Explain the theory of relativity in brief.",
         stream=True,
-    )
+        max_output_tokens=1024)
 
     reasoning_deltas, message_deltas = list(), list()
     async for event in stream:
@@ -240,12 +246,11 @@ async def test_streaming_tool_call(client: openai.AsyncOpenAI, model: str):
         }
     }
     messages = [{"role": "user", "content": "What is the weather like in SF?"}]
-    stream = await client.responses.create(
-        model=model,
-        input=messages,
-        tools=[tool_get_current_weather],
-        stream=True,
-    )
+    stream = await client.responses.create(model=model,
+                                           input=messages,
+                                           tools=[tool_get_current_weather],
+                                           stream=True,
+                                           max_output_tokens=1024)
 
     function_call = None
     reasoning_deltas = list()