[BugFix] Fix tool call finish reason in streaming case (#9209)

maxdebayser · web-flow · commit ec10cb8511b7 · 2024-10-11T18:24:26.000-07:00
Signed-off-by: Max de Bayser &lt;mbayser@br.ibm.com&gt;
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -538,10 +538,12 @@ async def chat_completion_stream_generator(
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
+                        auto_tools_called = False
                         if tool_parser:
-                            index = len(
-                                tool_parser.prev_tool_call_arr) - 1 if len(
-                                    tool_parser.prev_tool_call_arr) > 0 else 0
+                            auto_tools_called = len(
+                                tool_parser.prev_tool_call_arr) > 0
+                            index = len(tool_parser.prev_tool_call_arr
+                                        ) - 1 if auto_tools_called else 0
                         else:
                             index = 0
 
@@ -576,9 +578,7 @@ async def chat_completion_stream_generator(
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
-                            if not (tool_parser
-                                    and len(tool_parser.prev_tool_call_arr))
-                            else "tool_calls",
+                            if not auto_tools_called else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
@@ -680,8 +680,10 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
-            # by default, tools are not used.
-            tools_called = False
+            # In the OpenAI API the finish_reason is "tools_called"
+            # if the tool choice is auto and the model produced a tool
+            # call. The same is not true for named function calls
+            auto_tools_called = False
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
@@ -703,7 +705,6 @@ async def chat_completion_full_generator(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
-                tools_called = True
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
@@ -725,7 +726,10 @@ async def chat_completion_full_generator(
 
                 tool_call_info = tool_parser.extract_tool_calls(
                     output.text, request=request)
-                tools_called = tool_call_info.tools_called
+                # In the OpenAI API the finish_reason is "tools_called"
+                # if the tool choice is auto and the model produced a tool
+                # call. The same is not true for named function calls
+                auto_tools_called = tool_call_info.tools_called
                 if tool_call_info.tools_called:
                     message = ChatMessage(role=role,
                                           content=tool_call_info.content,
@@ -748,7 +752,7 @@ async def chat_completion_full_generator(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason="tool_calls" if tools_called else
+                finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
                 stop_reason=output.stop_reason)
             choices.append(choice_data)