diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index bca8e37be5..39211628eb 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -22197,6 +22197,19 @@ "reasoning" ], "model_description": "gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.", + "tool_parser": "qwen", + "reasoning_start_tag": "analysis", + "reasoning_end_tag": "assistantfinal", + "cache_config": { + "ignore_patterns": [ + "metal/**", + "original/**" + ], + "ignore_file_pattern": [ + "metal/*", + "original/*" + ] + }, "model_specs": [ { "model_format": "pytorch", @@ -22272,12 +22285,11 @@ "#system_numpy#" ] }, - "updated_at": 1768444084, + "updated_at": 1768550062, "featured": true, "architectures": [ "GptOssForCausalLM" - ], - "model_type": "gpt_oss" + ] }, { "version": 2, diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index c3de92e0d5..e863ecd7b9 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -78,6 +78,7 @@ "Qwen3-Omni-Instruct", "Qwen3-Omni-Thinking", "MiniMax-M2", + "gpt-oss", ] GLM4_TOOL_CALL_FAMILY = [ diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 7225a5ece3..ed9cac1789 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -57,6 +57,7 @@ ) from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1 from ..core import chat_context_var +from ..harmony import async_stream_harmony_chat_completion from ..llm_family import cache_model_tokenizer_and_config from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, @@ -807,7 +808,10 @@ def _sanitize_model_config( else: model_config.setdefault("quantization", None) model_config.setdefault("max_model_len", None) - model_config.setdefault("reasoning_content", False) + if self.model_family.model_name == "gpt-oss": + model_config.setdefault("reasoning_content", True) + else: + model_config.setdefault("reasoning_content", False) if "speculative_config" in model_config: model_config["speculative_config"] = self.parse_str_field_to_dict( @@ -1632,9 +1636,12 @@ async def async_chat( assert isinstance(agen, AsyncGenerator) if tools: return self._async_to_tool_completion_chunks(agen, chat_template_kwargs) - return self._async_to_chat_completion_chunks( + chunks = self._async_to_chat_completion_chunks( agen, self.reasoning_parser, chat_template_kwargs ) + if self.model_family.model_name == "gpt-oss": + return async_stream_harmony_chat_completion(chunks) + return chunks else: c = await self.async_generate( full_prompt, generate_config, request_id=request_id @@ -1644,7 +1651,13 @@ async def async_chat( return self._post_process_completion( self.model_family, self.model_uid, c ) - return self._to_chat_completion(c, self.reasoning_parser) + completion = self._to_chat_completion(c, self.reasoning_parser) + if self.model_family.model_name == "gpt-oss": + async for parsed_completion in async_stream_harmony_chat_completion( + completion + ): + return parsed_completion + return completion class VLLMMultiModel(VLLMModel, ChatModelMixin):