Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -22197,6 +22197,19 @@
"reasoning"
],
"model_description": "gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.",
"tool_parser": "qwen",
"reasoning_start_tag": "analysis",
"reasoning_end_tag": "assistantfinal",
"cache_config": {
"ignore_patterns": [
"metal/**",
"original/**"
],
"ignore_file_pattern": [
"metal/*",
"original/*"
]
},
"model_specs": [
{
"model_format": "pytorch",
Expand Down Expand Up @@ -22272,12 +22285,11 @@
"#system_numpy#"
]
},
"updated_at": 1768444084,
"updated_at": 1768550062,
"featured": true,
"architectures": [
"GptOssForCausalLM"
],
"model_type": "gpt_oss"
]
},
{
"version": 2,
Expand Down
1 change: 1 addition & 0 deletions xinference/model/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"Qwen3-Omni-Instruct",
"Qwen3-Omni-Thinking",
"MiniMax-M2",
"gpt-oss",
]

GLM4_TOOL_CALL_FAMILY = [
Expand Down
19 changes: 16 additions & 3 deletions xinference/model/llm/vllm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
)
from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
from ..core import chat_context_var
from ..harmony import async_stream_harmony_chat_completion
from ..llm_family import cache_model_tokenizer_and_config
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
Expand Down Expand Up @@ -807,7 +808,10 @@ def _sanitize_model_config(
else:
model_config.setdefault("quantization", None)
model_config.setdefault("max_model_len", None)
model_config.setdefault("reasoning_content", False)
if self.model_family.model_name == "gpt-oss":
model_config.setdefault("reasoning_content", True)
else:
model_config.setdefault("reasoning_content", False)

if "speculative_config" in model_config:
model_config["speculative_config"] = self.parse_str_field_to_dict(
Expand Down Expand Up @@ -1632,9 +1636,12 @@ async def async_chat(
assert isinstance(agen, AsyncGenerator)
if tools:
return self._async_to_tool_completion_chunks(agen, chat_template_kwargs)
return self._async_to_chat_completion_chunks(
chunks = self._async_to_chat_completion_chunks(
agen, self.reasoning_parser, chat_template_kwargs
)
if self.model_family.model_name == "gpt-oss":
return async_stream_harmony_chat_completion(chunks)
return chunks
else:
c = await self.async_generate(
full_prompt, generate_config, request_id=request_id
Expand All @@ -1644,7 +1651,13 @@ async def async_chat(
return self._post_process_completion(
self.model_family, self.model_uid, c
)
return self._to_chat_completion(c, self.reasoning_parser)
completion = self._to_chat_completion(c, self.reasoning_parser)
if self.model_family.model_name == "gpt-oss":
async for parsed_completion in async_stream_harmony_chat_completion(
completion
):
return parsed_completion
return completion


class VLLMMultiModel(VLLMModel, ChatModelMixin):
Expand Down
Loading