xorbitsai · OliverBryant · Jan 16, 2026 · Jan 16, 2026
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -22197,6 +22197,19 @@
       "reasoning"
     ],
     "model_description": "gpt-oss series, OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.",
+    "tool_parser": "qwen",
+    "reasoning_start_tag": "analysis",
+    "reasoning_end_tag": "assistantfinal",
+    "cache_config": {
+      "ignore_patterns": [
+        "metal/**",
+        "original/**"
+      ],
+      "ignore_file_pattern": [
+        "metal/*",
+        "original/*"
+      ]
+    },
     "model_specs": [
       {
         "model_format": "pytorch",
@@ -22272,12 +22285,11 @@
         "#system_numpy#"
       ]
     },
-    "updated_at": 1768444084,
+    "updated_at": 1768550062,
     "featured": true,
     "architectures": [
       "GptOssForCausalLM"
-    ],
-    "model_type": "gpt_oss"
+    ]
   },
   {
     "version": 2,

diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -78,6 +78,7 @@
     "Qwen3-Omni-Instruct",
     "Qwen3-Omni-Thinking",
     "MiniMax-M2",
+    "gpt-oss",
 ]
 
 GLM4_TOOL_CALL_FAMILY = [

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -57,6 +57,7 @@
 )
 from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
+from ..harmony import async_stream_harmony_chat_completion
 from ..llm_family import cache_model_tokenizer_and_config
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
@@ -807,7 +808,10 @@ def _sanitize_model_config(
         else:
             model_config.setdefault("quantization", None)
         model_config.setdefault("max_model_len", None)
-        model_config.setdefault("reasoning_content", False)
+        if self.model_family.model_name == "gpt-oss":
+            model_config.setdefault("reasoning_content", True)
+        else:
+            model_config.setdefault("reasoning_content", False)
 
         if "speculative_config" in model_config:
             model_config["speculative_config"] = self.parse_str_field_to_dict(
@@ -1632,9 +1636,12 @@ async def async_chat(
             assert isinstance(agen, AsyncGenerator)
             if tools:
                 return self._async_to_tool_completion_chunks(agen, chat_template_kwargs)
-            return self._async_to_chat_completion_chunks(
+            chunks = self._async_to_chat_completion_chunks(
                 agen, self.reasoning_parser, chat_template_kwargs
             )
+            if self.model_family.model_name == "gpt-oss":
+                return async_stream_harmony_chat_completion(chunks)
+            return chunks
         else:
             c = await self.async_generate(
                 full_prompt, generate_config, request_id=request_id
@@ -1644,7 +1651,13 @@ async def async_chat(
                 return self._post_process_completion(
                     self.model_family, self.model_uid, c
                 )
-            return self._to_chat_completion(c, self.reasoning_parser)
+            completion = self._to_chat_completion(c, self.reasoning_parser)
+            if self.model_family.model_name == "gpt-oss":
+                async for parsed_completion in async_stream_harmony_chat_completion(
+                    completion
+                ):
+                    return parsed_completion
+            return completion
 
 
 class VLLMMultiModel(VLLMModel, ChatModelMixin):