update gpt-oss app (#2128)

lvyufeng · web-flow · commit 9c931d02c5ea · 2025-08-14T00:40:39.000+08:00
diff --git a/examples/transformers/inference/gpt-oss/app_multiprocess.py b/examples/transformers/inference/gpt-oss/app_multiprocess.py
@@ -22,10 +22,8 @@
     torch_dtype="auto",
 )
 
-system_prompt = "You are a helpful and friendly chatbot"
-
 def build_input_from_chat_history(chat_history, msg: str):
-    messages = [{'role': 'system', 'content': system_prompt}]
+    messages = [{"role": "system", "content": "You are a helpful and friendly chatbot"}]
     for user_msg, ai_msg in chat_history:
         messages.append({'role': 'user', 'content': user_msg})
         messages.append({'role': 'assistant', 'content': ai_msg})
@@ -37,20 +35,22 @@ def predict(message, history):
     dist.barrier()
     # Formatting the input for the model.
     messages = build_input_from_chat_history(history, message)
-    input_ids = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            tokenize=True
-        )
-    input_len = core.tensor(input_ids.shape[1])
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True,
+    ).to(model.device)
+
+    input_len = core.tensor(inputs['input_ids'].shape[1])
     dist.broadcast(input_len, 0)
     dist.barrier()
     streamer = TextIteratorStreamer(tokenizer, timeout=1200, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids=input_ids.to('npu'),
+        **inputs,
         streamer=streamer,
         max_new_tokens=1024,
+        temperature=0.7
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()  # Starting the generation in a separate thread.
diff --git a/mindnlp/__init__.py b/mindnlp/__init__.py
@@ -35,17 +35,16 @@
     from mindspore._c_expression import disable_multi_thread
 except:
     disable_multi_thread = None
-# for different ascend devices
-    context.set_context(device_target='CPU')
 
+# for different ascend devices
 if platform.system().lower() == 'linux':
     SOC = MSContext.get_instance().get_ascend_soc_version()
     if ('910b' not in SOC and '310' not in SOC) or version.parse(mindspore.__version__) < version.parse('2.4.0'):
         os.environ["MS_ALLOC_CONF"] = 'enable_vmm:True,vmm_align_size:2MB'
 
     if SOC in ('ascend910', 'ascend310b'):
-        context.set_context(ascend_config={"precision_mode": "allow_mix_precision"})
-
+        # context.set_context(ascend_config={"precision_mode": "allow_mix_precision"})
+        mindspore.device_context.ascend.op_precision.precision_mode('allow_mix_precision')
     if SOC == 'ascend310b' and disable_multi_thread is not None:
         disable_multi_thread()
 
diff --git a/mindnlp/transformers/tokenization_utils.py b/mindnlp/transformers/tokenization_utils.py
@@ -1,6 +1,6 @@
 def apply_chat_template_wrapper(fn):
     def wrapper(*args, **kwargs):
-        return_tensors = kwargs.pop('return_tensors', None)
+        return_tensors = kwargs.get('return_tensors', None)
         if return_tensors is not None and return_tensors == 'ms':
             kwargs['return_tensors'] = 'pt'
         return fn(*args, **kwargs)