Skip to content

Commit 255d5aa

Browse files
committed
Add 310P example
Signed-off-by: leo-pony <nengjunma@outlook.com>
1 parent 097e714 commit 255d5aa

File tree

3 files changed

+203
-0
lines changed

3 files changed

+203
-0
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
18+
#
19+
import gc
20+
import os
21+
import torch
22+
23+
from vllm import LLM, SamplingParams
24+
from vllm.distributed.parallel_state import (destroy_distributed_environment,
25+
destroy_model_parallel)
26+
27+
def clean_up():
28+
destroy_model_parallel()
29+
destroy_distributed_environment()
30+
gc.collect()
31+
torch.npu.empty_cache()
32+
33+
os.environ["VLLM_USE_V1"] = "1"
34+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
35+
36+
37+
if __name__ == "__main__":
38+
# Update the model_path
39+
model_path="/home/xxx/pangu_model/pangu-pro-moe-model"
40+
41+
prompts = [
42+
"Hello, my name is",
43+
"The future of AI is",
44+
]
45+
sampling_params = SamplingParams(min_tokens=8, max_tokens=8, temperature=0.0)
46+
llm = LLM(model=model_path,
47+
tensor_parallel_size=8,
48+
max_num_batched_tokens=2048,
49+
gpu_memory_utilization=0.5,
50+
max_num_seqs=4,
51+
enforce_eager=True,
52+
trust_remote_code=True,
53+
max_model_len=1024,
54+
disable_custom_all_reduce=True, # IMPORTANT cause 310p needed custom ops
55+
enable_expert_parallel=True,
56+
57+
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 310P
58+
compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 310p needed custom ops
59+
60+
additional_config = {
61+
'ascend_scheduler_config': {
62+
'enabled': True,
63+
'enable_chunked_prefill' : False,
64+
'chunked_prefill_enabled': False
65+
}
66+
}
67+
)
68+
69+
# Generate texts from the prompts.
70+
outputs = llm.generate(prompts, sampling_params)
71+
for output in outputs:
72+
prompt = output.prompt
73+
generated_text = output.outputs[0].text
74+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
75+
76+
del llm
77+
clean_up()
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
18+
#
19+
20+
from vllm import LLM, SamplingParams
21+
import gc
22+
23+
import torch
24+
25+
from vllm import LLM, SamplingParams
26+
from vllm.distributed.parallel_state import (destroy_distributed_environment,
27+
destroy_model_parallel)
28+
29+
def clean_up():
30+
destroy_model_parallel()
31+
destroy_distributed_environment()
32+
gc.collect()
33+
torch.npu.empty_cache()
34+
35+
prompts = [
36+
"Hello, my name is",
37+
"The president of the United States is",
38+
"The capital of France is",
39+
"The future of AI is",
40+
]
41+
42+
# Create a sampling params object.
43+
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
44+
# Create an LLM.
45+
llm = LLM(
46+
model="Qwen/Qwen2.5-7B-Instruct",
47+
max_model_len=4096,
48+
max_num_seqs=4,
49+
trust_remote_code=True,
50+
tensor_parallel_size=2,
51+
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 310P
52+
disable_custom_all_reduce=True, # IMPORTANT cause 310p needed
53+
compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 310p needed custom ops
54+
)
55+
56+
# Generate texts from the prompts.
57+
outputs = llm.generate(prompts, sampling_params)
58+
for output in outputs:
59+
prompt = output.prompt
60+
generated_text = output.outputs[0].text
61+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
62+
63+
del llm
64+
clean_up()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
18+
#
19+
import gc
20+
21+
import torch
22+
23+
from vllm import LLM, SamplingParams
24+
from vllm.distributed.parallel_state import (destroy_distributed_environment,
25+
destroy_model_parallel)
26+
27+
def clean_up():
28+
destroy_model_parallel()
29+
destroy_distributed_environment()
30+
gc.collect()
31+
torch.npu.empty_cache()
32+
33+
prompts = [
34+
"Hello, my name is",
35+
"The president of the United States is",
36+
"The capital of France is",
37+
"The future of AI is",
38+
]
39+
40+
# Create a sampling params object.
41+
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
42+
# Create an LLM.
43+
llm = LLM(
44+
model="Qwen/Qwen3-0.6B",
45+
max_model_len=4096,
46+
max_num_seqs=4,
47+
trust_remote_code=True,
48+
tensor_parallel_size=2,
49+
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 310P
50+
disable_custom_all_reduce=True, # IMPORTANT cause 310p needed
51+
compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 310p needed custom ops
52+
)
53+
54+
# Generate texts from the prompts.
55+
outputs = llm.generate(prompts, sampling_params)
56+
for output in outputs:
57+
prompt = output.prompt
58+
generated_text = output.outputs[0].text
59+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
60+
61+
del llm
62+
clean_up()

0 commit comments

Comments
 (0)