Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
6f7044a
[Feat] long_seq_optim support cp&&sp (#2565)
LookAround0301 Sep 1, 2025
5f4e17e
Merge remote-tracking branch 'refs/remotes/origin_vllm-ascend/main' i…
LookAround0301 Sep 1, 2025
bb0ab43
[long_seq_optim] update mla
LookAround0301 Sep 1, 2025
baa9a75
support qwen3-32B sp and cp
Sep 2, 2025
11de0eb
support qwen3-32B sp and cp
Sep 2, 2025
51f03ff
support qwen3-32B sp and cp
Sep 2, 2025
ff03287
support qwen3-32B sp and cp
Sep 3, 2025
5cba09d
[long_seq_optim] support cp&sp
LookAround0301 Sep 3, 2025
00e1c14
Merge pull request #2 from Delphine-Nic/long_seq_tmp
LookAround0301 Sep 3, 2025
5fe2d7d
[long_seq_optim] fix sp bug
LookAround0301 Sep 3, 2025
6e9b4d1
【bugfix】128K Long Sequence Freezes in CP&SP Scenario
Sep 4, 2025
ae2438b
Merge pull request #3 from Delphine-Nic/long_seq_tmp
LookAround0301 Sep 4, 2025
64c8ca6
[long_seq_optim] clean code
LookAround0301 Sep 4, 2025
91b22c9
fix verify rules
SunnyLee151064 Sep 4, 2025
5f60018
Merge pull request #1 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 4, 2025
98ac745
【bugfix】Fix the bug in CP&SP features when max_num_seqs > 1.
Sep 4, 2025
00163ca
Merge pull request #4 from Delphine-Nic/long_seq_tmp
LookAround0301 Sep 5, 2025
44edeec
[bug-fix] remove original_len
Apocalypse990923-qshi Sep 5, 2025
abcb752
Merge pull request #7 from Apocalypse990923-qshi/long_seq_tmp
LookAround0301 Sep 8, 2025
b9d52c2
[long_seq_optim] deepseek wordembedding remove sp
LookAround0301 Sep 8, 2025
47c3c8e
[long_seq_optim] deepseek remove all_gather
LookAround0301 Sep 8, 2025
a02f8ad
[long_seq_optim] deepseek replace RowParallelLinear
LookAround0301 Sep 8, 2025
158be21
[long_seq_optim] bug fix & remove some enable sp
LookAround0301 Sep 9, 2025
4edb07f
[long_seq_optim] deepseek remove all enable sp
LookAround0301 Sep 9, 2025
8a5db84
[long_seq_optim] sp bug fix
LookAround0301 Sep 9, 2025
4383c0f
【Refactor】CP & SP Model Side Code Refactoring
Sep 10, 2025
321a0d7
Merge branch 'long_seq_tmp' of https://github.com/LookAround0301/vllm…
Sep 10, 2025
930225a
Merge pull request #2 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 10, 2025
aba85f0
support 128k decode
weiguihua2 Sep 10, 2025
f523790
Merge branch 'long_seq_tmp' of https://github.com/LookAround0301/vllm…
weiguihua2 Sep 10, 2025
7f06220
Merge pull request #3 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 12, 2025
9a493ed
add model runner test
weiguihua2 Sep 12, 2025
f90c724
[long_seq_optim] remove DeepseekV2MLP
LookAround0301 Sep 12, 2025
b533ae3
Merge remote-tracking branch 'refs/remotes/origin_vllm-ascend/main' i…
LookAround0301 Sep 13, 2025
939608b
[long_seq_optim] linear bug fix
LookAround0301 Sep 13, 2025
65cd330
Merge remote-tracking branch 'refs/remotes/origin_vllm-ascend/main' i…
LookAround0301 Sep 13, 2025
cfb4e77
[long_seq_optim] deepseek bug fix
LookAround0301 Sep 13, 2025
232d8f5
update main branch
Sep 13, 2025
11f3dc7
【bugfix】The CP & SP model focuses on construction.
Sep 13, 2025
36d31c8
Merge remote-tracking branch 'refs/remotes/origin_vllm-ascend/main' i…
LookAround0301 Sep 15, 2025
cbe1657
[long_seq_optim] ci bug fix
LookAround0301 Sep 15, 2025
19bacf3
[long_seq_optim] utils ci bug fix
LookAround0301 Sep 15, 2025
cc0aa83
clean code
weiguihua2 Sep 15, 2025
c19ae77
Merge pull request #4 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 15, 2025
b166bbf
【bugfix】Change the flashcomm switch type
Sep 15, 2025
c41c0b8
Merge pull request #11 from Delphine-Nic/long_seq_tmp
LookAround0301 Sep 15, 2025
781cf5e
cleancode
weiguihua2 Sep 15, 2025
6805391
Merge pull request #5 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 15, 2025
be19818
clean code
LookAround0301 Sep 15, 2025
ef191fb
ut test mla_v1
Apocalypse990923-qshi Sep 15, 2025
0bc71a2
Merge pull request #6 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 15, 2025
0d8a0b8
cleancode
Sep 15, 2025
0b5db1a
Merge pull request #12 from Delphine-Nic/long_seq_tmp
Delphine-Nic Sep 15, 2025
ad6563d
mla_v1 ut test add forward_decode_sp
Apocalypse990923-qshi Sep 15, 2025
519cd75
Merge pull request #9 from Apocalypse990923-qshi/long_seq_tmp
LookAround0301 Sep 15, 2025
3ecba13
Merge pull request #7 from LookAround0301/long_seq_tmp
Delphine-Nic Sep 15, 2025
adc4b7f
cleancode
weiguihua2 Sep 15, 2025
d9cba36
[bugfix] fix enable_sp
Apocalypse990923-qshi Sep 15, 2025
2a18b60
Merge pull request #13 from Apocalypse990923-qshi/long_seq_tmp
LookAround0301 Sep 15, 2025
f117ada
cleancode
Sep 15, 2025
572e2af
Merge pull request #14 from Delphine-Nic/long_seq_tmp
Delphine-Nic Sep 15, 2025
f3ef5cf
cleancode
Sep 15, 2025
7a5d54f
Merge pull request #15 from Delphine-Nic/long_seq_tmp
Delphine-Nic Sep 15, 2025
b63bd9d
cleancode
weiguihua2 Sep 15, 2025
0035bbc
[long_seq_optim] clean code
LookAround0301 Sep 15, 2025
4d8864c
[long_seq_optim] clean code
LookAround0301 Sep 15, 2025
42b2d7b
clean code
weiguihua2 Sep 15, 2025
11404e2
clean code
weiguihua2 Sep 15, 2025
d86963e
add example
weiguihua2 Sep 15, 2025
815121b
【bugfix】Change the flashcomm switch type
Sep 15, 2025
adcf83d
【bugfix】Change the flashcomm switch type
Sep 16, 2025
b53d858
[long_seq_optim] add env for cp&sp
LookAround0301 Sep 16, 2025
0a14bf3
[long_seq_optim] modify mla op
LookAround0301 Sep 19, 2025
cc74e96
[long_seq_optim] fix 128k bug
LookAround0301 Sep 19, 2025
8ecf93a
fix tuple error
weiguihua2 Sep 19, 2025
48e4456
support cp sp pd disaggregate
zhangsicheng5 Sep 19, 2025
68408bd
Merge pull request #26 from zhangsicheng5/long_seq_tmp
LookAround0301 Sep 19, 2025
11191fb
bugfix: Qwen3-moe support sp
Sep 20, 2025
2b62cd9
bugfix: add import
Sep 20, 2025
925a446
remove chinese comment
Apocalypse990923-qshi Sep 23, 2025
48cdd95
remove chinese comment[2]
Apocalypse990923-qshi Sep 23, 2025
da50531
Merge pull request #27 from Apocalypse990923-qshi/long_seq_tmp
LookAround0301 Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions examples/offline_inference_npu_128k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import time
import argparse
import random

from vllm import LLM, SamplingParams
from datasets import load_dataset, Features, Value, Sequence
from transformers import AutoTokenizer

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"


def generate_prompts_128K(model_path):
# Define the features schema
ft = Features({
"id": Value("int64"),
"context": Value("string"),
"input": Value("string"),
"answer": Sequence(Value("string")),
"options": Sequence(Value("string"))
})
# 100k datasets: https://huggingface.co/datasets/xinrongzhang2022/InfiniteBench
dataset_dict = load_dataset("./InfiniteBench", features=ft)
dataset = dataset_dict["train"]

tokenizer = AutoTokenizer.from_pretrained(model_path)
token_ids = []

prompt = str(dataset['context'][0]) + '\n' + dataset['input'][0]
encoded = tokenizer(
prompt,
truncation=True,
max_length=100*1024,
return_tensors="pt"
)

token_ids.extend(encoded["input_ids"].squeeze(0).tolist())

prompt = str(dataset['context'][1]) + '\n' + dataset['input'][1]
encoded = tokenizer(
prompt,
truncation=True,
max_length=28*1024,
return_tensors="pt"
)
token_ids.extend(encoded["input_ids"].squeeze(0).tolist())
token_ids_text = tokenizer.decode(token_ids)
return token_ids_text


def generate_prompt_token_ids(input_len, batchsize):
token_ids = [[random.randint(1,128000) for _ in range(input_len)] for _ in range(batchsize)]
return token_ids


# Performance testing function
def run_performance(args):
"""Run performance tests and return timing results."""

sampling_params = SamplingParams(temperature = 0.8, top_p = 0.95, ignore_eos=True, max_tokens=args.output_len)

prompt_token = generate_prompts_128K(args.model_path)

# Create an LLM
llm = LLM(
model=args.model_path,
trust_remote_code=True,
enforce_eager=True,
tensor_parallel_size=args.tp,
data_parallel_size=args.dp,
context_parallel_size=args.cp,
enable_prefix_caching=False,
enable_chunked_prefill=False,
enable_sequence_parallel=True,
enable_expert_parallel=True,
max_num_batched_tokens=args.input_len // args.cp // args.tp + 138,
max_model_len=args.input_len + 138,
quantization="ascend",
additional_config={"ascend_scheduler_config": {"enabled": True}},
max_num_seqs=1,
block_size=128,
gpu_memory_utilization=0.85
)

print("========================= Warmup =========================")
t0 = time.time()
llm.generate(prompts=prompt_token, sampling_params=sampling_params)
t1 = time.time()
dt0 = t1 - t0
print(f"E2E: {dt0} s")
print("============================= Warmup finished. ============================")

# Second run for comparison
print("========================= Infer ===========================")
t2 = time.time()
for _ in range(args.iter_times):
llm.generate(prompts=prompt_token, sampling_params=sampling_params)
t3 = time.time()

# Give engines time to pause their processing loops before exiting.
time.sleep(1)
dt1 = t3 - t2
print(f"E2E: {dt1} s")
print("============================= Infer finished. ============================")

if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument('--input_len', type=int, default=128*1024)
# current output_len only suppot 1 for long_seq prefill stage
parser.add_argument('--output_len', type=int, default=1)
parser.add_argument('--bs', type=int, default=1)
parser.add_argument('--model_path', type=str, default="./DeepSeek-R1_w8a8/")
parser.add_argument('--tp', type=int, default=8)
parser.add_argument('--cp', type=int, default=2)
parser.add_argument('--dp', type=int, default=1)
parser.add_argument('--iter_times', type=int, default=1)

args = parser.parse_args()
# Run performance test using our new function
run_performance(args)
3 changes: 2 additions & 1 deletion vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def __init__(self, vllm_config):
"chunked_prefill_for_mla", False)
self.enable_shared_expert_dp = additional_config.get(
"enable_shared_expert_dp", False
) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel \
and not vllm_config.parallel_config.enable_sequence_parallel
self.enable_prefetch = additional_config.get("enable_prefetch", False)
self.lmhead_tensor_parallel_size = additional_config.get(
"lmhead_tensor_parallel_size", None)
Expand Down
Loading
Loading