Skip to content

Commit c894ff7

Browse files
committed
Merge remote-tracking branch 'upstream/main' into flashcomm2_new
2 parents eef2d73 + eab3635 commit c894ff7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1116
-574
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,4 +258,4 @@ jobs:
258258
VLLM_WORKER_MULTIPROC_METHOD: spawn
259259
VLLM_USE_MODELSCOPE: True
260260
run: |
261-
pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
261+
pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP

.github/workflows/vllm_ascend_test_full.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ jobs:
226226
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
227227
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
228228
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
229+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
230+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
229231
230232
#pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
231233
pytest -sv tests/e2e/multicard/test_prefix_caching.py

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,4 +108,5 @@ jobs:
108108
109109
- name: Run vllm-project/vllm-ascend PD Disaggregation edge test
110110
run: |
111+
git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
111112
bash tests/e2e/pd_disaggreate/run_edge_case_test.sh

examples/disaggregated_prefill_v1/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
4242
export OMP_PROC_BIND=false
4343
export OMP_NUM_THREADS=100
4444
export VLLM_USE_V1=1
45-
export VLLM_LLMDD_RPC_PORT=5559
45+
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
4646

4747
vllm serve /models/deepseek_r1_w8a8 \
4848
--host 0.0.0.0 \
@@ -85,7 +85,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
8585
export OMP_PROC_BIND=false
8686
export OMP_NUM_THREADS=100
8787
export VLLM_USE_V1=1
88-
export VLLM_LLMDD_RPC_PORT=5659
88+
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
8989

9090
vllm serve /models/deepseek_r1_w8a8 \
9191
--host 0.0.0.0 \
@@ -131,7 +131,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
131131
export OMP_PROC_BIND=false
132132
export OMP_NUM_THREADS=100
133133
export VLLM_USE_V1=1
134-
export VLLM_LLMDD_RPC_PORT=5759
134+
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
135135

136136
vllm serve /models/deepseek_r1_w8a8 \
137137
--host 0.0.0.0 \
@@ -173,7 +173,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
173173
export OMP_PROC_BIND=false
174174
export OMP_NUM_THREADS=100
175175
export VLLM_USE_V1=1
176-
export VLLM_LLMDD_RPC_PORT=5859
176+
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
177177

178178
vllm serve /models/deepseek_r1_w8a8 \
179179
--host 0.0.0.0 \

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ openai
55
pytest >= 6.0
66
pytest-asyncio
77
pytest-mock
8-
lm-eval==0.4.8
8+
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
99
types-jsonschema
1010
xgrammar
1111
zmq

tests/e2e/multicard/test_offline_inference_distributed.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131

3232
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
3333

34-
QWEN_DENSE_MODELS = ["Qwen/QwQ-32B", "Qwen/Qwen-32B"]
34+
QWEN_DENSE_MODELS = [
35+
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
36+
]
3537

3638

3739
def test_models_distributed_QwQ():
@@ -170,6 +172,29 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
170172
max_model_len=8192,
171173
enforce_eager=enforce_eager,
172174
dtype="auto",
173-
tensor_parallel_size=4,
175+
tensor_parallel_size=2,
176+
quantization="ascend",
177+
) as vllm_model:
178+
vllm_model.generate_greedy(example_prompts, max_tokens)
179+
180+
181+
@pytest.mark.parametrize("enforce_eager", [True, False])
182+
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
183+
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
184+
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
185+
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
186+
model, enforce_eager):
187+
example_prompts = [
188+
"Hello, my name is",
189+
]
190+
max_tokens = 5
191+
192+
with VllmRunner(
193+
snapshot_download(model),
194+
max_model_len=8192,
195+
enforce_eager=enforce_eager,
196+
dtype="auto",
197+
tensor_parallel_size=2,
198+
quantization="ascend",
174199
) as vllm_model:
175200
vllm_model.generate_greedy(example_prompts, max_tokens)

tests/e2e/multicard/test_qwen3_moe.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
6666
max_model_len=8192,
6767
tensor_parallel_size=2,
6868
quantization="ascend",
69-
enforce_eager=True,
7069
) as vllm_model:
7170
vllm_model.generate_greedy(example_prompts, max_tokens)
7271

tests/e2e/singlecard/ops/test_fused_moe.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from vllm.model_executor.layers.activation import SiluAndMul
3030

3131
from vllm_ascend.ops.moe.experts_selector import select_experts
32+
from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
3233
from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
3334

3435
NUM_EXPERTS = [8, 64]
@@ -165,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
165166
torch.npu.reset_peak_memory_stats()
166167

167168

169+
@pytest.mark.parametrize("m", [1, 33, 64])
170+
@pytest.mark.parametrize("n", [128, 1024, 2048])
171+
@pytest.mark.parametrize("k", [128, 511, 1024])
172+
@pytest.mark.parametrize("e", NUM_EXPERTS)
173+
@pytest.mark.parametrize("topk", TOP_KS)
174+
@pytest.mark.parametrize("ep_size", EP_SIZE)
175+
@pytest.mark.parametrize("dtype", [torch.bfloat16])
176+
@pytest.mark.parametrize("device", DEVICE)
177+
def test_token_dispatcher_with_all_gather_quant(
178+
m: int,
179+
n: int,
180+
k: int,
181+
e: int,
182+
topk: int,
183+
ep_size: int,
184+
dtype: torch.dtype,
185+
device: str,
186+
):
187+
context_mock = MagicMock()
188+
context_mock.fused_moe_state = 0
189+
with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
190+
return_value=context_mock):
191+
a = torch.randn((m, k), device=device, dtype=dtype) / 10
192+
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
193+
w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
194+
w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
195+
w2_scale = torch.empty((e, k), device=device, dtype=dtype)
196+
197+
score = torch.randn((m, e), device=device, dtype=dtype)
198+
expert_map = None
199+
local_e = e
200+
201+
score = torch.softmax(score, dim=-1, dtype=dtype)
202+
topk_weights, topk_ids = torch.topk(score, topk)
203+
topk_ids = topk_ids.to(torch.int32)
204+
row_idx = (torch.arange(
205+
0,
206+
m * topk,
207+
device=device,
208+
dtype=torch.int32,
209+
).view(topk, -1).permute(1, 0).contiguous())
210+
211+
dispatcher_kwargs = {
212+
"num_experts": e,
213+
"top_k": topk,
214+
"num_local_experts": local_e,
215+
}
216+
dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
217+
218+
apply_router_weight_on_input = False
219+
dispatch_output = dispatcher.token_dispatch(
220+
hidden_states=a,
221+
topk_weights=topk_weights,
222+
topk_ids=topk_ids,
223+
row_idx=row_idx,
224+
expert_map=expert_map,
225+
apply_router_weight_on_input=apply_router_weight_on_input,
226+
with_quant=True)
227+
228+
sorted_hidden_states = dispatch_output["hidden_states"]
229+
group_list = dispatch_output["group_list"]
230+
group_list_type = dispatch_output.get("group_list_type", 1)
231+
dynamic_scale = dispatch_output["dynamic_scale"]
232+
233+
expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
234+
w1=w1,
235+
w1_scale=w1_scale,
236+
w2=w2,
237+
w2_scale=w2_scale,
238+
group_list=group_list,
239+
group_list_type=group_list_type,
240+
dynamic_scale=dynamic_scale,
241+
with_quant=True)
242+
combined_output = dispatcher.token_combine(hidden_states=expert_output,
243+
bias=None)
244+
assert combined_output.shape == (m, k)
245+
gc.collect()
246+
torch.npu.empty_cache()
247+
torch.npu.reset_peak_memory_stats()
248+
249+
168250
@pytest.mark.parametrize("m", [1, 33, 64])
169251
@pytest.mark.parametrize("n", [128, 1024, 2048])
170252
@pytest.mark.parametrize("e", NUM_EXPERTS)

tests/e2e/singlecard/test_ascend_scheduler.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import pytest
4+
from vllm import SamplingParams
45

56
from tests.e2e.conftest import VllmRunner
67
from tests.e2e.model_utils import check_outputs_equal
@@ -86,3 +87,25 @@ def test_chunked_prefill_with_ascend_scheduler(
8687
name_0="vllm_output",
8788
name_1="chunked_prefill_output",
8889
)
90+
91+
92+
def test_async_scheduling() -> None:
93+
prompts = [
94+
"Hello, my name is",
95+
"The president of the United States is",
96+
"The capital of France is",
97+
"The future of AI is",
98+
] * 10
99+
sampling_params = SamplingParams(temperature=0.2,
100+
max_tokens=10,
101+
stop_token_ids=None)
102+
103+
with VllmRunner(
104+
"Qwen/Qwen2.5-0.5B-Instruct",
105+
max_model_len=4096,
106+
max_num_seqs=50,
107+
dtype="bfloat16",
108+
gpu_memory_utilization=0.9,
109+
async_scheduling=True,
110+
) as vllm_model:
111+
vllm_model.generate(prompts, sampling_params=sampling_params)

tests/ut/distributed/test_distributed_tensor_parallel.py

Lines changed: 0 additions & 139 deletions
This file was deleted.

0 commit comments

Comments
 (0)