Skip to content

Commit 2e32824

Browse files
author
offline0806
committed
Merge remote-tracking branch 'upstream_gitee/main' into main_eplb_0916
# Conflicts: # vllm_ascend/ops/common_fused_moe.py
2 parents b44cca5 + 3e60aa5 commit 2e32824

22 files changed

+299
-1903
lines changed

.github/workflows/format_pr_body.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v4.2.2
5050

5151
- name: Set up Python
52-
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
52+
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
5353

5454
- name: Get vLLM release version
5555
run: |

.github/workflows/pre-commit.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
steps:
1313
- name: Checkout vllm-project/vllm-ascend repo
1414
uses: actions/checkout@v4
15-
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
15+
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
1616
with:
1717
python-version: "3.11"
1818
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

.github/workflows/release_code.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
lscpu
5151
5252
- name: Set up Python ${{ matrix.python-version }}
53-
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
53+
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
5454
with:
5555
python-version: ${{ matrix.python-version }}
5656

.github/workflows/release_whl.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
7474
- name: Set up Python ${{ matrix.python-version }}
7575
if: startsWith(github.ref, 'refs/tags/')
76-
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
76+
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
7777
with:
7878
python-version: ${{ matrix.python-version }}
7979

tests/e2e/multicard/test_expert_parallel.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,24 @@ def test_e2e_ep_correctness(model_name):
1414
]
1515
max_tokens = 5
1616

17-
with VllmRunner(model_name, tensor_parallel_size=2,
18-
enforce_eager=True) as vllm_model:
17+
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
18+
with VllmRunner(
19+
model_name,
20+
tensor_parallel_size=2,
21+
additional_config={"ascend_scheduler_config": {
22+
"enabled": True
23+
}},
24+
enforce_eager=True) as vllm_model:
1925
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
2026

21-
with VllmRunner(model_name,
22-
tensor_parallel_size=2,
23-
enable_expert_parallel=True,
24-
enforce_eager=True) as vllm_model:
27+
with VllmRunner(
28+
model_name,
29+
tensor_parallel_size=2,
30+
enable_expert_parallel=True,
31+
additional_config={"ascend_scheduler_config": {
32+
"enabled": True
33+
}},
34+
enforce_eager=True) as vllm_model:
2535
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
2636

2737
check_outputs_equal(

tests/e2e/multicard/test_torchair_graph_mode.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import os
2323
from typing import Dict
2424

25+
import pytest
26+
2527
from tests.e2e.conftest import VllmRunner
2628

2729
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@@ -153,6 +155,7 @@ def _pangu_torchair_test_fixture(
153155
print(f"Generated text: {vllm_output[i][1]!r}")
154156

155157

158+
@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
156159
def test_e2e_pangu_with_torchair():
157160
additional_config = {
158161
"torchair_graph_config": {

tests/ut/models/conftest.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from types import SimpleNamespace
2+
from unittest.mock import MagicMock, Mock, patch
3+
4+
import pytest
5+
import torch
6+
from transformers import PretrainedConfig
7+
from vllm.config import CacheConfig, EPLBConfig, ParallelConfig
8+
from vllm.distributed.parallel_state import GroupCoordinator
9+
10+
11+
@pytest.fixture
12+
def base_config():
13+
config = PretrainedConfig(
14+
hidden_size=128,
15+
num_attention_heads=8,
16+
num_hidden_layers=2,
17+
intermediate_size=256,
18+
hidden_act="silu",
19+
rms_norm_eps=1e-6,
20+
rope_theta=10000.0,
21+
max_position_embeddings=2048,
22+
n_routed_experts=4,
23+
n_shared_experts=1,
24+
moe_intermediate_size=256,
25+
num_experts_per_tok=2,
26+
routed_scaling_factor=1.0,
27+
first_k_dense_replace=0,
28+
moe_layer_freq=1,
29+
kv_lora_rank=16,
30+
qk_nope_head_dim=16,
31+
qk_rope_head_dim=16,
32+
v_head_dim=32,
33+
topk_method="noaux_tc",
34+
scoring_func="softmax",
35+
norm_topk_prob=True,
36+
n_group=1,
37+
topk_group=1,
38+
vocab_size=10000,
39+
)
40+
return config
41+
42+
43+
@pytest.fixture
44+
def vllm_config(base_config):
45+
model_config = SimpleNamespace(
46+
hf_config=base_config,
47+
tensor_parallel_size=1,
48+
dtype=torch.float32,
49+
use_mla=True,
50+
quant_config=None,
51+
max_model_len=2048,
52+
)
53+
parallel_config = MagicMock(spec=ParallelConfig)
54+
eplb_config = MagicMock(spec=EPLBConfig)
55+
eplb_config.num_redundant_experts = 0
56+
parallel_config.eplb_config = eplb_config
57+
58+
cache_config = CacheConfig()
59+
vllm_config = Mock()
60+
vllm_config.model_config = model_config
61+
vllm_config.cache_config = cache_config
62+
vllm_config.quant_config = None
63+
vllm_config.parallel_config = parallel_config
64+
return vllm_config
65+
66+
67+
@pytest.fixture
68+
def mock_distributed():
69+
tp_group = Mock(spec=GroupCoordinator)
70+
tp_group.rank_in_group = 0
71+
tp_group.world_size = 1
72+
tp_group.device_group = Mock()
73+
74+
dp_group = Mock(spec=GroupCoordinator)
75+
dp_group.rank_in_group = 0
76+
dp_group.world_size = 1
77+
78+
ep_group = Mock(spec=GroupCoordinator)
79+
ep_group.rank_in_group = 0
80+
ep_group.world_size = 1
81+
ep_group.device_group = Mock()
82+
ep_group.device_group.rank.return_value = 0
83+
ep_group.device_group.size.return_value = 1
84+
85+
pp_group = Mock(spec=GroupCoordinator)
86+
pp_group.rank_in_group = 0
87+
pp_group.world_size = 1
88+
89+
mock_vllm_config = Mock()
90+
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
91+
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
92+
93+
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
94+
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
95+
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
96+
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
97+
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
98+
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
99+
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
100+
patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
101+
patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
102+
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
103+
_PP=pp_group), \
104+
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
105+
patch("torch.npu.current_device", return_value=0):
106+
yield
107+
108+
109+
@pytest.fixture
110+
def mock_forward_context():
111+
forward_context = Mock(in_profile_run=False, with_prefill=False)
112+
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
113+
return_value=forward_context):
114+
yield

tests/ut/models/test_deepseek_mtp.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,13 @@
1313
class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
1414

1515
@pytest.fixture
16-
def setup_mtp_layer(self, mocker: MockerFixture):
16+
def setup_mtp_layer(self, mocker: MockerFixture, vllm_config: VllmConfig,
17+
mock_distributed):
1718
config = PretrainedConfig(vocab_size=1000,
1819
hidden_size=768,
1920
rms_norm_eps=1e-5)
21+
mocker.patch("vllm_ascend.models.deepseek_mtp.get_current_vllm_config",
22+
return_value=vllm_config)
2023
mocker.patch(
2124
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
2225
return_value=None)
@@ -29,15 +32,15 @@ def setup_mtp_layer(self, mocker: MockerFixture):
2932
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__",
3033
return_value=None)
3134
mocker_deepseek_v2_decode_layer = mocker.patch(
32-
"vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__",
35+
"vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.__init__",
3336
return_value=None)
3437
mocker.patch(
3538
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
3639
return_value=None)
37-
mocker.patch("vllm_ascend.utils.get_ascend_config",
40+
mocker.patch("vllm_ascend.models.deepseek_v2.get_ascend_config",
3841
return_value=mocker.Mock())
3942

40-
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None)
43+
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "0", None)
4144
mocker_deepseek_v2_decode_layer.assert_called_once()
4245
return mtp_layer
4346

0 commit comments

Comments
 (0)