Skip to content

Commit f47716a

Browse files
committed
[Dist][EP] Remove ETP/EP maintained in vllm-ascend
Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent ef99fe1 commit f47716a

File tree

21 files changed

+65
-409
lines changed

21 files changed

+65
-409
lines changed

docs/source/tutorials/multi_npu_moge.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Run the following script to start the vLLM server on Multi-NPU:
4848
```bash
4949
vllm serve /path/to/pangu-pro-moe-model \
5050
--tensor-parallel-size 4 \
51+
--enable-expert-parallel \
5152
--trust-remote-code \
5253
--enforce-eager
5354
```
@@ -113,6 +114,7 @@ if __name__ == "__main__":
113114

114115
llm = LLM(model="/path/to/pangu-pro-moe-model",
115116
tensor_parallel_size=4,
117+
enable_expert_parallel=True,
116118
distributed_executor_backend="mp",
117119
max_model_len=1024,
118120
trust_remote_code=True,

docs/source/user_guide/configuration/additional_config.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ The following table lists the additional configuration options available in vLLM
2828
|-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------|
2929
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
3030
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
31-
| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
3231
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
3332
| `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
3433
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
@@ -75,7 +74,6 @@ An example of additional configuration is as follows:
7574
"enabled": True,
7675
"enable_chunked_prefill": True,
7776
},
78-
"expert_tensor_parallel_size": 1,
7977
"refresh": False,
8078
}
8179
```

tests/e2e/multicard/test_ep.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pytest
2+
3+
from tests.conftest import VllmRunner
4+
from tests.model_utils import check_outputs_equal
5+
6+
7+
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
8+
def test_e2e_ep_correctness(model_name):
9+
example_prompts = [
10+
"Hello, my name is",
11+
"The president of the United States is",
12+
"The capital of France is",
13+
"The future of AI is",
14+
]
15+
max_tokens = 5
16+
17+
with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model:
18+
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
19+
20+
with VllmRunner(model_name,
21+
tensor_parallel_size=2,
22+
enable_expert_parallel=True) as vllm_model:
23+
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
24+
25+
check_outputs_equal(
26+
outputs_0_lst=ep_output,
27+
outputs_1_lst=tp_output,
28+
name_0="ep_output",
29+
name_1="tp_output",
30+
)

tests/e2e/multicard/test_fused_moe_allgather_ep.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ def test_generate_with_allgather():
5050
"enabled": True,
5151
"chunked_prefill_enabled": False,
5252
},
53-
"expert_tensor_parallel_size": 1
5453
}) as vllm_model:
5554
vllm_model.generate(example_prompts, sampling_params)
5655

@@ -74,6 +73,5 @@ def test_generate_with_alltoall():
7473
"enabled": True,
7574
"chunked_prefill_enabled": False,
7675
},
77-
"expert_tensor_parallel_size": 1
7876
}) as vllm_model:
7977
vllm_model.generate(example_prompts, sampling_params)

tests/e2e/multicard/test_torchair_graph_mode.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def _pangu_torchair_test_fixture(
123123
distributed_executor_backend="mp",
124124
enforce_eager=False,
125125
additional_config=additional_config,
126+
enable_expert_parallel=True,
126127
) as vllm_model:
127128
# use greedy sampler to make sure the generated results are fix
128129
vllm_output = vllm_model.generate_greedy(example_prompts, 5)

tests/ut/distributed/test_parallel_state.py

Lines changed: 0 additions & 208 deletions
This file was deleted.

tests/ut/test_ascend_config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def test_init_ascend_config_without_additional_config(self):
4242
test_vllm_config = VllmConfig()
4343
# No additional config given, check the default value here.
4444
ascend_config = init_ascend_config(test_vllm_config)
45-
self.assertEqual(ascend_config.expert_tensor_parallel_size, 0)
4645
self.assertIsNone(ascend_config.expert_map_path)
4746

4847
torchair_graph_config = ascend_config.torchair_graph_config
@@ -75,12 +74,10 @@ def test_init_ascend_config_with_additional_config(self):
7574
"ascend_scheduler_config": {
7675
"enabled": True
7776
},
78-
"expert_tensor_parallel_size": 1,
7977
"expert_map_path": "test_expert_map_path",
8078
"refresh": True
8179
}
8280
ascend_config = init_ascend_config(test_vllm_config)
83-
self.assertEqual(ascend_config.expert_tensor_parallel_size, 1)
8481
self.assertEqual(ascend_config.expert_map_path, "test_expert_map_path")
8582

8683
torchair_graph_config = ascend_config.torchair_graph_config

tests/ut/test_platform.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ def setUp(self):
2828
self.mock_vllm_config.speculative_config = None
2929

3030
self.mock_ascend_config = MagicMock()
31-
self.mock_ascend_config.expert_tensor_parallel_size = 0
3231
self.mock_ascend_config.torchair_graph_config.enabled = False
3332
self.mock_ascend_config.ascend_scheduler_config.enabled = False
3433

@@ -253,30 +252,6 @@ def test_check_and_update_config_basic_config_update(
253252
mock_init_ascend.assert_called_once_with(self.mock_vllm_config)
254253
mock_check_ascend.assert_called_once()
255254

256-
@patch("vllm_ascend.utils.is_310p", return_value=False)
257-
@patch("vllm_ascend.ascend_config.check_ascend_config")
258-
@patch("vllm_ascend.ascend_config.init_ascend_config")
259-
def test_check_and_update_config_expert_parallel_enabled(
260-
self, mock_init_ascend, mock_check_ascend, mock_is_310p):
261-
mock_init_ascend.return_value = self.mock_ascend_config
262-
self.mock_vllm_config.parallel_config.enable_expert_parallel = True
263-
self.mock_vllm_config.parallel_config.tensor_parallel_size = 2
264-
self.mock_vllm_config.parallel_config.world_size_across_dp = 4
265-
266-
from vllm_ascend import platform
267-
268-
importlib.reload(platform)
269-
270-
self.platform.check_and_update_config(self.mock_vllm_config)
271-
272-
self.assertEqual(
273-
self.mock_vllm_config.parallel_config.expert_tensor_parallel_size,
274-
1)
275-
self.assertEqual(
276-
self.mock_vllm_config.parallel_config.expert_parallel_size,
277-
self.mock_vllm_config.parallel_config.world_size_across_dp,
278-
)
279-
280255
@patch("vllm_ascend.utils.is_310p", return_value=False)
281256
@patch("vllm_ascend.ascend_config.check_ascend_config")
282257
@patch("vllm_ascend.ascend_config.init_ascend_config")

vllm_ascend/ascend_config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ def __init__(self, vllm_config):
4444
self.ascend_scheduler_config = AscendSchedulerConfig(
4545
ascend_scheduler_config)
4646

47-
self.expert_tensor_parallel_size = int(
48-
additional_config.get("expert_tensor_parallel_size", 0))
4947
self.expert_map_path = additional_config.get("expert_map_path", None)
5048
self.chunked_prefill_for_mla = additional_config.get(
5149
"chunked_prefill_for_mla", False)

0 commit comments

Comments
 (0)