Skip to content

Commit 8cfd257

Browse files
authored
[Dist][EP] Remove ETP/EP maintained in vllm-ascend (#1681)
### What this PR does / why we need it? Remove ETP/EP maintained in branch main. We drop this as there is no relevant scenarios to use ETP now, and we may subsequently advocate implementing expert tensor parallelism in vLLM to support scenarios where the expert is needed to be sliced This is a part of #1422 backport. Fixes #1396 #1154 ### Does this PR introduce _any_ user-facing change? We'll not maintain etp/ep in vllm-ascend anymore, and use the tp/ep in vllm instead. ### How was this patch tested? CI passed with new added and existing test. - vLLM version: v0.9.2 - vLLM main: vllm-project/vllm@fe8a2c5 Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent a8b316a commit 8cfd257

24 files changed

+66
-548
lines changed

docs/source/tutorials/multi_npu_moge.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Run the following script to start the vLLM server on Multi-NPU:
4848
```bash
4949
vllm serve /path/to/pangu-pro-moe-model \
5050
--tensor-parallel-size 4 \
51+
--enable-expert-parallel \
5152
--trust-remote-code \
5253
--enforce-eager
5354
```
@@ -145,6 +146,7 @@ if __name__ == "__main__":
145146
146147
llm = LLM(model="/path/to/pangu-pro-moe-model",
147148
tensor_parallel_size=4,
149+
enable_expert_parallel=True,
148150
distributed_executor_backend="mp",
149151
max_model_len=1024,
150152
trust_remote_code=True,

docs/source/user_guide/configuration/additional_config.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ The following table lists the additional configuration options available in vLLM
2828
|-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------|
2929
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
3030
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
31-
| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
3231
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
3332
| `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
3433
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
@@ -75,7 +74,6 @@ An example of additional configuration is as follows:
7574
"enabled": True,
7675
"enable_chunked_prefill": True,
7776
},
78-
"expert_tensor_parallel_size": 1,
7977
"refresh": False,
8078
}
8179
```

examples/run_dp_attention_etp16.sh

Lines changed: 0 additions & 22 deletions
This file was deleted.

examples/run_dp_attention_etp16_benmark.sh

Lines changed: 0 additions & 57 deletions
This file was deleted.

tests/e2e/long_term/accuracy/accuracy_multicard.py

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737
# pre-trained model path on Hugging Face.
3838
# Qwen/Qwen2.5-0.5B-Instruct: accuracy test for DP.
39-
# Qwen/Qwen3-30B-A3B: accuracy test for EP and ETP.
39+
# Qwen/Qwen3-30B-A3B: accuracy test for EP.
4040
# deepseek-ai/DeepSeek-V2-Lite: accuracy test for TP.
4141
MODEL_NAME = ["Qwen/Qwen3-30B-A3B", "deepseek-ai/DeepSeek-V2-Lite"]
4242

@@ -200,62 +200,3 @@ def test_lm_eval_accuracy_dp(model, max_tokens):
200200
except subprocess.TimeoutExpired:
201201
server_proc.kill()
202202
server_proc.wait()
203-
204-
205-
@pytest.mark.parametrize("max_tokens", [10])
206-
@pytest.mark.parametrize("model", ["Qwen/Qwen3-30B-A3B"])
207-
def test_lm_eval_accuracy_etp(model, max_tokens):
208-
log_file = open("accuracy_etp.log", "a+")
209-
cmd = [
210-
"vllm", "serve", model, "--max_model_len", "4096",
211-
"--tensor_parallel_size", "4", "--enforce_eager",
212-
"--enable_expert_parallel", "--additional_config",
213-
'{"expert_tensor_parallel_size": "4"}'
214-
]
215-
server_proc = subprocess.Popen(cmd,
216-
stdout=log_file,
217-
stderr=subprocess.DEVNULL)
218-
219-
try:
220-
for _ in range(300):
221-
try:
222-
r = requests.get(HEALTH_URL, timeout=1)
223-
if r.status_code == 200:
224-
break
225-
except requests.exceptions.RequestException:
226-
pass
227-
time.sleep(1)
228-
else:
229-
log_file.flush()
230-
log_file.seek(0)
231-
log_content = log_file.read()
232-
pytest.fail(
233-
f"vLLM serve did not become healthy after 300s: {HEALTH_URL}\n"
234-
f"==== vLLM Serve Log Start ===\n{log_content}\n==== vLLM Serve Log End ==="
235-
)
236-
237-
prompt = "bejing is a"
238-
payload = {
239-
"prompt": prompt,
240-
"max_tokens": max_tokens,
241-
"sampling_params": {
242-
"temperature": 0.0,
243-
"top_p": 1.0,
244-
"seed": 123
245-
}
246-
}
247-
resp = requests.post(COMPLETIONS_URL, json=payload, timeout=30)
248-
resp.raise_for_status()
249-
data = resp.json()
250-
251-
generated = data["choices"][0]["text"].strip()
252-
expected = "city in china. it is the capital city of"
253-
assert generated == expected, f"Expected `{expected}`, got `{generated}`"
254-
255-
finally:
256-
server_proc.send_signal(signal.SIGINT)
257-
try:
258-
server_proc.wait(timeout=10)
259-
except subprocess.TimeoutExpired:
260-
server_proc.kill()
261-
server_proc.wait()
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pytest
2+
3+
from tests.e2e.conftest import VllmRunner
4+
from tests.e2e.model_utils import check_outputs_equal
5+
6+
7+
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
8+
def test_e2e_ep_correctness(model_name):
9+
example_prompts = [
10+
"Hello, my name is",
11+
"The president of the United States is",
12+
"The capital of France is",
13+
"The future of AI is",
14+
]
15+
max_tokens = 5
16+
17+
with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model:
18+
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
19+
20+
with VllmRunner(model_name,
21+
tensor_parallel_size=2,
22+
enable_expert_parallel=True) as vllm_model:
23+
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
24+
25+
check_outputs_equal(
26+
outputs_0_lst=ep_output,
27+
outputs_1_lst=tp_output,
28+
name_0="ep_output",
29+
name_1="tp_output",
30+
)

tests/e2e/multicard/test_fused_moe_allgather_ep.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ def test_generate_with_allgather():
5050
"enabled": True,
5151
"chunked_prefill_enabled": False,
5252
},
53-
"expert_tensor_parallel_size": 1
5453
}) as vllm_model:
5554
vllm_model.generate(example_prompts, sampling_params)
5655

@@ -74,6 +73,5 @@ def test_generate_with_alltoall():
7473
"enabled": True,
7574
"chunked_prefill_enabled": False,
7675
},
77-
"expert_tensor_parallel_size": 1
7876
}) as vllm_model:
7977
vllm_model.generate(example_prompts, sampling_params)

tests/e2e/multicard/test_torchair_graph_mode.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def _pangu_torchair_test_fixture(
123123
distributed_executor_backend="mp",
124124
enforce_eager=False,
125125
additional_config=additional_config,
126+
enable_expert_parallel=True,
126127
) as vllm_model:
127128
# use greedy sampler to make sure the generated results are fix
128129
vllm_output = vllm_model.generate_greedy(example_prompts, 5)

0 commit comments

Comments
 (0)