vllm-project · wangxiyuan · Jul 21, 2025 · Jun 24, 2025 · MengqingCao · Jul 11, 2025
diff --git a/docs/source/tutorials/multi_npu_moge.md b/docs/source/tutorials/multi_npu_moge.md
@@ -48,6 +48,7 @@ Run the following script to start the vLLM server on Multi-NPU:
 ```bash
 vllm serve /path/to/pangu-pro-moe-model \
 --tensor-parallel-size 4 \
+--enable-expert-parallel \
 --trust-remote-code \
 --enforce-eager
 ```
@@ -145,6 +146,7 @@ if __name__ == "__main__":
 
     llm = LLM(model="/path/to/pangu-pro-moe-model",
             tensor_parallel_size=4,
+            enable_expert_parallel=True,
             distributed_executor_backend="mp",
             max_model_len=1024,
             trust_remote_code=True,

diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -28,7 +28,6 @@ The following table lists the additional configuration options available in vLLM
 |-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------|
 | `torchair_graph_config`       | dict | `{}` | The config options for torchair graph mode                                                    |
 | `ascend_scheduler_config`     | dict | `{}` | The config options for ascend scheduler                                                       |
-| `expert_tensor_parallel_size` | str  | `0`  | Expert tensor parallel size the model to use.                                                 |
 | `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case.     |
 | `expert_map_path`             | str  | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
 | `chunked_prefill_for_mla`     | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
@@ -75,7 +74,6 @@ An example of additional configuration is as follows:
         "enabled": True,
         "enable_chunked_prefill": True,
     },
-    "expert_tensor_parallel_size": 1,
     "refresh": False,
 }
 ```
diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh
diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh
diff --git a/tests/e2e/long_term/accuracy/accuracy_multicard.py b/tests/e2e/long_term/accuracy/accuracy_multicard.py
@@ -36,7 +36,7 @@
 
 # pre-trained model path on Hugging Face.
 # Qwen/Qwen2.5-0.5B-Instruct: accuracy test for DP.
-# Qwen/Qwen3-30B-A3B: accuracy test for EP and ETP.
+# Qwen/Qwen3-30B-A3B: accuracy test for EP.
 # deepseek-ai/DeepSeek-V2-Lite: accuracy test for TP.
 MODEL_NAME = ["Qwen/Qwen3-30B-A3B", "deepseek-ai/DeepSeek-V2-Lite"]
 
@@ -200,62 +200,3 @@ def test_lm_eval_accuracy_dp(model, max_tokens):
         except subprocess.TimeoutExpired:
             server_proc.kill()
             server_proc.wait()
-
-
-@pytest.mark.parametrize("max_tokens", [10])
-@pytest.mark.parametrize("model", ["Qwen/Qwen3-30B-A3B"])
-def test_lm_eval_accuracy_etp(model, max_tokens):
-    log_file = open("accuracy_etp.log", "a+")
-    cmd = [
-        "vllm", "serve", model, "--max_model_len", "4096",
-        "--tensor_parallel_size", "4", "--enforce_eager",
-        "--enable_expert_parallel", "--additional_config",
-        '{"expert_tensor_parallel_size": "4"}'
-    ]
-    server_proc = subprocess.Popen(cmd,
-                                   stdout=log_file,
-                                   stderr=subprocess.DEVNULL)
-
-    try:
-        for _ in range(300):
-            try:
-                r = requests.get(HEALTH_URL, timeout=1)
-                if r.status_code == 200:
-                    break
-            except requests.exceptions.RequestException:
-                pass
-            time.sleep(1)
-        else:
-            log_file.flush()
-            log_file.seek(0)
-            log_content = log_file.read()
-            pytest.fail(
-                f"vLLM serve did not become healthy after 300s: {HEALTH_URL}\n"
-                f"==== vLLM Serve Log Start ===\n{log_content}\n==== vLLM Serve Log End ==="
-            )
-
-        prompt = "bejing is a"
-        payload = {
-            "prompt": prompt,
-            "max_tokens": max_tokens,
-            "sampling_params": {
-                "temperature": 0.0,
-                "top_p": 1.0,
-                "seed": 123
-            }
-        }
-        resp = requests.post(COMPLETIONS_URL, json=payload, timeout=30)
-        resp.raise_for_status()
-        data = resp.json()
-
-        generated = data["choices"][0]["text"].strip()
-        expected = "city in china. it is the capital city of"
-        assert generated == expected, f"Expected `{expected}`, got `{generated}`"
-
-    finally:
-        server_proc.send_signal(signal.SIGINT)
-        try:
-            server_proc.wait(timeout=10)
-        except subprocess.TimeoutExpired:
-            server_proc.kill()
-            server_proc.wait()
diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py
@@ -0,0 +1,30 @@
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+
+@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
+def test_e2e_ep_correctness(model_name):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model:
+        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    enable_expert_parallel=True) as vllm_model:
+        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=ep_output,
+        outputs_1_lst=tp_output,
+        name_0="ep_output",
+        name_1="tp_output",
+    )
diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -50,7 +50,6 @@ def test_generate_with_allgather():
                             "enabled": True,
                             "chunked_prefill_enabled": False,
                         },
-                        "expert_tensor_parallel_size": 1
                     }) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
@@ -74,6 +73,5 @@ def test_generate_with_alltoall():
                             "enabled": True,
                             "chunked_prefill_enabled": False,
                         },
-                        "expert_tensor_parallel_size": 1
                     }) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -123,6 +123,7 @@ def _pangu_torchair_test_fixture(
             distributed_executor_backend="mp",
             enforce_eager=False,
             additional_config=additional_config,
+            enable_expert_parallel=True,
     ) as vllm_model:
         # use greedy sampler to make sure the generated results are fix
         vllm_output = vllm_model.generate_greedy(example_prompts, 5)