support pipeline parallel in V1 engine

weiguihua2 · weiguihua2 · commit e99edf956ccf · 2025-07-11T09:58:18.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -15,11 +15,10 @@
 # This file is a part of the vllm-ascend project.
 #
 import pytest
-import torch
-from vllm import LLM, SamplingParams
+from tests.conftest import VllmRunner
 
 MODELS = [
-    "deepseek-ai/DeepSeek-V2-Lite",
+    "Qwen/Qwen3-0.6B",
 ]
 
 TENSOR_PARALLELS = [2]
@@ -35,24 +34,9 @@
 @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
 @pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
 def test_models(model: str, tp_size: int, pp_size: int) -> None:
-    # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tp_size,
-        pipeline_parallel_size=pp_size,
-        trust_remote_code=True,
-        enforce_eager=True,
-    )
-    # Prepare sampling_parames
-    sampling_params = SamplingParams(
-        max_tokens=64,
-        temperature=0,
-        ignore_eos=True,
-    )
-
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    outputs = llm.generate(prompts, sampling_params)
-    torch.npu.synchronize()
-    # The output length should be equal to prompts length.
-    assert len(outputs) == len(prompts)
+    with VllmRunner(model,
+                    tensor_parallel_size=tp_size,
+                    pipeline_parallel_size=pp_size,
+                    enforce_eager=True,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy(prompts, 64)