vllm-project · wangxiyuan · Jul 15, 2025 · Jul 11, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -41,16 +41,10 @@ concurrency:
 
 jobs:
   lint:
-    # Only trigger lint on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     uses: ./.github/workflows/pre-commit.yml
 
   changes:
-    # Only trigger changes on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
     outputs:
       e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
       ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
@@ -60,20 +54,24 @@ jobs:
       with:
         filters: |
           e2e_tracker:
+            - '.github/workflows/vllm_ascend_test.yaml'
             - 'vllm_ascend/**'
             - 'csrc/**'
             - 'cmake/**'
             - 'tests/e2e/**'
-            - 'tests/conftest.py'
-            - 'tests/model_utils.py'
-            - 'tests/utils.py'
+            - 'CMakeLists.txt'
+            - 'setup.py'
+            - 'requirements.txt'
+            - 'requirements-dev.txt'
+            - 'requirements-lint.txt'
+            - 'packages.txt'
           ut_tracker:
             - 'tests/ut/**'
   ut:
     needs: [lint, changes]
     name: unit test
-    # only trigger unit test after lint passed and the change is e2e and ut related. Or the PR is merged.
-    if: ${{ github.event_name == 'push' || (needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true')) }}
+    # only trigger unit test after lint passed and the change is e2e and ut related.
+    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     runs-on: ubuntu-latest
     container:
       image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
@@ -112,9 +110,8 @@ jobs:
           python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
           python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
 
-      - name: Run unit test for V1 Engine
+      - name: Run unit test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           TORCH_DEVICE_BACKEND_AUTOLOAD: 0
         run: |
@@ -133,8 +130,8 @@ jobs:
 
   e2e:
     needs: [lint, changes]
-    # only trigger e2e test after lint passed and the change is e2e related.
-    if: ${{ needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
     strategy:
       max-parallel: 2
       matrix:
@@ -189,9 +186,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run e2e test for V1 Engine
+      - name: Run e2e test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -213,26 +209,6 @@ jobs:
           # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
           VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
 
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
   e2e-4-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -290,9 +266,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
+      - name: Run vllm-project/vllm-ascend test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -308,19 +283,3 @@ jobs:
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
             --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py
@@ -120,7 +120,6 @@ def main(
     trust_remote_code,
 ):
     # DP only support on V1 engine
-    os.environ["VLLM_USE_V1"] = "1"
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)

diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py
@@ -5,7 +5,6 @@
 
 # enable dual-batch overlap for vllm ascend
 os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
-os.environ["VLLM_USE_V1"] = "1"
 
 # Sample prompts.
 prompts = ["The president of the United States is"] * 41

diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py
@@ -22,7 +22,6 @@
 from vllm import LLM, SamplingParams
 from vllm.utils import GiB_bytes
 
-os.environ["VLLM_USE_V1"] = "1"
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 

diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh
@@ -1,4 +1,3 @@
-export VLLM_USE_V1=1
 export TASK_QUEUE_ENABLE=1
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 source /usr/local/Ascend/nnal/atb/set_env.sh

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -12,4 +12,5 @@ xgrammar
 zmq
 types-psutil
 pytest-cov
+regex
 sentence_transformers
diff --git a/requirements-lint.txt b/requirements-lint.txt
@@ -4,5 +4,6 @@ pre-commit==4.0.1
 # type checking
 mypy==1.11.1
 types-PyYAML
+types-regex
 types-requests
 types-setuptools
diff --git a/tests/conftest.py → tests/e2e/conftest.py b/tests/conftest.py → tests/e2e/conftest.py
@@ -39,8 +39,8 @@
 from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils import is_list_of
 
-from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
-                               TokensTextLogprobsPromptLogprobs)
+from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
+                                   TokensTextLogprobsPromptLogprobs)
 # TODO: remove this part after the patch merged into vllm, if
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
@@ -62,7 +62,7 @@
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 _TEST_DIR = os.path.dirname(__file__)
-_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):

diff --git a/tests/model_utils.py → tests/e2e/model_utils.py b/tests/model_utils.py → tests/e2e/model_utils.py
diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -26,12 +26,11 @@
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 
 @patch.dict(
     os.environ, {
-        "VLLM_USE_V1": "1",
         "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
         "TASK_QUEUE_ENABLE": "1",
         "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
@@ -56,12 +55,10 @@ def test_generate_with_allgather():
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@patch.dict(
-    os.environ, {
-        "VLLM_USE_V1": "1",
-        "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-        "TASK_QUEUE_ENABLE": "1"
-    })
+@patch.dict(os.environ, {
+    "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+    "TASK_QUEUE_ENABLE": "1"
+})
 def test_generate_with_alltoall():
     example_prompts = ["Hello, my name is"]
     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
@@ -79,4 +76,4 @@ def test_generate_with_alltoall():
                         },
                         "expert_tensor_parallel_size": 1
                     }) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
+        vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py
@@ -1,7 +1,7 @@
 import pytest
 from modelscope import snapshot_download  # type: ignore
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
                                                   MODEL_PATH, do_sample)
 

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -27,7 +27,7 @@
 from vllm import SamplingParams
 from vllm.model_executor.models.registry import ModelRegistry
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 

diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -16,7 +16,7 @@
 #
 import pytest
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 MODELS = [
     "Qwen/Qwen3-0.6B",

diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
@@ -2,12 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
 
-import os
-
 import pytest
 
-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     # for MHA
@@ -60,8 +58,6 @@
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
@@ -89,8 +85,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_ascend_scheduler(model: str,

diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -22,9 +22,7 @@
 import os
 from typing import Dict
 
-import pytest
-
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
@@ -78,8 +76,6 @@ def _deepseek_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair():
     additional_config = {
         "torchair_graph_config": {
@@ -89,8 +85,6 @@ def test_e2e_deepseekv3_with_torchair():
     _deepseek_torchair_test_fixture(additional_config)
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair_ms_mla():
     additional_config = {
         "torchair_graph_config": {
@@ -150,8 +144,6 @@ def _pangu_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_pangu_with_torchair():
     additional_config = {
         "torchair_graph_config": {

diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import os
 
 import pytest
 import torch
 from vllm import LLM
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 PROMPT = "Hello my name is Robert and I"
 

diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -9,8 +9,8 @@
 """
 import pytest
 
-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     "Qwen/Qwen3-0.6B-Base",