vllm-project · shen-shanshan · Jul 14, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -213,26 +213,6 @@ jobs:
           # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
           VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
 
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
   e2e-4-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -308,19 +288,3 @@ jobs:
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
             --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
diff --git a/.gitignore b/.gitignore
@@ -198,3 +198,4 @@ kernel_meta/
 /vllm_ascend/_version.py
 # build info file generated by setup.py
 /vllm_ascend/_build_info.py
+/vllm_ascend/include/
diff --git a/examples/disaggregated_prefill/run_decode_server.sh b/examples/disaggregated_prefill/run_decode_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 

diff --git a/examples/disaggregated_prefill/run_prefill_server.sh b/examples/disaggregated_prefill/run_prefill_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 

diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py
@@ -34,7 +34,7 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+llm = LLM(model="/shared/cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct")
 
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)

diff --git a/examples/offline_inference_npu_v0.py b/examples/offline_inference_npu_v0.py
diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1
 export VLLM_DP_SIZE=2
 export VLLM_DP_RANK=0

diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
@@ -60,8 +60,6 @@
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
@@ -89,8 +87,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_ascend_scheduler(model: str,

diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -78,8 +78,6 @@ def _deepseek_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair():
     additional_config = {
         "torchair_graph_config": {
@@ -89,8 +87,6 @@ def test_e2e_deepseekv3_with_torchair():
     _deepseek_torchair_test_fixture(additional_config)
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair_ms_mla():
     additional_config = {
         "torchair_graph_config": {
@@ -150,8 +146,6 @@ def _pangu_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_pangu_with_torchair():
     additional_config = {
         "torchair_graph_config": {

diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh