diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index ec23fcef1a..ffdf72b471 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -213,26 +213,6 @@ jobs:
           # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
           VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
 
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
   e2e-4-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -308,19 +288,3 @@ jobs:
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
             --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
diff --git a/.gitignore b/.gitignore
index ef8fc873a6..da7300fca1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,3 +198,4 @@ kernel_meta/
 /vllm_ascend/_version.py
 # build info file generated by setup.py
 /vllm_ascend/_build_info.py
+/vllm_ascend/include/
diff --git a/examples/disaggregated_prefill/run_decode_server.sh b/examples/disaggregated_prefill/run_decode_server.sh
index a3bbaa189f..8dbbf5d634 100644
--- a/examples/disaggregated_prefill/run_decode_server.sh
+++ b/examples/disaggregated_prefill/run_decode_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 
diff --git a/examples/disaggregated_prefill/run_prefill_server.sh b/examples/disaggregated_prefill/run_prefill_server.sh
index dc929f8a49..341a4feea9 100644
--- a/examples/disaggregated_prefill/run_prefill_server.sh
+++ b/examples/disaggregated_prefill/run_prefill_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 
diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py
index 3e88c00176..499c09a647 100644
--- a/examples/offline_inference_npu.py
+++ b/examples/offline_inference_npu.py
@@ -34,7 +34,7 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+llm = LLM(model="/shared/cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct")
 
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/examples/offline_inference_npu_v0.py b/examples/offline_inference_npu_v0.py
deleted file mode 100644
index b6a1156e43..0000000000
--- a/examples/offline_inference_npu_v0.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
-#
-
-import os
-
-os.environ["VLLM_USE_V1"] = "0"
-os.environ["VLLM_USE_MODELSCOPE"] = "True"
-
-from vllm import LLM, SamplingParams
-
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create a sampling params object.
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-# Create an LLM.
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
-
-# Generate texts from the prompts.
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh
index e2bf4c8158..0f271d1352 100644
--- a/examples/run_dp_server.sh
+++ b/examples/run_dp_server.sh
@@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
 export ASCEND_RT_VISIBLE_DEVICES=0,1
 export VLLM_DP_SIZE=2
 export VLLM_DP_RANK=0
diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
index 368d3ff953..2b07718e77 100644
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -60,8 +60,6 @@
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
@@ -89,8 +87,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_ascend_scheduler(model: str,
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
index ce628f9d35..26d908c7f5 100644
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -78,8 +78,6 @@ def _deepseek_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair():
     additional_config = {
         "torchair_graph_config": {
@@ -89,8 +87,6 @@ def test_e2e_deepseekv3_with_torchair():
     _deepseek_torchair_test_fixture(additional_config)
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair_ms_mla():
     additional_config = {
         "torchair_graph_config": {
@@ -150,8 +146,6 @@ def _pangu_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_pangu_with_torchair():
     additional_config = {
         "torchair_graph_config": {
diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh
deleted file mode 100644
index c15f109299..0000000000
--- a/tests/e2e/pd_disaggreate/setup_pd.sh
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-function run_prefill_instance() {
-  local model_name=$1
-  local tp_size=$2
-  local prefill_port=$3
-  local register_port=$4
-  local prefill_device_ips=$5
-  local decode_device_ips=$6
-
-  echo "================================"
-  echo "Testing model: $model_name"
-  echo "================================"
-  # Start prefill instance
-
-  KV_CONFIG=$(jq -n \
-    --arg kv_connector "AscendSimpleConnector" \
-    --arg kv_buffer_device "npu" \
-    --arg kv_role "kv_producer" \
-    --argjson kv_parallel_size 8 \
-    --arg kv_port 11001 \
-    --argjson prefill_device_ips "$prefill_device_ips" \
-    --argjson decode_device_ips "$decode_device_ips" \
-    --argjson llmdatadist_comm_port 26000 \
-    --arg proxy_ip "0.0.0.0" \
-    --argjson proxy_port "$register_port" \
-    --argjson http_port "$prefill_port" \
-    '{
-      "kv_connector": $kv_connector,
-      "kv_buffer_device": $kv_buffer_device,
-      "kv_role": $kv_role,
-      "kv_parallel_size": $kv_parallel_size,
-      "kv_port": $kv_port,
-      "kv_connector_extra_config": {
-        "prefill_device_ips": $prefill_device_ips,
-        "decode_device_ips": $decode_device_ips,
-        "llmdatadist_comm_port": $llmdatadist_comm_port,
-        "proxy_ip": $proxy_ip,
-        "proxy_port": $proxy_port,
-        "http_port": $http_port
-      }
-    }')
-
-  # start prefill instance
-  ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \
-  --host 0.0.0.0 \
-  --port $prefill_port \
-  --tensor-parallel-size $tp_size \
-  --served-model-name Deepseek \
-  --max-model-len 2000 \
-  --trust-remote-code \
-  --enforce-eager \
-  --kv-transfer-config "$KV_CONFIG"
-}
-
-
-
-function run_decode_instance() {
-  # Start decode instance
-  local model_name=$1
-  local tp_size=$2
-  local decode_port=$3
-  local register_port=$4
-  local prefill_device_ips=$5
-  local decode_device_ips=$6
-
-  KV_CONFIG=$(jq -n \
-    --arg kv_connector "AscendSimpleConnector" \
-    --arg kv_buffer_device "npu" \
-    --arg kv_role "kv_consumer" \
-    --argjson kv_parallel_size 8 \
-    --arg kv_port 21001 \
-    --argjson prefill_device_ips "$prefill_device_ips" \
-    --argjson decode_device_ips "$decode_device_ips" \
-    --argjson llmdatadist_comm_port 26000 \
-    --arg proxy_ip "0.0.0.0" \
-    --argjson proxy_port "$register_port" \
-    --argjson http_port "$decode_port" \
-    '{
-      "kv_connector": $kv_connector,
-      "kv_buffer_device": $kv_buffer_device,
-      "kv_role": $kv_role,
-      "kv_parallel_size": $kv_parallel_size,
-      "kv_port": $kv_port,
-      "kv_connector_extra_config": {
-        "prefill_device_ips": $prefill_device_ips,
-        "decode_device_ips": $decode_device_ips,
-        "llmdatadist_comm_port": $llmdatadist_comm_port,
-        "proxy_ip": $proxy_ip,
-        "proxy_port": $proxy_port,
-        "http_port": $http_port
-      }
-    }')
-
-  # start decode instance
-  ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \
-    --host 0.0.0.0 \
-    --port $decode_port \
-    --tensor-parallel-size $tp_size \
-    --seed 1024 \
-    --served-model-name Deepseek \
-    --max-model-len 2000 \
-    --max-num-batched-tokens 2000 \
-    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
-    --enforce-eager \
-    --kv-transfer-config "$KV_CONFIG"
-}
-
-function run_proxy_server() {
-  # Build the command for the proxy server with all the hosts and ports
-  register_port=$1
-  proxy_port=$2
-  PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port"
-
-  # Start the proxy server
-  echo "Starting proxy server with command: $PROXY_CMD"
-  $PROXY_CMD &
-}
diff --git a/tests/e2e/pd_disaggreate/test_pd_e2e.py b/tests/e2e/pd_disaggreate/test_pd_e2e.py
deleted file mode 100644
index 5fd923211c..0000000000
--- a/tests/e2e/pd_disaggreate/test_pd_e2e.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-import os
-import signal
-import subprocess
-import time
-
-import psutil
-import requests
-
-
-def kill_process_and_children(pid):
-    try:
-        parent = psutil.Process(pid)
-        children = parent.children(recursive=True)
-        for child in children:
-            print(f"Killing child process {child.pid}")
-            child.kill()
-        print(f"Killing parent process {pid}")
-        parent.kill()
-    except psutil.NoSuchProcess:
-        pass
-
-
-def kill_all_vllm_related():
-    current_pid = os.getpid()
-
-    for proc in psutil.process_iter(['pid', 'cmdline']):
-        try:
-            if proc.pid == current_pid:
-                continue
-            cmd = ' '.join(proc.info['cmdline'])
-            if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd:
-                kill_process_and_children(proc.pid)
-        except Exception:
-            continue
-
-
-PROXY_PORT = 10102
-DECODE_PORT = 8002
-
-SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh")
-
-
-def wait_for_port(port, timeout=30):
-    import socket
-    start = time.time()
-    while time.time() - start < timeout:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-            if sock.connect_ex(("127.0.0.1", port)) == 0:
-                return True
-        time.sleep(1)
-    raise TimeoutError(f"Port {port} not ready after {timeout}s")
-
-
-def start_and_test_pipeline():
-    print("Launching bash script to run vLLM PD setup...")
-    proc = subprocess.Popen(["bash", SCRIPT_PATH])
-    try:
-        print("Waiting for proxy port to be available...")
-        wait_for_port(PROXY_PORT, 180)
-        wait_for_port(DECODE_PORT, 600)
-
-        # request
-        payload = {
-            "model": "Deepseek",
-            "prompt": "The future of AI is",
-            "max_tokens": 64,
-            "temperature": 0,
-        }
-        response = requests.post(
-            f"http://localhost:{PROXY_PORT}/v1/completions",
-            headers={"Content-Type": "application/json"},
-            json=payload,
-            timeout=10)
-        assert response.status_code == 200, f"HTTP failed: {response.status_code}"
-        result = response.json()
-        print("Response:", result)
-        assert "text" in result["choices"][0]
-        assert len(result["choices"][0]["text"].strip()) > 0
-
-    finally:
-        # clean up subprocesses
-        print("Cleaning up subprocess...")
-        proc.send_signal(signal.SIGINT)
-        try:
-            proc.wait(timeout=10)
-        except subprocess.TimeoutExpired:
-            proc.kill()
-        kill_all_vllm_related()
-
-
-def test_disaggregated_pd_pipeline():
-    start_and_test_pipeline()
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
index 17116ab59a..2d8689d5ae 100644
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
@@ -7,9 +7,6 @@
 import torch
 from vllm import LLM
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 PROMPT = "Hello my name is Robert and I"
 
diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py
index 89dfa08e41..605384ae0a 100644
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -36,8 +36,6 @@
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
@@ -86,8 +84,6 @@ def test_models(
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
 def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_MODELSCOPE", "True")
diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py
index 2240b88e2c..b185671500 100644
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -29,8 +29,6 @@
 MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="new chunked only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [1])
 def test_models(
diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py
index 9d103a5308..e8c33a49ea 100644
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -30,10 +30,8 @@
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
 
-GuidedDecodingBackendV0 = ["outlines", "lm-format-enforcer", "xgrammar"]
 GuidedDecodingBackendV1 = ["xgrammar", "guidance"]
-GuidedDecodingBackend = list(
-    set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1))
+GuidedDecodingBackend = GuidedDecodingBackendV1
 
 
 @pytest.fixture(scope="module")
@@ -85,9 +83,6 @@ def sample_json_schema():
 
 
 def check_backend(guided_decoding_backend: str):
-    if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv(
-            "VLLM_USE_V1") == "0":
-        pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.")
     if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv(
             "VLLM_USE_V1") == "1":
         pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")
diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
index a123790dbd..cfaec1bfd3 100644
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -193,26 +193,6 @@ def test_check_ascend_config_pass(self):
     @_clean_up_ascend_config
     def test_check_ascend_config_wrong_case(self):
         test_vllm_config = VllmConfig()
-        # For V0 engine
-        with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}):
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "torchair_graph_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, False)
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "ascend_scheduler_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, True)
         # For V1 engine
         with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
             # torchair + eager mode
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index c09964a745..72286236b9 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -416,24 +416,6 @@ def test_check_and_update_config_speculative_worker_config(
                 "vllm_ascend.worker.worker.NPUWorker",
             )
 
-    @patch("vllm_ascend.ascend_config.check_ascend_config")
-    @patch("vllm_ascend.ascend_config.init_ascend_config")
-    @patch("vllm.envs.VLLM_USE_V1", False)
-    def test_check_and_update_config_multi_step_worker_config(
-            self, mock_init_ascend, mock_check_ascend):
-        mock_init_ascend.return_value = self.mock_ascend_config
-        self.mock_vllm_config.scheduler_config.is_multi_step = True
-        self.mock_vllm_config.parallel_config.worker_cls = "auto"
-
-        from vllm_ascend import platform
-
-        importlib.reload(platform)
-        self.platform.check_and_update_config(self.mock_vllm_config)
-        self.assertEqual(
-            self.mock_vllm_config.parallel_config.worker_cls,
-            "vllm_ascend.worker.multi_step_worker.MultiStepWorker",
-        )
-
     @patch("vllm_ascend.ascend_config.check_ascend_config")
     @patch("vllm_ascend.ascend_config.init_ascend_config")
     @patch("vllm.envs.VLLM_USE_V1", False)
diff --git a/tests/ut/worker/test_pooling_model_runner.py b/tests/ut/worker/test_pooling_model_runner.py
deleted file mode 100644
index 28a0a7d3c6..0000000000
--- a/tests/ut/worker/test_pooling_model_runner.py
+++ /dev/null
@@ -1,355 +0,0 @@
-import unittest
-from unittest.mock import MagicMock, patch
-
-import torch
-from vllm.distributed.parallel_state import GroupCoordinator
-from vllm.engine.arg_utils import EngineArgs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import SequenceData, SequenceGroupMetadata
-
-from vllm_ascend.worker.pooling_model_runner import (
-    ModelInputForNPUWithPoolingMetadata, NPUPoolingModelRunner)
-
-
-class TestPoolingModelRunner(unittest.TestCase):
-    """Unit tests for the NPUPoolingModelRunner class."""
-
-    def _create_model_runner(self, model: str, *args,
-                             **kwargs) -> NPUPoolingModelRunner:
-        engine_args = EngineArgs(model, *args, **kwargs)
-        engine_config = engine_args.create_engine_config()
-        model_runner = NPUPoolingModelRunner(vllm_config=engine_config, )
-        return model_runner
-
-    def setUp(self):
-        """Initialize test fixtures and common mocks"""
-        self.attn_backend = "npu"
-
-        model_runner = self._create_model_runner(
-            "tests/ut/fake_weight",
-            trust_remote_code=True,
-            enable_chunked_prefill=False,
-        )
-
-        self.runner = model_runner
-        self.runner.attn_backend = self.attn_backend
-        model_runner.model = MagicMock()
-        self.runner = model_runner
-        # Sample test data
-        self.sample_tensor_dict = {"tensor1": torch.randn(3, 4)}
-        self.sample_seq_group = [MagicMock(spec=SequenceGroupMetadata)]
-        self.sample_finished_ids = ["req1", "req2"]
-
-    @patch(
-        'vllm_ascend.worker.pooling_model_runner.ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict'
-    )
-    def test_make_model_input_from_broadcasted_tensor_dict(
-            self, mock_from_dict):
-        """Test tensor dictionary conversion to model input"""
-        # Setup mock return
-        expected_output = MagicMock()
-        mock_from_dict.return_value = expected_output
-
-        # Execute
-        result = self.runner.make_model_input_from_broadcasted_tensor_dict(
-            self.sample_tensor_dict)
-
-        # Verify
-        mock_from_dict.assert_called_once_with(self.sample_tensor_dict,
-                                               attn_backend=self.attn_backend)
-        self.assertEqual(result, expected_output)
-
-    @patch.object(NPUPoolingModelRunner, '_prepare_pooling')
-    @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
-    def test_prepare_model_input_normal_case(self, mock_prepare_tensors,
-                                             mock_prepare_pooling):
-        """Test normal flow of model input preparation"""
-        # Setup mocks
-        mock_model_input = ModelInputForNPUWithPoolingMetadata(
-            seq_lens=[1, 2, 3])
-        mock_prepare_tensors.return_value = mock_model_input
-
-        mock_pooling_metadata = MagicMock()
-        mock_prepare_pooling.return_value = mock_pooling_metadata
-
-        # Execute
-        result = self.runner.prepare_model_input(
-            seq_group_metadata_list=self.sample_seq_group,
-            finished_requests_ids=self.sample_finished_ids)
-
-        # Verify
-        mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
-                                                     self.sample_finished_ids)
-        mock_prepare_pooling.assert_called_once_with(self.sample_seq_group,
-                                                     mock_model_input.seq_lens)
-        self.assertEqual(result.pooling_metadata, mock_pooling_metadata)
-
-    def test_prepare_model_input_null_sequence_group(self):
-        """Test assertion when seq_group_metadata_list is None"""
-        with self.assertRaises(AssertionError):
-            self.runner.prepare_model_input(
-                seq_group_metadata_list=None,
-                finished_requests_ids=self.sample_finished_ids)
-
-    @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
-    def test_prepare_model_input_null_seq_lens(self, mock_prepare_tensors):
-        """Test assertion when seq_lens is None in model input"""
-        # Setup mock with None seq_lens
-        mock_model_input = MagicMock()
-        mock_model_input.seq_lens = None
-        mock_prepare_tensors.return_value = mock_model_input
-
-        with self.assertRaises(AssertionError):
-            self.runner.prepare_model_input(
-                seq_group_metadata_list=self.sample_seq_group,
-                finished_requests_ids=self.sample_finished_ids)
-
-    @patch.object(NPUPoolingModelRunner, '_prepare_pooling')
-    @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
-    def test_prepare_model_input_with_virtual_engine(self,
-                                                     mock_prepare_tensors,
-                                                     mock_prepare_pooling):
-        """Test virtual engine parameter is properly handled"""
-        # Setup mocks
-        mock_model_input = ModelInputForNPUWithPoolingMetadata(
-            seq_lens=[1, 2, 3])
-        mock_prepare_tensors.return_value = mock_model_input
-
-        # Execute with virtual_engine parameter
-        result = self.runner.prepare_model_input(
-            seq_group_metadata_list=self.sample_seq_group,
-            virtual_engine=1,
-            finished_requests_ids=self.sample_finished_ids)
-
-        # Verify virtual_engine doesn't affect the flow
-        self.assertIsNotNone(result)
-
-    @patch.object(NPUPoolingModelRunner, '_prepare_pooling')
-    @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
-    def test_prepare_model_input_with_null_finished_ids(
-            self, mock_prepare_tensors, mock_prepare_pooling):
-        """Test case when finished_requests_ids is None"""
-        # Setup mocks
-        mock_model_input = ModelInputForNPUWithPoolingMetadata(
-            seq_lens=[1, 2, 3])
-        mock_prepare_tensors.return_value = mock_model_input
-
-        # Execute with None finished_ids
-        result = self.runner.prepare_model_input(
-            seq_group_metadata_list=self.sample_seq_group,
-            finished_requests_ids=None)
-
-        # Verify
-        mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
-                                                     None)
-        self.assertIsNotNone(result)
-
-    @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
-    def test_prepare_pooling_normal_case(self, mock_pooling_metadata):
-        """Test normal case with multiple sequences in group"""
-        # Setup test data
-        mock_pooling_metadata.return_value = None
-        seq_data = {
-            1: MagicMock(spec=SequenceData),
-            2: MagicMock(spec=SequenceData)
-        }
-        pooling_params = MagicMock(spec=PoolingParams)
-        seq_group = MagicMock(spec=SequenceGroupMetadata)
-        seq_group.seq_data = seq_data
-        seq_group.pooling_params = pooling_params
-
-        # Call the function
-        self.runner._prepare_pooling([seq_group], [10, 20])
-
-        # Verify results
-        mock_pooling_metadata.assert_called_once_with(seq_groups=[
-            ([1, 2], pooling_params)
-        ],
-                                                      seq_data=seq_data,
-                                                      prompt_lens=[10, 20])
-
-    @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
-    def test_prepare_pooling_empty_group(self, mock_pooling_metadata):
-        """Test case with empty sequence group"""
-        # Setup empty group
-        mock_pooling_metadata.return_value = None
-        empty_seq_data: dict[int, SequenceData] = {}
-        pooling_params = MagicMock(spec=PoolingParams)
-        empty_group = MagicMock(spec=SequenceGroupMetadata)
-        empty_group.seq_data = empty_seq_data
-        empty_group.pooling_params = pooling_params
-
-        # Call the function
-        self.runner._prepare_pooling([empty_group], [])
-
-        # Verify results
-        mock_pooling_metadata.assert_called_once_with(seq_groups=[
-            ([], pooling_params)
-        ],
-                                                      seq_data={},
-                                                      prompt_lens=[])
-
-    @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
-    def test_prepare_pooling_single_sequence(self, mock_pooling_metadata):
-        """Test case with single sequence in group"""
-        # Setup single sequence
-        mock_pooling_metadata.return_value = None
-        single_seq_data = {3: MagicMock(spec=SequenceData)}
-        pooling_params = MagicMock(spec=PoolingParams)
-        single_group = MagicMock(spec=SequenceGroupMetadata)
-        single_group.seq_data = single_seq_data
-        single_group.pooling_params = pooling_params
-
-        # Call the function
-        self.runner._prepare_pooling([single_group], [5])
-
-        # Verify results
-        mock_pooling_metadata.assert_called_once_with(seq_groups=[
-            ([3], pooling_params)
-        ],
-                                                      seq_data=single_seq_data,
-                                                      prompt_lens=[5])
-
-    @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
-    def test_prepare_pooling_multiple_groups(self, mock_pooling_metadata):
-        """Test case with multiple sequence groups"""
-        # Setup multiple groups
-        mock_pooling_metadata.return_value = None
-        seq_data1 = {1: MagicMock(spec=SequenceData)}
-        seq_data2 = {2: MagicMock(spec=SequenceData)}
-        params1 = MagicMock(spec=PoolingParams)
-        params2 = MagicMock(spec=PoolingParams)
-
-        group1 = MagicMock(spec=SequenceGroupMetadata)
-        group1.seq_data = seq_data1
-        group1.pooling_params = params1
-
-        group2 = MagicMock(spec=SequenceGroupMetadata)
-        group2.seq_data = seq_data2
-        group2.pooling_params = params2
-
-        # Call the function
-        self.runner._prepare_pooling([group1, group2], [10, 20])
-
-        # Verify results
-        mock_pooling_metadata.assert_called_once_with(seq_groups=[
-            ([1], params1), ([2], params2)
-        ],
-                                                      seq_data={
-                                                          **seq_data1,
-                                                          **seq_data2
-                                                      },
-                                                      prompt_lens=[10, 20])
-
-    @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
-    def test_prepare_pooling_empty_input(self, mock_pooling_metadata):
-        """Test case with empty input lists"""
-        # Call the function with empty inputs
-        mock_pooling_metadata.return_value = None
-        self.runner._prepare_pooling([], [])
-
-        # Verify results
-        mock_pooling_metadata.assert_called_once_with(seq_groups=[],
-                                                      seq_data={},
-                                                      prompt_lens=[])
-
-    @patch('vllm.forward_context.set_forward_context')
-    @patch('vllm.distributed.parallel_state._PP',
-           new_callable=lambda: MagicMock(spec=GroupCoordinator,
-                                          is_last_rank=True))
-    @patch('torch.npu.Event')
-    @patch.object(NPUPoolingModelRunner, 'set_active_loras')
-    @patch.object(NPUPoolingModelRunner, 'set_active_prompt_adapters')
-    def test_execute_model_normal_flow(self, mock_set_adapters, mock_set_loras,
-                                       mock_event, mock_pp, mock_set_forward):
-        """Test normal execution path with all dependencies mocked"""
-
-        # Setup model input mock
-        mock_input = MagicMock()
-        mock_input.input_tokens = torch.tensor([1])
-        mock_input.input_positions = torch.tensor([0])
-        mock_input.multi_modal_kwargs = {}
-        self.runner.is_driver_worker = True
-        # Execute
-        self.runner.execute_model(model_input=mock_input,
-                                  kv_caches=[],
-                                  num_steps=1)
-
-        # Verify core calls
-        self.runner.model.pooler.assert_called_once()
-
-    @patch('vllm.forward_context.set_forward_context')
-    def test_execute_model_invalid_steps(self, mock_set_forward):
-        """Test ValueError when num_steps != 1"""
-        with self.assertRaises(ValueError):
-            self.runner.execute_model(model_input=MagicMock(),
-                                      kv_caches=[],
-                                      num_steps=2)
-        mock_set_forward.assert_not_called()
-
-    @patch('vllm.forward_context.set_forward_context')
-    @patch('vllm.distributed.parallel_state._PP',
-           new_callable=lambda: MagicMock(spec=GroupCoordinator,
-                                          is_last_rank=False))
-    @patch('torch.npu.Event')
-    def test_execute_model_perf_monitoring(self, mock_event, mock_pp,
-                                           mock_set_forward):
-        """Test performance monitoring with timing mocks"""
-        # Setup mocks
-
-        mock_event.return_value.elapsed_time.return_value = 15.0
-        self.runner.observability_config = MagicMock(
-            collect_model_forward_time=True)
-
-        # Execute
-        self.runner.execute_model(model_input=MagicMock(
-            input_tokens=torch.tensor([1]),
-            input_positions=torch.tensor([0]),
-            multi_modal_kwargs={}),
-                                  kv_caches=[],
-                                  num_steps=1)
-
-        # Verify timing calls
-        self.assertEqual(mock_event.call_count, 2)
-
-    @patch('vllm.forward_context.set_forward_context')
-    @patch.object(NPUPoolingModelRunner, 'set_active_loras')
-    @patch('vllm.distributed.parallel_state._PP',
-           new_callable=lambda: MagicMock(spec=GroupCoordinator,
-                                          is_last_rank=False))
-    def test_execute_model_lora_config(self, mock_pp, set_active_loras,
-                                       mock_set_forward):
-        """Test LoRA configuration handling"""
-        # Setup
-
-        self.runner.lora_config = True
-        mock_input = MagicMock()
-        mock_input.lora_requests = ["req1"]
-        mock_input.lora_mapping = {"map": 1}
-
-        # Execute
-        self.runner.execute_model(model_input=mock_input,
-                                  kv_caches=[],
-                                  num_steps=1)
-
-        # Verify LoRA call
-        set_active_loras.assert_called_once_with(["req1"], {"map": 1})
-
-    @patch('vllm.forward_context.set_forward_context')
-    @patch('vllm.distributed.parallel_state._PP',
-           new_callable=lambda: MagicMock(spec=GroupCoordinator,
-                                          is_last_rank=False))
-    def test_execute_model_not_last_rank(self, mock_pp, mock_set_forward):
-        """Test behavior when not the last pipeline rank"""
-        # Setup
-
-        # Execute
-        self.runner.execute_model(model_input=MagicMock(
-            input_tokens=torch.tensor([1]),
-            input_positions=torch.tensor([0]),
-            multi_modal_kwargs={}),
-                                  kv_caches=[],
-                                  num_steps=1)
-
-        # Verify pooler not called
-        self.runner.model.pooler.assert_not_called()
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
deleted file mode 100644
index 944e8c9a65..0000000000
--- a/vllm_ascend/attention/attention.py
+++ /dev/null
@@ -1,1228 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import numpy as np
-import torch
-import torch_npu
-import torchair._contrib.custom_torch_ops  # type: ignore  # noqa: F401
-from torch.nn.functional import scaled_dot_product_attention
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              MLAAttentionImpl)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
-                                           CommonMetadataBuilder,
-                                           compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
-
-from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
-from vllm_ascend.ops.cache import concat_and_cache_mla
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16,
-                               enable_custom_op, is_310p, nd_to_nz_2d)
-from vllm_ascend.worker.model_runner import (
-    ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
-
-_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
-
-
-class AscendAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "ASCEND"
-
-    @staticmethod
-    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
-        return AscendAttentionBackendImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AscendMetadata"]:
-        return AscendMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        if is_310p():
-            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
-                    16)
-        else:
-            return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: List[torch.Tensor],
-        dst_kv_cache: List[torch.Tensor],
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1]
-        dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1]
-        src_indices = src_to_dst[:, 0]
-        dst_indices = src_to_dst[:, 1]
-
-        dst_key_cache[dst_indices] = src_key_cache[src_indices].to(
-            dst_key_cache.device)
-        dst_value_cache[dst_indices] = src_value_cache[src_indices].to(
-            dst_key_cache.device)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        src_indices = src_to_dists[:, 0]
-        dst_indices = src_to_dists[:, 1]
-
-        for kv_cache in kv_caches:
-            key_caches = kv_cache[0]
-            value_caches = kv_cache[1]
-            key_caches[dst_indices] = key_caches[src_indices]
-            value_caches[dst_indices] = value_caches[src_indices]
-
-    @staticmethod
-    def get_builder_cls() -> Type["AscendMetadataBuilder"]:
-        return AscendMetadataBuilder
-
-    @classmethod
-    def make_metadata_builder(cls, *args, **kwargs) -> "AscendMetadataBuilder":
-        return cls.get_builder_cls()(*args, **kwargs)
-
-
-class AscendMLAAttentionBackend(AscendAttentionBackend):
-
-    @staticmethod
-    def get_impl_cls() -> Type["AscendMLAAttentionBackendImpl"]:
-        return AscendMLAAttentionBackendImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads, head_size)
-
-
-@dataclass
-class AscendMetadata(AttentionMetadata):
-    """Metadata for Ascendbackend.
-        * modified from XFormersbackend
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
-    #                                   |-- query_len ---|
-
-    # FIXME: It is for flash attn.
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # Avoid mypy error
-    # Total number of prefill requests.
-    num_prefills: int
-    # Number of prefill tokens.
-    num_prefill_tokens: int
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
-    slot_mapping: torch.Tensor
-
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-
-    chunked_prefill_enabled: bool
-
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    block_tables: Optional[torch.Tensor]
-
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]] = None
-
-    # The query lengths of the input sequences
-    query_lens: Optional[List[int]] = None
-
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["AscendMetadata"] = None
-    _cached_decode_metadata: Optional["AscendMetadata"] = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-
-    # Mask for normal situation
-    attn_mask: Optional[torch.Tensor] = None
-
-    # Mask for prefix caching
-    compress_mask: Optional[torch.Tensor] = None
-
-    # Mask for chunked prefill
-    chunk_mask: Optional[torch.Tensor] = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["AscendMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure.
-            return self._cached_prefill_metadata
-
-        assert ((self.seq_lens is not None)
-                or (self.encoder_seq_lens is not None))
-
-        # Compute some attn_metadata fields which default to None.
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[:self.num_prefill_tokens])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[:self.num_prefills])
-        query_lens = (None if self.query_lens is None else
-                      self.query_lens[:self.num_prefills])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[:self.num_prefills])
-
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[:self.num_prefills])
-
-        # Construct & cache prefill-phase attention metadata structure.
-        self._cached_prefill_metadata = AscendMetadata(
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            query_lens=query_lens,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_seq_len=0,
-            chunked_prefill_enabled=self.chunked_prefill_enabled,
-            block_tables=block_tables,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables,
-            enable_kv_scales_calculation=False)
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["AscendMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure.
-            return self._cached_decode_metadata
-
-        # Compute some attn_metadata fields which default to None.
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[self.num_prefill_tokens:])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[self.num_prefills:])
-        query_lens = (None if self.query_lens is None else
-                      self.query_lens[self.num_prefills:])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[self.num_prefills:])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[self.num_prefills:])
-        # Construct & cache decode-phase attention metadata structure.
-        self._cached_decode_metadata = AscendMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=slot_mapping,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            query_lens=query_lens,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            chunked_prefill_enabled=self.chunked_prefill_enabled,
-            block_tables=block_tables,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables,
-            enable_kv_scales_calculation=False)
-        return self._cached_decode_metadata
-
-    def advance_step(self,
-                     model_input: "ModelInputForNPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-
-        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-        if enable_custom_op():
-            #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
-            torch.ops._C.advance_step_flashattn_ascendc(
-                num_seqs=num_seqs,
-                num_queries=num_queries,
-                block_size=block_size,
-                input_tokens=model_input.input_tokens,
-                sampled_token_ids=sampled_token_ids,
-                input_positions=model_input.input_positions,
-                seq_lens=self.seq_lens_tensor,
-                slot_mapping=self.slot_mapping,
-                block_tables=self.block_tables)
-        else:
-            # use traditional Pytorch method for updating these tensors.
-            # update input_tokens
-            sampled_token_ids_list = sampled_token_ids[:
-                                                       num_queries].squeeze(  # type: ignore
-                                                           -1)
-            model_input.input_tokens[:
-                                     num_queries] = sampled_token_ids_list  # type: ignore
-
-            # get seq_lens and input_positions
-            seq_lens = self.seq_lens_tensor[:num_queries]
-            next_seq_lens = seq_lens + 1
-            next_input_pos = next_seq_lens - 1
-
-            # update seq_lens and input_positions
-            self.seq_lens_tensor[:num_queries] = next_seq_lens
-            model_input.input_positions[:
-                                        num_queries] = next_input_pos  # type: ignore
-
-            # 计算 block index 和 offset
-            block_idx = next_input_pos // block_size
-            block_offset = next_input_pos % block_size
-
-            current_block_table = self.block_tables.gather(
-                1, block_idx.unsqueeze(-1)).squeeze(-1)
-            slot_num = current_block_table * block_size + block_offset
-
-            # update slot_mapping
-            self.slot_mapping[:num_queries] = slot_num
-
-
-class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]):
-
-    _attn_mask_builder = None  # noqa
-
-    def __init__(self, input_builder: "ModelInputForNPUBuilder"):
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-        self.attn_mask = None
-        self.compress_mask = None
-        self.chunk_mask = None
-        if AscendMetadataBuilder._attn_mask_builder is None:
-            AscendMetadataBuilder._attn_mask_builder = AttentionMaskBuilder(
-                128, self.input_builder.runner.model_config.dtype)
-
-    def _add_seq_group(
-            self, inter_data: ModelInputForNPUBuilder.InterDataForSeqGroup,
-            chunked_prefill_enabled: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-            if is_prompt:
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table: List[int] = []
-            prefix_cache_hit = any([
-                inter_data.prefix_cache_hit
-                for inter_data in self.input_builder.inter_data_list
-            ])
-            if prefix_cache_hit:
-                # NOTE(woosuk): For flash-attn, the block table should
-                # include the entries for the incoming prefill tokens.
-                if block_tables is not None:
-                    block_table = block_tables[seq_id]
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                if curr_sliding_window_block == 0:
-                    block_table = block_tables[seq_id]
-                else:
-                    block_table = block_tables[seq_id][
-                        -curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            # Compute slot mapping.
-            is_profile_run = is_block_tables_empty(block_tables)
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(
-                is_profile_run,
-                self.slot_mapping,
-                seq_id,
-                seq_len,
-                context_len,
-                start_idx,
-                self.block_size,
-                inter_data.block_tables,
-            )
-
-    def _get_graph_runner_block_tables(
-            self, num_seqs: int,
-            block_tables: List[List[int]]) -> torch.Tensor:
-        # The shape of graph_block_tables is
-        # [max batch size, max context len // block size].
-
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs
-
-        graph_block_tables = self.runner.graph_block_tables  # [:num_seqs]
-        for i, block_table in enumerate(block_tables):
-            if block_table:
-                num_blocks = len(block_table)
-                if num_blocks <= max_blocks:
-                    graph_block_tables[i, :num_blocks] = block_table
-                else:
-                    graph_block_tables[
-                        i, :max_blocks] = block_table[:max_blocks]
-
-        return torch.from_numpy(graph_block_tables).to(
-            device=self.runner.device, non_blocking=True)
-
-    def build(
-        self,
-        seq_lens: List[int],
-        query_lens: List[int],
-        graph_pad_size: int,
-    ):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-        """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
-
-        device = self.runner.device
-        dtype = self.runner.model_config.dtype
-        use_npu_graph = graph_pad_size != -1
-
-        max_query_len = max(query_lens)
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        max_seq_len = max(max_prefill_seq_len, max_decode_seq_len)
-        num_decode_tokens = self.num_decode_tokens
-
-        if self.num_prefills == 0 and use_npu_graph:
-            num_seqs = len(seq_lens)
-            self.slot_mapping.extend([PAD_SLOT_ID] * graph_pad_size)
-            self.block_tables.extend([[]] * graph_pad_size)
-            block_tables = self._get_graph_runner_block_tables(
-                num_seqs, self.block_tables)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device=device,
-            )
-
-        if self.num_prefills > 0:
-            if block_tables is None or block_tables.numel() == 0:
-                # normal mask
-                self.attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
-                    max_prefill_seq_len, dtype, device)
-                if is_310p():
-                    mask_nz = nd_to_nz_2d(self.attn_mask)
-                    mask_nz = torch_npu.npu_format_cast(
-                        mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ)
-                    self.attn_mask = mask_nz
-            elif self.num_decode_tokens == 0 and not self.input_builder.chunked_prefill_enabled:
-                # compress mask for prefix cache
-                self.compress_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
-                    128, dtype, device)
-            else:
-                # chunk_mask for chunk prefill
-                attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
-                    max_seq_len, dtype, device)
-                if attn_mask.numel() > 1 and attn_mask[0][1] > 0:
-                    # Do not use in-place multiplication to avoid modifying `attn_mask_cache`!
-                    attn_mask = attn_mask * -10000
-                chunk_mask_list = []
-                for i, seq_len in enumerate(seq_lens):
-                    context_len = self.context_lens[i]
-                    chunk_mask_list.append(attn_mask[context_len:seq_len])
-                self.chunk_mask = torch.cat(chunk_mask_list, 0)
-        else:
-            self.attn_mask = None
-            self.compress_mask = None
-            self.chunk_mask = None
-
-        assert max_query_len > 0, "query_lens: {}".format(query_lens)
-
-        assert device is not None
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.int32,
-                                               device, self.runner.pin_memory)
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-
-        return AscendMetadata(
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=True,
-            seq_lens_tensor=seq_lens_tensor,
-            query_lens=query_lens,
-            max_query_len=max_query_len,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            block_tables=block_tables,
-            attn_mask=self.attn_mask,
-            compress_mask=self.compress_mask,
-            chunk_mask=self.chunk_mask,
-            chunked_prefill_enabled=self.input_builder.chunked_prefill_enabled,
-        )
-
-
-class AscendAttentionBackendImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.hidden_size = self.num_heads * self.head_size
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = sliding_window
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes,
-                                        dtype=torch.float32,
-                                        device="npu")
-        self.alibi_slopes = alibi_slopes
-        self.attn_type = attn_type
-
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.seq_len_cpu_tensor = None
-        self.query_len_cpu_tensor = None
-        self.key_cache = None
-        self.value_cache = None
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AscendMetadata,
-        attn_type: str = AttentionType.DECODER,
-        output: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with Ascend attention.
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-                   num_tokens = batch_size * seq_len
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache: shape = [2, num_blocks, block_size,
-                               num_kv_heads, head_size]
-                      key_cache = [num_blocks, block_size,
-                                   num_kv_heads, head_size]
-                      value_cache = [num_blocks, block_size,
-                                     num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [batch_size, seq_len * num_heads * head_size]
-        """
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        # View q k v to BSH.
-        num_tokens = query.shape[0]
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        # TODO: Remove this contiguous in the future.
-        value = value.contiguous()
-        attn_type = self.attn_type
-
-        output = torch.empty(num_tokens,
-                             self.num_heads,
-                             self.head_size,
-                             dtype=query.dtype,
-                             device=query.device)
-
-        if kv_cache.numel() > 0:
-            if self.key_cache is None:
-                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
-            slots = attn_metadata.slot_mapping
-
-        if hasattr(layer, 'quant_method'):
-            isPrefill = True if attn_metadata.num_prefills > 0 else False
-            if isPrefill:
-                assert attn_metadata.prefill_metadata is not None
-                self.seq_lens_tensor_cpu = torch.from_numpy(
-                    np.array(attn_metadata.prefill_metadata.seq_lens).astype(
-                        np.int32))
-            else:
-                assert attn_metadata.decode_metadata is not None
-                self.seq_lens_tensor_cpu = torch.from_numpy(
-                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
-                        np.int32))
-            block_tables = attn_metadata.decode_metadata.block_tables if attn_metadata.decode_metadata else None
-            # Details of kv_cache arrangement in attention quantization
-            # are implemented by quant_method.
-            layer.quant_method.apply(
-                layer,
-                query,
-                key,
-                value,
-                self.key_cache,
-                self.value_cache,
-                self.scale,
-                block_tables,
-                isPrefill,
-                attn_metadata,
-                output,
-                seq_lens_tensor_cpu=self.seq_lens_tensor_cpu)
-        else:
-            if self.key_cache is not None:
-                torch_npu._npu_reshape_and_cache(key=key,
-                                                 value=value,
-                                                 key_cache=self.key_cache,
-                                                 value_cache=self.value_cache,
-                                                 slot_indices=slots)
-
-            if attn_metadata.num_prefills > 0:
-                # Prefix cache disabled  and  chunk prefill disabled  or  no prefix cache hit
-                if (attn_metadata.block_tables is None
-                        or attn_metadata.block_tables.numel() == 0):
-                    if attn_type == AttentionType.ENCODER_ONLY:
-                        # TODO: change to use torch_npu encoder attention op, instead
-                        # of torch sdpa
-                        query = query.movedim(0, query.dim() - 2)
-                        key = key.movedim(0, key.dim() - 2)
-                        value = value.movedim(0, value.dim() - 2)
-
-                        causal_attn = (attn_type == AttentionType.DECODER)
-                        if attn_metadata.seq_lens is not None:
-                            seq_lens_q = seq_lens_kv = attn_metadata.seq_lens
-                        attn_masks = [None] * len(seq_lens_q)
-                        start_q, start_kv = 0, 0
-                        for seq_len_q, seq_len_kv, mask in zip(
-                                seq_lens_q, seq_lens_kv, attn_masks):
-                            end_q = start_q + seq_len_q
-                            end_kv = start_kv + seq_len_kv
-                            sub_out = scaled_dot_product_attention(
-                                query[None, :, start_q:end_q, :],
-                                key[None, :, start_kv:end_kv, :],
-                                value[None, :, start_kv:end_kv, :],
-                                attn_mask=mask,
-                                dropout_p=0.0,
-                                is_causal=causal_attn and mask is None,
-                                scale=self.scale).squeeze(0).movedim(
-                                    query.dim() - 2, 0)
-                            output[start_q:end_q, :, :] = sub_out
-                            start_q, start_kv = end_q, end_kv
-                    else:
-                        assert attn_metadata.attn_mask is not None
-                        mask = attn_metadata.attn_mask
-                        assert attn_metadata.prefill_metadata is not None
-                        self.seq_lens_tensor_cpu = torch.from_numpy(
-                            np.array(attn_metadata.prefill_metadata.seq_lens).
-                            astype(np.int32))
-                        if is_310p():
-                            # align q k v output tensors
-                            query = aligned_16(query)
-                            key = aligned_16(key)
-                            value = aligned_16(value)
-                            output = aligned_16(output)
-
-                            # do reformat in case of broadcasted tensors
-                            mask = mask.repeat(
-                                self.seq_lens_tensor_cpu.size(0), 1, 1, 1)
-                            mask = torch_npu.npu_format_cast(
-                                mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
-                        torch_npu._npu_flash_attention(
-                            query=query,
-                            key=key,
-                            value=value,
-                            mask=mask,
-                            seq_len=self.seq_lens_tensor_cpu,
-                            scale_value=self.scale,
-                            num_heads=self.num_heads,
-                            num_kv_heads=self.num_kv_heads,
-                            out=output)
-                        output = output[:num_tokens, :, :]
-                # Prefix cache only and cache hit
-                elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled:
-                    assert kv_cache is not None
-                    assert attn_metadata.prefill_metadata is not None
-                    self.seq_lens_tensor_cpu = torch.from_numpy(
-                        np.array(
-                            attn_metadata.prefill_metadata.seq_lens).astype(
-                                np.int32))
-                    self.query_lens_tensor_cpu = torch.from_numpy(
-                        np.array(
-                            attn_metadata.prefill_metadata.query_lens).astype(
-                                np.int32))
-                    block_tables = attn_metadata.prefill_metadata.block_tables
-                    assert attn_metadata.compress_mask is not None
-                    compress_mask = attn_metadata.compress_mask
-                    torch_npu._npu_flash_attention_qlens(
-                        query=query,
-                        key_cache=self.key_cache,
-                        value_cache=self.value_cache,
-                        block_table=block_tables,
-                        mask=compress_mask,
-                        seq_len=self.query_lens_tensor_cpu,
-                        context_lens=self.seq_lens_tensor_cpu,
-                        num_kv_heads=self.num_kv_heads,
-                        num_heads=self.num_heads,
-                        scale_value=self.scale,
-                        out=output)
-                # Splitfuse
-                else:
-                    assert kv_cache is not None
-                    self.seq_lens_tensor_cpu = torch.from_numpy(
-                        np.array(attn_metadata.seq_lens).astype(np.int32))
-                    self.query_lens_tensor_cpu = torch.from_numpy(
-                        np.array(attn_metadata.query_lens).astype(np.int32))
-                    block_tables = attn_metadata.block_tables
-                    assert attn_metadata.chunk_mask is not None
-                    chunk_mask = attn_metadata.chunk_mask
-                    torch_npu._npu_paged_attention_splitfuse(
-                        query=query,
-                        key_cache=self.key_cache,
-                        value_cache=self.value_cache,
-                        block_table=block_tables,
-                        context_lens=self.seq_lens_tensor_cpu,
-                        mask=chunk_mask,
-                        seq_len=self.query_lens_tensor_cpu,
-                        num_kv_heads=self.num_kv_heads,
-                        num_heads=self.num_heads,
-                        scale_value=self.scale,
-                        out=output)
-            # Decode only
-            else:
-                assert self.key_cache is not None
-                assert self.value_cache is not None
-                assert attn_metadata.decode_metadata is not None
-                self.seq_lens_tensor_cpu = torch.from_numpy(
-                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
-                        np.int32))
-                if is_310p():
-                    # # seq_lens_tensor needs to be transferred to the device for 310P
-                    self.seq_lens_tensor_cpu = self.seq_lens_tensor_cpu.to(
-                        device=self.key_cache.device)
-                block_tables = attn_metadata.decode_metadata.block_tables
-                torch_npu._npu_paged_attention(
-                    query=query,
-                    key_cache=self.key_cache,
-                    value_cache=self.value_cache,
-                    num_kv_heads=self.num_kv_heads,
-                    num_heads=self.num_heads,
-                    scale_value=self.scale,
-                    block_table=block_tables,
-                    context_lens=self.seq_lens_tensor_cpu,
-                    out=output)
-
-        return output.view(num_tokens, self.hidden_size)
-
-
-class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        **extra_impl_args,
-    ) -> None:
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.hidden_size = self.num_heads * self.head_size
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = sliding_window
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes,
-                                        dtype=torch.float32,
-                                        device="npu")
-        self.alibi_slopes = alibi_slopes
-        self.attn_type = attn_type
-
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.seq_len_cpu_tensor = None
-
-        # MLA Args
-        self.q_lora_rank = extra_impl_args['q_lora_rank']
-        self.kv_lora_rank = extra_impl_args['kv_lora_rank']
-        self.qk_nope_head_dim = extra_impl_args['qk_nope_head_dim']
-        self.qk_rope_head_dim = extra_impl_args['qk_rope_head_dim']
-        self.qk_head_dim = extra_impl_args['qk_head_dim']
-        self.v_head_dim = extra_impl_args['v_head_dim']
-        self.rotary_emb = extra_impl_args['rotary_emb']
-        self.q_proj = extra_impl_args['q_proj']
-        self.kv_b_proj = extra_impl_args['kv_b_proj']
-        self.o_proj = extra_impl_args['o_proj']
-        self.kv_a_proj_with_mqa = extra_impl_args.get('kv_a_proj_with_mqa',
-                                                      None)
-        self.kv_a_layernorm = extra_impl_args.get('kv_a_layernorm', None)
-        self.k_pe_cache = None
-        self.k_nope_cache = None
-        self.w_kc = None
-        self.w_vc = None
-
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-
-        # TODO: support numHeads / numKvHeads < 16 in MLA kernel
-        if self.torchair_graph_enabled:
-            assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
-                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
-                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
-                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
-                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
-                "{32, 64, 128}.")
-
-    def exec_kv(
-        self,
-        hidden_states: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-        kv_cache: Tuple,
-        slots: torch.Tensor,
-    ):
-        B = hidden_states.shape[0]
-        N = self.num_kv_heads
-        S = 1
-        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
-        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
-        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-
-        k_pe, k_nope, _, _ = torch.ops.npu_inference.npu_kv_rmsnorm_rope_cache(
-            kv,
-            self.kv_a_layernorm.weight,
-            cos,
-            sin,
-            slots.to(torch.int64),
-            kv_cache[1],
-            kv_cache[0],
-            epsilon=self.kv_a_layernorm.variance_epsilon,
-            cache_mode="PA",
-        )
-
-        return k_pe, k_nope
-
-    def apply_rotary_emb(
-        self,
-        x: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-        is_neox_style: bool,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x: [num_tokens, num_heads, head_size]
-            cos: [num_tokens, head_size // 2]
-            sin: [num_tokens, head_size // 2]
-            is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
-                positional embeddings.
-        """
-        cos = cos.unsqueeze(-2).to(x.dtype)
-        sin = sin.unsqueeze(-2).to(x.dtype)
-        if is_neox_style:
-            x1, x2 = torch.chunk(x, 2, dim=-1)
-        else:
-            x1 = x[..., ::2]
-            x2 = x[..., 1::2]
-        o1 = x1 * cos - x2 * sin
-        o2 = x2 * cos + x1 * sin
-        if is_neox_style:
-            return torch.cat((o1, o2), dim=-1)
-        else:
-            return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-    def rope_single(
-        self,
-        x: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-    ) -> torch.Tensor:
-        B, N, D = x.shape
-        S = 1
-        x = x.view(B, N, S, D)
-        x = torch.ops.npu_inference.npu_interleave_rope(x, cos, sin)
-        return x.view(B, N, D)
-
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
-        if self.w_kc is None or self.w_vc is None:
-            kv_b_proj_weight = self.kv_b_proj.weight.reshape(
-                self.num_heads, self.qk_nope_head_dim + self.v_head_dim,
-                self.kv_lora_rank)
-            self.w_kc = kv_b_proj_weight[:, :self.
-                                         qk_nope_head_dim, :].contiguous()
-            self.w_vc = kv_b_proj_weight[:,
-                                         self.qk_nope_head_dim:, :].transpose(
-                                             1, 2).contiguous()
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        hidden_states_or_q_c: torch.Tensor,
-        hidden_states_or_kv_c_normed: torch.Tensor,
-        k_pe: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AscendMetadata,
-        attn_type: str = AttentionType.DECODER,
-        output: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with Ascend attention.
-        Args:
-            hidden_states_or_q_c: shape = [num_tokens, num_heads * head_size]
-                                           num_tokens = batch_size * seq_len
-            hidden_states_or_kv_c_normed: shape = [num_tokens, num_kv_heads * head_size]
-            k_pe: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache: shape = [1, num_blocks, block_size,
-                               num_kv_heads * head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [batch_size, seq_len * num_heads * head_size]
-        """
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        attn_type = self.attn_type
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
-
-        if attn_metadata is None:
-            # for profile run
-            return hidden_states_or_q_c
-
-        num_tokens = hidden_states_or_q_c.shape[0]
-        q = self.q_proj(hidden_states_or_q_c)[0].view(-1, self.num_heads,
-                                                      self.qk_head_dim)
-        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                               dim=-1)
-        if k_pe is None and attn_metadata.decode_metadata:
-            seq_len = self.rotary_emb.max_position_embeddings
-
-            cos = self.rotary_emb.cos_cached[:seq_len].to(dtype=q_pe.dtype)
-            sin = self.rotary_emb.sin_cached[:seq_len].to(dtype=q_pe.dtype)
-            cos = cos[attn_metadata.input_positions]
-            sin = sin[attn_metadata.input_positions]
-            cos = cos[:, None, None, :]
-            sin = sin[:, None, None, :]
-
-            q_pe = self.rope_single(q_pe, cos, sin)
-            k_pe, k_nope = self.exec_kv(hidden_states_or_kv_c_normed, cos, sin,
-                                        kv_cache, attn_metadata.slot_mapping)
-        else:
-            if k_pe is None:
-                # NOTE: k_pe is None when graph mode enabled
-                kv_c, k_pe = self.kv_a_proj_with_mqa(
-                    hidden_states_or_kv_c_normed)[0].split(
-                        [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-                kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-            else:
-                kv_c_normed = hidden_states_or_kv_c_normed
-            k_pe = k_pe.view(num_tokens, self.num_kv_heads, -1)
-            if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding':
-                # NOTE: When scaling not specified
-                ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
-                q_pe = q_pe.reshape(num_tokens, -1)
-                k_pe = k_pe.reshape(num_tokens, -1)
-                q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions,
-                                             q_pe, k_pe)
-                q_pe = q_pe.view(ori_q_pe_shape)
-                k_pe = k_pe.view(ori_k_pe_shape)
-            else:
-                q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions,
-                                             q_pe, k_pe)
-
-        if attn_metadata.num_prefills > 0:
-            kv = self.kv_b_proj(kv_c_normed)[0].view(num_tokens,
-                                                     self.num_heads, -1)
-            k_nope, value = kv.split([self.qk_nope_head_dim, self.v_head_dim],
-                                     dim=-1)
-        else:
-            q_nope_t = torch.transpose(q_nope, 0, 1)
-            q_nope_out = torch.bmm(q_nope_t, self.w_kc)
-            q_nope = torch.transpose(q_nope_out, 0, 1)
-
-        query = torch.cat([q_nope, q_pe], dim=-1).view(num_tokens,
-                                                       self.num_heads, -1)
-
-        # TODO: Replace the env with more flexible expressions
-        if self.torchair_graph_enabled:
-            if len(kv_cache) > 0 and kv_cache[0].numel(
-            ) > 0 and attn_metadata.num_prefills > 0:
-                slots = attn_metadata.slot_mapping
-                # NOTE: Separate the kv cache in advance to avoid OOM or other issues
-                torch_npu._npu_reshape_and_cache(key=kv_c_normed.view(
-                    num_tokens, self.num_kv_heads, -1),
-                                                 value=k_pe,
-                                                 key_cache=kv_cache[0],
-                                                 value_cache=kv_cache[1],
-                                                 slot_indices=slots)
-        elif kv_cache.numel() > 0:
-            # TODO replace this naive implement with fusion kernel
-            concat_and_cache_mla(kv_c_normed, k_pe, kv_cache,
-                                 attn_metadata.slot_mapping)
-
-        if attn_metadata.num_prefills > 0:
-            attn_output = torch.empty(num_tokens,
-                                      self.num_heads,
-                                      self.v_head_dim,
-                                      dtype=query.dtype,
-                                      device=query.device)
-            if (attn_metadata.block_tables is None
-                    or attn_metadata.block_tables.numel() == 0):
-                assert attn_metadata.attn_mask is not None
-                assert attn_metadata.prefill_metadata is not None
-                assert attn_metadata.prefill_metadata.seq_lens is not None
-                mask = attn_metadata.attn_mask
-                self.seq_lens_tensor_cpu = torch.from_numpy(
-                    np.array(attn_metadata.prefill_metadata.seq_lens).astype(
-                        np.int32))
-                k_pe = k_pe.repeat(1, self.num_heads, 1)
-                key = torch.cat(
-                    [k_nope.view(num_tokens, self.num_heads, -1), k_pe], dim=2)
-                torch_npu._npu_flash_attention(
-                    query=query,
-                    key=key,
-                    value=value,
-                    mask=mask,
-                    seq_len=self.seq_lens_tensor_cpu,
-                    scale_value=self.scale,
-                    num_heads=self.num_heads,
-                    num_kv_heads=self.num_heads,
-                    out=attn_output)
-            else:
-                # TODO: Will support prefix cache and chunked prefill soon.
-                raise RuntimeError(
-                    "Prefix cache and chunked prefill are currently not supported."
-                )
-        elif attn_metadata.decode_metadata:
-            assert kv_cache is not None
-            if self.torchair_graph_enabled:
-                # shape of query for npu graph mode should be:
-                # [bs, num_heads_per_rank, seq_len, dim]
-                q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
-                q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
-                # shape of knope/k_pe for npu graph mode should be:
-                # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim]
-                block_size = kv_cache[0].shape[1]
-                k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
-                                     self.kv_lora_rank)
-                k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
-                                 self.qk_rope_head_dim)
-                attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
-                    q_nope,
-                    k_nope,
-                    k_nope,
-                    query_rope=q_pe,
-                    key_rope=k_pe,
-                    num_heads=self.num_heads,
-                    num_key_value_heads=self.num_kv_heads,
-                    input_layout="BNSD",
-                    atten_mask=attn_metadata.attn_mask,
-                    scale=self.scale,
-                    antiquant_mode=0,
-                    antiquant_scale=None,
-                    block_table=attn_metadata.block_tables,
-                    block_size=block_size,
-                    actual_seq_lengths_kv=attn_metadata.seq_lens,
-                )
-                attn_output = attn_output.view(num_tokens, -1,
-                                               self.kv_lora_rank).transpose(
-                                                   0, 1)
-                attn_output = torch.bmm(attn_output, self.w_vc).transpose(0, 1)
-            else:
-                # if torch.empty is used here, the preemptive scheduling case of
-                # test_mtp_correctness.py will fail to run.
-                attn_output = torch.randn(
-                    [num_tokens, self.num_heads, self.kv_lora_rank],
-                    dtype=query.dtype,
-                    device=query.device)
-                self.seq_lens_tensor_cpu = torch.from_numpy(
-                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
-                        np.int32))
-                block_tables = attn_metadata.decode_metadata.block_tables
-                torch_npu._npu_paged_attention_mla(
-                    query=query,
-                    key_cache=kv_cache,
-                    num_kv_heads=self.num_kv_heads,
-                    num_heads=self.num_heads,
-                    scale_value=self.scale,
-                    block_table=block_tables,
-                    context_lens=self.seq_lens_tensor_cpu,
-                    mla_vheadsize=self.kv_lora_rank,
-                    out=attn_output)
-                attn_output_t = torch.transpose(attn_output, 0, 1)
-                attn_output_t = torch.bmm(attn_output_t, self.w_vc)
-                attn_output = torch.transpose(attn_output_t, 0, 1)
-
-        output, _ = self.o_proj(attn_output.reshape(num_tokens, -1))
-
-        return output
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index 391e41d6ce..e60344836d 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -73,38 +73,6 @@
 #    Future Plan:
 #       Keep this patch in vllm-ascend.
 #
-# ** File: worker/patch_common/patch_multi_step_worker.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output`
-#    Why:
-#       There are cuda hard code (current_platform.is_cuda_alike()) in
-#       `MultiStepWorker.sampler_output`, and we need to use the patched `TP1DraftModelRunner` in it.
-#    How：
-#       Make speculative decoding extensible to different backends.
-#       - support attention metadata register to the set supported spec decode
-#       - offer a api in platform to determine whether spec decode is supported,
-#         and deprecate is_cuda_alike in it.
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm/pull/15195
-#       - https://github.com/vllm-project/vllm-ascend/pull/395
-#    Future Plan:
-#       Revert it when the related pr is merged in vllm and vllm-ascend.
-#
-# ** File: worker/patch_common/patch_spec_decode_worker.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
-#    Why:
-#       We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`.
-#       The mainly reason to overwrite `TP1DraftModelRunner`is the hard code of
-#           `FlashAttentionMetadata`
-#    How：
-#       ditto
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm/pull/15195
-#       - https://github.com/vllm-project/vllm-ascend/pull/395
-#    Future Plan:
-#       Revert it when the related pr is merged in vllm and vllm-ascend.
-#
 # ** File: worker/patch_common/patch_distributed.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.distributed.parallel_state.GroupCoordinator`
diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py
index d78b6dc8b4..c9358682a7 100644
--- a/vllm_ascend/patch/worker/patch_common/__init__.py
+++ b/vllm_ascend/patch/worker/patch_common/__init__.py
@@ -20,6 +20,4 @@
 import vllm_ascend.patch.worker.patch_common.patch_utils  # noqa isort:skip
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_sampler  # noqa
-import vllm_ascend.patch.worker.patch_common.patch_spec_decode_worker  # noqa
diff --git a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
deleted file mode 100644
index 53ce312676..0000000000
--- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import List, Set, Tuple
-
-import torch
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.spec_decode.multi_step_worker import MultiStepWorker
-
-from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
-
-
-def sampler_output(
-    self,
-    execute_model_req: ExecuteModelRequest,
-    sample_len: int,
-    seq_ids_with_bonus_token_in_last_step: Set[int],
-) -> Tuple[List[SamplerOutput], bool]:
-    """Run the model forward pass sample_len times. Returns the list of
-    sampler output, one per model forward pass, along with indicator of
-    whether torch tensor in sampler output need to be transposed in latter
-    sampler_output_to_torch logic.
-
-    For multi step worker, this indicator shall be True.
-    """
-    self._raise_if_unsupported(execute_model_req)
-    # Expand the batch for sequences with a bonus token.
-    # Perform a forward pass on the expanded batch and filter the
-    # response to retain only the original sequences' responses.
-    expanded_request, indices_of_seq_with_bonus_tokens =\
-        self._expand_execute_model_request(
-            execute_model_req, seq_ids_with_bonus_token_in_last_step)
-
-    # Run model sample_len times.
-    model_outputs: List[SamplerOutput] = []
-
-    # TODO: supports_gpu_multi_step is False in ASCEND
-    if isinstance(self.model_runner, TP1DraftModelRunner) and \
-        self.model_runner.supports_gpu_multi_step(expanded_request):
-        # Here we run the draft_model_runner with multi-step prepare
-        # on the GPU directly
-        expanded_request.num_steps = sample_len
-        self.model_runner.set_indices_of_seq_with_bonus_tokens(
-            indices_of_seq_with_bonus_tokens)
-        model_outputs = self.execute_model(execute_model_req=expanded_request)
-    else:
-        # Here we run multi-step directly, with every step prepared
-        # on the CPU.
-        # TODO Remove this branch once DraftModelRunner supports TP>1
-        # and other restrictions that are part of DraftModelRunner's
-        # supports_gpu_multi_step(..)
-        if expanded_request.previous_hidden_states is not None:
-            self.worker.model_runner.return_hidden_states = True
-        for _ in range(sample_len):
-            model_output: List[SamplerOutput] = self.worker.execute_model(
-                execute_model_req=expanded_request)
-            assert (len(model_output) == 1
-                    ), "composing multistep workers not supported"
-            model_output = model_output[0]
-            self._maybe_update_previous_hidden_states(model_output,
-                                                      expanded_request)
-
-            self._append_new_tokens(model_output,
-                                    expanded_request.seq_group_metadata_list,
-                                    indices_of_seq_with_bonus_tokens)
-            model_outputs.append(model_output)
-
-    # move indices to device to avoid stream sync
-    indices_of_seq_with_bonus_tokens = torch.tensor(
-        indices_of_seq_with_bonus_tokens, device=self.device)
-    filtered_model_outputs = self._filter_model_output(
-        model_outputs, indices_of_seq_with_bonus_tokens)
-    return filtered_model_outputs, True
-
-
-MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
diff --git a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
deleted file mode 100644
index d271e65bfc..0000000000
--- a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Any, Dict, Optional
-
-from vllm.config import ParallelConfig
-from vllm.logger import logger
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.model_executor.layers.spec_decode_base_sampler import \
-    SpecDecodeBaseSampler
-from vllm.model_executor.layers.typical_acceptance_sampler import \
-    TypicalAcceptanceSampler
-from vllm.spec_decode.medusa_worker import MedusaWorker
-from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
-from vllm.spec_decode.multi_step_worker import MultiStepWorker
-from vllm.spec_decode.ngram_worker import NGramWorker
-from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
-from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
-from vllm.worker.worker_base import WorkerBase
-
-from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
-
-
-def create_worker(
-    cls,
-    scorer_worker: WorkerBase,
-    draft_worker_kwargs: Dict[str, Any],
-    disable_mqa_scorer: bool,
-    disable_by_batch_size: Optional[int],
-    draft_token_acceptance_method: str,
-    typical_acceptance_sampler_posterior_threshold: float,
-    typical_acceptance_sampler_posterior_alpha: float,
-    disable_logprobs: bool,
-    disable_log_stats: bool,
-    num_speculative_tokens: int,
-) -> "SpecDecodeWorker":
-
-    allow_zero_draft_token_step = True
-    enable_lm_head_weight_load = False
-    num_spec_prefill_steps = 1
-    ngram_prompt_lookup_max = (
-        draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
-    ngram_prompt_lookup_min = (
-        draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
-
-    draft_model_config = draft_worker_kwargs["vllm_config"].model_config
-    draft_parallel_config: ParallelConfig = draft_worker_kwargs[
-        'vllm_config'].parallel_config
-    if ngram_prompt_lookup_max > 0:
-        draft_worker_kwargs[
-            "device_type"] = scorer_worker.device_config.device.type
-        proposer_worker = NGramWorker(**draft_worker_kwargs)
-        proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
-                                              ngram_prompt_lookup_max)
-    else:
-        # TODO(Yizhou): A quick fix, must be refactored ASAP
-        # ngram need not this fix.
-        draft_worker_kwargs[
-            "vllm_config"].parallel_config.expert_parallel_size = 1
-        draft_worker_kwargs[
-            "vllm_config"].parallel_config.expert_tensor_parallel_size = 1
-
-        draft_tp = draft_parallel_config.tensor_parallel_size
-        target_tp = scorer_worker.parallel_config.tensor_parallel_size
-
-        if draft_model_config.hf_config.model_type == "mlp_speculator":
-            proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
-        elif draft_model_config.hf_config.model_type == "medusa":
-            proposer_worker = MedusaWorker(**draft_worker_kwargs)
-        else:
-            # Note: The current version of the MTP module doer not support
-            # the use of TP1DraftModelRunner
-            if draft_tp == 1 and draft_model_config.hf_config.model_type !=\
-                    "deepseek_mtp":
-                draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner
-            else:
-                if draft_model_config.hf_config.model_type == "eagle":
-                    raise NotImplementedError(
-                        f"{draft_model_config.hf_config.model_type} "
-                        "does not support TP > 1 yet")
-
-                allow_zero_draft_token_step = False
-
-            # Load lm_head weight for eagle in init_device
-            if draft_model_config.hf_config.model_type == "eagle":
-                enable_lm_head_weight_load = True
-
-            proposer_worker = MultiStepWorker(**draft_worker_kwargs)
-            if draft_model_config.hf_config.model_type == "deepseek_mtp":
-                num_spec_prefill_steps = draft_model_config.hf_config.n_predict
-
-        proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
-            proposer_worker, draft_tp, target_tp)
-
-    logger.info("Configuring SpecDecodeWorker with proposer=%s",
-                type(proposer_worker))
-
-    spec_decode_sampler: SpecDecodeBaseSampler = None
-    if draft_token_acceptance_method == "rejection_sampler":
-        spec_decode_sampler = RejectionSampler()
-    elif draft_token_acceptance_method == "typical_acceptance_sampler":
-        spec_decode_sampler = TypicalAcceptanceSampler(
-            posterior_threshold=\
-                typical_acceptance_sampler_posterior_threshold,
-            posterior_alpha=typical_acceptance_sampler_posterior_alpha,
-        )
-    logger.info(
-        "[Speculative Decoding] Configuring"
-        " SpecDecodeWorker with sampler=%s", type(spec_decode_sampler))
-
-    if not disable_mqa_scorer:
-        if scorer_worker.model_runner.attn_backend.get_name() != "FLASH_ATTN":
-            disable_mqa_scorer = True
-            logger.info("[Speculative Decoding] Disabling MQA scorer as the "
-                        "MQA is only available with flash attn backend.")
-
-        if draft_model_config and \
-            draft_model_config.max_model_len < \
-                scorer_worker.model_config.max_model_len:
-            disable_mqa_scorer = True
-            logger.info("[Speculative Decoding] Disabling MQA scorer as the "
-                        "draft model max_model_len is smaller than the target "
-                        "model max_model_len.")
-
-        if not scorer_worker.model_runner.model_config.enforce_eager:
-            disable_mqa_scorer = True
-            logger.info("[Speculative Decoding] Disabling MQA scorer as the "
-                        "target model is not running in eager mode.")
-
-    return SpecDecodeWorker(
-        proposer_worker,
-        scorer_worker,
-        disable_mqa_scorer=disable_mqa_scorer,
-        disable_logprobs=disable_logprobs,
-        disable_log_stats=disable_log_stats,
-        disable_by_batch_size=disable_by_batch_size,
-        spec_decode_sampler=spec_decode_sampler,
-        allow_zero_draft_token_step=allow_zero_draft_token_step,
-        enable_lm_head_weight_load=enable_lm_head_weight_load,
-        num_spec_prefill_steps=num_spec_prefill_steps)
-
-
-SpecDecodeWorker.create_worker = classmethod(create_worker)
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 07fb07fcb6..111e13c5dc 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -180,18 +180,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             update_aclgraph_sizes(vllm_config)
 
         if parallel_config and parallel_config.worker_cls == "auto":
-            if envs.VLLM_USE_V1:
-                parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
-            elif vllm_config.speculative_config:
-                # NOTE: We set this var to `1` in vllm-ascend to avoid segment
-                # fault when using spec decode with V0 engine.
-                os.environ["ACL_OP_INIT_MODE"] = "1"
-                parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-                parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
-            elif vllm_config.scheduler_config.is_multi_step:
-                parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
-            else:
-                parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
+            parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
 
         if cache_config:
             if cache_config.block_size is None:
@@ -202,34 +191,33 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 )
                 cache_config.block_size = 128
 
-        if envs.VLLM_USE_V1:
-            # Activate custom ops for v1, except on 310P
-            if not is_310p():
-                compilation_config.custom_ops = ["all"]
-
-            # If ascend_scheduler_config is enabled,
-            # extents original scheduler_config to use AscendScheduler.
-            if ascend_config.ascend_scheduler_config.enabled:
-                from vllm_ascend.core.schedule_config import \
-                    AscendSchedulerConfig
-                ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
-                    vllm_config.scheduler_config,
-                    ascend_config.ascend_scheduler_config)
-                vllm_config.scheduler_config = ascend_scheduler_config
+        # Activate custom ops for v1, except on 310P
+        if not is_310p():
+            compilation_config.custom_ops = ["all"]
+
+        # If ascend_scheduler_config is enabled,
+        # extents original scheduler_config to use AscendScheduler.
+        if ascend_config.ascend_scheduler_config.enabled:
+            from vllm_ascend.core.schedule_config import \
+                AscendSchedulerConfig
+            ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
+                vllm_config.scheduler_config,
+                ascend_config.ascend_scheduler_config)
+            vllm_config.scheduler_config = ascend_scheduler_config
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1, use_mla):
-        if use_v1 and use_mla:
-            return "vllm_ascend.attention.mla_v1.AscendMLABackend"
+        if not use_v1:
+            raise RuntimeError("V0 engine is not supported on vllm-ascend now!")
+        
         use_torchair = get_ascend_config().torchair_graph_config.enabled
-        if use_v1 and use_torchair:
+        if use_mla:
+            return "vllm_ascend.attention.mla_v1.AscendMLABackend"
+        elif use_torchair:
             return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
-        if use_v1:
+        else:
             return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
-        if use_mla:
-            return "vllm_ascend.attention.attention.AscendMLAAttentionBackend"
-        return "vllm_ascend.attention.attention.AscendAttentionBackend"
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py
index ee59a056ef..116c73c06c 100644
--- a/vllm_ascend/worker/__init__.py
+++ b/vllm_ascend/worker/__init__.py
@@ -14,4 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import vllm_ascend.worker.cache_engine  # noqa
\ No newline at end of file
diff --git a/vllm_ascend/worker/cache_engine.py b/vllm_ascend/worker/cache_engine.py
deleted file mode 100644
index d8d9087745..0000000000
--- a/vllm_ascend/worker/cache_engine.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/worker/model_runner.py
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Any, List
-
-import torch
-from vllm.utils import is_pin_memory_available
-from vllm.worker.cache_engine import CacheEngine
-
-from vllm_ascend.ascend_config import get_ascend_config
-
-
-def allocate_kv_cache(
-    self,
-    num_blocks: int,
-    device: str,
-) -> List[Any]:
-    """Allocates KV cache on the specified device."""
-    kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-        num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-    pin_memory = is_pin_memory_available() if device == "cpu" else False
-    kv_cache: List[Any] = []
-
-    ascend_config = get_ascend_config()
-    if ascend_config.torchair_graph_config.enabled:
-        # Align entries so they are 256 byte aligned for better performance
-        # Primarily targets MLA as this typically only ends up having entries
-        # be 128 byte aligned.
-        alloc_shape = kv_cache_shape
-
-        for _ in range(self.num_attention_layers):
-            # null block in CpuGpuBlockAllocator requires at least that
-            # block to be zeroed-out.
-            # We zero-out everything for simplicity.
-            layer_kv_cache_nope = torch.zeros(
-                alloc_shape[:-1] +
-                (self.model_config.hf_text_config.kv_lora_rank, ),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device=device)
-            layer_kv_cache_pe = torch.zeros(
-                alloc_shape[:-1] +
-                (self.model_config.hf_text_config.qk_rope_head_dim, ),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device=device)
-
-            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
-            # when entry_shape is higher than 1D
-            kv_cache.append((layer_kv_cache_nope, layer_kv_cache_pe))
-    else:
-        for _ in range(self.num_attention_layers):
-            # null block in CpuGpuBlockAllocator requires at least that
-            # block to be zeroed-out.
-            # We zero-out everything for simplicity.
-            layer_kv_cache = torch.zeros(kv_cache_shape,
-                                         dtype=self.dtype,
-                                         pin_memory=pin_memory,
-                                         device=device)
-
-            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
-            # when entry_shape is higher than 1D
-            kv_cache.append(layer_kv_cache)
-    return kv_cache
-
-
-CacheEngine._allocate_kv_cache = allocate_kv_cache
diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py
deleted file mode 100644
index b070da1a7f..0000000000
--- a/vllm_ascend/worker/draft_model_runner.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import List, Optional
-
-import torch
-from vllm.forward_context import set_forward_context
-from vllm.logger import logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.worker.model_runner_base import (ModelRunnerBase,
-                                           ModelRunnerInputBase,
-                                           ModelRunnerWrapperBase)
-
-from vllm_ascend.attention.attention import AscendMetadata
-
-# A flag to enable debug prints for the updated input tensors
-# before each step.
-debug_advance_input = False
-# A flag to allow GPU advance step for draft model runner.
-# Set to False for debugging.
-allow_gpu_advance_step = True
-
-
-class TP1DraftModelRunner(ModelRunnerWrapperBase):
-    """Specialized model runner for speculative decoding draft model.
-    Since the draft model always execute k forward passes consecutively to
-    generate k speculative tokens in a single speculative decoding step,
-    we could get rid of most CPU-GPU synchronization and data transfer
-    overheads by keeping model input and output tensors on GPU all the time.
-
-    TODOs:
-    1. Currently supports only flash-attn, add support for other attn_backends.
-    2. Support TP > 1 (this requires some designs because we do not expect
-       any broadcasting inside execute_model).
-    """
-
-    def __init__(self, model_runner: ModelRunnerBase):
-        super().__init__(model_runner)
-
-        self.indices_of_seq_with_bonus_tokens = None
-
-    def _update_sampling_metadata(self, sampling_metadata, num_seqs,
-                                  num_queries):
-
-        assert sampling_metadata.num_prompts == 0
-        assert len(sampling_metadata.seq_groups) == num_queries
-        assert sampling_metadata.selected_token_indices.shape == (
-            num_queries, )
-        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
-
-        # Verify that all sequences are decodes
-        for i in range(num_queries):
-            seq_group = sampling_metadata.seq_groups[i]
-
-            assert seq_group.is_prompt is False  # No prompt
-            assert seq_group.prompt_logprob_indices == []  # No prompt
-            assert seq_group.sample_indices == [i]  # Simple
-
-    def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
-                          last_output: SamplerOutput) -> ModelRunnerInputBase:
-        # Currently, we expect "decode mode" only
-        assert not model_input.is_prompt
-
-        # Get num_seqs
-        num_seqs = len(model_input.seq_lens)
-        num_queries = len(model_input.query_lens)
-
-        # Get output tokens GPU tensor
-        sampled_token_ids = last_output.sampled_token_ids
-        assert sampled_token_ids is not None
-
-        # Update attn_metadata
-        attn_metadata = model_input.attn_metadata
-        assert isinstance(attn_metadata, AscendMetadata)
-
-        attn_metadata.advance_step(model_input, sampled_token_ids,
-                                   self.block_size, num_seqs, num_queries)
-
-        # Update sampling_metadata
-        sampling_metadata = model_input.sampling_metadata
-        self._update_sampling_metadata(sampling_metadata, num_seqs,
-                                       num_queries)
-
-        # Create new input
-        new_model_input = self._model_input_cls(
-            input_tokens=model_input.input_tokens,
-            input_positions=model_input.input_positions,
-            attn_metadata=attn_metadata,
-            seq_lens=attn_metadata.seq_lens,
-            query_lens=model_input.query_lens,
-            # Notes: If vllm_ascend supports LORA, we need to
-            # add the following two params.
-            # lora_mapping=model_input.lora_mapping,
-            # lora_requests=model_input.lora_requests,
-            multi_modal_kwargs=model_input.multi_modal_kwargs,
-            sampling_metadata=model_input.sampling_metadata,
-            is_prompt=False,
-        )
-
-        # Ensure we skip CPU samples
-        assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True
-        # We can reuse sampling tensors since every decode iteration is the same
-        new_model_input.sampling_metadata.reuse_sampling_tensors = True
-
-        if debug_advance_input:
-            logger.debug("NEW INPUT: ")
-            logger.debug("  input_tokens = %s", new_model_input.input_tokens)
-            logger.debug("  input_positions = %s",
-                         new_model_input.input_positions)
-            logger.debug("  seq_lens = %d", new_model_input.seq_lens)
-            logger.debug("  query_lens = %d", new_model_input.query_lens)
-            logger.debug("  attn_metadata:")
-            logger.debug("    seq_lens_tensor: %s",
-                         attn_metadata.seq_lens_tensor)
-            logger.debug("    slot_mapping: %s", attn_metadata.slot_mapping)
-            logger.debug("    block_tables: %s", attn_metadata.block_tables)
-
-        return new_model_input
-
-    def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
-        """Determines if draft_model_runner GPU multi-step can be used.
-        Currently required conditions are:
-            1. Only decodes 
-            2. Only flash-attn
-            3. No LORA
-            4. No prompt_adapter_config
-        """
-        if not allow_gpu_advance_step:
-            return False
-
-        # We allow multi-step GPU only in decode mode
-        for seq_group in execute_model_req.seq_group_metadata_list:
-            if seq_group.is_prompt:
-                return False
-
-        # TODO: Add support for ASCEND when outer multi_step_worker
-        # could work correct.
-        if self.attn_backend.get_name() not in ("FLASH_ATTN", "TRITON_MLA"):
-            return False
-
-        # TODO: Add support for LORA
-        if self.lora_config:
-            return False
-
-        # TODO: Add soft-tuning prompt adapter support
-        return not self.prompt_adapter_config
-
-    def set_indices_of_seq_with_bonus_tokens(self,
-                                             indices_of_seq_with_bonus_tokens):
-        self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelRunnerInputBase,
-        kv_caches: List[torch.Tensor],
-        previous_hidden_states: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-        **kwargs,
-    ) -> Optional[List[SamplerOutput]]:
-        """Executes num_steps forward passes with advacement of input tensors 
-        on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
-
-        Optimizations used:
-            1. Input tensors are updated on the GPU directly
-            2. Skips GPU=>CPU serialization of sampler outputs (we don't need 
-                them since we do batch expansion later that uses GPU outputs)
-            3. Reuses sampling tensors (since we run only decodes and they have
-                a repeating sampling logic)
-        """
-
-        # When num_steps == 1, we execute the fallback here for the GPU
-        # advance_step, which runs prepare_inputs on CPU and for each spec
-        # iteration invokes this function only once
-        # (Look at multi-step-worker code)
-        is_fallback = num_steps == 1
-        if not is_fallback:
-            # Since we do not broadcast data inside execute_model anymore,
-            # we need to figure out the best way to support TP > 1 in this
-            # case, because we will at least need to broadcast the sampled
-            # tokens to all workers.
-            if not self.is_driver_worker:
-                raise ValueError("TP1DraftModelRunner only supports TP=1.")
-
-            # Sanity
-            if self.lora_config is not None:
-                raise ValueError("TP1DraftModelRunner has no support for LORA")
-            if self.prompt_adapter_config is not None:
-                raise ValueError("TP1DraftModelRunner has no support for "
-                                 "prompt_adapter_config")
-            if model_input.inputs_embeds is not None:
-                raise ValueError("TP1DraftModelRunner has no support for "
-                                 "inputs_embeds")
-            if model_input.multi_modal_kwargs:
-                raise ValueError(
-                    "TP1DraftModelRunner has no support for multi_modal_kwargs"
-                )
-        else:
-            if self.lora_config:
-                assert model_input.lora_requests is not None
-                assert model_input.lora_mapping is not None
-                self.set_active_loras(model_input.lora_requests,
-                                      model_input.lora_mapping)
-
-            if self.prompt_adapter_config:
-                assert model_input.prompt_adapter_requests is not None
-                assert model_input.prompt_adapter_mapping is not None
-                self.set_active_prompt_adapters(
-                    model_input.prompt_adapter_requests,
-                    model_input.prompt_adapter_mapping)
-
-            self.attn_state.begin_forward(model_input)
-
-        # Detect exec mode
-        assert model_input.attn_metadata is not None
-        if model_input.attn_metadata.num_prefills > 0:
-            # In this case, execute_model(..) was called directly
-            if num_steps > 1:
-                raise ValueError(
-                    "execute_model(..) of draft_model_runner can be called "
-                    "directly only with a single-step prefill")
-        else:
-            # We can skip CPU samples for spec token generation.
-            # (We do allow CPU samples for num_steps == 1 to support the
-            # fallback case, where supports_gpu_multi_step(..) does not pass)
-            model_input.sampling_metadata.skip_sampler_cpu_output = (
-                not is_fallback)
-
-        model_executable = self.model
-        hidden_states = previous_hidden_states
-
-        outputs: List[SamplerOutput] = []
-        for step in range(num_steps):
-            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-
-            model_execute_kwargs = {"previous_hidden_states": hidden_states} \
-                if previous_hidden_states is not None else {}
-
-            compute_logits_kwargs = {}
-            # Run model
-            if hasattr(self.model.config, "num_nextn_predict_layers"):
-                # for DeepSeek MTP only to use the corresponding layer for
-                # each step
-                spec_step_idx = kwargs.get("spec_step_idx", step)
-                model_execute_kwargs["spec_step_idx"] = spec_step_idx
-                compute_logits_kwargs["spec_step_idx"] = spec_step_idx
-            with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config):
-
-                if model_input.attn_metadata is not None:
-                    model_input.attn_metadata.input_positions = model_input.input_positions
-
-                hidden_states = model_executable(
-                    input_ids=model_input.input_tokens,
-                    inputs_embeds=None,
-                    positions=model_input.input_positions,
-                    intermediate_tensors=intermediate_tensors,
-                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                 device=self.device),
-                    **model_execute_kwargs,
-                )
-
-            # Compute the logits.
-            logits = self.model.compute_logits(hidden_states,
-                                               model_input.sampling_metadata,
-                                               **compute_logits_kwargs)
-            if not self.is_driver_worker:
-                return []
-            # Sample the next token.
-            assert self.model_runner.sampler is not None
-            output = self.model_runner.sampler(
-                logits=logits,
-                sampling_metadata=model_input.sampling_metadata,
-            )
-            outputs.append(output)
-
-            if self.return_hidden_states and is_fallback:
-                output.hidden_states = hidden_states
-
-            if model_input.attn_metadata.num_prefills == 0 \
-                and self.indices_of_seq_with_bonus_tokens is not None:
-                assert output.sampled_token_ids is not None
-                # output.sampled_token_ids should be of shape (num_seqs, 1)
-                nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape
-                assert num_tokens_per_seq == 1
-                count = 0
-                for i in range(nums_seqs):
-                    bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[
-                        count]
-                    if i != bonus_seq_idx:
-                        # The following might cause a cpu->gpu sync
-                        # However, the performance impact is negligible as we
-                        # benchmarked on H100.
-                        output.sampled_token_ids[
-                            i, :] = model_input.input_tokens[bonus_seq_idx]
-                    else:
-                        count += 1
-
-            # Prepare inputs for the next step
-            if step != num_steps - 1:
-                model_input = self._gpu_advance_step(model_input, outputs[-1])
-
-        return outputs
diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py
deleted file mode 100644
index 48c5d4b68f..0000000000
--- a/vllm_ascend/worker/model_runner.py
+++ /dev/null
@@ -1,1607 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/worker/model_runner.py
-#
-
-import dataclasses
-import itertools
-import weakref
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
-                    Type, TypeVar, Union)
-
-import numpy as np
-import torch
-import torch.nn as nn
-import vllm.envs as envs
-from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.config import VllmConfig
-from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import broadcast_tensor_dict, get_dp_group, get_pp_group
-from vllm.distributed.kv_transfer import get_kv_transfer_group
-from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
-from vllm.logger import logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                get_sampler)
-from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalPlaceholderMap,
-                             MultiModalRegistry)
-from vllm.prompt_adapter.layers import PromptAdapterMapping
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
-                        is_pin_memory_available)
-from vllm.worker.model_runner_base import (
-    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
-    _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict,
-    _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-from vllm_ascend.ascend_config import get_ascend_config
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-TModelInputForNPU = TypeVar('TModelInputForNPU', bound="ModelInputForNPU")
-ENCODER_NUM = 0
-# if True, allow tensor initialization and casting with internal format (e.g., NZ)
-torch.npu.config.allow_internal_format = True
-
-
-@dataclass(frozen=True)
-class ModelInputForNPU(ModelRunnerInputBase):
-    """
-    This base class contains metadata needed for the base model forward pass
-    but not metadata for possible additional steps, e.g., sampling. Model
-    runners that run additional steps should subclass this method to add
-    additional fields.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    inputs_embeds: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    token_types: Optional[torch.Tensor] = None
-    seq_lens: Optional[List[int]] = None
-    query_lens: Optional[List[int]] = None
-    lora_mapping: Optional["LoRAMapping"] = None
-    lora_requests: Optional[Set[LoRARequest]] = None
-    attn_metadata: Optional["AttentionMetadata"] = None
-    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
-    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
-    finished_requests_ids: Optional[List[str]] = None
-    virtual_engine: int = 0
-    async_callback: Optional[Callable] = None
-    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
-    scheduler_outputs: Optional[SchedulerOutputs] = None
-    previous_hidden_states: Optional[torch.Tensor] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "inputs_embeds": self.inputs_embeds,
-            "input_positions": self.input_positions,
-            "lora_requests": self.lora_requests,
-            "lora_mapping": self.lora_mapping,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-            "virtual_engine": self.virtual_engine,
-            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
-            "finished_requests_ids": self.finished_requests_ids,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type[TModelInputForNPU],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> TModelInputForNPU:
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-    # Exclude `async_callback` to be able to pickle this object
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        del state["async_callback"]
-        return state
-
-    # TODO: What happens when we depickle this object?
-    # How can we update this callback to properly pass it to the engine?
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.__dict__.update({'async_callback': None})
-
-
-@dataclass(frozen=True)
-class ModelInputForNPUWithSamplingMetadata(ModelInputForNPU):
-    """
-    Used by the ModelRunner.
-    """
-    sampling_metadata: Optional["SamplingMetadata"] = None
-    # Used for speculative decoding. We do not broadcast it because it is only
-    # used by the driver worker.
-    is_prompt: Optional[bool] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "inputs_embeds": self.inputs_embeds,
-            "input_positions": self.input_positions,
-            "lora_requests": self.lora_requests,
-            "lora_mapping": self.lora_mapping,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-            "virtual_engine": self.virtual_engine,
-            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
-            "finished_requests_ids": self.finished_requests_ids,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForNPUWithSamplingMetadata":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
-    """Build ModelInputForNPU from SequenceGroupMetadata."""
-
-    # Note: ideally we would be using a dataclass(kw_only=True)
-    # here, so that this can be subclassed easily,
-    # but kw_only is not supported in python<3.10.
-    class InterDataForSeqGroup:
-        """Intermediate data for the current sequence group."""
-
-        def simple_reinit(self):
-            self.input_tokens[0].clear()  # type: ignore
-            self.inputs_embeds = None  # type: ignore
-            self.input_positions[0].clear()  # type: ignore
-            self.token_types[0].clear()  # type: ignore
-            self.mrope_input_positions = None  # type: ignore
-            self.seq_lens[0] = 0  # type: ignore
-            self.orig_seq_lens[0] = 0  # type: ignore
-            self.query_lens[0] = 0  # type: ignore
-            self.context_lens[0] = 0  # type: ignore
-            self.curr_sliding_window_blocks[0] = 0  # type: ignore
-            self.lora_index_mapping.clear()  # type: ignore
-            self.lora_prompt_mapping.clear()  # type: ignore
-            self.lora_requests.clear()  # type: ignore
-
-        def __init__(
-            self,
-            *,
-            # From sequence group metadata.
-            request_id: str,
-            seq_ids: List[int],
-            is_prompt: bool,
-            block_tables: Optional[Dict[int, List[int]]],
-            computed_block_nums: List[int],
-            n_seqs: int = 0,
-
-            # Input tokens and positions.
-            input_tokens: Optional[List[List[int]]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            input_positions: Optional[List[List[int]]] = None,
-            token_types: Optional[List[List[int]]] = None,
-            mrope_input_positions: Optional[List[List[List[int]]]] = None,
-
-            # The sequence length (may be capped to the sliding window).
-            seq_lens: Optional[List[int]] = None,
-            # The original sequence length (before applying sliding window).
-            # This is used to compute slot mapping.
-            orig_seq_lens: Optional[List[int]] = None,
-            # The query length.
-            query_lens: Optional[List[int]] = None,
-            # The number of tokens that are already computed.
-            context_lens: Optional[List[int]] = None,
-            # The current sliding window block.
-            curr_sliding_window_blocks: Optional[List[int]] = None,
-
-            # LoRA inputs.
-            lora_index_mapping: Optional[List[List[int]]] = None,
-            lora_prompt_mapping: Optional[List[List[int]]] = None,
-            lora_requests: Optional[Set[LoRARequest]] = None,
-
-            # Multi-modal inputs.
-            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
-            multi_modal_placeholder_maps: Optional[Dict[
-                str, MultiModalPlaceholderMap]] = None,
-
-            # Whether the prefix cache is hit (prefill only).
-            prefix_cache_hit: bool = False,
-            reinit: bool = False,
-            reinit_use_defaults: bool = False,
-            encoder_seq_len: int = 0,
-        ):
-            if reinit:
-                assert len(self.seq_ids) == len(seq_ids)  # type: ignore
-                for i, seq_id in enumerate(seq_ids):
-                    self.seq_ids[i] = seq_id  # type: ignore
-            else:
-                self.seq_ids = seq_ids
-
-            self.request_id = request_id
-            self.is_prompt = is_prompt
-            self.block_tables = block_tables
-            self.computed_block_nums = computed_block_nums
-            self.n_seqs = n_seqs
-            self.encoder_seq_len = encoder_seq_len
-
-            if reinit:
-                if len(self.seq_ids) == 1 and reinit_use_defaults:
-                    self.simple_reinit()
-                else:
-                    if input_tokens:
-                        self.input_tokens = input_tokens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.input_tokens[seq_id].clear()
-                    self.inputs_embeds = inputs_embeds
-
-                    if input_positions:
-                        self.input_positions = input_positions
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.input_positions[seq_id].clear()
-
-                    if token_types:
-                        self.token_types = token_types
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.token_types[seq_id].clear()
-
-                    self.mrope_input_positions = None
-
-                    if seq_lens:
-                        self.seq_lens = seq_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.seq_lens[seq_id] = 0
-
-                    if orig_seq_lens:
-                        self.orig_seq_lens = orig_seq_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.orig_seq_lens[seq_id] = 0
-
-                    if query_lens:
-                        self.query_lens = query_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.query_lens[seq_id] = 0
-
-                    if context_lens:
-                        self.context_lens = context_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.context_lens[seq_id] = 0
-
-                    if curr_sliding_window_blocks:
-                        self.curr_sliding_window_blocks = \
-                            curr_sliding_window_blocks
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.curr_sliding_window_blocks[seq_id] = 0
-
-                    if lora_index_mapping:
-                        self.lora_index_mapping = lora_index_mapping
-                    else:
-                        self.lora_index_mapping.clear()
-                    if lora_prompt_mapping:
-                        self.lora_prompt_mapping = lora_prompt_mapping
-                    else:
-                        self.lora_prompt_mapping.clear()
-                    if lora_requests:
-                        self.lora_requests = lora_requests
-                    else:
-                        self.lora_requests.clear()
-
-            else:
-                self.input_tokens = input_tokens or []
-                self.inputs_embeds = inputs_embeds
-                self.input_positions = input_positions or []
-                self.token_types = token_types or []
-                self.mrope_input_positions = mrope_input_positions or None
-                self.seq_lens = seq_lens or []
-                self.orig_seq_lens = orig_seq_lens or []
-                self.query_lens = query_lens or []
-                self.context_lens = context_lens or []
-                self.curr_sliding_window_blocks = \
-                    curr_sliding_window_blocks or []
-
-                self.lora_index_mapping = lora_index_mapping or []
-                self.lora_prompt_mapping = lora_prompt_mapping or []
-                self.lora_requests = lora_requests or set()
-
-            self.multi_modal_kwargs = multi_modal_kwargs
-            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
-            self.prefix_cache_hit = prefix_cache_hit
-
-            self.n_seqs = len(self.seq_ids)
-
-            if not reinit:
-                self.__post_init__()
-
-        def __post_init__(self):
-            self.n_seqs = len(self.seq_ids)
-
-            self.input_tokens = [[] for _ in range(self.n_seqs)]
-            self.input_positions = [[] for _ in range(self.n_seqs)]
-            self.token_types = [[] for _ in range(self.n_seqs)]
-            self.mrope_input_positions = None
-            self.seq_lens = [0] * self.n_seqs
-            self.orig_seq_lens = [0] * self.n_seqs
-            self.query_lens = [0] * self.n_seqs
-            self.context_lens = [0] * self.n_seqs
-            self.curr_sliding_window_blocks = [0] * self.n_seqs
-
-            self.lora_index_mapping = []
-            self.lora_prompt_mapping = []
-
-        def __repr__(self) -> str:
-            return (f"InterDataForSeqGroup("
-                    f"request_id={self.request_id}, "
-                    f"seq_ids={self.seq_ids}, "
-                    f"is_prompt={self.is_prompt}, "
-                    f"block_tables={self.block_tables}, "
-                    f"computed_block_nums={self.computed_block_nums}, "
-                    f"n_seqs={self.n_seqs}, "
-                    f"input_tokens={self.input_tokens}, "
-                    f"inputs_embeds.shape="
-                    f"{getattr(self.inputs_embeds, 'shape', None)}, "
-                    f"input_positions={self.input_positions}, "
-                    f"token_types={self.token_types}, "
-                    f"mrope_input_positions={self.mrope_input_positions}, "
-                    f"seq_lens={self.seq_lens}, "
-                    f"orig_seq_lens={self.orig_seq_lens}, "
-                    f"query_lens={self.query_lens}, "
-                    f"context_lens={self.context_lens}, "
-                    f"multi_modal_kwargs={self.multi_modal_kwargs}")
-
-    def __init__(self,
-                 runner,
-                 finished_requests_ids: Optional[List[str]] = None):
-        super().__init__()
-        # Compute functions for each sequence in a sequence group.
-        # WARNING: The order of the functions matters!
-        self.per_seq_compute_fns = [
-            self._compute_lens,
-            self._compute_for_prefix_cache_hit,
-            self._compute_for_sliding_window,
-            self._compute_lora_input,
-        ]
-        # Compute functions for each sequence group.
-        # WARNING: The order of the functions matters!
-        self.per_seq_group_compute_fns = [
-            self._compute_multi_modal_input,
-        ]
-
-        self.runner = runner
-        self.model_input_cls = self.runner._model_input_cls
-        self.attn_backend = self.runner.attn_backend
-        self.scheduler_config = self.runner.scheduler_config
-        self.sliding_window = self.runner.sliding_window
-        self.block_size = self.runner.block_size
-        self.enable_lora = self.runner.lora_config is not None
-        self.finished_requests_ids = finished_requests_ids
-        self.decode_only = True
-        self.is_encoder_decoder = self.runner.model_config.is_encoder_decoder
-
-        # Attention metadata inputs.
-        self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
-            weakref.proxy(self))
-
-        # Engine/Model configurations.
-        self.chunked_prefill_enabled = (
-            self.scheduler_config is not None
-            and self.scheduler_config.chunked_prefill_enabled)
-        if self.sliding_window is not None:
-            self.sliding_window_blocks = (
-                self.sliding_window + self.block_size - 1) // self.block_size
-            self.block_aligned_sliding_window = \
-                self.sliding_window_blocks * self.block_size
-
-    def prepare(self,
-                finished_requests_ids: Optional[List[str]] = None) -> None:
-        self.finished_requests_ids = finished_requests_ids
-
-        # if the current batch is decode-only.
-        # will be set to False if there is any non-decode request.
-        self.decode_only = True
-
-        # Intermediate data (data in CPU before going to NPU) for
-        # the current sequence group.
-        self.inter_data_list: List[
-            ModelInputForNPUBuilder.InterDataForSeqGroup] = []
-
-        self.attn_metadata_builder.prepare()
-
-    def gen_inter_data_builder(self, num_seqs: int):
-        return lambda: ModelInputForNPUBuilder.InterDataForSeqGroup(
-            request_id="",
-            seq_ids=[0] * num_seqs,
-            is_prompt=True,
-            block_tables=None,
-            computed_block_nums=[])
-
-    def init_cached_inter_data(self, *args, **kwargs):
-        assert len(args) == 0
-        assert "seq_ids" in kwargs
-        seq_ids = kwargs["seq_ids"]
-        num_seqs = len(seq_ids)
-
-        # The inter-data cache is per model_runner
-        inter_data_cache = self.runner.inter_data_cache
-        if num_seqs not in inter_data_cache:
-            inter_data_cache[num_seqs] = PyObjectCache(
-                self.gen_inter_data_builder(num_seqs))
-
-        obj = inter_data_cache[num_seqs].get_object()
-        obj.__init__(*args, **kwargs)
-        return obj
-
-    def reset_cached_inter_data(self):
-        for cache in self.runner.inter_data_cache.values():
-            cache.reset()
-
-    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
-        """Add a sequence group to the builder."""
-        seq_ids = seq_group_metadata.seq_data.keys()
-        n_seqs = len(seq_ids)
-        is_prompt = seq_group_metadata.is_prompt
-
-        if is_prompt:
-            assert n_seqs == 1
-            self.decode_only = False
-
-        encoder_seq_len = 0
-
-        if self.is_encoder_decoder:
-            encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
-
-        inter_data = self.init_cached_inter_data(
-            request_id=seq_group_metadata.request_id,
-            seq_ids=seq_ids,
-            is_prompt=is_prompt,
-            block_tables=seq_group_metadata.block_tables,
-            computed_block_nums=seq_group_metadata.computed_block_nums,
-            reinit=True,
-            reinit_use_defaults=True,
-            encoder_seq_len=encoder_seq_len)
-
-        self.inter_data_list.append(inter_data)
-
-        for seq_idx in range(n_seqs):
-            for per_seq_fn in self.per_seq_compute_fns:
-                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
-        for per_seq_group_fn in self.per_seq_group_compute_fns:
-            per_seq_group_fn(inter_data, seq_group_metadata)
-
-    def build(self) -> ModelInputForNPU:
-        """Finalize the builder intermediate data and
-        create on-device tensors.
-        """
-        # Combine and flatten intermediate data.
-        input_tokens = list[int]()
-        inputs_embeds_list = list[torch.Tensor]()
-        token_types = list[int]()
-        for inter_data in self.inter_data_list:
-            for cur_input_tokens in inter_data.input_tokens:
-                input_tokens.extend(cur_input_tokens)
-            for cur_token_types in inter_data.token_types:
-                token_types.extend(cur_token_types)
-            if inter_data.inputs_embeds is not None:
-                inputs_embeds_list.append(
-                    inter_data.inputs_embeds.to(
-                        dtype=self.runner.model_config.dtype,
-                        device=self.runner.device))
-
-        inputs_embeds: Optional[torch.Tensor]
-        if len(inputs_embeds_list) == 0:
-            inputs_embeds = None
-        else:
-            inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to(
-                dtype=self.runner.model_config.dtype,
-                device=self.runner.device)
-            assert len(inputs_embeds) == len(input_tokens)
-
-        if not input_tokens and inputs_embeds is None:
-            # This may happen when all prefill requests hit
-            # prefix caching and there is no decode request.
-            return self.model_input_cls()
-
-        mrope_input_positions: Optional[List[List[int]]] = None
-        if any(inter_data.mrope_input_positions is not None
-               for inter_data in self.inter_data_list):
-            mrope_input_positions = [[] for _ in range(3)]
-
-            for idx in range(3):
-                for inter_data in self.inter_data_list:
-                    msections = inter_data.mrope_input_positions
-                    if msections is None:
-                        for _seq_input_positions in inter_data.input_positions:
-                            mrope_input_positions[idx].extend(
-                                _seq_input_positions)
-                    else:
-                        for _seq_mrope_input_positions in msections:
-                            mrope_input_positions[idx].extend(
-                                _seq_mrope_input_positions[idx])
-            input_positions = None
-        else:
-            input_positions = [
-                flatten_2d_lists(inter_data.input_positions)
-                for inter_data in self.inter_data_list
-            ]
-
-        seq_lens = []
-        max_decode_seq_len = 0
-        is_prompt = self.inter_data_list[0].is_prompt
-        for inter_data in self.inter_data_list:
-            seq_lens.extend(inter_data.seq_lens)
-            if not inter_data.is_prompt:
-                max_decode_seq_len = max(max_decode_seq_len,
-                                         max(inter_data.seq_lens))
-        query_lens = flatten_2d_lists(
-            [inter_data.query_lens for inter_data in self.inter_data_list])
-        # Mapping from request IDs to sequence IDs. Used for Jamba models
-        # that manages the cache by itself.
-        request_ids_to_seq_ids = {
-            data.request_id: data.seq_ids
-            for data in self.inter_data_list
-        }
-
-        # Add graph_pad_size here
-        if self.runner.torchair_graph_enabled:
-            graph_pad_size = self.runner.scheduler_config.max_num_seqs - len(
-                seq_lens)
-        else:
-            graph_pad_size = -1
-
-        if input_positions:
-            input_positions = flatten_2d_lists(input_positions)
-        if graph_pad_size != -1 and not is_prompt:
-            input_tokens.extend(itertools.repeat(0, graph_pad_size))
-            input_positions.extend(  # type: ignore
-                itertools.repeat(0, graph_pad_size))
-            seq_lens.extend(itertools.repeat(1, graph_pad_size))
-            query_lens.extend(itertools.repeat(1, graph_pad_size))
-        input_tokens_tensor = torch.tensor(input_tokens,
-                                           dtype=torch.long,
-                                           device=self.runner.device)
-        token_types_tensor = torch.tensor(token_types,
-                                          dtype=torch.long,
-                                          device=self.runner.device) \
-                                          if token_types else None
-        if mrope_input_positions is not None:
-            input_positions_tensor = torch.tensor(mrope_input_positions,
-                                                  dtype=torch.long,
-                                                  device=self.runner.device)
-        else:
-            input_positions_tensor = torch.tensor(input_positions,
-                                                  dtype=torch.long,
-                                                  device=self.runner.device)
-        #print(f"after tensor input_tokens_tensor: {input_tokens_tensor}")
-        #print(f"after tensor input_positions_tensor: {input_positions_tensor}")
-        #print(f"after list seq_lens: {seq_lens}")
-
-        # Attention metadata.
-        attn_metadata = self.attn_metadata_builder.build(
-            seq_lens, query_lens, graph_pad_size)
-
-        # LoRA data.
-        lora_requests = set()
-        lora_mapping = None
-        if self.enable_lora:
-            lora_requests = set(r for data in self.inter_data_list
-                                for r in data.lora_requests)
-            lora_index_mapping = flatten_2d_lists([
-                flatten_2d_lists(inter_data.lora_index_mapping)
-                for inter_data in self.inter_data_list
-            ])
-            lora_prompt_mapping = flatten_2d_lists([
-                flatten_2d_lists(inter_data.lora_prompt_mapping)
-                for inter_data in self.inter_data_list
-            ])
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=lora_index_mapping,
-                       prompt_mapping=lora_prompt_mapping,
-                       is_prefill=not self.decode_only))
-
-        # Multi-modal data.
-        multi_modal_kwargs_list = [
-            data.multi_modal_kwargs for data in self.inter_data_list
-            if data.multi_modal_kwargs is not None
-        ]
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        if self.runner.torchair_graph_enabled:
-            torch._dynamo.mark_static(input_tokens_tensor)
-            torch._dynamo.mark_static(input_positions_tensor)
-            torch._dynamo.mark_static(attn_metadata.block_tables)
-            torch._dynamo.mark_static(attn_metadata.slot_mapping)
-
-        return self.model_input_cls(
-            input_tokens=input_tokens_tensor,
-            inputs_embeds=inputs_embeds,
-            token_types=token_types_tensor,
-            input_positions=input_positions_tensor,
-            attn_metadata=attn_metadata,
-            seq_lens=seq_lens,
-            query_lens=query_lens,
-            lora_mapping=lora_mapping,
-            lora_requests=lora_requests,
-            multi_modal_kwargs=multi_modal_kwargs,
-            request_ids_to_seq_ids=request_ids_to_seq_ids,
-            finished_requests_ids=self.finished_requests_ids)
-
-    def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
-                      seq_group_metadata: SequenceGroupMetadata):
-        """Compute context length, sequence length and tokens
-        for the given sequence data.
-        """
-        seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]]
-        token_chunk_size = seq_group_metadata.token_chunk_size
-
-        # Compute context length (the number of tokens that are
-        # already computed) and sequence length (total number of tokens).
-
-        seq_len = seq_data.get_len()
-        if inter_data.is_prompt:
-            context_len = seq_data.get_num_computed_tokens()
-            seq_len = min(seq_len, context_len + token_chunk_size)
-        elif self.runner.scheduler_config.is_multi_step or \
-            self.is_encoder_decoder:
-            context_len = seq_len - 1
-        else:
-            context_len = seq_data.get_num_computed_tokens()
-
-        # Compute tokens.
-        # Fixme: this is for the version compatibility, remove this once vllm v0.8.5 does not be supported.
-        if not hasattr(seq_data,
-                       "prompt_embeds") or seq_data.prompt_embeds is None:
-            tokens = seq_data.get_token_ids()[context_len:seq_len]
-            prompt_embeds = None
-        else:
-            tokens = [0] * (seq_len - context_len)
-            prompt_embeds = seq_data.get_token_embeddings(
-            )[context_len:seq_len]
-
-        token_types = seq_group_metadata.token_type_ids
-
-        inter_data.seq_lens[seq_idx] = seq_len
-        inter_data.orig_seq_lens[seq_idx] = seq_len
-        inter_data.context_lens[seq_idx] = context_len
-        inter_data.input_tokens[seq_idx].extend(tokens)
-        inter_data.inputs_embeds = prompt_embeds
-        inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
-        inter_data.token_types[seq_idx].extend(
-            token_types if token_types else [])
-        inter_data.query_lens[seq_idx] = seq_len - context_len
-
-        if seq_data.mrope_position_delta is not None:
-            if inter_data.mrope_input_positions is None:
-                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
-
-            inter_data.mrope_input_positions[
-                seq_idx] = MRotaryEmbedding.get_next_input_positions(
-                    seq_data.mrope_position_delta,
-                    context_len,
-                    seq_len,
-                )
-
-    def _compute_for_prefix_cache_hit(
-            self, inter_data: InterDataForSeqGroup, seq_idx: int,
-            seq_group_metadata: SequenceGroupMetadata):
-        """Check if hit prefix cache (i.e., some blocks are already computed).
-        If hit, update input tokens and positions to only compute the
-        remaining blocks.
-        """
-        computed_block_nums = inter_data.computed_block_nums
-
-        # Note that prefix caching does not support sliding window.
-        prefix_cache_hit = (computed_block_nums is not None
-                            and len(computed_block_nums) > 0
-                            and self.sliding_window is None
-                            and inter_data.is_prompt)
-        inter_data.prefix_cache_hit = prefix_cache_hit
-
-        if not prefix_cache_hit:
-            return
-
-        assert computed_block_nums is not None
-        # The cache hit prompt tokens in this sequence. Note that
-        # this may be larger than the sequence length if chunked
-        # prefill is enabled.
-        prefix_cache_len = len(computed_block_nums) * self.block_size
-
-        # The total number of prompt tokens in this sequence.
-        # When chunked prefill is enabled, this is the token number of
-        # computed chunks + current chunk.
-        seq_len = inter_data.seq_lens[seq_idx]
-
-        # When full hit, compute the last block rather than the last token,
-        # due to the requirements of prefix operator.
-        if seq_len <= prefix_cache_len:
-            prefix_cache_len -= self.block_size
-
-        seq_group_metadata.seq_data[inter_data.seq_ids[
-            seq_idx]].update_num_cached_tokens(prefix_cache_len)
-
-        # The number of so far computed prompt tokens in this sequence.
-        context_len = inter_data.context_lens[seq_idx]
-
-        if prefix_cache_len <= context_len:
-            # We already passed the cache hit region,
-            # so do normal computation.
-            pass
-        elif context_len < prefix_cache_len < seq_len:
-            # Partial hit. Compute the missing part.
-            uncomputed_start = prefix_cache_len - context_len
-            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
-                seq_idx][uncomputed_start:]
-            inter_data.input_positions[seq_idx] = inter_data.input_positions[
-                seq_idx][uncomputed_start:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                uncomputed_start:]
-            context_len = prefix_cache_len
-
-            inter_data.context_lens[seq_idx] = context_len
-            inter_data.query_lens[
-                seq_idx] = inter_data.seq_lens[seq_idx] - context_len
-        elif seq_len <= prefix_cache_len:
-            # Full hit. Only compute the last token to avoid
-            # erroneous behavior. FIXME: Ideally we should directly
-            # mark all tokens as computed in the scheduler and do not
-            # schedule this sequence, so this case should not happen.
-            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
-                seq_idx][-1:]
-            inter_data.input_positions[seq_idx] = inter_data.input_positions[
-                seq_idx][-1:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                -1:]
-            inter_data.query_lens[seq_idx] = 1
-            inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
-
-    def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
-                                    seq_idx: int,
-                                    seq_group_metadata: SequenceGroupMetadata):
-        """Update seq_len and curr_sliding_window_block for the given
-        sequence data (only required by decoding) if sliding window is enabled.
-        """
-        curr_sliding_window_block = 0
-        sliding_seq_len = inter_data.seq_lens[seq_idx]
-        if not inter_data.is_prompt and self.sliding_window is not None:
-            # TODO(sang): This is a hack to make sliding window work with
-            # paged attn. We can remove it if we make paged attn kernel
-            # to properly handle slinding window attn.
-            curr_sliding_window_block = self.sliding_window_blocks
-            # number of elements in last block
-            suff_len = inter_data.seq_lens[seq_idx] % self.block_size
-            sliding_seq_len = min(inter_data.seq_lens[seq_idx],
-                                  self.block_aligned_sliding_window + suff_len)
-            if suff_len > 0:
-                curr_sliding_window_block += 1
-
-        inter_data.curr_sliding_window_blocks[
-            seq_idx] = curr_sliding_window_block
-        inter_data.seq_lens[seq_idx] = sliding_seq_len
-
-    def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
-                            seq_idx: int,
-                            seq_group_metadata: SequenceGroupMetadata):
-        """If LoRA is enabled, compute LoRA index and prompt mapping."""
-        if not self.enable_lora:
-            return
-        lora_id = seq_group_metadata.lora_int_id
-        if lora_id > 0:
-            inter_data.lora_requests.add(seq_group_metadata.lora_request)
-        query_len = inter_data.query_lens[seq_idx]
-        inter_data.lora_index_mapping.append([lora_id] * query_len)
-        sampling_params = seq_group_metadata.sampling_params
-        if sampling_params and sampling_params.prompt_logprobs is not None:
-            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
-        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
-            inter_data.lora_prompt_mapping.append([lora_id])
-        else:
-            inter_data.lora_prompt_mapping.append([])
-
-    def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
-                                   seq_group_metadata: SequenceGroupMetadata):
-        """If multi-modal data is given, add it to the input."""
-        # NOTE: mm_kwargs only includes the subset of multi-modal items that
-        # intersect with the current prefill positions.
-        positions = inter_data.input_positions[0]
-        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
-            seq_group_metadata,
-            range(positions[0], positions[0] + len(positions)))
-        if not mm_kwargs:
-            return
-
-        inter_data.multi_modal_kwargs = mm_kwargs
-        inter_data.multi_modal_placeholder_maps = placeholder_maps
-
-        # special processing for mrope position deltas.
-        if self.runner.model_config.uses_mrope:
-            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
-            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
-            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
-
-            hf_config = self.runner.model_config.hf_config
-
-            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
-            for seq_idx in range(inter_data.n_seqs):
-                seq_data = seq_group_metadata.seq_data[
-                    inter_data.seq_ids[seq_idx]]
-                token_ids = seq_data.get_token_ids()
-
-                mrope_input_positions, mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions(
-                        token_ids,
-                        hf_config,
-                        image_grid_thw=image_grid_thw,
-                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=second_per_grid_ts,
-                        context_len=inter_data.context_lens[seq_idx],
-                        seq_len=inter_data.seq_lens[seq_idx],
-                    )
-
-                seq_data.mrope_position_delta = mrope_position_delta
-                inter_data.mrope_input_positions[
-                    seq_idx] = mrope_input_positions
-
-
-class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
-    """
-    Helper class for shared methods between NPU model runners.
-    """
-    _model_input_cls: Type[TModelInputForNPU]
-    _builder_cls: Type[ModelInputForNPUBuilder]
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-        input_registry: InputRegistry = INPUT_REGISTRY,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-
-        ModelRunnerBase.__init__(self, vllm_config)
-        model_config = self.model_config
-        cache_config = self.cache_config
-
-        self.is_driver_worker = is_driver_worker
-        self.return_hidden_states = return_hidden_states
-
-        self.device = self.device_config.device
-        self.pin_memory = is_pin_memory_available()
-
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
-        self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = \
-            self.vllm_config.compilation_config.max_capture_size
-
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
-
-        self.has_inner_state = model_config.has_inner_state
-
-        self.in_profile_run = False
-
-        self.graph_block_tables = np.zeros(
-            (self.vllm_config.scheduler_config.max_num_seqs,
-             (model_config.max_model_len + self.block_size - 1) //
-             self.block_size),
-            dtype=np.int32)
-
-        # Attention-free but stateful models like Mamba need a placeholder attn
-        # backend, as the attention metadata is needed to manage internal state.
-        # However we must bypass attention selection altogether for some models
-        # used for speculative decoding to avoid a divide-by-zero in
-        # model_config.get_head_size()
-        num_attn_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        needs_attn_backend = (num_attn_heads != 0
-                              or self.model_config.is_attention_free)
-
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.kv_cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-        ) if needs_attn_backend else None
-        if self.attn_backend:
-            self.attn_state = self.attn_backend.get_state_cls()(
-                weakref.proxy(self))
-        else:
-            self.attn_state = CommonAttentionState(weakref.proxy(self))
-
-        # Multi-modal data support
-        self.input_registry = input_registry
-        self.mm_registry = mm_registry
-
-        # Lazy initialization
-        self.model: nn.Module  # Set after load_model
-        # Set after load_model.
-        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
-
-        set_cpu_offload_max_bytes(
-            int(self.cache_config.cpu_offload_gb * 1024**3))
-
-        # Used to cache python objects
-        self.inter_data_cache: Dict[int, PyObjectCache] = {}
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceGroupToSample object. In Pipeline-Parallel, we have
-        # more than 1 Scheduler, resulting in a potential back-to-back
-        # prepare_model_inputs() call. This clobbers the cached
-        # SequenceGroupToSample objects, as we reset the cache during
-        # every prepare_model_inputs() call.
-        self.sampling_metadata_cache: SamplingMetadataCache = \
-              SamplingMetadataCache() \
-                if self.parallel_config.pipeline_parallel_size == 1 else None
-        self.sampler = get_sampler()
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def load_model(self) -> None:
-        logger.info("Starting to load model %s...", self.model_config.model)
-        with DeviceMemoryProfiler() as m:
-            self.model = get_model(vllm_config=self.vllm_config)
-
-        self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
-
-        if self.lora_config:
-            assert supports_lora(
-                self.model
-            ), f"{self.model.__class__.__name__} does not support LoRA yet."
-            if supports_multimodal(self.model):
-                logger.warning("Regarding multimodal models, vLLM currently "
-                               "only supports adding LoRA to language model.")
-            # It's necessary to distinguish between the max_position_embeddings
-            # of VLMs and LLMs.
-            if hasattr(self.model.config, "max_position_embeddings"):
-                max_pos_embeddings = self.model.config.max_position_embeddings
-            else:
-                max_pos_embeddings = (
-                    self.model.config.text_config.max_position_embeddings)
-            self.lora_manager = LRUCacheWorkerLoRAManager(
-                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens,
-                self.vocab_size,
-                self.lora_config,
-                self.device,
-                self.model.embedding_modules,
-                self.model.embedding_padding_modules,
-                max_position_embeddings=max_pos_embeddings,
-            )
-            self.model = self.lora_manager.create_lora_manager(self.model)
-
-        # adapter torch compile with npu_backend
-        if self.torchair_graph_enabled:
-            import torchair  # type: ignore
-            from torchair import patch_for_hcom  # type: ignore
-
-            # 通信算子成图
-            patch_for_hcom()
-            # 设置npu的config，如果不设置config，可以使用默认的，那可以设置npu_backend="npu"
-            config = torchair.CompilerConfig()
-            config.experimental_config.frozen_parameter = True
-            config.experimental_config.tiling_schedule_optimize = True
-            config.experimental_config.enable_view_optimize = \
-            get_ascend_config().torchair_graph_config.enable_view_optimize
-            torch.npu.set_compile_mode(jit_compile=False)
-            if not self.use_cached_npu_graph:
-                npu_backend = torchair.get_npu_backend(compiler_config=config)
-                self.compile_model = torch.compile(
-                    self.model,
-                    dynamic=True,
-                    fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                    backend=npu_backend)
-            else:
-                self.compile_model = torchair.inference.cache_compile(
-                    self.model.forward,
-                    dynamic=True,
-                    fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                    config=config,
-                    ge_cache=False)
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-
-        from vllm.model_executor.model_loader import ShardedStateLoader
-        ShardedStateLoader.save_model(
-            self.model,
-            path,
-            pattern=pattern,
-            max_size=max_size,
-        )
-
-    def save_tensorized_model(
-        self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-
-        from vllm.model_executor.model_loader import \
-            TensorizerLoader  # type: ignore  # noqa
-        TensorizerLoader.save_model(
-            self.model,
-            tensorizer_config=tensorizer_config,
-        )
-
-    def get_max_block_per_batch(self) -> int:
-        block_size = self.block_size
-        return (self.max_seq_len_to_capture + block_size - 1) // block_size
-
-    def _prepare_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> TModelInputForNPU:
-        """Helper method to prepare the model input based on a given sequence
-        group. Prepares metadata needed for the base model forward pass but not
-        metadata for possible additional steps, e.g., sampling.
-
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-        """
-        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
-        builder.prepare(finished_requests_ids)
-        for seq_group_metadata in seq_group_metadata_list:
-            builder.add_seq_group(seq_group_metadata)
-
-        builder.reset_cached_inter_data()
-
-        return builder.build()  # type: ignore
-
-    @contextmanager
-    def set_in_profile_run(self):
-        self.in_profile_run = True
-        try:
-            yield
-        finally:
-            self.in_profile_run = False
-
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        with self.set_in_profile_run():
-            # Enable top-k sampling to reflect the accurate memory usage.
-            sampling_params = \
-                SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-            max_num_batched_tokens = \
-                self.scheduler_config.max_num_batched_tokens
-            max_num_seqs = self.scheduler_config.max_num_seqs
-
-            # Profile memory usage with max_num_sequences sequences and the
-            # total number of tokens equal to max_num_batched_tokens.
-            seqs: List[SequenceGroupMetadata] = []
-            # Additional GPU memory may be needed for multi-modal encoding,
-            # which needs to be accounted for when calculating the GPU blocks
-            # for vLLM blocker manager.
-            # To exercise the worst scenario for GPU memory consumption,
-            # the number of seqs (batch_size) is chosen to maximize the number
-            # of images processed.
-
-            max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-                self.model_config)
-            if max_mm_tokens > 0:
-                max_num_seqs_orig = max_num_seqs
-                max_num_seqs = min(max_num_seqs,
-                                   max_num_batched_tokens // max_mm_tokens)
-                if max_num_seqs < 1:
-                    expr = (f"min({max_num_seqs_orig}, "
-                            f"{max_num_batched_tokens} // {max_mm_tokens})")
-                    logger.warning(
-                        "Computed max_num_seqs (%s) to be less than 1. "
-                        "Setting it to the minimum value of 1.", expr)
-                    max_num_seqs = 1
-
-            batch_size = 0
-            for group_id in range(max_num_seqs):
-                seq_len = (max_num_batched_tokens // max_num_seqs +
-                           (group_id < max_num_batched_tokens % max_num_seqs))
-                batch_size += seq_len
-
-                dummy_data = self.input_registry \
-                    .dummy_data_for_profiling(self.model_config,
-                                              seq_len,
-                                              self.mm_registry)
-
-                seq = SequenceGroupMetadata(
-                    request_id=str(group_id),
-                    is_prompt=True,
-                    seq_data={group_id: dummy_data.seq_data},
-                    sampling_params=sampling_params,
-                    block_tables=None,
-                    lora_request=None,
-                    multi_modal_data=dummy_data.multi_modal_data,
-                    multi_modal_placeholders=dummy_data.
-                    multi_modal_placeholders,
-                )
-                seqs.append(seq)
-
-            # Run the model with the dummy inputs.
-            num_layers = self.model_config.get_num_layers(self.parallel_config)
-            # use an empty tensor instead of `None`` to force Dynamo to pass
-            # it by reference, rather by specializing on the value ``None``.
-            # the `dtype` argument does not matter, and we use `float32` as
-            # a placeholder (it has wide hardware support).
-            # it is important to create tensors inside the loop, rather than
-            # multiplying the list, to avoid Dynamo from treating them as
-            # tensor aliasing.
-            kv_caches = [
-                torch.tensor([], dtype=torch.float32, device=self.device)
-                for _ in range(num_layers)
-            ]
-            finished_requests_ids = [seq.request_id for seq in seqs]
-            model_input = self.prepare_model_input(
-                seqs, finished_requests_ids=finished_requests_ids)
-            intermediate_tensors = None
-            if not get_pp_group().is_first_rank:
-                intermediate_tensors = \
-                    self.model.make_empty_intermediate_tensors(
-                    batch_size=batch_size,
-                    dtype=self.model_config.dtype,
-                    device=self.device)
-
-            self.execute_model(model_input, kv_caches, intermediate_tensors)
-            torch.npu.synchronize()
-            return
-
-    def remove_all_loras(self):
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_adapters()
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_adapter(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_adapter(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.pin_adapter(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_adapters()
-
-    def remove_all_prompt_adapters(self):
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    def set_active_prompt_adapters(
-            self, prompt_adapter_requests: Set[PromptAdapterRequest],
-            prompt_adapter_mapping: PromptAdapterMapping) -> None:
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise RuntimeError("PromptAdapter is not supported on NPU now.")
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-
-class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]):
-    """
-    NPU model runner with sampling step.
-    """
-    _model_input_cls: Type[ModelInputForNPUWithSamplingMetadata] = (
-        ModelInputForNPUWithSamplingMetadata)
-    _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder
-
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> ModelInputForNPUWithSamplingMetadata:
-        model_input = \
-            ModelInputForNPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-                tensor_dict,
-                attn_backend=self.attn_backend,
-            )
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None,
-    ) -> ModelInputForNPUWithSamplingMetadata:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        if get_pp_group().is_last_rank:
-            # Sampling metadata is only required for the final pp group
-            generators = self.get_generators(finished_requests_ids)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list,
-                model_input.seq_lens,
-                model_input.query_lens,
-                self.device,
-                self.pin_memory,
-                generators,
-                self.sampling_metadata_cache,
-                # TODO (cmq): enable this after supported in vllm
-                # pad_for_invariant_seq_len=True,
-            )
-            # Get hash value of request id list to perform sampling param cache in sampler.
-            request_ids = model_input.request_ids_to_seq_ids.keys(  # type: ignore
-            )  # type: ignore
-            request_ids_hash = hash("".join(request_ids))
-            sampling_metadata.request_ids_hash = request_ids_hash  # type: ignore
-        else:
-            sampling_metadata = None
-        is_prompt = (seq_group_metadata_list[0].is_prompt
-                     if seq_group_metadata_list else None)
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   is_prompt=is_prompt,
-                                   virtual_engine=virtual_engine)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNPUWithSamplingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-        **kwargs,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError("num_steps > 1 is not supported in ModelRunner")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        self.attn_state.begin_forward(model_input)
-
-        assert model_input.attn_metadata is not None
-        # TODO(zzzzwwjj): Do we need to do it every time?
-        if self.torchair_graph_enabled:
-            torch._dynamo.mark_static(model_input.input_tokens)
-            torch._dynamo.mark_static(model_input.input_positions)
-            torch._dynamo.mark_static(model_input.attn_metadata.block_tables)
-            torch._dynamo.mark_static(model_input.attn_metadata.slot_mapping)
-            for kv in kv_caches:
-                if isinstance(kv, tuple):
-                    torch._dynamo.mark_static(kv[0])
-                    torch._dynamo.mark_static(kv[1])
-
-        # TODO(andoorve): We can remove this once all
-        # virtual engines share the same kv cache.
-        virtual_engine = model_input.virtual_engine
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        previous_hidden_states = kwargs.get("previous_hidden_states")
-        if prefill_meta is None and self.torchair_graph_enabled:
-            model_executable = self.compile_model
-            # Note: graph_batch_size value not same as GPU
-            graph_batch_size = model_input.input_tokens.shape[  # type: ignore
-                0]  # type: ignore
-            # Note: previous_hidden_states maybe None not same as GPU
-            if previous_hidden_states is not None:
-                previous_hidden_states = torch.cat([
-                    previous_hidden_states,
-                    torch.empty([
-                        graph_batch_size - previous_hidden_states.shape[0],
-                        *previous_hidden_states.shape[1:]
-                    ],
-                                dtype=previous_hidden_states.dtype,
-                                device=previous_hidden_states.device)
-                ])
-        else:
-            model_executable = self.model
-
-        # Receive KV cache in distributed KV cache transfer setting
-        # In disagg prefill setting, it will also recv hidden states and bypass
-        # model forwarding
-        # In KV cache database setting, it will change the model input so that
-        # we can skip prefilling on tokens that successfully received KV caches
-        # NOTE: The receive operation is blocking
-        bypass_model_exec = False
-        if self.need_recv_kv(model_input, kv_caches):
-            hidden_or_intermediate_states, bypass_model_exec, model_input = \
-                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
-                    # model is used to know which layer the current worker
-                    # is working on, so that we can receive KV for only those
-                    # layers.
-                    model_executable,
-                    model_input,
-                    kv_caches=kv_caches
-                )
-
-        if get_dp_group().world_size > 1:
-            bypass_model_exec_tensor = torch.tensor(
-                1, dtype=torch.int32) if bypass_model_exec else torch.tensor(
-                    0, dtype=torch.int32)
-            torch.distributed.all_reduce(bypass_model_exec_tensor,
-                                         op=torch.distributed.ReduceOp.MIN,
-                                         group=get_dp_group().cpu_group)
-            # If there is any group have not receive the necessary hidden states or kv_cache, we force all the dp group execute.
-            if bypass_model_exec_tensor.item() == 0:
-                bypass_model_exec = False
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-
-        if self.torchair_graph_enabled:
-            model_kwargs: Dict[str, Any] = {"inputs_embeds": None}
-        else:
-            model_kwargs = {}
-        if previous_hidden_states is not None:
-            model_kwargs["previous_hidden_states"] = previous_hidden_states
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.npu.Event(enable_timing=True)
-            model_forward_end = torch.npu.Event(enable_timing=True)
-            model_forward_start.record()
-
-        if not bypass_model_exec:
-            with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config, virtual_engine):
-                if model_input.attn_metadata is not None:
-                    model_input.attn_metadata.input_positions = model_input.input_positions
-                if self.torchair_graph_enabled:
-                    model_kwargs["kv_caches"] = kv_caches
-                    model_kwargs["attn_metadata"] = model_input.attn_metadata
-                hidden_or_intermediate_states = model_executable(
-                    input_ids=model_input.input_tokens,
-                    inputs_embeds=model_input.inputs_embeds,
-                    positions=model_input.input_positions,
-                    intermediate_tensors=intermediate_tensors,
-                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                 device=self.device),
-                    **seqlen_agnostic_kwargs,
-                    **model_kwargs)
-
-            # Compute the logits in the last pipeline stage.
-            if not get_pp_group().is_last_rank:
-                if (self.is_driver_worker
-                        and hidden_or_intermediate_states is not None
-                        and isinstance(hidden_or_intermediate_states,
-                                       IntermediateTensors)
-                        and self.observability_config is not None and
-                        self.observability_config.collect_model_forward_time):
-                    model_forward_end.synchronize()
-                    model_forward_time = model_forward_start.elapsed_time(
-                        model_forward_end)
-                    orig_model_forward_time = 0.0
-                    if intermediate_tensors is not None:
-                        orig_model_forward_time = intermediate_tensors.tensors.get(
-                            "model_forward_time", torch.tensor(0.0)).item()
-                    hidden_or_intermediate_states.tensors[
-                        "model_forward_time"] = (
-                            torch.tensor(model_forward_time +
-                                         orig_model_forward_time))
-                return hidden_or_intermediate_states
-
-        logits = self.model.compute_logits(hidden_or_intermediate_states,
-                                           model_input.sampling_metadata)
-
-        # Sending KV cache in distributed KV cache transfer setting
-        if self.need_send_kv(model_input, kv_caches):
-            get_kv_transfer_group().send_kv_caches_and_hidden_states(
-                # model_executable is used to know which layer the current
-                # worker is working on, so that we can send KV for only those
-                # layers.
-                model_executable,
-                model_input,
-                kv_caches,
-                hidden_or_intermediate_states,
-            )
-
-        if self.is_driver_worker:
-            if model_input.async_callback is not None:
-                model_input.async_callback()
-
-            # Sample the next token.
-            assert isinstance(self.sampler, Sampler)
-            orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor
-            if model_input.inputs_embeds is not None:
-                self.sampler.include_gpu_probs_tensor = True
-
-            output: SamplerOutput = self.sampler(
-                logits=logits,
-                sampling_metadata=model_input.sampling_metadata,
-            )
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time
-                    and output is not None):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                # If there are multiple workers, we are still tracking the
-                # latency from the start time of the driver worker to the end
-                # time of the driver worker. The model forward time will then
-                # end up covering the communication time as well.
-                output.model_forward_time = (orig_model_forward_time +
-                                             model_forward_time)
-
-        if model_input.inputs_embeds is not None:
-            if self.is_driver_worker:
-                sampled = broadcast_tensor_dict(
-                    {"token_ids": output.sampled_token_ids})
-            else:
-                sampled = broadcast_tensor_dict()
-            if sampled["token_ids"] is not None:
-                sampled_token_embeds = self.model.get_input_embeddings(
-                    sampled["token_ids"].squeeze(1))
-                if self.is_driver_worker:
-                    self.sampler.include_gpu_probs_tensor = \
-                        orig_include_gpu_probs
-
-                    output.sampled_token_embeds = sampled_token_embeds
-
-                    for token_embed, sequence_group_output in zip(
-                            output.sampled_token_embeds, output.outputs):
-                        assert len(sequence_group_output.samples) == 1
-                        sequence_group_output.samples[
-                            0].output_embed = token_embed
-
-        if not self.is_driver_worker:
-            return []
-
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            assert model_input.sampling_metadata is not None
-            indices = model_input.sampling_metadata.selected_token_indices
-            if model_input.is_prompt:
-                hidden_states = hidden_or_intermediate_states.index_select(
-                    0, indices)
-                output.prefill_hidden_states = hidden_or_intermediate_states
-            elif self.torchair_graph_enabled:
-                hidden_states = hidden_or_intermediate_states[:len(indices)]
-            else:
-                hidden_states = hidden_or_intermediate_states
-
-            output.hidden_states = hidden_states
-
-        return [output]
-
-    def need_recv_kv(self, model_input, kv_caches) -> bool:
-        """Check if we need to receive kv-cache from the other worker.
-        We need to receive KV when
-            1. current vLLM instance is KV cache consumer/decode vLLM instance
-            2. this batch is not a profiling run
-            3. this batch is a prefill run
-            
-        Args:
-            model_input: input to the model executable
-            kv_caches: vLLM's paged memory
-        """
-
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches[0].numel() == 0)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-
-        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
-            not is_profile_run) and is_prefill_run
-
-    def need_send_kv(self, model_input, kv_caches) -> bool:
-        """Check if we need to send kv-cache to the other worker.
-        We need to send KV when
-            1. current vLLM instance is KV cache producer/prefill vLLM instance
-            2. this batch is not a profiling run
-            3. this batch is a prefill run
-            
-        Args:
-            model_input: input to the model executable
-            kv_caches: vLLM's paged memory
-        """
-
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches[0].numel() == 0)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-
-        return self.vllm_config.kv_transfer_config.is_kv_producer and (
-            not is_profile_run) and is_prefill_run
diff --git a/vllm_ascend/worker/multi_step_runner.py b/vllm_ascend/worker/multi_step_runner.py
deleted file mode 100644
index 028bcd05df..0000000000
--- a/vllm_ascend/worker/multi_step_runner.py
+++ /dev/null
@@ -1,737 +0,0 @@
-import dataclasses
-import functools
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
-
-import torch
-from torch import nn
-from vllm.distributed import get_pp_group
-from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
-                                                SamplerOutput,
-                                                SamplingMetadata, get_logprobs,
-                                                get_pythonized_sample_results)
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
-                           Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.worker.model_runner_base import (
-    _init_attn_metadata_from_tensor_dict,
-    _init_frozen_model_input_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-from vllm.worker.multi_step_model_runner import (ModelOutput,
-                                                 PythonizationCache,
-                                                 StatefulModelInput)
-
-from vllm_ascend.utils import current_stream
-from vllm_ascend.worker.model_runner import (
-    ModelInputForNPUWithSamplingMetadata, NPUModelRunnerBase)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-
-@dataclass(frozen=False)
-class StatefulModelInputForNPU(StatefulModelInput):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def record_step_event(self, current_stream: torch.npu.Stream):
-        # record the event for the current step so that the next step can sync
-        # on it. We modulo by 2 to keep the events in a circular buffer and
-        # support any attn backends that may be supported in the future. ie
-        # Flashinfer would want two DecodeWrappers to overlap the CPU and NPU.
-        self.step_cuda_events[self.current_step & 1] = \
-            torch.npu.Event(blocking=True)
-        self.step_cuda_events[self.current_step & 1].record(current_stream)
-
-    # actual frozen model input dataclass passed to _base_model_runner
-    frozen_model_input: Optional[ModelInputForNPUWithSamplingMetadata] = None
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "StatefulModelInputForNPU":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        tensor_dict = _init_frozen_model_input_from_tensor_dict(
-            ModelInputForNPUWithSamplingMetadata, tensor_dict)
-        return cls(**tensor_dict)
-
-    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
-        """
-        Advancing the datastructures of StatefulModelInput::frozen_model_input
-        is only required when prefills are scheduled with decodes to run in
-        multi-step. This advancement/correction is required to account for
-        the conversion of Prefills to Decodes after the first multi-step.
-        """
-        if self.current_step != 1 or self.num_single_step_prefills == 0:
-            return
-
-        assert self.frozen_model_input is not None
-        fmi = self.frozen_model_input
-
-        # Truncate input_tokens
-        assert fmi.input_tokens is not None
-        assert fmi.input_tokens.shape[0] >= self.num_seqs
-        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
-
-        # Update frozen_model_input::input_positons.
-        assert fmi.input_positions is not None
-        assert fmi.input_positions.shape[0] >= self.num_seqs
-        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
-                                                                    num_seqs]
-
-        # Assert unsupported
-        # TODO Uncomment the following codes when NPU supported
-        # assert fmi.lora_mapping is None
-        # assert fmi.lora_requests is not None
-        # assert len(fmi.lora_requests) == 0
-        # assert fmi.prompt_adapter_mapping is None
-        # assert fmi.prompt_adapter_requests is not None
-        # assert len(fmi.prompt_adapter_requests) == 0
-        assert fmi.attn_metadata is not None
-        assert fmi.multi_modal_kwargs is not None
-        assert len(fmi.multi_modal_kwargs) == 0
-
-        self.frozen_model_input = dataclasses.replace(
-            self.frozen_model_input,
-            input_tokens=fmi_new_input_tokens,
-            input_positions=fmi_new_input_positions)
-
-        self.maybe_advance_sampling_metadata(device, pin_memory)
-
-
-@dataclass(frozen=False)
-class NPUModelOutput(ModelOutput):
-
-    logprobs: Optional["torch.Tensor"] = None
-
-    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
-                                  copy_stream: torch.npu.Stream,
-                                  pinned_sampled_token_buffer: torch.Tensor,
-                                  blocking: bool) -> bool:
-        """
-        If blocking is set, will block until the forward pass for the output is
-        ready and pythonize the output. Upon completing Pythonization, erases
-        self.logprobs (note that a non-blocking call that is performed when
-        the sampler output is not yet ready, will not erase self.logprobs.)
-        """
-        assert self.sampled_token_ids is not None
-        if not blocking and not self.sampler_output_ready_event.query():
-            return False
-
-        if blocking:
-            self.sampler_output_ready_event.synchronize()
-        with torch.npu.stream(copy_stream):
-            _pythonize_sampler_output(input_metadata, self.sampler_output,
-                                      pinned_sampled_token_buffer,
-                                      self.sampled_token_ids, self.logprobs,
-                                      self.pythonization_cache)
-
-        # Erase the logprobs GPU-side tensor.
-        # Note that although _pythonize_sampler_output() runs in its
-        # own CUDA stream, nonetheless _pythonize_sampler_output()
-        # cannot return until Pythonization is complete; therefore
-        # we know that by the time the CPU reaches this point,
-        # `self.logprobs` is no longer needed.
-        self.logprobs = None
-        return True
-
-
-class MultiStepModelNPURunner(NPUModelRunnerBase[StatefulModelInputForNPU]):
-    # mypy: enable-error-code=type-var
-
-    def __init__(self, base_model_runner: NPUModelRunnerBase, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # uses the base model runner to execute the model and wraps it with
-        # multi-step logic
-        self._base_model_runner: NPUModelRunnerBase = base_model_runner
-
-        self.is_multi_step = self.scheduler_config.is_multi_step
-        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceOutput and CompletionSequenceGroupOutput object.
-        # When cache-reset happens at the last step of a multi-step
-        # execution, there may be other on-going single-step/multi-step
-        # executions. The current caching implementation does not check
-        # for this.
-        self.pythonization_cache = PythonizationCache() \
-            if self.parallel_config.pipeline_parallel_size == 1 else None
-
-    def get_model(self) -> nn.Module:
-        return self._base_model_runner.get_model()
-
-    @functools.cached_property
-    def _copy_stream(self):
-        # used to copy tensors from NPU to CPU asynchronously
-        return torch.npu.Stream()
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> StatefulModelInputForNPU:
-        model_input = (StatefulModelInputForNPU.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        ))
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> StatefulModelInputForNPU:
-        frozen_model_input: ModelInputForNPUWithSamplingMetadata = \
-              self._base_model_runner.prepare_model_input(
-                    seq_group_metadata_list,
-                    virtual_engine,
-                    finished_requests_ids)
-
-        assert frozen_model_input.query_lens is not None
-        assert frozen_model_input.seq_lens is not None
-        assert frozen_model_input.attn_metadata is not None
-        num_queries = len(frozen_model_input.query_lens)
-        num_seqs = len(frozen_model_input.seq_lens)
-        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
-
-        model_input = StatefulModelInputForNPU(
-            frozen_model_input=frozen_model_input,
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            num_single_step_prefills=num_single_step_prefills,
-            step_cuda_events=[torch.npu.Event(blocking=True)] * 2,
-        )
-
-        return model_input
-
-    def _async_process_outputs(self, model_input: StatefulModelInputForNPU,
-                               output_proc_callback: Callable):
-        # Proceed with pythonization and output_proc in order.
-        # Stop on the first one that fails to pythonize
-        output_proc_callback()
-
-        cont = True
-        for step_num, model_output in enumerate(model_input.cached_outputs):
-            if not model_output.pythonized:
-                model_output.maybe_pythonize(model_input, self._copy_stream,
-                                             self.pinned_sampled_token_ids)
-                if model_output.pythonized:
-                    ctx = output_proc_callback.keywords["ctx"]  # type: ignore
-                    ctx.append_output(
-                        outputs=[model_output.sampler_output],
-                        seq_group_metadata_list=ctx.seq_group_metadata_list,
-                        scheduler_outputs=ctx.scheduler_outputs,
-                        is_async=False,
-                        is_last_step=False,
-                        is_first_step_output=step_num == 0)
-
-                    output_proc_callback()
-                else:
-                    cont = False
-
-            if not cont:
-                break
-
-    def _final_process_outputs(
-            self, model_input: StatefulModelInputForNPU,
-            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
-        assert model_input.frozen_model_input is not None
-
-        has_async_callback = output_proc_callback is not None
-
-        outputs = []
-        for step_num, output in enumerate(model_input.cached_outputs):
-            is_last_step = step_num == len(model_input.cached_outputs) - 1
-
-            # For non-async case:
-            #   -- We simply add the outputs
-            # For async case:
-            #   -- Invoke callback, pythonize, add to callback queue and repeat
-            #   -- For last output, just add to callback queue
-            if has_async_callback:
-                assert output_proc_callback is not None
-
-                # Invoke callback before pythonize (to overlap with NPU)
-                output_proc_callback()
-
-                # Pythonize
-                if not output.pythonized:
-                    output.pythonize(model_input, self._copy_stream,
-                                     self.pinned_sampled_token_ids)
-
-                    # For non last step, add to callback queue to chain
-                    # callbacks=>pythonize pairs (for NPU overlap)
-                    if not is_last_step:
-                        ctx = output_proc_callback.keywords[  # type: ignore
-                            "ctx"]  # type: ignore
-                        ctx.append_output(
-                            outputs=[output.sampler_output],
-                            seq_group_metadata_list=ctx.
-                            seq_group_metadata_list,
-                            scheduler_outputs=ctx.scheduler_outputs,
-                            is_async=False,
-                            is_last_step=False,
-                            is_first_step_output=step_num == 0)
-                    else:
-                        outputs.append(output.sampler_output)
-            else:
-                output.pythonize(model_input, self._copy_stream,
-                                 self.pinned_sampled_token_ids)
-                outputs.append(output.sampler_output)
-
-        return outputs
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: StatefulModelInputForNPU,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        """ 
-        Execute the model for a single step and update multi-step
-        metadata
-        """
-        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-
-        # path for warm up runs
-        if not model_input.is_multi_step:
-            return self._base_model_runner.execute_model(
-                frozen_model_input, kv_caches, intermediate_tensors, num_steps)
-
-        # make sure we skip the sampler on the lask rank and only pythonize
-        # if CPU is ahead.
-        if self.is_driver_worker and get_pp_group().is_last_rank:
-            if self.pinned_sampled_token_ids is None:
-                self.pinned_sampled_token_ids = torch.zeros(
-                    (self.scheduler_config.max_num_seqs, 1),
-                    dtype=torch.long,
-                    device="cpu",
-                    pin_memory=True)
-            self._base_model_runner.sampler.include_gpu_probs_tensor = True
-            if frozen_model_input.sampling_metadata:
-                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
-                    True)
-
-        # some pre-execute model logic for multi-step:
-        #   - if it's the first step, we need to reset the sampling tensors
-        #   - if it's not the first step, we need to advance the step using the
-        #   appended sampler output from last iteration
-        #   - also maybe pythonize if CPU is ahead of NPU
-
-        stream = current_stream()
-        if not model_input.is_first_multi_step:
-            # Explicitly block on the previous step's forward to make sure we
-            # don't clobber any NPU tensors still in use.
-            # This is not needed for flashattn backend, but for other attn
-            # backends such as flashinfer that performs extra CPU operations on
-            # input metadata we may need to synchronize any CPU operations that
-            # might clobber enqueued forwards. (prevents CPU from running too
-            # far ahead if needed)
-            model_input.wait_previous_step()
-            model_input = self._advance_step(
-                model_input, model_input.cached_outputs[-1].sampler_output)
-
-            # frozen_model_input may have been updated
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        if model_input.base_output_proc_callback is None:
-            assert frozen_model_input is not None
-            model_input.base_output_proc_callback = \
-                        frozen_model_input.async_callback
-
-        if frozen_model_input.async_callback is not None:
-            assert model_input.base_output_proc_callback is not None
-            async_callback = functools.partial(
-                self._async_process_outputs,
-                model_input=model_input,
-                output_proc_callback=model_input.base_output_proc_callback)
-
-            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                model_input.frozen_model_input,
-                async_callback=async_callback)
-            # Update the local instance
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        # Execute the model
-        output = self._base_model_runner.execute_model(frozen_model_input,
-                                                       kv_caches,
-                                                       intermediate_tensors,
-                                                       num_steps=1)
-
-        # record the event for the current step so that the next step can sync
-        model_input.record_step_event(stream)
-
-        if get_pp_group().is_last_rank and self.is_driver_worker:
-            assert isinstance(output, list)
-            assert len(
-                output
-            ) == 1, "MultiStepModelRunner requires single-step base_models"
-
-            # event for the pythonization so that we only pythonize if the
-            # tensors are ready. May be able to be combined with the step event
-            output_ready_event = torch.npu.Event()
-            output_ready_event.record(stream)
-            if self.parallel_config.pipeline_parallel_size > 1:
-                output[0].sampled_token_ids_cpu = output[
-                    0].sampled_token_ids.cpu()
-            model_input.cached_outputs.append(
-                NPUModelOutput(output[0], output_ready_event,
-                               output[0].sampled_token_ids, False,
-                               output[0].logprobs, self.pythonization_cache))
-
-            # These NPU tensors are not required by multi-step;
-            # erase them to ensure they are not pythonized or
-            # transferred to CPU
-            output[0].sampled_token_ids = None
-            output[0].sampled_token_probs = None
-            output[0].logprobs = None
-
-            # Pythonize the output if CPU is ahead and the previous step is
-            # ready.
-            if frozen_model_input.async_callback is None:
-                for model_output in model_input.cached_outputs:
-                    model_output.maybe_pythonize(model_input,
-                                                 self._copy_stream,
-                                                 self.pinned_sampled_token_ids)
-
-        model_input.current_step += 1
-
-        if not get_pp_group().is_last_rank:
-            # Should be IntermediateTensors
-            assert isinstance(output, IntermediateTensors)
-            return output
-        if not self.is_driver_worker:
-            return []
-
-        # Pythonize the output and block if needed since it is the last step
-        if model_input.is_last_step:
-            outputs = self._final_process_outputs(
-                model_input, model_input.base_output_proc_callback)
-            if self.pythonization_cache:
-                self.pythonization_cache.reset()
-            return outputs
-
-        # should be [SamplerOutput]
-        return output
-
-    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
-                                  num_seqs: Optional[int], num_queries: int):
-
-        assert sampling_metadata.num_prompts == 0
-        assert len(sampling_metadata.seq_groups) == num_queries
-        assert sampling_metadata.selected_token_indices.shape == (
-            num_queries, )
-        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
-
-        # Verify that all sequences are decodes
-        for i in range(num_queries):
-            seq_group = sampling_metadata.seq_groups[i]
-
-            assert seq_group.is_prompt is False  # No prompt
-            assert seq_group.prompt_logprob_indices == []  # No prompt
-            assert seq_group.sample_indices == [i]  # Simple
-            assert seq_group.seq_len is None  # Decode
-            assert seq_group.query_len is None  # Decode
-
-    def _advance_step(self, model_input: StatefulModelInputForNPU,
-                      out: SamplerOutput) -> StatefulModelInputForNPU:
-
-        model_input.maybe_advance_frozen_model_input(self.device,
-                                                     self.pin_memory)
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        assert frozen_model_input.input_tokens is not None
-        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
-        assert frozen_model_input.attn_metadata is not None
-
-        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
-        num_seqs = model_input.num_seqs
-        num_queries = model_input.num_queries
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        attn_metadata = frozen_model_input.attn_metadata
-        assert attn_metadata is not None
-
-        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
-                                    model_input.num_single_step_prefills != 0
-        attn_metadata.advance_step(
-            frozen_model_input,
-            sampled_token_ids,
-            self.block_size,
-            num_seqs,
-            num_queries,
-            turn_prefills_into_decodes=turn_prefills_into_decodes)
-
-        return model_input
-
-    def load_model(self) -> None:
-        self._base_model_runner.load_model()
-        self.model_memory_usage = self._base_model_runner.model_memory_usage
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        return self._base_model_runner.save_sharded_state(
-            path, pattern, max_size)
-
-    def save_tensorized_model(self,
-                              tensorizer_config: TensorizerConfig) -> None:
-        return self._base_model_runner.save_tensorized_model(tensorizer_config)
-
-    def profile_run(self) -> None:
-        return self._base_model_runner.profile_run()
-
-    def remove_all_loras(self):
-        return self._base_model_runner.remove_all_loras()
-
-    def capture_model(self, kv_caches: List[List]) -> None:
-        return self._base_model_runner.capture_model(kv_caches)
-
-    @property
-    def vocab_size(self) -> int:
-        return self._base_model_runner.vocab_size
-
-
-DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
-                                   Optional[List[SampleLogprobs]]]
-
-
-def deferred_pythonize_logprobs(
-    output: SamplerOutput,
-    sampling_metadata: SamplingMetadata,
-    logprobs_tensor: Optional[torch.Tensor],
-) -> DeferredLogprobsReturnType:
-    """Perform deferred logprob Pythonization.
-
-    1. Pythonize NPU-side sampler result tensors into CPU-side sampler result.
-    2. Pythonize NPU-side logprobs tensor into CPU-side logprobs lists,
-       utilizing  the Pythonized sampler result computed in step 1.
-    
-    These deferred computations are not required for single-step scheduling
-    or the `profile_run()` phase of multi-step scheduling.
-
-    Args:
-        output: sampler output (under deferred Pythonization)
-        sampling_metadata
-        
-    Returns:
-        prompt_logprobs (CPU), sample_logprobs (CPU)
-    """
-
-    # - Deferred pythonization of sample result
-    sampler_result = get_pythonized_sample_results(
-        output.deferred_sample_results_args)
-
-    # - Erase the NPU-side deferred sample_result
-    #   computation args to ensure it is never
-    #   pythonized or transferred to CPU
-    output.deferred_sample_results_args = None
-
-    # - Deferred pythonization of logprobs
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
-    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
-    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
-
-    return prompt_logprobs, sample_logprobs
-
-
-def _pythonize_sampler_output(
-    model_input: StatefulModelInputForNPU,
-    output: SamplerOutput,
-    pinned_sampled_token_buffer: torch.Tensor,
-    sampled_token_ids: torch.Tensor,
-    logprobs_tensor: Optional[torch.Tensor],
-    cache: Optional[PythonizationCache],
-) -> None:
-    """ This function is only called when the output tensors are ready. 
-    See :class:`ModelOutput`. 
-    
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
-    adding a Pythonized output data structure
-    (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
-
-    Args:
-      model_input
-      output: sampler output
-      pinned_sampled_token_token_buffer: CPU-side pinned memory
-                                         (receives copy of
-                                         NPU-side token buffer.)
-      sampled_token_ids: NPU-side token buffer
-      logprobs_tensor: NPU-side tensor containing 
-                       logprobs computed during sampling
-    """
-
-    assert model_input.frozen_model_input is not None
-
-    frozen_model_input = model_input.frozen_model_input
-    assert frozen_model_input.sampling_metadata is not None
-    sampling_metadata = frozen_model_input.sampling_metadata
-    # samples generation should have been skipped
-    assert not output.outputs
-
-    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
-
-    # We guarantee output tensors are ready, so it is safe to
-    # pythonize the sampler output & obtain CPU-side logprobs.
-    #
-    # However we should check whether logprobs pythonization may
-    # be skipped entirely, i.e. because no logprobs were requested
-    # or pythonization was not deferred. To that end,
-    #
-    # * `prompt_logprobs_are_requested_for_prefill` signals that
-    #   there are *any* prefill-phase requests which specify that
-    #   prompt logprobs should be returned.
-    #
-    # * `any_logprobs_are_requested` signals that there are any
-    #   requests which (1) specify that sample logprobs should be
-    #   returned, or (2) are in the prefill phase AND specify that
-    #   prompt logprobs should be returned.
-    #
-    # Later on, these flags cause adjustments to the pythonization
-    # process to accommodate logprobs.
-
-    seq_groups = sampling_metadata.seq_groups
-    prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
-        for sg in seq_groups
-    ])
-    any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill
-        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
-
-    if prompt_logprobs_are_requested_for_prefill:
-        # CPU NPU sync, after gathering *only* sampled tokens (since
-        # requesting prompt logprobs leads `sampled_token_ids` to
-        # include prompt token ids in addition to sampled token ids.)
-        sample_idx_tensor = torch.tensor(
-            [sdx for sg in seq_groups for sdx in sg.sample_indices])
-        pinned_buffer = pinned_buffer.copy_(
-            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
-    else:
-        # CPU NPU sync
-        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
-                                            non_blocking=False)
-
-    # this will not block as the tensors are already on CPU
-    samples_list = pinned_buffer.tolist()
-
-    skip_sampler_cpu_output = (
-        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
-
-    # *Don't* skip logprobs pythonization *if*:
-    # * Any requests require logprobs to be returned in this
-    # iteration AND
-    # * These requests are being scheduled in a fashion which
-    # defers pythonization (i.e. multi-step scheduling.)
-    do_pythonize_logprobs = (skip_sampler_cpu_output
-                             and any_logprobs_are_requested)
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
-                                     logprobs_tensor)
-         if do_pythonize_logprobs else (None, None))
-
-    for sgdx, (seq_group,
-               sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        # (Check for Guided Decoding)
-        if seq_group.sampling_params.logits_processors:
-            assert len(seq_group.sampling_params.logits_processors) == 0, (
-                "Logits Processors are not supported in multi-step decoding")
-
-        if do_pythonize_logprobs:
-            assert prompt_logprobs is not None
-            assert sample_logprobs is not None
-
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (  # Utilize deferred pythonization results
-                prompt_logprobs[sgdx],
-                sample_logprobs[sgdx],
-            )
-        elif any_logprobs_are_requested:
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (
-                # profile_run: use already-computed logprobs
-                output.outputs[sgdx].prompt_logprobs,
-                [sample.logprobs for sample in output.outputs[sgdx].samples])
-
-        seq_ids = seq_group.seq_ids
-        next_token_ids = sample_result
-        parent_ids = [0]
-        seq_outputs: List[SequenceOutput]
-
-        if cache is not None:
-            completion_seq_group_output: CompletionSequenceGroupOutput = \
-                cache.cached_completion_seq_group_output.get_object()
-            completion_seq_group_output.samples.clear()
-            seq_outputs = completion_seq_group_output.samples
-        else:
-            seq_outputs = []
-
-        for tdx, (parent_id,
-                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
-            if cache is not None:
-                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
-                )
-                seq_output.parent_seq_id = seq_ids[parent_id]
-                seq_output.output_token = next_token_id
-
-                if any_logprobs_are_requested:
-                    seq_output.logprobs = group_sample_logprobs[tdx]
-                else:
-                    logprobs = next(iter(seq_output.logprobs.values()))
-                    seq_output.logprobs.clear()
-
-                    logprobs.logprob = float('inf')
-                    logprobs.rank = None
-                    logprobs.decoded_token = None
-
-                    seq_output.logprobs[next_token_id] = logprobs
-
-                seq_outputs.append(seq_output)
-
-            else:
-                seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id,
-                                   (group_sample_logprobs[tdx]
-                                    if any_logprobs_are_requested else {
-                                        next_token_id:
-                                        Logprob(logprob=float('inf'),
-                                                rank=None,
-                                                decoded_token=None)
-                                    })))
-        if cache is not None:
-            completion_seq_group_output.prompt_logprobs = \
-                group_prompt_logprobs if any_logprobs_are_requested else None
-            output.outputs.append(completion_seq_group_output)
-        else:
-            output.outputs.append(
-                CompletionSequenceGroupOutput(
-                    seq_outputs, (group_prompt_logprobs
-                                  if any_logprobs_are_requested else None)))
-
-    assert len(output.outputs) > 0
diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py
deleted file mode 100644
index 6d092805d5..0000000000
--- a/vllm_ascend/worker/multi_step_worker.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import dataclasses
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import torch
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.model_runner_base import BroadcastableModelInput
-from vllm.worker.multi_step_model_runner import StatefulModelInput
-
-from vllm_ascend.worker.multi_step_runner import MultiStepModelNPURunner
-from vllm_ascend.worker.worker import NPUWorker, WorkerInput
-
-
-@dataclass
-class MultiStepState:
-    worker_input: WorkerInput
-    model_input: StatefulModelInput
-
-
-class MultiStepWorker(NPUWorker):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        base_model_runner = self.model_runner
-        # for multi-step model, wrap the model runner with MultiStepModelRunner
-        self.model_runner = MultiStepModelNPURunner(
-            base_model_runner,
-            vllm_config=base_model_runner.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=base_model_runner.is_driver_worker,
-        )
-
-        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
-        self.multi_step_states: List[
-            Optional[MultiStepState]] = [None] * pipeline_parallel_size
-        self.temp_output = None
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """
-        Get the driver input and broadcast it to other workers.
-        """
-        assert self.is_driver_worker
-        virtual_engine = execute_model_req.virtual_engine
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        if is_first_multi_step:
-            # on first step we prepare the worker input and model input normally
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            model_input: StatefulModelInput = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-
-            if execute_model_req.async_callback:
-                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                    model_input.frozen_model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            # on subsequent steps we reuse the worker input and model input
-            multi_step_state = self.multi_step_states[virtual_engine]
-            worker_input = multi_step_state.worker_input
-            model_input = multi_step_state.model_input
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-            assert frozen_model_input.attn_metadata is not None
-            # clear the cached metadata so that it can be recomputed on
-            # the workers.
-            frozen_model_input.attn_metadata._cached_prefill_metadata = None
-            frozen_model_input.attn_metadata._cached_decode_metadata = None
-
-        model_input.is_first_multi_step = is_first_multi_step
-        model_input.is_last_step = execute_model_req.is_last_step
-
-        if not is_first_multi_step:
-            # we broadcast the last sampled token ids to all TP workers so they
-            # can update their model input metadata in-place.
-            self._prepare_last_sampled_token_ids_for_tp_workers(
-                execute_model_req=execute_model_req, model_input=model_input)
-
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        # Retuning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-
-    def _prepare_last_sampled_token_ids_for_tp_workers(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        model_input: StatefulModelInput,
-    ) -> None:
-        """ 
-        Prepare the last sampled token ids for TP workers. If it's the last 
-        PP rank, then the last sampled token ids are already in the model_input.
-        If it is NOT the last PP rank, then we need to get the last sampled
-        token that is cached in the execute_model_req.
-        """
-        if get_pp_group().is_last_rank:
-            assert model_input.cached_outputs[
-                -1].sampler_output.sampled_token_ids is None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-            model_input.last_sampled_token_ids = model_input.cached_outputs[
-                -1].sampled_token_ids
-            # free sampled token ids from the previous step if it has been
-            # pythonized. Cannot free the last sampled token ids because
-            # we need it for GPU advance_step.
-            for output in model_input.cached_outputs[:-1]:
-                if output.pythonized:
-                    output.sampled_token_ids = None
-        else:
-            # otherwise we need to get the cached sampled token ids from the
-            # execute_model_req
-            assert execute_model_req.last_sampled_token_ids is not None
-            model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.npu())
-            model_input.add_sampler_output(
-                SamplerOutput(outputs=[], sampled_token_ids=None),
-                model_input.last_sampled_token_ids)
-
-            # free sampled token ids from the previous step.
-            # TODO(will) we could reuse the sampled token ids tensor from
-            # the previous step instead.
-            for output in model_input.cached_outputs[:-1]:
-                output.sampled_token_ids = None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
-                                                              torch.Tensor]]]:
-        """
-        Depending on the current state of the request and multi step worker,
-        this method may skip the normal _prepare_model_input and
-        _prepare_worker_input methods and instead used cached values.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-
-            virtual_engine = execute_model_req.virtual_engine
-            (model_input, worker_input,
-             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
-            assert isinstance(model_input, StatefulModelInput)
-            if execute_model_req.is_first_multi_step:
-                # cache the worker input and model input for the next steps
-                self.multi_step_states[virtual_engine] = MultiStepState(
-                    worker_input=worker_input, model_input=model_input)
-        # if TP workers
-        else:
-            broadcast_data = self._get_worker_input_from_broadcast()
-            # if the driver has sent an empty input, we should stop the worker
-            # loop
-            if broadcast_data is None:
-                return None
-            model_input, worker_input, kwargs = broadcast_data
-            assert isinstance(model_input, StatefulModelInput)
-            virtual_engine = worker_input.virtual_engine
-            if model_input.is_first_multi_step:
-                pass
-                # TODO(will) Can cache the worker input and model input for the
-                # next steps. See below for details
-            else:
-                # TODO(will) possible to also cache and reuse the cached worker
-                # input and model input. The idea is essentially the delta
-                # optimization for model_inputs. Where the TP workers can cache
-                # the model input states and we only broadcast the delta need
-                # for the next step (sampled_token_ids from the previous step)
-
-                assert isinstance(model_input, StatefulModelInput)
-                # we need to update the last sampled token ids in the model
-                # input for the workers so that they can run inplace
-                # advance_step
-                model_input.add_sampler_output(
-                    SamplerOutput(outputs=[], sampled_token_ids=None),
-                    model_input.last_sampled_token_ids)
-
-        assert model_input is not None
-        assert worker_input is not None
-        return model_input, worker_input, kwargs
diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py
deleted file mode 100644
index e1262fb0a2..0000000000
--- a/vllm_ascend/worker/pooling_model_runner.py
+++ /dev/null
@@ -1,186 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/worker/worker.py
-#
-import dataclasses
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalKwargs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import (IntermediateTensors, SequenceData,
-                           SequenceGroupMetadata)
-
-from vllm_ascend.worker.model_runner import (ModelInputForNPU,
-                                             ModelInputForNPUBuilder,
-                                             NPUModelRunnerBase)
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelInputForNPUWithPoolingMetadata(ModelInputForNPU):
-    """
-    Used by the PoolingModelRunner.
-    """
-    pooling_metadata: Optional["PoolingMetadata"] = None
-
-
-class NPUPoolingModelRunner(
-        NPUModelRunnerBase[ModelInputForNPUWithPoolingMetadata]):
-
-    _model_input_cls: Type[ModelInputForNPUWithPoolingMetadata] = (
-        ModelInputForNPUWithPoolingMetadata)
-    _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForNPUWithPoolingMetadata:
-        return ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNPUWithPoolingMetadata:
-        assert seq_group_metadata_list is not None
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Prepare PoolingMetadata.
-        assert model_input.seq_lens is not None
-        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                 model_input.seq_lens)
-
-        return dataclasses.replace(model_input,
-                                   pooling_metadata=pooling_metadata)
-
-    def _prepare_pooling(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-    ) -> PoolingMetadata:
-        """Prepare PoolingMetadata for the sequence group metadata list."""
-        seq_groups: List[Tuple[List[int], PoolingParams]] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            pooling_params = seq_group_metadata.pooling_params
-            seq_groups.append((seq_ids, pooling_params))
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        pooling_metadata = PoolingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-        )
-
-        return pooling_metadata
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNPUWithPoolingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ):
-        if num_steps > 1:
-            raise ValueError(
-                "PoolingModelRunner does not support multi-step execution.")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        if self.prompt_adapter_config:
-            assert model_input.prompt_adapter_requests is not None
-            assert model_input.prompt_adapter_mapping is not None
-            self.set_active_prompt_adapters(
-                model_input.prompt_adapter_requests,
-                model_input.prompt_adapter_mapping)
-
-        assert model_input.attn_metadata is not None
-        virtual_engine = model_input.virtual_engine
-        model_executable = self.model
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.npu.Event(enable_timing=True)
-            model_forward_end = torch.npu.Event(enable_timing=True)
-            model_forward_start.record()
-
-        cross_enc_kwargs = {}
-        if model_input.token_types is not None:
-            cross_enc_kwargs["token_type_ids"] = model_input.token_types
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
-                **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs)
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end.record()
-
-        # Only perform pooling in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            if (self.is_driver_worker
-                    and hidden_or_intermediate_states is not None
-                    and isinstance(hidden_or_intermediate_states,
-                                   IntermediateTensors)
-                    and self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                hidden_or_intermediate_states.tensors["model_forward_time"] = (
-                    torch.tensor(model_forward_time + orig_model_forward_time))
-            return hidden_or_intermediate_states
-
-        # Only perform pooling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        return [
-            self.model.pooler(hidden_states=hidden_or_intermediate_states,
-                              pooling_metadata=model_input.pooling_metadata)
-        ]
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
deleted file mode 100644
index bffc6a8de8..0000000000
--- a/vllm_ascend/worker/worker.py
+++ /dev/null
@@ -1,579 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/worker/worker.py
-#
-
-import gc
-import os
-from typing import Dict, List, Optional, Set, Tuple, Type, Union
-
-import msgpack  # type: ignore
-import torch
-import torch.distributed
-import zmq
-from torch import nn
-from vllm import envs
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment,
-                              set_custom_all_reduce)
-from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
-from vllm.logger import logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
-                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, bind_kv_cache, get_ip
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-from vllm.worker.model_runner_base import ModelRunnerBase
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-from vllm_ascend.ascend_config import init_ascend_config
-from vllm_ascend.device_allocator.camem import CaMemAllocator
-from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
-from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               is_310p, try_register_lib)
-from vllm_ascend.worker.model_runner import NPUModelRunner
-from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner
-
-
-class NPUWorker(LocalOrDistributedWorkerBase):
-    """A worker class that executes (a partition of) the model on a NPU.
-    Each worker is associated with a single NPU. The worker is responsible for
-    maintaining the KV cache and executing the model on the NPU. In case of
-    distributed inference, each worker is assigned a partition of the model.
-    """
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False,
-                 model_runner_cls: Optional[Type[ModelRunnerBase]] = None):
-        # register patch for vllm
-        from vllm_ascend.utils import adapt_patch
-        adapt_patch()
-        # Register ops when worker init.
-        from vllm_ascend import ops  # noqa: F401
-
-        # init ascend config
-        init_ascend_config(vllm_config)
-
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        # Try to import mindie_turbo to accelerate vLLM inference.
-        try_register_lib(
-            "mindie_turbo",
-            "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
-        )
-        # distribute related config
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        # Return hidden states from target model if the draft model is an
-        # mlp_speculator
-        speculative_config = self.speculative_config
-        model_config = self.model_config
-        speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.hf_config.model_type ==
-                model_config.hf_config.model_type) \
-            or (speculative_config.draft_model_config.hf_config.model_type
-                not in ["medusa", "mlp_speculator", "eagle", "deepseek_mtp"]) \
-                    else {"return_hidden_states": True}
-
-        ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner
-        if model_config.runner_type == "pooling":
-            ModelRunnerClass = NPUPoolingModelRunner
-        elif self.model_config.is_encoder_decoder:
-            ModelRunnerClass = EncoderDecoderModelRunner
-        self.model_runner: ModelRunnerBase = ModelRunnerClass(
-            vllm_config=self.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker,
-            **speculative_args,
-        )
-        if model_runner_cls is not None:
-            self.model_runner = model_runner_cls(self.model_runner)
-
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
-        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
-
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            # lazy import so that torch_npu is not required for normal use.
-            import torch_npu
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
-
-            experimental_config = torch_npu.profiler._ExperimentalConfig(
-                export_type=torch_npu.profiler.ExportType.Text,
-                profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
-                msprof_tx=False,
-                aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone,
-                l2_cache=False,
-                op_attr=False,
-                data_simplification=False,
-                record_op_args=False,
-                gc_detect_threshold=None,
-            )
-
-            self.profiler = torch_npu.profiler.profile(
-                activities=[
-                    torch_npu.profiler.ProfilerActivity.CPU,
-                    torch_npu.profiler.ProfilerActivity.NPU,
-                ],
-                with_stack=False,
-                profile_memory=False,
-                with_modules=False,
-                experimental_config=experimental_config,
-                on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir))
-        else:
-            self.profiler = None
-
-        self.enable_dummy_run = False
-        if os.getenv("VLLM_DP_PROXY_IP", None):
-            logger.warning("enable dummy run for the DP")
-            self.enable_dummy_run = True
-            # dp_rank = os.environ["VLLM_DP_RANK"]
-            dp_master_ip = os.environ["VLLM_DP_PROXY_IP"]
-            dp_proxy_listener_port = os.environ["VLLM_DP_PROXY_PORT"]
-            dp_proxy_monitor_port = os.environ["VLLM_DP_MONITOR_PORT"]
-            dp_proxy_listener_addr = f"{dp_master_ip}:{dp_proxy_listener_port}"
-            self.dp_proxy_monitor_addr = f"{dp_master_ip}:{dp_proxy_monitor_port}"
-            http_ip = get_ip()
-            port = os.environ["VLLM_HTTP_PORT"]
-            self.http_addr = f"{http_ip}:{port}"
-            context = zmq.Context()  # type: ignore
-            sock = context.socket(zmq.DEALER)  # type: ignore
-
-            logger.debug("ping dp proxy start, DP_RANK:%s", 0)
-            # logger.debug("ping dp proxy start, DP_RANK:%s", dp_rank)
-
-            sock.connect(f"tcp://{dp_proxy_listener_addr}")
-            data = {"type": "DP", "http_address": self.http_addr}
-            for _ in range(10):
-                sock.send(msgpack.dumps(data))
-
-            self.notify_socket = context.socket(zmq.PUSH)  # type: ignore
-            self.notify_socket.connect(f"tcp://{self.dp_proxy_monitor_addr}")
-
-    def sleep(self, level: int = 1) -> None:
-        NPUPlatform.set_device(self.device)
-        free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
-        allocator = CaMemAllocator.get_instance()
-        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
-        free_bytes_after_sleep, total = NPUPlatform.mem_get_info()
-        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
-        used_bytes = total - free_bytes_after_sleep
-        assert freed_bytes >= 0, "Memory usage increased after sleeping."
-        logger.info(
-            "Sleep mode freed %.2f GiB memory, "
-            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
-            used_bytes / GiB_bytes)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        allocator = CaMemAllocator.get_instance()
-        allocator.wake_up(tags=tags)
-
-    def init_device(self) -> None:
-        if self.device_config.device.type == "npu":
-            self.device = torch.device(f"npu:{self.local_rank}")
-            NPUPlatform.set_device(self.device)
-            NPUPlatform.empty_cache()
-            self.init_npu_memory = NPUPlatform.mem_get_info()[0]
-        else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
-        # Initialize the distributed environment.
-        self._init_worker_distributed_environment(self.vllm_config, self.rank,
-                                                  self.distributed_init_method,
-                                                  self.local_rank)
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        if self.vllm_config.model_config.enable_sleep_mode:
-            allocator = CaMemAllocator.get_instance()
-            assert allocator.get_current_usage() == 0, (
-                "Sleep mode can only be "
-                "used for one instance per process.")
-            context = allocator.use_memory_pool(tag="weights")
-        else:
-            from contextlib import nullcontext
-            context = nullcontext()  # type: ignore
-        with context:
-            self.model_runner.load_model()
-
-    def start_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.start()
-
-    def stop_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.stop()
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self.model_runner.save_sharded_state(
-            path,
-            pattern=pattern,
-            max_size=max_size,
-        )
-
-    def save_tensorized_model(
-        self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-        self.model_runner.save_tensorized_model(
-            tensorizer_config=tensorizer_config, )
-
-    @NPUPlatform.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of NPU and CPU blocks
-        that can be allocated with the remaining free memory.
-        .. tip::
-            You may limit the usage of NPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
-        NPUPlatform.empty_cache()
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        free_npu_memory, total_npu_memory = NPUPlatform.mem_get_info()
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_npu_memory - free_npu_memory
-        assert peak_memory > 0, (
-            "Error in memory profiling. "
-            f"Initial free memory {self.init_npu_memory}, current free memory"
-            f" {free_npu_memory}. This happens when the NPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
-
-        cache_block_size = self.get_cache_block_size_bytes()
-        num_npu_blocks = int(
-            (total_npu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                             cache_block_size)
-        num_npu_blocks = max(num_npu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-        gc.collect()
-        # TODO: don`t need impl this func after empty_cache in
-        # Worker.determine_num_available_blocks() unified`
-        NPUPlatform.empty_cache()
-        return num_npu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Allocate NPU and CPU KV cache with the specified number of blocks.
-        """
-        raise_if_cache_size_invalid(num_gpu_blocks,
-                                    self.cache_config.block_size,
-                                    self.cache_config.is_attention_free,
-                                    self.model_config.max_model_len)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-        if self.vllm_config.model_config.enable_sleep_mode:
-            allocator = CaMemAllocator.get_instance()
-            context = allocator.use_memory_pool(tag="kv_cache")
-        else:
-            from contextlib import nullcontext
-            context = nullcontext()  # type: ignore
-        with context:
-            with set_current_vllm_config(self.vllm_config):
-                self._init_cache_engine()
-        self._warm_up_model()
-
-    def _init_cache_engine(self):
-        assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = [
-            CacheEngine(self.cache_config, self.model_config,
-                        self.parallel_config, self.device_config)
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        import torch_npu
-        acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p(
-        ) else ACL_FORMAT_FRACTAL_ND
-        for ve in range(self.parallel_config.pipeline_parallel_size):
-            num_layers = len(self.cache_engine[ve].gpu_cache)
-            for i in range(num_layers):
-                if torch.is_tensor(self.cache_engine[ve].gpu_cache[i]):
-                    self.cache_engine[ve].gpu_cache[
-                        i] = torch_npu.npu_format_cast(
-                            self.cache_engine[ve].gpu_cache[i], acl_format)
-                else:
-                    self.cache_engine[ve].gpu_cache[i][
-                        0] = torch_npu.npu_format_cast(
-                            self.cache_engine[ve].gpu_cache[i][0], acl_format)
-                    self.cache_engine[ve].gpu_cache[i][
-                        1] = torch_npu.npu_format_cast(
-                            self.cache_engine[ve].gpu_cache[i][1], acl_format)
-        self.gpu_cache = [
-            self.cache_engine[ve].gpu_cache
-            for ve in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.gpu_cache)
-
-    def _warm_up_model(self) -> None:
-        # model capture is not supported, thus we just set seed here.
-        # Reset the seed to ensure that the random state is not affected by
-        # the model initialization and profiling.
-        set_random_seed(self.model_config.seed)
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return self.parallel_config.tensor_parallel_size > 1
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return self.gpu_cache
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        virtual_engine = execute_model_req.virtual_engine
-        num_steps = execute_model_req.num_steps
-        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
-        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
-        # they contain parameters to launch cudamemcpyasync.
-        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
-                                         device="cpu",
-                                         dtype=torch.int64).view(-1, 2)
-        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-        # `blocks_to_copy` is a gpu tensor. The src and tgt of
-        # blocks to copy are in the same device, and `blocks_to_copy`
-        # can be used directly within cuda kernels.
-        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                      device=self.device,
-                                      dtype=torch.int64).view(-1, 2)
-
-        return WorkerInput(
-            num_seq_groups=num_seq_groups,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            virtual_engine=virtual_engine,
-            num_steps=num_steps,
-        )
-
-    def get_model(self) -> nn.Module:
-        return self.model_runner.get_model()
-
-    @torch.inference_mode()
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        if self.enable_dummy_run:
-            logger.debug(
-                f"send notify to the dp proxy: {self.dp_proxy_monitor_addr}")
-            data = {"info": "notify_step", "http_address": self.http_addr}
-            self.notify_socket.send(msgpack.dumps(data))
-        virtual_engine = worker_input.virtual_engine
-        # Issue cache operations.
-        if (worker_input.blocks_to_swap_in is not None
-                and worker_input.blocks_to_swap_in.numel() > 0):
-            self.cache_engine[virtual_engine].swap_in(
-                worker_input.blocks_to_swap_in)
-        if (worker_input.blocks_to_swap_out is not None
-                and worker_input.blocks_to_swap_out.numel() > 0):
-            self.cache_engine[virtual_engine].swap_out(
-                worker_input.blocks_to_swap_out)
-        if (worker_input.blocks_to_copy is not None
-                and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
-
-    def _get_cached_seq_group_metadata(
-            self,
-            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
-                                                SequenceGroupMetadataDelta]],
-            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
-        """Return a list of cached Sequence Group Metadata after updating its
-        state.
-
-        It is used because scheduler only sends delta to workers to reduce
-        the data payload size. The function also cleans up cache based on
-        a given `finished_request_ids`.
-        """
-        new_seq_group_metadata_list = []
-        for metadata_or_delta in seq_group_metadata_list:
-            request_id = metadata_or_delta.request_id
-            if request_id not in self._seq_group_metadata_cache:
-                # The first prefill.
-                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                self._seq_group_metadata_cache[request_id] = metadata_or_delta
-            else:
-                # The first prefill is already cached.
-                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
-                    self._seq_group_metadata_cache[request_id].apply_delta(
-                        metadata_or_delta)
-                else:
-                    # If metadata snapshot is sent again, it is
-                    # preempted. Reset the cache because we need to start
-                    # from scratch.
-                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                    self._seq_group_metadata_cache[
-                        request_id] = metadata_or_delta
-
-            new_seq_group_metadata_list.append(
-                self._seq_group_metadata_cache[request_id])
-
-        # Clean up finished ids
-        for finished_id in finished_request_ids:
-            del self._seq_group_metadata_cache[finished_id]
-
-        return new_seq_group_metadata_list
-
-    def _execute_model_spmd(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Optional[List[SamplerOutput]]:
-        if execute_model_req is not None:
-            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
-                execute_model_req.seq_group_metadata_list,
-                execute_model_req.finished_requests_ids)
-
-            execute_model_req.seq_group_metadata_list = (
-                new_seq_group_metadata_list)
-        output = super()._execute_model_spmd(execute_model_req,
-                                             intermediate_tensors)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.model_runner.list_loras()
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for NPU backend currently.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for NPU backend currently.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for NPU backend currently.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for NPU backend currently.")
-
-    @property
-    def max_model_len(self) -> int:
-        return self.model_config.max_model_len
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_runner.vocab_size
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Get the size of the KV cache block size in bytes.
-        """
-        return CacheEngine.get_cache_block_size(self.cache_config,
-                                                self.model_config,
-                                                self.parallel_config)
-
-    def _init_worker_distributed_environment(
-            self,
-            vllm_config: VllmConfig,
-            rank: int,
-            distributed_init_method: Optional[str] = None,
-            local_rank: int = -1,
-            backend: str = "hccl") -> None:
-        """Initialize the distributed environment."""
-        parallel_config = self.parallel_config
-        set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
-        init_distributed_environment(parallel_config.world_size, rank,
-                                     distributed_init_method, local_rank,
-                                     backend)
-        ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size)
-        init_ascend_model_parallel(
-            parallel_config.expert_parallel_size,
-            parallel_config.expert_tensor_parallel_size,
-            parallel_config.world_size_across_dp,
-        )
-        ensure_kv_transfer_initialized(vllm_config)
-
-
-def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
-                                max_model_len) -> None:
-    if is_attention_free and num_gpu_blocks != 0:
-        raise ValueError("No memory should be allocated for the cache blocks "
-                         f"for an attention-free model, but {num_gpu_blocks}"
-                         "blocks are allocated.")
-    if not is_attention_free and num_gpu_blocks <= 0:
-        raise ValueError("No available memory for the cache blocks. "
-                         "Try increasing `gpu_memory_utilization` when "
-                         "initializing the engine.")
-    max_seq_len = block_size * num_gpu_blocks
-    if not is_attention_free and max_model_len > max_seq_len:
-        raise ValueError(
-            f"The model's max seq len ({max_model_len}) "
-            "is larger than the maximum number of tokens that can be "
-            f"stored in KV cache ({max_seq_len}). Try increasing "
-            "`gpu_memory_utilization` or decreasing `max_model_len` when "
-            "initializing the engine.")