diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index ec23fcef1a..ffdf72b471 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -213,26 +213,6 @@ jobs: # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py - - name: Run e2e test on V0 engine - if: ${{ github.event_name == 'schedule' }} - env: - VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/singlecard/test_offline_inference.py - pytest -sv tests/e2e/singlecard/test_ilama_lora.py - pytest -sv tests/e2e/singlecard/test_guided_decoding.py - pytest -sv tests/e2e/singlecard/test_camem.py - pytest -sv tests/e2e/singlecard/test_prompt_embedding.py - pytest -sv tests/e2e/singlecard/test_embedding.py - pytest -sv tests/e2e/singlecard/ \ - --ignore=tests/e2e/singlecard/test_offline_inference.py \ - --ignore=tests/e2e/singlecard/test_ilama_lora.py \ - --ignore=tests/e2e/singlecard/test_guided_decoding.py \ - --ignore=tests/e2e/singlecard/test_camem.py \ - --ignore=tests/e2e/singlecard/test_prompt_embedding.py \ - --ignore=tests/e2e/singlecard/test_embedding.py - e2e-4-cards: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} @@ -308,19 +288,3 @@ jobs: pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \ --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \ --ignore=tests/e2e/multicard/test_data_parallel.py - - - name: Run vllm-project/vllm-ascend test on V0 engine - if: ${{ github.event_name == 'schedule' }} - env: - VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. - # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/e2e/multicard/test_data_parallel.py - pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \ - --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \ - --ignore=tests/e2e/multicard/test_data_parallel.py diff --git a/.gitignore b/.gitignore index ef8fc873a6..da7300fca1 100644 --- a/.gitignore +++ b/.gitignore @@ -198,3 +198,4 @@ kernel_meta/ /vllm_ascend/_version.py # build info file generated by setup.py /vllm_ascend/_build_info.py +/vllm_ascend/include/ diff --git a/examples/disaggregated_prefill/run_decode_server.sh b/examples/disaggregated_prefill/run_decode_server.sh index a3bbaa189f..8dbbf5d634 100644 --- a/examples/disaggregated_prefill/run_decode_server.sh +++ b/examples/disaggregated_prefill/run_decode_server.sh @@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=0 - export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 diff --git a/examples/disaggregated_prefill/run_prefill_server.sh b/examples/disaggregated_prefill/run_prefill_server.sh index dc929f8a49..341a4feea9 100644 --- a/examples/disaggregated_prefill/run_prefill_server.sh +++ b/examples/disaggregated_prefill/run_prefill_server.sh @@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=0 - export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py index 3e88c00176..499c09a647 100644 --- a/examples/offline_inference_npu.py +++ b/examples/offline_inference_npu.py @@ -34,7 +34,7 @@ # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.0) # Create an LLM. -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") +llm = LLM(model="/shared/cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct") # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference_npu_v0.py b/examples/offline_inference_npu_v0.py deleted file mode 100644 index b6a1156e43..0000000000 --- a/examples/offline_inference_npu_v0.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/examples/offline_inference/basic.py -# - -import os - -os.environ["VLLM_USE_V1"] = "0" -os.environ["VLLM_USE_MODELSCOPE"] = "True" - -from vllm import LLM, SamplingParams - -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) -# Create an LLM. -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") - -# Generate texts from the prompts. -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh index e2bf4c8158..0f271d1352 100644 --- a/examples/run_dp_server.sh +++ b/examples/run_dp_server.sh @@ -6,8 +6,6 @@ export HCCL_SOCKET_IFNAME="enp189s0f0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=0 - export ASCEND_RT_VISIBLE_DEVICES=0,1 export VLLM_DP_SIZE=2 export VLLM_DP_RANK=0 diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index 368d3ff953..2b07718e77 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -60,8 +60,6 @@ ] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="mtp is not supported on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: @@ -89,8 +87,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: ) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="mtp is not supported on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_ascend_scheduler(model: str, diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py index ce628f9d35..26d908c7f5 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -78,8 +78,6 @@ def _deepseek_torchair_test_fixture( print(f"Generated text: {vllm_output[i][1]!r}") -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_deepseekv3_with_torchair(): additional_config = { "torchair_graph_config": { @@ -89,8 +87,6 @@ def test_e2e_deepseekv3_with_torchair(): _deepseek_torchair_test_fixture(additional_config) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_deepseekv3_with_torchair_ms_mla(): additional_config = { "torchair_graph_config": { @@ -150,8 +146,6 @@ def _pangu_torchair_test_fixture( print(f"Generated text: {vllm_output[i][1]!r}") -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_pangu_with_torchair(): additional_config = { "torchair_graph_config": { diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh deleted file mode 100644 index c15f109299..0000000000 --- a/tests/e2e/pd_disaggreate/setup_pd.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash - -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -function run_prefill_instance() { - local model_name=$1 - local tp_size=$2 - local prefill_port=$3 - local register_port=$4 - local prefill_device_ips=$5 - local decode_device_ips=$6 - - echo "================================" - echo "Testing model: $model_name" - echo "================================" - # Start prefill instance - - KV_CONFIG=$(jq -n \ - --arg kv_connector "AscendSimpleConnector" \ - --arg kv_buffer_device "npu" \ - --arg kv_role "kv_producer" \ - --argjson kv_parallel_size 8 \ - --arg kv_port 11001 \ - --argjson prefill_device_ips "$prefill_device_ips" \ - --argjson decode_device_ips "$decode_device_ips" \ - --argjson llmdatadist_comm_port 26000 \ - --arg proxy_ip "0.0.0.0" \ - --argjson proxy_port "$register_port" \ - --argjson http_port "$prefill_port" \ - '{ - "kv_connector": $kv_connector, - "kv_buffer_device": $kv_buffer_device, - "kv_role": $kv_role, - "kv_parallel_size": $kv_parallel_size, - "kv_port": $kv_port, - "kv_connector_extra_config": { - "prefill_device_ips": $prefill_device_ips, - "decode_device_ips": $decode_device_ips, - "llmdatadist_comm_port": $llmdatadist_comm_port, - "proxy_ip": $proxy_ip, - "proxy_port": $proxy_port, - "http_port": $http_port - } - }') - - # start prefill instance - ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \ - --host 0.0.0.0 \ - --port $prefill_port \ - --tensor-parallel-size $tp_size \ - --served-model-name Deepseek \ - --max-model-len 2000 \ - --trust-remote-code \ - --enforce-eager \ - --kv-transfer-config "$KV_CONFIG" -} - - - -function run_decode_instance() { - # Start decode instance - local model_name=$1 - local tp_size=$2 - local decode_port=$3 - local register_port=$4 - local prefill_device_ips=$5 - local decode_device_ips=$6 - - KV_CONFIG=$(jq -n \ - --arg kv_connector "AscendSimpleConnector" \ - --arg kv_buffer_device "npu" \ - --arg kv_role "kv_consumer" \ - --argjson kv_parallel_size 8 \ - --arg kv_port 21001 \ - --argjson prefill_device_ips "$prefill_device_ips" \ - --argjson decode_device_ips "$decode_device_ips" \ - --argjson llmdatadist_comm_port 26000 \ - --arg proxy_ip "0.0.0.0" \ - --argjson proxy_port "$register_port" \ - --argjson http_port "$decode_port" \ - '{ - "kv_connector": $kv_connector, - "kv_buffer_device": $kv_buffer_device, - "kv_role": $kv_role, - "kv_parallel_size": $kv_parallel_size, - "kv_port": $kv_port, - "kv_connector_extra_config": { - "prefill_device_ips": $prefill_device_ips, - "decode_device_ips": $decode_device_ips, - "llmdatadist_comm_port": $llmdatadist_comm_port, - "proxy_ip": $proxy_ip, - "proxy_port": $proxy_port, - "http_port": $http_port - } - }') - - # start decode instance - ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \ - --host 0.0.0.0 \ - --port $decode_port \ - --tensor-parallel-size $tp_size \ - --seed 1024 \ - --served-model-name Deepseek \ - --max-model-len 2000 \ - --max-num-batched-tokens 2000 \ - --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --enforce-eager \ - --kv-transfer-config "$KV_CONFIG" -} - -function run_proxy_server() { - # Build the command for the proxy server with all the hosts and ports - register_port=$1 - proxy_port=$2 - PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port" - - # Start the proxy server - echo "Starting proxy server with command: $PROXY_CMD" - $PROXY_CMD & -} diff --git a/tests/e2e/pd_disaggreate/test_pd_e2e.py b/tests/e2e/pd_disaggreate/test_pd_e2e.py deleted file mode 100644 index 5fd923211c..0000000000 --- a/tests/e2e/pd_disaggreate/test_pd_e2e.py +++ /dev/null @@ -1,109 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import os -import signal -import subprocess -import time - -import psutil -import requests - - -def kill_process_and_children(pid): - try: - parent = psutil.Process(pid) - children = parent.children(recursive=True) - for child in children: - print(f"Killing child process {child.pid}") - child.kill() - print(f"Killing parent process {pid}") - parent.kill() - except psutil.NoSuchProcess: - pass - - -def kill_all_vllm_related(): - current_pid = os.getpid() - - for proc in psutil.process_iter(['pid', 'cmdline']): - try: - if proc.pid == current_pid: - continue - cmd = ' '.join(proc.info['cmdline']) - if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd: - kill_process_and_children(proc.pid) - except Exception: - continue - - -PROXY_PORT = 10102 -DECODE_PORT = 8002 - -SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh") - - -def wait_for_port(port, timeout=30): - import socket - start = time.time() - while time.time() - start < timeout: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - if sock.connect_ex(("127.0.0.1", port)) == 0: - return True - time.sleep(1) - raise TimeoutError(f"Port {port} not ready after {timeout}s") - - -def start_and_test_pipeline(): - print("Launching bash script to run vLLM PD setup...") - proc = subprocess.Popen(["bash", SCRIPT_PATH]) - try: - print("Waiting for proxy port to be available...") - wait_for_port(PROXY_PORT, 180) - wait_for_port(DECODE_PORT, 600) - - # request - payload = { - "model": "Deepseek", - "prompt": "The future of AI is", - "max_tokens": 64, - "temperature": 0, - } - response = requests.post( - f"http://localhost:{PROXY_PORT}/v1/completions", - headers={"Content-Type": "application/json"}, - json=payload, - timeout=10) - assert response.status_code == 200, f"HTTP failed: {response.status_code}" - result = response.json() - print("Response:", result) - assert "text" in result["choices"][0] - assert len(result["choices"][0]["text"].strip()) > 0 - - finally: - # clean up subprocesses - print("Cleaning up subprocess...") - proc.send_signal(signal.SIGINT) - try: - proc.wait(timeout=10) - except subprocess.TimeoutExpired: - proc.kill() - kill_all_vllm_related() - - -def test_disaggregated_pd_pipeline(): - start_and_test_pipeline() diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py index 17116ab59a..2d8689d5ae 100644 --- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py +++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py @@ -7,9 +7,6 @@ import torch from vllm import LLM -if os.getenv("VLLM_USE_V1", "0") != "1": - pytest.skip("Test package requires V1", allow_module_level=True) - MODEL = "Qwen/Qwen2.5-0.5B-Instruct" PROMPT = "Hello my name is Robert and I" diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index 89dfa08e41..605384ae0a 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -36,8 +36,6 @@ ] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="aclgraph only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) def test_models( @@ -86,8 +84,6 @@ def test_models( ) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="aclgraph only support on v1") def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None: with monkeypatch.context() as m: m.setenv("VLLM_USE_MODELSCOPE", "True") diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py index 2240b88e2c..b185671500 100644 --- a/tests/e2e/singlecard/test_chunked.py +++ b/tests/e2e/singlecard/test_chunked.py @@ -29,8 +29,6 @@ MODELS = ["deepseek-ai/DeepSeek-V2-Lite"] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="new chunked only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [1]) def test_models( diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py index 9d103a5308..e8c33a49ea 100644 --- a/tests/e2e/singlecard/test_guided_decoding.py +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -30,10 +30,8 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" -GuidedDecodingBackendV0 = ["outlines", "lm-format-enforcer", "xgrammar"] GuidedDecodingBackendV1 = ["xgrammar", "guidance"] -GuidedDecodingBackend = list( - set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1)) +GuidedDecodingBackend = GuidedDecodingBackendV1 @pytest.fixture(scope="module") @@ -85,9 +83,6 @@ def sample_json_schema(): def check_backend(guided_decoding_backend: str): - if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv( - "VLLM_USE_V1") == "0": - pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.") if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv( "VLLM_USE_V1") == "1": pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.") diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index a123790dbd..cfaec1bfd3 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -193,26 +193,6 @@ def test_check_ascend_config_pass(self): @_clean_up_ascend_config def test_check_ascend_config_wrong_case(self): test_vllm_config = VllmConfig() - # For V0 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "ascend_scheduler_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, True) # For V1 engine with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): # torchair + eager mode diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index c09964a745..72286236b9 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -416,24 +416,6 @@ def test_check_and_update_config_speculative_worker_config( "vllm_ascend.worker.worker.NPUWorker", ) - @patch("vllm_ascend.ascend_config.check_ascend_config") - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm.envs.VLLM_USE_V1", False) - def test_check_and_update_config_multi_step_worker_config( - self, mock_init_ascend, mock_check_ascend): - mock_init_ascend.return_value = self.mock_ascend_config - self.mock_vllm_config.scheduler_config.is_multi_step = True - self.mock_vllm_config.parallel_config.worker_cls = "auto" - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(self.mock_vllm_config) - self.assertEqual( - self.mock_vllm_config.parallel_config.worker_cls, - "vllm_ascend.worker.multi_step_worker.MultiStepWorker", - ) - @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm.envs.VLLM_USE_V1", False) diff --git a/tests/ut/worker/test_pooling_model_runner.py b/tests/ut/worker/test_pooling_model_runner.py deleted file mode 100644 index 28a0a7d3c6..0000000000 --- a/tests/ut/worker/test_pooling_model_runner.py +++ /dev/null @@ -1,355 +0,0 @@ -import unittest -from unittest.mock import MagicMock, patch - -import torch -from vllm.distributed.parallel_state import GroupCoordinator -from vllm.engine.arg_utils import EngineArgs -from vllm.pooling_params import PoolingParams -from vllm.sequence import SequenceData, SequenceGroupMetadata - -from vllm_ascend.worker.pooling_model_runner import ( - ModelInputForNPUWithPoolingMetadata, NPUPoolingModelRunner) - - -class TestPoolingModelRunner(unittest.TestCase): - """Unit tests for the NPUPoolingModelRunner class.""" - - def _create_model_runner(self, model: str, *args, - **kwargs) -> NPUPoolingModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = NPUPoolingModelRunner(vllm_config=engine_config, ) - return model_runner - - def setUp(self): - """Initialize test fixtures and common mocks""" - self.attn_backend = "npu" - - model_runner = self._create_model_runner( - "tests/ut/fake_weight", - trust_remote_code=True, - enable_chunked_prefill=False, - ) - - self.runner = model_runner - self.runner.attn_backend = self.attn_backend - model_runner.model = MagicMock() - self.runner = model_runner - # Sample test data - self.sample_tensor_dict = {"tensor1": torch.randn(3, 4)} - self.sample_seq_group = [MagicMock(spec=SequenceGroupMetadata)] - self.sample_finished_ids = ["req1", "req2"] - - @patch( - 'vllm_ascend.worker.pooling_model_runner.ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict' - ) - def test_make_model_input_from_broadcasted_tensor_dict( - self, mock_from_dict): - """Test tensor dictionary conversion to model input""" - # Setup mock return - expected_output = MagicMock() - mock_from_dict.return_value = expected_output - - # Execute - result = self.runner.make_model_input_from_broadcasted_tensor_dict( - self.sample_tensor_dict) - - # Verify - mock_from_dict.assert_called_once_with(self.sample_tensor_dict, - attn_backend=self.attn_backend) - self.assertEqual(result, expected_output) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_normal_case(self, mock_prepare_tensors, - mock_prepare_pooling): - """Test normal flow of model input preparation""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - mock_pooling_metadata = MagicMock() - mock_prepare_pooling.return_value = mock_pooling_metadata - - # Execute - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=self.sample_finished_ids) - - # Verify - mock_prepare_tensors.assert_called_once_with(self.sample_seq_group, - self.sample_finished_ids) - mock_prepare_pooling.assert_called_once_with(self.sample_seq_group, - mock_model_input.seq_lens) - self.assertEqual(result.pooling_metadata, mock_pooling_metadata) - - def test_prepare_model_input_null_sequence_group(self): - """Test assertion when seq_group_metadata_list is None""" - with self.assertRaises(AssertionError): - self.runner.prepare_model_input( - seq_group_metadata_list=None, - finished_requests_ids=self.sample_finished_ids) - - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_null_seq_lens(self, mock_prepare_tensors): - """Test assertion when seq_lens is None in model input""" - # Setup mock with None seq_lens - mock_model_input = MagicMock() - mock_model_input.seq_lens = None - mock_prepare_tensors.return_value = mock_model_input - - with self.assertRaises(AssertionError): - self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=self.sample_finished_ids) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_with_virtual_engine(self, - mock_prepare_tensors, - mock_prepare_pooling): - """Test virtual engine parameter is properly handled""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - # Execute with virtual_engine parameter - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - virtual_engine=1, - finished_requests_ids=self.sample_finished_ids) - - # Verify virtual_engine doesn't affect the flow - self.assertIsNotNone(result) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_with_null_finished_ids( - self, mock_prepare_tensors, mock_prepare_pooling): - """Test case when finished_requests_ids is None""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - # Execute with None finished_ids - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=None) - - # Verify - mock_prepare_tensors.assert_called_once_with(self.sample_seq_group, - None) - self.assertIsNotNone(result) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_normal_case(self, mock_pooling_metadata): - """Test normal case with multiple sequences in group""" - # Setup test data - mock_pooling_metadata.return_value = None - seq_data = { - 1: MagicMock(spec=SequenceData), - 2: MagicMock(spec=SequenceData) - } - pooling_params = MagicMock(spec=PoolingParams) - seq_group = MagicMock(spec=SequenceGroupMetadata) - seq_group.seq_data = seq_data - seq_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([seq_group], [10, 20]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([1, 2], pooling_params) - ], - seq_data=seq_data, - prompt_lens=[10, 20]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_empty_group(self, mock_pooling_metadata): - """Test case with empty sequence group""" - # Setup empty group - mock_pooling_metadata.return_value = None - empty_seq_data: dict[int, SequenceData] = {} - pooling_params = MagicMock(spec=PoolingParams) - empty_group = MagicMock(spec=SequenceGroupMetadata) - empty_group.seq_data = empty_seq_data - empty_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([empty_group], []) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([], pooling_params) - ], - seq_data={}, - prompt_lens=[]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_single_sequence(self, mock_pooling_metadata): - """Test case with single sequence in group""" - # Setup single sequence - mock_pooling_metadata.return_value = None - single_seq_data = {3: MagicMock(spec=SequenceData)} - pooling_params = MagicMock(spec=PoolingParams) - single_group = MagicMock(spec=SequenceGroupMetadata) - single_group.seq_data = single_seq_data - single_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([single_group], [5]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([3], pooling_params) - ], - seq_data=single_seq_data, - prompt_lens=[5]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_multiple_groups(self, mock_pooling_metadata): - """Test case with multiple sequence groups""" - # Setup multiple groups - mock_pooling_metadata.return_value = None - seq_data1 = {1: MagicMock(spec=SequenceData)} - seq_data2 = {2: MagicMock(spec=SequenceData)} - params1 = MagicMock(spec=PoolingParams) - params2 = MagicMock(spec=PoolingParams) - - group1 = MagicMock(spec=SequenceGroupMetadata) - group1.seq_data = seq_data1 - group1.pooling_params = params1 - - group2 = MagicMock(spec=SequenceGroupMetadata) - group2.seq_data = seq_data2 - group2.pooling_params = params2 - - # Call the function - self.runner._prepare_pooling([group1, group2], [10, 20]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([1], params1), ([2], params2) - ], - seq_data={ - **seq_data1, - **seq_data2 - }, - prompt_lens=[10, 20]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_empty_input(self, mock_pooling_metadata): - """Test case with empty input lists""" - # Call the function with empty inputs - mock_pooling_metadata.return_value = None - self.runner._prepare_pooling([], []) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[], - seq_data={}, - prompt_lens=[]) - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=True)) - @patch('torch.npu.Event') - @patch.object(NPUPoolingModelRunner, 'set_active_loras') - @patch.object(NPUPoolingModelRunner, 'set_active_prompt_adapters') - def test_execute_model_normal_flow(self, mock_set_adapters, mock_set_loras, - mock_event, mock_pp, mock_set_forward): - """Test normal execution path with all dependencies mocked""" - - # Setup model input mock - mock_input = MagicMock() - mock_input.input_tokens = torch.tensor([1]) - mock_input.input_positions = torch.tensor([0]) - mock_input.multi_modal_kwargs = {} - self.runner.is_driver_worker = True - # Execute - self.runner.execute_model(model_input=mock_input, - kv_caches=[], - num_steps=1) - - # Verify core calls - self.runner.model.pooler.assert_called_once() - - @patch('vllm.forward_context.set_forward_context') - def test_execute_model_invalid_steps(self, mock_set_forward): - """Test ValueError when num_steps != 1""" - with self.assertRaises(ValueError): - self.runner.execute_model(model_input=MagicMock(), - kv_caches=[], - num_steps=2) - mock_set_forward.assert_not_called() - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - @patch('torch.npu.Event') - def test_execute_model_perf_monitoring(self, mock_event, mock_pp, - mock_set_forward): - """Test performance monitoring with timing mocks""" - # Setup mocks - - mock_event.return_value.elapsed_time.return_value = 15.0 - self.runner.observability_config = MagicMock( - collect_model_forward_time=True) - - # Execute - self.runner.execute_model(model_input=MagicMock( - input_tokens=torch.tensor([1]), - input_positions=torch.tensor([0]), - multi_modal_kwargs={}), - kv_caches=[], - num_steps=1) - - # Verify timing calls - self.assertEqual(mock_event.call_count, 2) - - @patch('vllm.forward_context.set_forward_context') - @patch.object(NPUPoolingModelRunner, 'set_active_loras') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - def test_execute_model_lora_config(self, mock_pp, set_active_loras, - mock_set_forward): - """Test LoRA configuration handling""" - # Setup - - self.runner.lora_config = True - mock_input = MagicMock() - mock_input.lora_requests = ["req1"] - mock_input.lora_mapping = {"map": 1} - - # Execute - self.runner.execute_model(model_input=mock_input, - kv_caches=[], - num_steps=1) - - # Verify LoRA call - set_active_loras.assert_called_once_with(["req1"], {"map": 1}) - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - def test_execute_model_not_last_rank(self, mock_pp, mock_set_forward): - """Test behavior when not the last pipeline rank""" - # Setup - - # Execute - self.runner.execute_model(model_input=MagicMock( - input_tokens=torch.tensor([1]), - input_positions=torch.tensor([0]), - multi_modal_kwargs={}), - kv_caches=[], - num_steps=1) - - # Verify pooler not called - self.runner.model.pooler.assert_not_called() diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py deleted file mode 100644 index 944e8c9a65..0000000000 --- a/vllm_ascend/attention/attention.py +++ /dev/null @@ -1,1228 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import numpy as np -import torch -import torch_npu -import torchair._contrib.custom_torch_ops # type: ignore # noqa: F401 -from torch.nn.functional import scaled_dot_product_attention -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - MLAAttentionImpl) -from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, - CommonMetadataBuilder, - compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) -from vllm.utils import async_tensor_h2d, make_tensor_with_pad - -from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.attention.attention_mask import AttentionMaskBuilder -from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, - enable_custom_op, is_310p, nd_to_nz_2d) -from vllm_ascend.worker.model_runner import ( - ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) - -_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] - - -class AscendAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "ASCEND" - - @staticmethod - def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: - return AscendAttentionBackendImpl - - @staticmethod - def get_metadata_cls() -> Type["AscendMetadata"]: - return AscendMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - if is_310p(): - return (2, num_blocks, num_kv_heads * head_size // 16, block_size, - 16) - else: - return (2, num_blocks, block_size, num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: List[torch.Tensor], - dst_kv_cache: List[torch.Tensor], - src_to_dst: torch.Tensor, - ) -> None: - src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1] - dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1] - src_indices = src_to_dst[:, 0] - dst_indices = src_to_dst[:, 1] - - dst_key_cache[dst_indices] = src_key_cache[src_indices].to( - dst_key_cache.device) - dst_value_cache[dst_indices] = src_value_cache[src_indices].to( - dst_key_cache.device) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - src_indices = src_to_dists[:, 0] - dst_indices = src_to_dists[:, 1] - - for kv_cache in kv_caches: - key_caches = kv_cache[0] - value_caches = kv_cache[1] - key_caches[dst_indices] = key_caches[src_indices] - value_caches[dst_indices] = value_caches[src_indices] - - @staticmethod - def get_builder_cls() -> Type["AscendMetadataBuilder"]: - return AscendMetadataBuilder - - @classmethod - def make_metadata_builder(cls, *args, **kwargs) -> "AscendMetadataBuilder": - return cls.get_builder_cls()(*args, **kwargs) - - -class AscendMLAAttentionBackend(AscendAttentionBackend): - - @staticmethod - def get_impl_cls() -> Type["AscendMLAAttentionBackendImpl"]: - return AscendMLAAttentionBackendImpl - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, num_kv_heads, head_size) - - -@dataclass -class AscendMetadata(AttentionMetadata): - """Metadata for Ascendbackend. - * modified from XFormersbackend - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # FIXME: It is for flash attn. - # Maximum sequence length among prefill batch. 0 if there are decoding - # Avoid mypy error - # Total number of prefill requests. - num_prefills: int - # Number of prefill tokens. - num_prefill_tokens: int - # (num_tokens,). The indices of the token slots that input tokens will be - # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size - # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot - # in block 0, and 1st slot in block 1, respectively. - slot_mapping: torch.Tensor - - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - - chunked_prefill_enabled: bool - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - block_tables: Optional[torch.Tensor] - - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] = None - - # The query lengths of the input sequences - query_lens: Optional[List[int]] = None - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - - # Self-attention prefill/decode metadata cache - _cached_prefill_metadata: Optional["AscendMetadata"] = None - _cached_decode_metadata: Optional["AscendMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Mask for normal situation - attn_mask: Optional[torch.Tensor] = None - - # Mask for prefix caching - compress_mask: Optional[torch.Tensor] = None - - # Mask for chunked prefill - chunk_mask: Optional[torch.Tensor] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self) -> Optional["AscendMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - # Recover cached prefill-phase attention - # metadata structure. - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - - # Compute some attn_metadata fields which default to None. - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - query_lens = (None if self.query_lens is None else - self.query_lens[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - - # Construct & cache prefill-phase attention metadata structure. - self._cached_prefill_metadata = AscendMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - query_lens=query_lens, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - chunked_prefill_enabled=self.chunked_prefill_enabled, - block_tables=block_tables, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables, - enable_kv_scales_calculation=False) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["AscendMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - # Recover cached decode-phase attention - # metadata structure. - return self._cached_decode_metadata - - # Compute some attn_metadata fields which default to None. - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[self.num_prefills:]) - query_lens = (None if self.query_lens is None else - self.query_lens[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - # Construct & cache decode-phase attention metadata structure. - self._cached_decode_metadata = AscendMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - query_lens=query_lens, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - chunked_prefill_enabled=self.chunked_prefill_enabled, - block_tables=block_tables, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables, - enable_kv_scales_calculation=False) - return self._cached_decode_metadata - - def advance_step(self, - model_input: "ModelInputForNPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - - if turn_prefills_into_decodes: - # When Mutli-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes. This update reflects that - # conversion. - assert self.num_decode_tokens + self.num_prefills == num_seqs - self.num_decode_tokens += self.num_prefills - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.max_prefill_seq_len = 0 - self.max_query_len = 1 - - self.slot_mapping = self.slot_mapping[:num_seqs] - else: - assert self.seq_lens is not None - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - assert self.slot_mapping.shape == (num_seqs, ) - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - - assert self.block_tables is not None - assert self.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - if enable_custom_op(): - #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled - torch.ops._C.advance_step_flashattn_ascendc( - num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables) - else: - # use traditional Pytorch method for updating these tensors. - # update input_tokens - sampled_token_ids_list = sampled_token_ids[: - num_queries].squeeze( # type: ignore - -1) - model_input.input_tokens[: - num_queries] = sampled_token_ids_list # type: ignore - - # get seq_lens and input_positions - seq_lens = self.seq_lens_tensor[:num_queries] - next_seq_lens = seq_lens + 1 - next_input_pos = next_seq_lens - 1 - - # update seq_lens and input_positions - self.seq_lens_tensor[:num_queries] = next_seq_lens - model_input.input_positions[: - num_queries] = next_input_pos # type: ignore - - # 计算 block index 和 offset - block_idx = next_input_pos // block_size - block_offset = next_input_pos % block_size - - current_block_table = self.block_tables.gather( - 1, block_idx.unsqueeze(-1)).squeeze(-1) - slot_num = current_block_table * block_size + block_offset - - # update slot_mapping - self.slot_mapping[:num_queries] = slot_num - - -class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]): - - _attn_mask_builder = None # noqa - - def __init__(self, input_builder: "ModelInputForNPUBuilder"): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - - self.attn_mask = None - self.compress_mask = None - self.chunk_mask = None - if AscendMetadataBuilder._attn_mask_builder is None: - AscendMetadataBuilder._attn_mask_builder = AttentionMaskBuilder( - 128, self.input_builder.runner.model_config.dtype) - - def _add_seq_group( - self, inter_data: ModelInputForNPUBuilder.InterDataForSeqGroup, - chunked_prefill_enabled: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table: List[int] = [] - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - if block_tables is not None: - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping( - is_profile_run, - self.slot_mapping, - seq_id, - seq_len, - context_len, - start_idx, - self.block_size, - inter_data.block_tables, - ) - - def _get_graph_runner_block_tables( - self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - - max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs - - graph_block_tables = self.runner.graph_block_tables # [:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build( - self, - seq_lens: List[int], - query_lens: List[int], - graph_pad_size: int, - ): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - """ - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled) - - device = self.runner.device - dtype = self.runner.model_config.dtype - use_npu_graph = graph_pad_size != -1 - - max_query_len = max(query_lens) - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - max_seq_len = max(max_prefill_seq_len, max_decode_seq_len) - num_decode_tokens = self.num_decode_tokens - - if self.num_prefills == 0 and use_npu_graph: - num_seqs = len(seq_lens) - self.slot_mapping.extend([PAD_SLOT_ID] * graph_pad_size) - self.block_tables.extend([[]] * graph_pad_size) - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int32, - device=device, - ) - - if self.num_prefills > 0: - if block_tables is None or block_tables.numel() == 0: - # normal mask - self.attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask( # type: ignore - max_prefill_seq_len, dtype, device) - if is_310p(): - mask_nz = nd_to_nz_2d(self.attn_mask) - mask_nz = torch_npu.npu_format_cast( - mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ) - self.attn_mask = mask_nz - elif self.num_decode_tokens == 0 and not self.input_builder.chunked_prefill_enabled: - # compress mask for prefix cache - self.compress_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask( # type: ignore - 128, dtype, device) - else: - # chunk_mask for chunk prefill - attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask( # type: ignore - max_seq_len, dtype, device) - if attn_mask.numel() > 1 and attn_mask[0][1] > 0: - # Do not use in-place multiplication to avoid modifying `attn_mask_cache`! - attn_mask = attn_mask * -10000 - chunk_mask_list = [] - for i, seq_len in enumerate(seq_lens): - context_len = self.context_lens[i] - chunk_mask_list.append(attn_mask[context_len:seq_len]) - self.chunk_mask = torch.cat(chunk_mask_list, 0) - else: - self.attn_mask = None - self.compress_mask = None - self.chunk_mask = None - - assert max_query_len > 0, "query_lens: {}".format(query_lens) - - assert device is not None - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.int32, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - - return AscendMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - query_lens=query_lens, - max_query_len=max_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - block_tables=block_tables, - attn_mask=self.attn_mask, - compress_mask=self.compress_mask, - chunk_mask=self.chunk_mask, - chunked_prefill_enabled=self.input_builder.chunked_prefill_enabled, - ) - - -class AscendAttentionBackendImpl(AttentionImpl): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.hidden_size = self.num_heads * self.head_size - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = sliding_window - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, - dtype=torch.float32, - device="npu") - self.alibi_slopes = alibi_slopes - self.attn_type = attn_type - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.seq_len_cpu_tensor = None - self.query_len_cpu_tensor = None - self.key_cache = None - self.value_cache = None - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AscendMetadata, - attn_type: str = AttentionType.DECODER, - output: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with Ascend attention. - Args: - query: shape = [num_tokens, num_heads * head_size] - num_tokens = batch_size * seq_len - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: shape = [2, num_blocks, block_size, - num_kv_heads, head_size] - key_cache = [num_blocks, block_size, - num_kv_heads, head_size] - value_cache = [num_blocks, block_size, - num_kv_heads, head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [batch_size, seq_len * num_heads * head_size] - """ - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - # View q k v to BSH. - num_tokens = query.shape[0] - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - # TODO: Remove this contiguous in the future. - value = value.contiguous() - attn_type = self.attn_type - - output = torch.empty(num_tokens, - self.num_heads, - self.head_size, - dtype=query.dtype, - device=query.device) - - if kv_cache.numel() > 0: - if self.key_cache is None: - self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] - slots = attn_metadata.slot_mapping - - if hasattr(layer, 'quant_method'): - isPrefill = True if attn_metadata.num_prefills > 0 else False - if isPrefill: - assert attn_metadata.prefill_metadata is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.prefill_metadata.seq_lens).astype( - np.int32)) - else: - assert attn_metadata.decode_metadata is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.decode_metadata.seq_lens).astype( - np.int32)) - block_tables = attn_metadata.decode_metadata.block_tables if attn_metadata.decode_metadata else None - # Details of kv_cache arrangement in attention quantization - # are implemented by quant_method. - layer.quant_method.apply( - layer, - query, - key, - value, - self.key_cache, - self.value_cache, - self.scale, - block_tables, - isPrefill, - attn_metadata, - output, - seq_lens_tensor_cpu=self.seq_lens_tensor_cpu) - else: - if self.key_cache is not None: - torch_npu._npu_reshape_and_cache(key=key, - value=value, - key_cache=self.key_cache, - value_cache=self.value_cache, - slot_indices=slots) - - if attn_metadata.num_prefills > 0: - # Prefix cache disabled and chunk prefill disabled or no prefix cache hit - if (attn_metadata.block_tables is None - or attn_metadata.block_tables.numel() == 0): - if attn_type == AttentionType.ENCODER_ONLY: - # TODO: change to use torch_npu encoder attention op, instead - # of torch sdpa - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - causal_attn = (attn_type == AttentionType.DECODER) - if attn_metadata.seq_lens is not None: - seq_lens_q = seq_lens_kv = attn_metadata.seq_lens - attn_masks = [None] * len(seq_lens_q) - start_q, start_kv = 0, 0 - for seq_len_q, seq_len_kv, mask in zip( - seq_lens_q, seq_lens_kv, attn_masks): - end_q = start_q + seq_len_q - end_kv = start_kv + seq_len_kv - sub_out = scaled_dot_product_attention( - query[None, :, start_q:end_q, :], - key[None, :, start_kv:end_kv, :], - value[None, :, start_kv:end_kv, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=causal_attn and mask is None, - scale=self.scale).squeeze(0).movedim( - query.dim() - 2, 0) - output[start_q:end_q, :, :] = sub_out - start_q, start_kv = end_q, end_kv - else: - assert attn_metadata.attn_mask is not None - mask = attn_metadata.attn_mask - assert attn_metadata.prefill_metadata is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.prefill_metadata.seq_lens). - astype(np.int32)) - if is_310p(): - # align q k v output tensors - query = aligned_16(query) - key = aligned_16(key) - value = aligned_16(value) - output = aligned_16(output) - - # do reformat in case of broadcasted tensors - mask = mask.repeat( - self.seq_lens_tensor_cpu.size(0), 1, 1, 1) - mask = torch_npu.npu_format_cast( - mask.contiguous(), ACL_FORMAT_FRACTAL_NZ) - torch_npu._npu_flash_attention( - query=query, - key=key, - value=value, - mask=mask, - seq_len=self.seq_lens_tensor_cpu, - scale_value=self.scale, - num_heads=self.num_heads, - num_kv_heads=self.num_kv_heads, - out=output) - output = output[:num_tokens, :, :] - # Prefix cache only and cache hit - elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled: - assert kv_cache is not None - assert attn_metadata.prefill_metadata is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array( - attn_metadata.prefill_metadata.seq_lens).astype( - np.int32)) - self.query_lens_tensor_cpu = torch.from_numpy( - np.array( - attn_metadata.prefill_metadata.query_lens).astype( - np.int32)) - block_tables = attn_metadata.prefill_metadata.block_tables - assert attn_metadata.compress_mask is not None - compress_mask = attn_metadata.compress_mask - torch_npu._npu_flash_attention_qlens( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - block_table=block_tables, - mask=compress_mask, - seq_len=self.query_lens_tensor_cpu, - context_lens=self.seq_lens_tensor_cpu, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - out=output) - # Splitfuse - else: - assert kv_cache is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.seq_lens).astype(np.int32)) - self.query_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.query_lens).astype(np.int32)) - block_tables = attn_metadata.block_tables - assert attn_metadata.chunk_mask is not None - chunk_mask = attn_metadata.chunk_mask - torch_npu._npu_paged_attention_splitfuse( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - block_table=block_tables, - context_lens=self.seq_lens_tensor_cpu, - mask=chunk_mask, - seq_len=self.query_lens_tensor_cpu, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - out=output) - # Decode only - else: - assert self.key_cache is not None - assert self.value_cache is not None - assert attn_metadata.decode_metadata is not None - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.decode_metadata.seq_lens).astype( - np.int32)) - if is_310p(): - # # seq_lens_tensor needs to be transferred to the device for 310P - self.seq_lens_tensor_cpu = self.seq_lens_tensor_cpu.to( - device=self.key_cache.device) - block_tables = attn_metadata.decode_metadata.block_tables - torch_npu._npu_paged_attention( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=block_tables, - context_lens=self.seq_lens_tensor_cpu, - out=output) - - return output.view(num_tokens, self.hidden_size) - - -class AscendMLAAttentionBackendImpl(MLAAttentionImpl): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - **extra_impl_args, - ) -> None: - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.hidden_size = self.num_heads * self.head_size - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = sliding_window - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, - dtype=torch.float32, - device="npu") - self.alibi_slopes = alibi_slopes - self.attn_type = attn_type - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.seq_len_cpu_tensor = None - - # MLA Args - self.q_lora_rank = extra_impl_args['q_lora_rank'] - self.kv_lora_rank = extra_impl_args['kv_lora_rank'] - self.qk_nope_head_dim = extra_impl_args['qk_nope_head_dim'] - self.qk_rope_head_dim = extra_impl_args['qk_rope_head_dim'] - self.qk_head_dim = extra_impl_args['qk_head_dim'] - self.v_head_dim = extra_impl_args['v_head_dim'] - self.rotary_emb = extra_impl_args['rotary_emb'] - self.q_proj = extra_impl_args['q_proj'] - self.kv_b_proj = extra_impl_args['kv_b_proj'] - self.o_proj = extra_impl_args['o_proj'] - self.kv_a_proj_with_mqa = extra_impl_args.get('kv_a_proj_with_mqa', - None) - self.kv_a_layernorm = extra_impl_args.get('kv_a_layernorm', None) - self.k_pe_cache = None - self.k_nope_cache = None - self.w_kc = None - self.w_vc = None - - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - - # TODO: support numHeads / numKvHeads < 16 in MLA kernel - if self.torchair_graph_enabled: - assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ - ("The allowed number of queries per kv when enabling both MLA and Graph mode" - " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," - " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," - " please make sure after the tensor parallel split, num_heads / num_kv_heads in " - "{32, 64, 128}.") - - def exec_kv( - self, - hidden_states: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - kv_cache: Tuple, - slots: torch.Tensor, - ): - B = hidden_states.shape[0] - N = self.num_kv_heads - S = 1 - kv = self.kv_a_proj_with_mqa(hidden_states)[0] - # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] - kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) - - k_pe, k_nope, _, _ = torch.ops.npu_inference.npu_kv_rmsnorm_rope_cache( - kv, - self.kv_a_layernorm.weight, - cos, - sin, - slots.to(torch.int64), - kv_cache[1], - kv_cache[0], - epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode="PA", - ) - - return k_pe, k_nope - - def apply_rotary_emb( - self, - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, - ) -> torch.Tensor: - """ - Args: - x: [num_tokens, num_heads, head_size] - cos: [num_tokens, head_size // 2] - sin: [num_tokens, head_size // 2] - is_neox_style: Whether to use the Neox-style or GPT-J-style rotary - positional embeddings. - """ - cos = cos.unsqueeze(-2).to(x.dtype) - sin = sin.unsqueeze(-2).to(x.dtype) - if is_neox_style: - x1, x2 = torch.chunk(x, 2, dim=-1) - else: - x1 = x[..., ::2] - x2 = x[..., 1::2] - o1 = x1 * cos - x2 * sin - o2 = x2 * cos + x1 * sin - if is_neox_style: - return torch.cat((o1, o2), dim=-1) - else: - return torch.stack((o1, o2), dim=-1).flatten(-2) - - def rope_single( - self, - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ) -> torch.Tensor: - B, N, D = x.shape - S = 1 - x = x.view(B, N, S, D) - x = torch.ops.npu_inference.npu_interleave_rope(x, cos, sin) - return x.view(B, N, D) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - if self.w_kc is None or self.w_vc is None: - kv_b_proj_weight = self.kv_b_proj.weight.reshape( - self.num_heads, self.qk_nope_head_dim + self.v_head_dim, - self.kv_lora_rank) - self.w_kc = kv_b_proj_weight[:, :self. - qk_nope_head_dim, :].contiguous() - self.w_vc = kv_b_proj_weight[:, - self.qk_nope_head_dim:, :].transpose( - 1, 2).contiguous() - - def forward( - self, - layer: AttentionLayer, - hidden_states_or_q_c: torch.Tensor, - hidden_states_or_kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AscendMetadata, - attn_type: str = AttentionType.DECODER, - output: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with Ascend attention. - Args: - hidden_states_or_q_c: shape = [num_tokens, num_heads * head_size] - num_tokens = batch_size * seq_len - hidden_states_or_kv_c_normed: shape = [num_tokens, num_kv_heads * head_size] - k_pe: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: shape = [1, num_blocks, block_size, - num_kv_heads * head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [batch_size, seq_len * num_heads * head_size] - """ - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - attn_type = self.attn_type - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "PallasAttentionBackendImpl") - - if attn_metadata is None: - # for profile run - return hidden_states_or_q_c - - num_tokens = hidden_states_or_q_c.shape[0] - q = self.q_proj(hidden_states_or_q_c)[0].view(-1, self.num_heads, - self.qk_head_dim) - q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], - dim=-1) - if k_pe is None and attn_metadata.decode_metadata: - seq_len = self.rotary_emb.max_position_embeddings - - cos = self.rotary_emb.cos_cached[:seq_len].to(dtype=q_pe.dtype) - sin = self.rotary_emb.sin_cached[:seq_len].to(dtype=q_pe.dtype) - cos = cos[attn_metadata.input_positions] - sin = sin[attn_metadata.input_positions] - cos = cos[:, None, None, :] - sin = sin[:, None, None, :] - - q_pe = self.rope_single(q_pe, cos, sin) - k_pe, k_nope = self.exec_kv(hidden_states_or_kv_c_normed, cos, sin, - kv_cache, attn_metadata.slot_mapping) - else: - if k_pe is None: - # NOTE: k_pe is None when graph mode enabled - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) - else: - kv_c_normed = hidden_states_or_kv_c_normed - k_pe = k_pe.view(num_tokens, self.num_kv_heads, -1) - if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding': - # NOTE: When scaling not specified - ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape - q_pe = q_pe.reshape(num_tokens, -1) - k_pe = k_pe.reshape(num_tokens, -1) - q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions, - q_pe, k_pe) - q_pe = q_pe.view(ori_q_pe_shape) - k_pe = k_pe.view(ori_k_pe_shape) - else: - q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions, - q_pe, k_pe) - - if attn_metadata.num_prefills > 0: - kv = self.kv_b_proj(kv_c_normed)[0].view(num_tokens, - self.num_heads, -1) - k_nope, value = kv.split([self.qk_nope_head_dim, self.v_head_dim], - dim=-1) - else: - q_nope_t = torch.transpose(q_nope, 0, 1) - q_nope_out = torch.bmm(q_nope_t, self.w_kc) - q_nope = torch.transpose(q_nope_out, 0, 1) - - query = torch.cat([q_nope, q_pe], dim=-1).view(num_tokens, - self.num_heads, -1) - - # TODO: Replace the env with more flexible expressions - if self.torchair_graph_enabled: - if len(kv_cache) > 0 and kv_cache[0].numel( - ) > 0 and attn_metadata.num_prefills > 0: - slots = attn_metadata.slot_mapping - # NOTE: Separate the kv cache in advance to avoid OOM or other issues - torch_npu._npu_reshape_and_cache(key=kv_c_normed.view( - num_tokens, self.num_kv_heads, -1), - value=k_pe, - key_cache=kv_cache[0], - value_cache=kv_cache[1], - slot_indices=slots) - elif kv_cache.numel() > 0: - # TODO replace this naive implement with fusion kernel - concat_and_cache_mla(kv_c_normed, k_pe, kv_cache, - attn_metadata.slot_mapping) - - if attn_metadata.num_prefills > 0: - attn_output = torch.empty(num_tokens, - self.num_heads, - self.v_head_dim, - dtype=query.dtype, - device=query.device) - if (attn_metadata.block_tables is None - or attn_metadata.block_tables.numel() == 0): - assert attn_metadata.attn_mask is not None - assert attn_metadata.prefill_metadata is not None - assert attn_metadata.prefill_metadata.seq_lens is not None - mask = attn_metadata.attn_mask - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.prefill_metadata.seq_lens).astype( - np.int32)) - k_pe = k_pe.repeat(1, self.num_heads, 1) - key = torch.cat( - [k_nope.view(num_tokens, self.num_heads, -1), k_pe], dim=2) - torch_npu._npu_flash_attention( - query=query, - key=key, - value=value, - mask=mask, - seq_len=self.seq_lens_tensor_cpu, - scale_value=self.scale, - num_heads=self.num_heads, - num_kv_heads=self.num_heads, - out=attn_output) - else: - # TODO: Will support prefix cache and chunked prefill soon. - raise RuntimeError( - "Prefix cache and chunked prefill are currently not supported." - ) - elif attn_metadata.decode_metadata: - assert kv_cache is not None - if self.torchair_graph_enabled: - # shape of query for npu graph mode should be: - # [bs, num_heads_per_rank, seq_len, dim] - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) - q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) - # shape of knope/k_pe for npu graph mode should be: - # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] - block_size = kv_cache[0].shape[1] - k_nope = k_nope.view(-1, self.num_kv_heads, block_size, - self.kv_lora_rank) - k_pe = k_pe.view(-1, self.num_kv_heads, block_size, - self.qk_rope_head_dim) - attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score( - q_nope, - k_nope, - k_nope, - query_rope=q_pe, - key_rope=k_pe, - num_heads=self.num_heads, - num_key_value_heads=self.num_kv_heads, - input_layout="BNSD", - atten_mask=attn_metadata.attn_mask, - scale=self.scale, - antiquant_mode=0, - antiquant_scale=None, - block_table=attn_metadata.block_tables, - block_size=block_size, - actual_seq_lengths_kv=attn_metadata.seq_lens, - ) - attn_output = attn_output.view(num_tokens, -1, - self.kv_lora_rank).transpose( - 0, 1) - attn_output = torch.bmm(attn_output, self.w_vc).transpose(0, 1) - else: - # if torch.empty is used here, the preemptive scheduling case of - # test_mtp_correctness.py will fail to run. - attn_output = torch.randn( - [num_tokens, self.num_heads, self.kv_lora_rank], - dtype=query.dtype, - device=query.device) - self.seq_lens_tensor_cpu = torch.from_numpy( - np.array(attn_metadata.decode_metadata.seq_lens).astype( - np.int32)) - block_tables = attn_metadata.decode_metadata.block_tables - torch_npu._npu_paged_attention_mla( - query=query, - key_cache=kv_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=block_tables, - context_lens=self.seq_lens_tensor_cpu, - mla_vheadsize=self.kv_lora_rank, - out=attn_output) - attn_output_t = torch.transpose(attn_output, 0, 1) - attn_output_t = torch.bmm(attn_output_t, self.w_vc) - attn_output = torch.transpose(attn_output_t, 0, 1) - - output, _ = self.o_proj(attn_output.reshape(num_tokens, -1)) - - return output diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 391e41d6ce..e60344836d 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -73,38 +73,6 @@ # Future Plan: # Keep this patch in vllm-ascend. # -# ** File: worker/patch_common/patch_multi_step_worker.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output` -# Why: -# There are cuda hard code (current_platform.is_cuda_alike()) in -# `MultiStepWorker.sampler_output`, and we need to use the patched `TP1DraftModelRunner` in it. -# How: -# Make speculative decoding extensible to different backends. -# - support attention metadata register to the set supported spec decode -# - offer a api in platform to determine whether spec decode is supported, -# and deprecate is_cuda_alike in it. -# Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm/pull/15195 -# - https://github.com/vllm-project/vllm-ascend/pull/395 -# Future Plan: -# Revert it when the related pr is merged in vllm and vllm-ascend. -# -# ** File: worker/patch_common/patch_spec_decode_worker.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` -# Why: -# We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`. -# The mainly reason to overwrite `TP1DraftModelRunner`is the hard code of -# `FlashAttentionMetadata` -# How: -# ditto -# Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm/pull/15195 -# - https://github.com/vllm-project/vllm-ascend/pull/395 -# Future Plan: -# Revert it when the related pr is merged in vllm and vllm-ascend. -# # ** File: worker/patch_common/patch_distributed.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.GroupCoordinator` diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index d78b6dc8b4..c9358682a7 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -20,6 +20,4 @@ import vllm_ascend.patch.worker.patch_common.patch_utils # noqa isort:skip import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa -import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker # noqa import vllm_ascend.patch.worker.patch_common.patch_sampler # noqa -import vllm_ascend.patch.worker.patch_common.patch_spec_decode_worker # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py deleted file mode 100644 index 53ce312676..0000000000 --- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +++ /dev/null @@ -1,91 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import List, Set, Tuple - -import torch -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.multi_step_worker import MultiStepWorker - -from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner - - -def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], -) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass sample_len times. Returns the list of - sampler output, one per model forward pass, along with indicator of - whether torch tensor in sampler output need to be transposed in latter - sampler_output_to_torch logic. - - For multi step worker, this indicator shall be True. - """ - self._raise_if_unsupported(execute_model_req) - # Expand the batch for sequences with a bonus token. - # Perform a forward pass on the expanded batch and filter the - # response to retain only the original sequences' responses. - expanded_request, indices_of_seq_with_bonus_tokens =\ - self._expand_execute_model_request( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - # Run model sample_len times. - model_outputs: List[SamplerOutput] = [] - - # TODO: supports_gpu_multi_step is False in ASCEND - if isinstance(self.model_runner, TP1DraftModelRunner) and \ - self.model_runner.supports_gpu_multi_step(expanded_request): - # Here we run the draft_model_runner with multi-step prepare - # on the GPU directly - expanded_request.num_steps = sample_len - self.model_runner.set_indices_of_seq_with_bonus_tokens( - indices_of_seq_with_bonus_tokens) - model_outputs = self.execute_model(execute_model_req=expanded_request) - else: - # Here we run multi-step directly, with every step prepared - # on the CPU. - # TODO Remove this branch once DraftModelRunner supports TP>1 - # and other restrictions that are part of DraftModelRunner's - # supports_gpu_multi_step(..) - if expanded_request.previous_hidden_states is not None: - self.worker.model_runner.return_hidden_states = True - for _ in range(sample_len): - model_output: List[SamplerOutput] = self.worker.execute_model( - execute_model_req=expanded_request) - assert (len(model_output) == 1 - ), "composing multistep workers not supported" - model_output = model_output[0] - self._maybe_update_previous_hidden_states(model_output, - expanded_request) - - self._append_new_tokens(model_output, - expanded_request.seq_group_metadata_list, - indices_of_seq_with_bonus_tokens) - model_outputs.append(model_output) - - # move indices to device to avoid stream sync - indices_of_seq_with_bonus_tokens = torch.tensor( - indices_of_seq_with_bonus_tokens, device=self.device) - filtered_model_outputs = self._filter_model_output( - model_outputs, indices_of_seq_with_bonus_tokens) - return filtered_model_outputs, True - - -MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output) diff --git a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py deleted file mode 100644 index d271e65bfc..0000000000 --- a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +++ /dev/null @@ -1,157 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import Any, Dict, Optional - -from vllm.config import ParallelConfig -from vllm.logger import logger -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.layers.spec_decode_base_sampler import \ - SpecDecodeBaseSampler -from vllm.model_executor.layers.typical_acceptance_sampler import \ - TypicalAcceptanceSampler -from vllm.spec_decode.medusa_worker import MedusaWorker -from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.ngram_worker import NGramWorker -from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker -from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker -from vllm.worker.worker_base import WorkerBase - -from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner - - -def create_worker( - cls, - scorer_worker: WorkerBase, - draft_worker_kwargs: Dict[str, Any], - disable_mqa_scorer: bool, - disable_by_batch_size: Optional[int], - draft_token_acceptance_method: str, - typical_acceptance_sampler_posterior_threshold: float, - typical_acceptance_sampler_posterior_alpha: float, - disable_logprobs: bool, - disable_log_stats: bool, - num_speculative_tokens: int, -) -> "SpecDecodeWorker": - - allow_zero_draft_token_step = True - enable_lm_head_weight_load = False - num_spec_prefill_steps = 1 - ngram_prompt_lookup_max = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_max")) - ngram_prompt_lookup_min = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_min")) - - draft_model_config = draft_worker_kwargs["vllm_config"].model_config - draft_parallel_config: ParallelConfig = draft_worker_kwargs[ - 'vllm_config'].parallel_config - if ngram_prompt_lookup_max > 0: - draft_worker_kwargs[ - "device_type"] = scorer_worker.device_config.device.type - proposer_worker = NGramWorker(**draft_worker_kwargs) - proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, - ngram_prompt_lookup_max) - else: - # TODO(Yizhou): A quick fix, must be refactored ASAP - # ngram need not this fix. - draft_worker_kwargs[ - "vllm_config"].parallel_config.expert_parallel_size = 1 - draft_worker_kwargs[ - "vllm_config"].parallel_config.expert_tensor_parallel_size = 1 - - draft_tp = draft_parallel_config.tensor_parallel_size - target_tp = scorer_worker.parallel_config.tensor_parallel_size - - if draft_model_config.hf_config.model_type == "mlp_speculator": - proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) - elif draft_model_config.hf_config.model_type == "medusa": - proposer_worker = MedusaWorker(**draft_worker_kwargs) - else: - # Note: The current version of the MTP module doer not support - # the use of TP1DraftModelRunner - if draft_tp == 1 and draft_model_config.hf_config.model_type !=\ - "deepseek_mtp": - draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner - else: - if draft_model_config.hf_config.model_type == "eagle": - raise NotImplementedError( - f"{draft_model_config.hf_config.model_type} " - "does not support TP > 1 yet") - - allow_zero_draft_token_step = False - - # Load lm_head weight for eagle in init_device - if draft_model_config.hf_config.model_type == "eagle": - enable_lm_head_weight_load = True - - proposer_worker = MultiStepWorker(**draft_worker_kwargs) - if draft_model_config.hf_config.model_type == "deepseek_mtp": - num_spec_prefill_steps = draft_model_config.hf_config.n_predict - - proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( - proposer_worker, draft_tp, target_tp) - - logger.info("Configuring SpecDecodeWorker with proposer=%s", - type(proposer_worker)) - - spec_decode_sampler: SpecDecodeBaseSampler = None - if draft_token_acceptance_method == "rejection_sampler": - spec_decode_sampler = RejectionSampler() - elif draft_token_acceptance_method == "typical_acceptance_sampler": - spec_decode_sampler = TypicalAcceptanceSampler( - posterior_threshold=\ - typical_acceptance_sampler_posterior_threshold, - posterior_alpha=typical_acceptance_sampler_posterior_alpha, - ) - logger.info( - "[Speculative Decoding] Configuring" - " SpecDecodeWorker with sampler=%s", type(spec_decode_sampler)) - - if not disable_mqa_scorer: - if scorer_worker.model_runner.attn_backend.get_name() != "FLASH_ATTN": - disable_mqa_scorer = True - logger.info("[Speculative Decoding] Disabling MQA scorer as the " - "MQA is only available with flash attn backend.") - - if draft_model_config and \ - draft_model_config.max_model_len < \ - scorer_worker.model_config.max_model_len: - disable_mqa_scorer = True - logger.info("[Speculative Decoding] Disabling MQA scorer as the " - "draft model max_model_len is smaller than the target " - "model max_model_len.") - - if not scorer_worker.model_runner.model_config.enforce_eager: - disable_mqa_scorer = True - logger.info("[Speculative Decoding] Disabling MQA scorer as the " - "target model is not running in eager mode.") - - return SpecDecodeWorker( - proposer_worker, - scorer_worker, - disable_mqa_scorer=disable_mqa_scorer, - disable_logprobs=disable_logprobs, - disable_log_stats=disable_log_stats, - disable_by_batch_size=disable_by_batch_size, - spec_decode_sampler=spec_decode_sampler, - allow_zero_draft_token_step=allow_zero_draft_token_step, - enable_lm_head_weight_load=enable_lm_head_weight_load, - num_spec_prefill_steps=num_spec_prefill_steps) - - -SpecDecodeWorker.create_worker = classmethod(create_worker) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 07fb07fcb6..111e13c5dc 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -180,18 +180,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: update_aclgraph_sizes(vllm_config) if parallel_config and parallel_config.worker_cls == "auto": - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" - elif vllm_config.speculative_config: - # NOTE: We set this var to `1` in vllm-ascend to avoid segment - # fault when using spec decode with V0 engine. - os.environ["ACL_OP_INIT_MODE"] = "1" - parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker" - elif vllm_config.scheduler_config.is_multi_step: - parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker" - else: - parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker" + parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" if cache_config: if cache_config.block_size is None: @@ -202,34 +191,33 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) cache_config.block_size = 128 - if envs.VLLM_USE_V1: - # Activate custom ops for v1, except on 310P - if not is_310p(): - compilation_config.custom_ops = ["all"] - - # If ascend_scheduler_config is enabled, - # extents original scheduler_config to use AscendScheduler. - if ascend_config.ascend_scheduler_config.enabled: - from vllm_ascend.core.schedule_config import \ - AscendSchedulerConfig - ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( - vllm_config.scheduler_config, - ascend_config.ascend_scheduler_config) - vllm_config.scheduler_config = ascend_scheduler_config + # Activate custom ops for v1, except on 310P + if not is_310p(): + compilation_config.custom_ops = ["all"] + + # If ascend_scheduler_config is enabled, + # extents original scheduler_config to use AscendScheduler. + if ascend_config.ascend_scheduler_config.enabled: + from vllm_ascend.core.schedule_config import \ + AscendSchedulerConfig + ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( + vllm_config.scheduler_config, + ascend_config.ascend_scheduler_config) + vllm_config.scheduler_config = ascend_scheduler_config @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla): - if use_v1 and use_mla: - return "vllm_ascend.attention.mla_v1.AscendMLABackend" + if not use_v1: + raise RuntimeError("V0 engine is not supported on vllm-ascend now!") + use_torchair = get_ascend_config().torchair_graph_config.enabled - if use_v1 and use_torchair: + if use_mla: + return "vllm_ascend.attention.mla_v1.AscendMLABackend" + elif use_torchair: return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend" - if use_v1: + else: return "vllm_ascend.attention.attention_v1.AscendAttentionBackend" - if use_mla: - return "vllm_ascend.attention.attention.AscendMLAAttentionBackend" - return "vllm_ascend.attention.attention.AscendAttentionBackend" @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py index ee59a056ef..116c73c06c 100644 --- a/vllm_ascend/worker/__init__.py +++ b/vllm_ascend/worker/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.worker.cache_engine # noqa \ No newline at end of file diff --git a/vllm_ascend/worker/cache_engine.py b/vllm_ascend/worker/cache_engine.py deleted file mode 100644 index d8d9087745..0000000000 --- a/vllm_ascend/worker/cache_engine.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/model_runner.py -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import Any, List - -import torch -from vllm.utils import is_pin_memory_available -from vllm.worker.cache_engine import CacheEngine - -from vllm_ascend.ascend_config import get_ascend_config - - -def allocate_kv_cache( - self, - num_blocks: int, - device: str, -) -> List[Any]: - """Allocates KV cache on the specified device.""" - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: List[Any] = [] - - ascend_config = get_ascend_config() - if ascend_config.torchair_graph_config.enabled: - # Align entries so they are 256 byte aligned for better performance - # Primarily targets MLA as this typically only ends up having entries - # be 128 byte aligned. - alloc_shape = kv_cache_shape - - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache_nope = torch.zeros( - alloc_shape[:-1] + - (self.model_config.hf_text_config.kv_lora_rank, ), - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - layer_kv_cache_pe = torch.zeros( - alloc_shape[:-1] + - (self.model_config.hf_text_config.qk_rope_head_dim, ), - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append((layer_kv_cache_nope, layer_kv_cache_pe)) - else: - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append(layer_kv_cache) - return kv_cache - - -CacheEngine._allocate_kv_cache = allocate_kv_cache diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py deleted file mode 100644 index b070da1a7f..0000000000 --- a/vllm_ascend/worker/draft_model_runner.py +++ /dev/null @@ -1,320 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import List, Optional - -import torch -from vllm.forward_context import set_forward_context -from vllm.logger import logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerWrapperBase) - -from vllm_ascend.attention.attention import AscendMetadata - -# A flag to enable debug prints for the updated input tensors -# before each step. -debug_advance_input = False -# A flag to allow GPU advance step for draft model runner. -# Set to False for debugging. -allow_gpu_advance_step = True - - -class TP1DraftModelRunner(ModelRunnerWrapperBase): - """Specialized model runner for speculative decoding draft model. - Since the draft model always execute k forward passes consecutively to - generate k speculative tokens in a single speculative decoding step, - we could get rid of most CPU-GPU synchronization and data transfer - overheads by keeping model input and output tensors on GPU all the time. - - TODOs: - 1. Currently supports only flash-attn, add support for other attn_backends. - 2. Support TP > 1 (this requires some designs because we do not expect - any broadcasting inside execute_model). - """ - - def __init__(self, model_runner: ModelRunnerBase): - super().__init__(model_runner) - - self.indices_of_seq_with_bonus_tokens = None - - def _update_sampling_metadata(self, sampling_metadata, num_seqs, - num_queries): - - assert sampling_metadata.num_prompts == 0 - assert len(sampling_metadata.seq_groups) == num_queries - assert sampling_metadata.selected_token_indices.shape == ( - num_queries, ) - # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501 - - # Verify that all sequences are decodes - for i in range(num_queries): - seq_group = sampling_metadata.seq_groups[i] - - assert seq_group.is_prompt is False # No prompt - assert seq_group.prompt_logprob_indices == [] # No prompt - assert seq_group.sample_indices == [i] # Simple - - def _gpu_advance_step(self, model_input: ModelRunnerInputBase, - last_output: SamplerOutput) -> ModelRunnerInputBase: - # Currently, we expect "decode mode" only - assert not model_input.is_prompt - - # Get num_seqs - num_seqs = len(model_input.seq_lens) - num_queries = len(model_input.query_lens) - - # Get output tokens GPU tensor - sampled_token_ids = last_output.sampled_token_ids - assert sampled_token_ids is not None - - # Update attn_metadata - attn_metadata = model_input.attn_metadata - assert isinstance(attn_metadata, AscendMetadata) - - attn_metadata.advance_step(model_input, sampled_token_ids, - self.block_size, num_seqs, num_queries) - - # Update sampling_metadata - sampling_metadata = model_input.sampling_metadata - self._update_sampling_metadata(sampling_metadata, num_seqs, - num_queries) - - # Create new input - new_model_input = self._model_input_cls( - input_tokens=model_input.input_tokens, - input_positions=model_input.input_positions, - attn_metadata=attn_metadata, - seq_lens=attn_metadata.seq_lens, - query_lens=model_input.query_lens, - # Notes: If vllm_ascend supports LORA, we need to - # add the following two params. - # lora_mapping=model_input.lora_mapping, - # lora_requests=model_input.lora_requests, - multi_modal_kwargs=model_input.multi_modal_kwargs, - sampling_metadata=model_input.sampling_metadata, - is_prompt=False, - ) - - # Ensure we skip CPU samples - assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True - # We can reuse sampling tensors since every decode iteration is the same - new_model_input.sampling_metadata.reuse_sampling_tensors = True - - if debug_advance_input: - logger.debug("NEW INPUT: ") - logger.debug(" input_tokens = %s", new_model_input.input_tokens) - logger.debug(" input_positions = %s", - new_model_input.input_positions) - logger.debug(" seq_lens = %d", new_model_input.seq_lens) - logger.debug(" query_lens = %d", new_model_input.query_lens) - logger.debug(" attn_metadata:") - logger.debug(" seq_lens_tensor: %s", - attn_metadata.seq_lens_tensor) - logger.debug(" slot_mapping: %s", attn_metadata.slot_mapping) - logger.debug(" block_tables: %s", attn_metadata.block_tables) - - return new_model_input - - def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): - """Determines if draft_model_runner GPU multi-step can be used. - Currently required conditions are: - 1. Only decodes - 2. Only flash-attn - 3. No LORA - 4. No prompt_adapter_config - """ - if not allow_gpu_advance_step: - return False - - # We allow multi-step GPU only in decode mode - for seq_group in execute_model_req.seq_group_metadata_list: - if seq_group.is_prompt: - return False - - # TODO: Add support for ASCEND when outer multi_step_worker - # could work correct. - if self.attn_backend.get_name() not in ("FLASH_ATTN", "TRITON_MLA"): - return False - - # TODO: Add support for LORA - if self.lora_config: - return False - - # TODO: Add soft-tuning prompt adapter support - return not self.prompt_adapter_config - - def set_indices_of_seq_with_bonus_tokens(self, - indices_of_seq_with_bonus_tokens): - self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelRunnerInputBase, - kv_caches: List[torch.Tensor], - previous_hidden_states: Optional[torch.Tensor] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[List[SamplerOutput]]: - """Executes num_steps forward passes with advacement of input tensors - on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions. - - Optimizations used: - 1. Input tensors are updated on the GPU directly - 2. Skips GPU=>CPU serialization of sampler outputs (we don't need - them since we do batch expansion later that uses GPU outputs) - 3. Reuses sampling tensors (since we run only decodes and they have - a repeating sampling logic) - """ - - # When num_steps == 1, we execute the fallback here for the GPU - # advance_step, which runs prepare_inputs on CPU and for each spec - # iteration invokes this function only once - # (Look at multi-step-worker code) - is_fallback = num_steps == 1 - if not is_fallback: - # Since we do not broadcast data inside execute_model anymore, - # we need to figure out the best way to support TP > 1 in this - # case, because we will at least need to broadcast the sampled - # tokens to all workers. - if not self.is_driver_worker: - raise ValueError("TP1DraftModelRunner only supports TP=1.") - - # Sanity - if self.lora_config is not None: - raise ValueError("TP1DraftModelRunner has no support for LORA") - if self.prompt_adapter_config is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "prompt_adapter_config") - if model_input.inputs_embeds is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "inputs_embeds") - if model_input.multi_modal_kwargs: - raise ValueError( - "TP1DraftModelRunner has no support for multi_modal_kwargs" - ) - else: - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - - self.attn_state.begin_forward(model_input) - - # Detect exec mode - assert model_input.attn_metadata is not None - if model_input.attn_metadata.num_prefills > 0: - # In this case, execute_model(..) was called directly - if num_steps > 1: - raise ValueError( - "execute_model(..) of draft_model_runner can be called " - "directly only with a single-step prefill") - else: - # We can skip CPU samples for spec token generation. - # (We do allow CPU samples for num_steps == 1 to support the - # fallback case, where supports_gpu_multi_step(..) does not pass) - model_input.sampling_metadata.skip_sampler_cpu_output = ( - not is_fallback) - - model_executable = self.model - hidden_states = previous_hidden_states - - outputs: List[SamplerOutput] = [] - for step in range(num_steps): - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - - model_execute_kwargs = {"previous_hidden_states": hidden_states} \ - if previous_hidden_states is not None else {} - - compute_logits_kwargs = {} - # Run model - if hasattr(self.model.config, "num_nextn_predict_layers"): - # for DeepSeek MTP only to use the corresponding layer for - # each step - spec_step_idx = kwargs.get("spec_step_idx", step) - model_execute_kwargs["spec_step_idx"] = spec_step_idx - compute_logits_kwargs["spec_step_idx"] = spec_step_idx - with set_forward_context(model_input.attn_metadata, - self.vllm_config): - - if model_input.attn_metadata is not None: - model_input.attn_metadata.input_positions = model_input.input_positions - - hidden_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=None, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **model_execute_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata, - **compute_logits_kwargs) - if not self.is_driver_worker: - return [] - # Sample the next token. - assert self.model_runner.sampler is not None - output = self.model_runner.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - outputs.append(output) - - if self.return_hidden_states and is_fallback: - output.hidden_states = hidden_states - - if model_input.attn_metadata.num_prefills == 0 \ - and self.indices_of_seq_with_bonus_tokens is not None: - assert output.sampled_token_ids is not None - # output.sampled_token_ids should be of shape (num_seqs, 1) - nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape - assert num_tokens_per_seq == 1 - count = 0 - for i in range(nums_seqs): - bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[ - count] - if i != bonus_seq_idx: - # The following might cause a cpu->gpu sync - # However, the performance impact is negligible as we - # benchmarked on H100. - output.sampled_token_ids[ - i, :] = model_input.input_tokens[bonus_seq_idx] - else: - count += 1 - - # Prepare inputs for the next step - if step != num_steps - 1: - model_input = self._gpu_advance_step(model_input, outputs[-1]) - - return outputs diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py deleted file mode 100644 index 48c5d4b68f..0000000000 --- a/vllm_ascend/worker/model_runner.py +++ /dev/null @@ -1,1607 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/model_runner.py -# - -import dataclasses -import itertools -import weakref -from contextlib import contextmanager -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, - Type, TypeVar, Union) - -import numpy as np -import torch -import torch.nn as nn -import vllm.envs as envs -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.attention.backends.utils import CommonAttentionState -from vllm.config import VllmConfig -from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import broadcast_tensor_dict, get_dp_group, get_pp_group -from vllm.distributed.kv_transfer import get_kv_transfer_group -from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata, SamplingMetadataCache -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - get_sampler) -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import supports_lora, supports_multimodal -from vllm.model_executor.models.utils import set_cpu_offload_max_bytes -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap, - MultiModalRegistry) -from vllm.prompt_adapter.layers import PromptAdapterMapping -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists, - is_pin_memory_available) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -from vllm_ascend.ascend_config import get_ascend_config - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -TModelInputForNPU = TypeVar('TModelInputForNPU', bound="ModelInputForNPU") -ENCODER_NUM = 0 -# if True, allow tensor initialization and casting with internal format (e.g., NZ) -torch.npu.config.allow_internal_format = True - - -@dataclass(frozen=True) -class ModelInputForNPU(ModelRunnerInputBase): - """ - This base class contains metadata needed for the base model forward pass - but not metadata for possible additional steps, e.g., sampling. Model - runners that run additional steps should subclass this method to add - additional fields. - """ - input_tokens: Optional[torch.Tensor] = None - inputs_embeds: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - token_types: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None - finished_requests_ids: Optional[List[str]] = None - virtual_engine: int = 0 - async_callback: Optional[Callable] = None - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - previous_hidden_states: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForNPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForNPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - # Exclude `async_callback` to be able to pickle this object - def __getstate__(self): - state = self.__dict__.copy() - del state["async_callback"] - return state - - # TODO: What happens when we depickle this object? - # How can we update this callback to properly pass it to the engine? - def __setstate__(self, state): - self.__dict__.update(state) - self.__dict__.update({'async_callback': None}) - - -@dataclass(frozen=True) -class ModelInputForNPUWithSamplingMetadata(ModelInputForNPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - # Used for speculative decoding. We do not broadcast it because it is only - # used by the driver worker. - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForNPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]): - """Build ModelInputForNPU from SequenceGroupMetadata.""" - - # Note: ideally we would be using a dataclass(kw_only=True) - # here, so that this can be subclassed easily, - # but kw_only is not supported in python<3.10. - class InterDataForSeqGroup: - """Intermediate data for the current sequence group.""" - - def simple_reinit(self): - self.input_tokens[0].clear() # type: ignore - self.inputs_embeds = None # type: ignore - self.input_positions[0].clear() # type: ignore - self.token_types[0].clear() # type: ignore - self.mrope_input_positions = None # type: ignore - self.seq_lens[0] = 0 # type: ignore - self.orig_seq_lens[0] = 0 # type: ignore - self.query_lens[0] = 0 # type: ignore - self.context_lens[0] = 0 # type: ignore - self.curr_sliding_window_blocks[0] = 0 # type: ignore - self.lora_index_mapping.clear() # type: ignore - self.lora_prompt_mapping.clear() # type: ignore - self.lora_requests.clear() # type: ignore - - def __init__( - self, - *, - # From sequence group metadata. - request_id: str, - seq_ids: List[int], - is_prompt: bool, - block_tables: Optional[Dict[int, List[int]]], - computed_block_nums: List[int], - n_seqs: int = 0, - - # Input tokens and positions. - input_tokens: Optional[List[List[int]]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - input_positions: Optional[List[List[int]]] = None, - token_types: Optional[List[List[int]]] = None, - mrope_input_positions: Optional[List[List[List[int]]]] = None, - - # The sequence length (may be capped to the sliding window). - seq_lens: Optional[List[int]] = None, - # The original sequence length (before applying sliding window). - # This is used to compute slot mapping. - orig_seq_lens: Optional[List[int]] = None, - # The query length. - query_lens: Optional[List[int]] = None, - # The number of tokens that are already computed. - context_lens: Optional[List[int]] = None, - # The current sliding window block. - curr_sliding_window_blocks: Optional[List[int]] = None, - - # LoRA inputs. - lora_index_mapping: Optional[List[List[int]]] = None, - lora_prompt_mapping: Optional[List[List[int]]] = None, - lora_requests: Optional[Set[LoRARequest]] = None, - - # Multi-modal inputs. - multi_modal_kwargs: Optional[MultiModalKwargs] = None, - multi_modal_placeholder_maps: Optional[Dict[ - str, MultiModalPlaceholderMap]] = None, - - # Whether the prefix cache is hit (prefill only). - prefix_cache_hit: bool = False, - reinit: bool = False, - reinit_use_defaults: bool = False, - encoder_seq_len: int = 0, - ): - if reinit: - assert len(self.seq_ids) == len(seq_ids) # type: ignore - for i, seq_id in enumerate(seq_ids): - self.seq_ids[i] = seq_id # type: ignore - else: - self.seq_ids = seq_ids - - self.request_id = request_id - self.is_prompt = is_prompt - self.block_tables = block_tables - self.computed_block_nums = computed_block_nums - self.n_seqs = n_seqs - self.encoder_seq_len = encoder_seq_len - - if reinit: - if len(self.seq_ids) == 1 and reinit_use_defaults: - self.simple_reinit() - else: - if input_tokens: - self.input_tokens = input_tokens - else: - for seq_id in range(len(self.seq_ids)): - self.input_tokens[seq_id].clear() - self.inputs_embeds = inputs_embeds - - if input_positions: - self.input_positions = input_positions - else: - for seq_id in range(len(self.seq_ids)): - self.input_positions[seq_id].clear() - - if token_types: - self.token_types = token_types - else: - for seq_id in range(len(self.seq_ids)): - self.token_types[seq_id].clear() - - self.mrope_input_positions = None - - if seq_lens: - self.seq_lens = seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.seq_lens[seq_id] = 0 - - if orig_seq_lens: - self.orig_seq_lens = orig_seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.orig_seq_lens[seq_id] = 0 - - if query_lens: - self.query_lens = query_lens - else: - for seq_id in range(len(self.seq_ids)): - self.query_lens[seq_id] = 0 - - if context_lens: - self.context_lens = context_lens - else: - for seq_id in range(len(self.seq_ids)): - self.context_lens[seq_id] = 0 - - if curr_sliding_window_blocks: - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks - else: - for seq_id in range(len(self.seq_ids)): - self.curr_sliding_window_blocks[seq_id] = 0 - - if lora_index_mapping: - self.lora_index_mapping = lora_index_mapping - else: - self.lora_index_mapping.clear() - if lora_prompt_mapping: - self.lora_prompt_mapping = lora_prompt_mapping - else: - self.lora_prompt_mapping.clear() - if lora_requests: - self.lora_requests = lora_requests - else: - self.lora_requests.clear() - - else: - self.input_tokens = input_tokens or [] - self.inputs_embeds = inputs_embeds - self.input_positions = input_positions or [] - self.token_types = token_types or [] - self.mrope_input_positions = mrope_input_positions or None - self.seq_lens = seq_lens or [] - self.orig_seq_lens = orig_seq_lens or [] - self.query_lens = query_lens or [] - self.context_lens = context_lens or [] - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks or [] - - self.lora_index_mapping = lora_index_mapping or [] - self.lora_prompt_mapping = lora_prompt_mapping or [] - self.lora_requests = lora_requests or set() - - self.multi_modal_kwargs = multi_modal_kwargs - self.multi_modal_placeholder_maps = multi_modal_placeholder_maps - self.prefix_cache_hit = prefix_cache_hit - - self.n_seqs = len(self.seq_ids) - - if not reinit: - self.__post_init__() - - def __post_init__(self): - self.n_seqs = len(self.seq_ids) - - self.input_tokens = [[] for _ in range(self.n_seqs)] - self.input_positions = [[] for _ in range(self.n_seqs)] - self.token_types = [[] for _ in range(self.n_seqs)] - self.mrope_input_positions = None - self.seq_lens = [0] * self.n_seqs - self.orig_seq_lens = [0] * self.n_seqs - self.query_lens = [0] * self.n_seqs - self.context_lens = [0] * self.n_seqs - self.curr_sliding_window_blocks = [0] * self.n_seqs - - self.lora_index_mapping = [] - self.lora_prompt_mapping = [] - - def __repr__(self) -> str: - return (f"InterDataForSeqGroup(" - f"request_id={self.request_id}, " - f"seq_ids={self.seq_ids}, " - f"is_prompt={self.is_prompt}, " - f"block_tables={self.block_tables}, " - f"computed_block_nums={self.computed_block_nums}, " - f"n_seqs={self.n_seqs}, " - f"input_tokens={self.input_tokens}, " - f"inputs_embeds.shape=" - f"{getattr(self.inputs_embeds, 'shape', None)}, " - f"input_positions={self.input_positions}, " - f"token_types={self.token_types}, " - f"mrope_input_positions={self.mrope_input_positions}, " - f"seq_lens={self.seq_lens}, " - f"orig_seq_lens={self.orig_seq_lens}, " - f"query_lens={self.query_lens}, " - f"context_lens={self.context_lens}, " - f"multi_modal_kwargs={self.multi_modal_kwargs}") - - def __init__(self, - runner, - finished_requests_ids: Optional[List[str]] = None): - super().__init__() - # Compute functions for each sequence in a sequence group. - # WARNING: The order of the functions matters! - self.per_seq_compute_fns = [ - self._compute_lens, - self._compute_for_prefix_cache_hit, - self._compute_for_sliding_window, - self._compute_lora_input, - ] - # Compute functions for each sequence group. - # WARNING: The order of the functions matters! - self.per_seq_group_compute_fns = [ - self._compute_multi_modal_input, - ] - - self.runner = runner - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.scheduler_config = self.runner.scheduler_config - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.enable_lora = self.runner.lora_config is not None - self.finished_requests_ids = finished_requests_ids - self.decode_only = True - self.is_encoder_decoder = self.runner.model_config.is_encoder_decoder - - # Attention metadata inputs. - self.attn_metadata_builder = self.attn_backend.make_metadata_builder( - weakref.proxy(self)) - - # Engine/Model configurations. - self.chunked_prefill_enabled = ( - self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled) - if self.sliding_window is not None: - self.sliding_window_blocks = ( - self.sliding_window + self.block_size - 1) // self.block_size - self.block_aligned_sliding_window = \ - self.sliding_window_blocks * self.block_size - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.finished_requests_ids = finished_requests_ids - - # if the current batch is decode-only. - # will be set to False if there is any non-decode request. - self.decode_only = True - - # Intermediate data (data in CPU before going to NPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForNPUBuilder.InterDataForSeqGroup] = [] - - self.attn_metadata_builder.prepare() - - def gen_inter_data_builder(self, num_seqs: int): - return lambda: ModelInputForNPUBuilder.InterDataForSeqGroup( - request_id="", - seq_ids=[0] * num_seqs, - is_prompt=True, - block_tables=None, - computed_block_nums=[]) - - def init_cached_inter_data(self, *args, **kwargs): - assert len(args) == 0 - assert "seq_ids" in kwargs - seq_ids = kwargs["seq_ids"] - num_seqs = len(seq_ids) - - # The inter-data cache is per model_runner - inter_data_cache = self.runner.inter_data_cache - if num_seqs not in inter_data_cache: - inter_data_cache[num_seqs] = PyObjectCache( - self.gen_inter_data_builder(num_seqs)) - - obj = inter_data_cache[num_seqs].get_object() - obj.__init__(*args, **kwargs) - return obj - - def reset_cached_inter_data(self): - for cache in self.runner.inter_data_cache.values(): - cache.reset() - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - """Add a sequence group to the builder.""" - seq_ids = seq_group_metadata.seq_data.keys() - n_seqs = len(seq_ids) - is_prompt = seq_group_metadata.is_prompt - - if is_prompt: - assert n_seqs == 1 - self.decode_only = False - - encoder_seq_len = 0 - - if self.is_encoder_decoder: - encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() - - inter_data = self.init_cached_inter_data( - request_id=seq_group_metadata.request_id, - seq_ids=seq_ids, - is_prompt=is_prompt, - block_tables=seq_group_metadata.block_tables, - computed_block_nums=seq_group_metadata.computed_block_nums, - reinit=True, - reinit_use_defaults=True, - encoder_seq_len=encoder_seq_len) - - self.inter_data_list.append(inter_data) - - for seq_idx in range(n_seqs): - for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) - for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) - - def build(self) -> ModelInputForNPU: - """Finalize the builder intermediate data and - create on-device tensors. - """ - # Combine and flatten intermediate data. - input_tokens = list[int]() - inputs_embeds_list = list[torch.Tensor]() - token_types = list[int]() - for inter_data in self.inter_data_list: - for cur_input_tokens in inter_data.input_tokens: - input_tokens.extend(cur_input_tokens) - for cur_token_types in inter_data.token_types: - token_types.extend(cur_token_types) - if inter_data.inputs_embeds is not None: - inputs_embeds_list.append( - inter_data.inputs_embeds.to( - dtype=self.runner.model_config.dtype, - device=self.runner.device)) - - inputs_embeds: Optional[torch.Tensor] - if len(inputs_embeds_list) == 0: - inputs_embeds = None - else: - inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to( - dtype=self.runner.model_config.dtype, - device=self.runner.device) - assert len(inputs_embeds) == len(input_tokens) - - if not input_tokens and inputs_embeds is None: - # This may happen when all prefill requests hit - # prefix caching and there is no decode request. - return self.model_input_cls() - - mrope_input_positions: Optional[List[List[int]]] = None - if any(inter_data.mrope_input_positions is not None - for inter_data in self.inter_data_list): - mrope_input_positions = [[] for _ in range(3)] - - for idx in range(3): - for inter_data in self.inter_data_list: - msections = inter_data.mrope_input_positions - if msections is None: - for _seq_input_positions in inter_data.input_positions: - mrope_input_positions[idx].extend( - _seq_input_positions) - else: - for _seq_mrope_input_positions in msections: - mrope_input_positions[idx].extend( - _seq_mrope_input_positions[idx]) - input_positions = None - else: - input_positions = [ - flatten_2d_lists(inter_data.input_positions) - for inter_data in self.inter_data_list - ] - - seq_lens = [] - max_decode_seq_len = 0 - is_prompt = self.inter_data_list[0].is_prompt - for inter_data in self.inter_data_list: - seq_lens.extend(inter_data.seq_lens) - if not inter_data.is_prompt: - max_decode_seq_len = max(max_decode_seq_len, - max(inter_data.seq_lens)) - query_lens = flatten_2d_lists( - [inter_data.query_lens for inter_data in self.inter_data_list]) - # Mapping from request IDs to sequence IDs. Used for Jamba models - # that manages the cache by itself. - request_ids_to_seq_ids = { - data.request_id: data.seq_ids - for data in self.inter_data_list - } - - # Add graph_pad_size here - if self.runner.torchair_graph_enabled: - graph_pad_size = self.runner.scheduler_config.max_num_seqs - len( - seq_lens) - else: - graph_pad_size = -1 - - if input_positions: - input_positions = flatten_2d_lists(input_positions) - if graph_pad_size != -1 and not is_prompt: - input_tokens.extend(itertools.repeat(0, graph_pad_size)) - input_positions.extend( # type: ignore - itertools.repeat(0, graph_pad_size)) - seq_lens.extend(itertools.repeat(1, graph_pad_size)) - query_lens.extend(itertools.repeat(1, graph_pad_size)) - input_tokens_tensor = torch.tensor(input_tokens, - dtype=torch.long, - device=self.runner.device) - token_types_tensor = torch.tensor(token_types, - dtype=torch.long, - device=self.runner.device) \ - if token_types else None - if mrope_input_positions is not None: - input_positions_tensor = torch.tensor(mrope_input_positions, - dtype=torch.long, - device=self.runner.device) - else: - input_positions_tensor = torch.tensor(input_positions, - dtype=torch.long, - device=self.runner.device) - #print(f"after tensor input_tokens_tensor: {input_tokens_tensor}") - #print(f"after tensor input_positions_tensor: {input_positions_tensor}") - #print(f"after list seq_lens: {seq_lens}") - - # Attention metadata. - attn_metadata = self.attn_metadata_builder.build( - seq_lens, query_lens, graph_pad_size) - - # LoRA data. - lora_requests = set() - lora_mapping = None - if self.enable_lora: - lora_requests = set(r for data in self.inter_data_list - for r in data.lora_requests) - lora_index_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_index_mapping) - for inter_data in self.inter_data_list - ]) - lora_prompt_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_prompt_mapping) - for inter_data in self.inter_data_list - ]) - lora_mapping = LoRAMapping( - **dict(index_mapping=lora_index_mapping, - prompt_mapping=lora_prompt_mapping, - is_prefill=not self.decode_only)) - - # Multi-modal data. - multi_modal_kwargs_list = [ - data.multi_modal_kwargs for data in self.inter_data_list - if data.multi_modal_kwargs is not None - ] - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - if self.runner.torchair_graph_enabled: - torch._dynamo.mark_static(input_tokens_tensor) - torch._dynamo.mark_static(input_positions_tensor) - torch._dynamo.mark_static(attn_metadata.block_tables) - torch._dynamo.mark_static(attn_metadata.slot_mapping) - - return self.model_input_cls( - input_tokens=input_tokens_tensor, - inputs_embeds=inputs_embeds, - token_types=token_types_tensor, - input_positions=input_positions_tensor, - attn_metadata=attn_metadata, - seq_lens=seq_lens, - query_lens=query_lens, - lora_mapping=lora_mapping, - lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, - request_ids_to_seq_ids=request_ids_to_seq_ids, - finished_requests_ids=self.finished_requests_ids) - - def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Compute context length, sequence length and tokens - for the given sequence data. - """ - seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] - token_chunk_size = seq_group_metadata.token_chunk_size - - # Compute context length (the number of tokens that are - # already computed) and sequence length (total number of tokens). - - seq_len = seq_data.get_len() - if inter_data.is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min(seq_len, context_len + token_chunk_size) - elif self.runner.scheduler_config.is_multi_step or \ - self.is_encoder_decoder: - context_len = seq_len - 1 - else: - context_len = seq_data.get_num_computed_tokens() - - # Compute tokens. - # Fixme: this is for the version compatibility, remove this once vllm v0.8.5 does not be supported. - if not hasattr(seq_data, - "prompt_embeds") or seq_data.prompt_embeds is None: - tokens = seq_data.get_token_ids()[context_len:seq_len] - prompt_embeds = None - else: - tokens = [0] * (seq_len - context_len) - prompt_embeds = seq_data.get_token_embeddings( - )[context_len:seq_len] - - token_types = seq_group_metadata.token_type_ids - - inter_data.seq_lens[seq_idx] = seq_len - inter_data.orig_seq_lens[seq_idx] = seq_len - inter_data.context_lens[seq_idx] = context_len - inter_data.input_tokens[seq_idx].extend(tokens) - inter_data.inputs_embeds = prompt_embeds - inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) - inter_data.token_types[seq_idx].extend( - token_types if token_types else []) - inter_data.query_lens[seq_idx] = seq_len - context_len - - if seq_data.mrope_position_delta is not None: - if inter_data.mrope_input_positions is None: - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - - inter_data.mrope_input_positions[ - seq_idx] = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - - def _compute_for_prefix_cache_hit( - self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Check if hit prefix cache (i.e., some blocks are already computed). - If hit, update input tokens and positions to only compute the - remaining blocks. - """ - computed_block_nums = inter_data.computed_block_nums - - # Note that prefix caching does not support sliding window. - prefix_cache_hit = (computed_block_nums is not None - and len(computed_block_nums) > 0 - and self.sliding_window is None - and inter_data.is_prompt) - inter_data.prefix_cache_hit = prefix_cache_hit - - if not prefix_cache_hit: - return - - assert computed_block_nums is not None - # The cache hit prompt tokens in this sequence. Note that - # this may be larger than the sequence length if chunked - # prefill is enabled. - prefix_cache_len = len(computed_block_nums) * self.block_size - - # The total number of prompt tokens in this sequence. - # When chunked prefill is enabled, this is the token number of - # computed chunks + current chunk. - seq_len = inter_data.seq_lens[seq_idx] - - # When full hit, compute the last block rather than the last token, - # due to the requirements of prefix operator. - if seq_len <= prefix_cache_len: - prefix_cache_len -= self.block_size - - seq_group_metadata.seq_data[inter_data.seq_ids[ - seq_idx]].update_num_cached_tokens(prefix_cache_len) - - # The number of so far computed prompt tokens in this sequence. - context_len = inter_data.context_lens[seq_idx] - - if prefix_cache_len <= context_len: - # We already passed the cache hit region, - # so do normal computation. - pass - elif context_len < prefix_cache_len < seq_len: - # Partial hit. Compute the missing part. - uncomputed_start = prefix_cache_len - context_len - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][uncomputed_start:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][uncomputed_start:] - inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ - uncomputed_start:] - context_len = prefix_cache_len - - inter_data.context_lens[seq_idx] = context_len - inter_data.query_lens[ - seq_idx] = inter_data.seq_lens[seq_idx] - context_len - elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last token to avoid - # erroneous behavior. FIXME: Ideally we should directly - # mark all tokens as computed in the scheduler and do not - # schedule this sequence, so this case should not happen. - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][-1:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][-1:] - inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ - -1:] - inter_data.query_lens[seq_idx] = 1 - inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 - - def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Update seq_len and curr_sliding_window_block for the given - sequence data (only required by decoding) if sliding window is enabled. - """ - curr_sliding_window_block = 0 - sliding_seq_len = inter_data.seq_lens[seq_idx] - if not inter_data.is_prompt and self.sliding_window is not None: - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - curr_sliding_window_block = self.sliding_window_blocks - # number of elements in last block - suff_len = inter_data.seq_lens[seq_idx] % self.block_size - sliding_seq_len = min(inter_data.seq_lens[seq_idx], - self.block_aligned_sliding_window + suff_len) - if suff_len > 0: - curr_sliding_window_block += 1 - - inter_data.curr_sliding_window_blocks[ - seq_idx] = curr_sliding_window_block - inter_data.seq_lens[seq_idx] = sliding_seq_len - - def _compute_lora_input(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """If LoRA is enabled, compute LoRA index and prompt mapping.""" - if not self.enable_lora: - return - lora_id = seq_group_metadata.lora_int_id - if lora_id > 0: - inter_data.lora_requests.add(seq_group_metadata.lora_request) - query_len = inter_data.query_lens[seq_idx] - inter_data.lora_index_mapping.append([lora_id] * query_len) - sampling_params = seq_group_metadata.sampling_params - if sampling_params and sampling_params.prompt_logprobs is not None: - inter_data.lora_prompt_mapping.append([lora_id] * query_len) - elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: - inter_data.lora_prompt_mapping.append([lora_id]) - else: - inter_data.lora_prompt_mapping.append([]) - - def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): - """If multi-modal data is given, add it to the input.""" - # NOTE: mm_kwargs only includes the subset of multi-modal items that - # intersect with the current prefill positions. - positions = inter_data.input_positions[0] - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, - range(positions[0], positions[0] + len(positions))) - if not mm_kwargs: - return - - inter_data.multi_modal_kwargs = mm_kwargs - inter_data.multi_modal_placeholder_maps = placeholder_maps - - # special processing for mrope position deltas. - if self.runner.model_config.uses_mrope: - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - assert image_grid_thw is not None or video_grid_thw is not None, ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw'.") - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - - hf_config = self.runner.model_config.hf_config - - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - for seq_idx in range(inter_data.n_seqs): - seq_data = seq_group_metadata.seq_data[ - inter_data.seq_ids[seq_idx]] - token_ids = seq_data.get_token_ids() - - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - ) - - seq_data.mrope_position_delta = mrope_position_delta - inter_data.mrope_input_positions[ - seq_idx] = mrope_input_positions - - -class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]): - """ - Helper class for shared methods between NPU model runners. - """ - _model_input_cls: Type[TModelInputForNPU] - _builder_cls: Type[ModelInputForNPUBuilder] - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - - ModelRunnerBase.__init__(self, vllm_config) - model_config = self.model_config - cache_config = self.cache_config - - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - self.pin_memory = is_pin_memory_available() - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = \ - self.vllm_config.compilation_config.max_capture_size - - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph - - self.has_inner_state = model_config.has_inner_state - - self.in_profile_run = False - - self.graph_block_tables = np.zeros( - (self.vllm_config.scheduler_config.max_num_seqs, - (model_config.max_model_len + self.block_size - 1) // - self.block_size), - dtype=np.int32) - - # Attention-free but stateful models like Mamba need a placeholder attn - # backend, as the attention metadata is needed to manage internal state. - # However we must bypass attention selection altogether for some models - # used for speculative decoding to avoid a divide-by-zero in - # model_config.get_head_size() - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) - needs_attn_backend = (num_attn_heads != 0 - or self.model_config.is_attention_free) - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) if needs_attn_backend else None - if self.attn_backend: - self.attn_state = self.attn_backend.get_state_cls()( - weakref.proxy(self)) - else: - self.attn_state = CommonAttentionState(weakref.proxy(self)) - - # Multi-modal data support - self.input_registry = input_registry - self.mm_registry = mm_registry - - # Lazy initialization - self.model: nn.Module # Set after load_model - # Set after load_model. - self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - - set_cpu_offload_max_bytes( - int(self.cache_config.cpu_offload_gb * 1024**3)) - - # Used to cache python objects - self.inter_data_cache: Dict[int, PyObjectCache] = {} - - # Using the PythonizationCache in Pipeline-Parallel clobbers the - # SequenceGroupToSample object. In Pipeline-Parallel, we have - # more than 1 Scheduler, resulting in a potential back-to-back - # prepare_model_inputs() call. This clobbers the cached - # SequenceGroupToSample objects, as we reset the cache during - # every prepare_model_inputs() call. - self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - self.sampler = get_sampler() - - def get_model(self) -> nn.Module: - return self.model - - def load_model(self) -> None: - logger.info("Starting to load model %s...", self.model_config.model) - with DeviceMemoryProfiler() as m: - self.model = get_model(vllm_config=self.vllm_config) - - self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB", - self.model_memory_usage / float(2**30)) - - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - if supports_multimodal(self.model): - logger.warning("Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - # It's necessary to distinguish between the max_position_embeddings - # of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = self.model.config.max_position_embeddings - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, - ) - self.model = self.lora_manager.create_lora_manager(self.model) - - # adapter torch compile with npu_backend - if self.torchair_graph_enabled: - import torchair # type: ignore - from torchair import patch_for_hcom # type: ignore - - # 通信算子成图 - patch_for_hcom() - # 设置npu的config,如果不设置config,可以使用默认的,那可以设置npu_backend="npu" - config = torchair.CompilerConfig() - config.experimental_config.frozen_parameter = True - config.experimental_config.tiling_schedule_optimize = True - config.experimental_config.enable_view_optimize = \ - get_ascend_config().torchair_graph_config.enable_view_optimize - torch.npu.set_compile_mode(jit_compile=False) - if not self.use_cached_npu_graph: - npu_backend = torchair.get_npu_backend(compiler_config=config) - self.compile_model = torch.compile( - self.model, - dynamic=True, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=npu_backend) - else: - self.compile_model = torchair.inference.cache_compile( - self.model.forward, - dynamic=True, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - config=config, - ge_cache=False) - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - - from vllm.model_executor.model_loader import ShardedStateLoader - ShardedStateLoader.save_model( - self.model, - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - - from vllm.model_executor.model_loader import \ - TensorizerLoader # type: ignore # noqa - TensorizerLoader.save_model( - self.model, - tensorizer_config=tensorizer_config, - ) - - def get_max_block_per_batch(self) -> int: - block_size = self.block_size - return (self.max_seq_len_to_capture + block_size - 1) // block_size - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> TModelInputForNPU: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) - builder.prepare(finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) - - builder.reset_cached_inter_data() - - return builder.build() # type: ignore - - @contextmanager - def set_in_profile_run(self): - self.in_profile_run = True - try: - yield - finally: - self.in_profile_run = False - - @torch.inference_mode() - def profile_run(self) -> None: - with self.set_in_profile_run(): - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = \ - SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # Profile memory usage with max_num_sequences sequences and the - # total number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, - # which needs to be accounted for when calculating the GPU blocks - # for vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data. - multi_modal_placeholders, - ) - seqs.append(seq) - - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - # it is important to create tensors inside the loop, rather than - # multiplying the list, to avoid Dynamo from treating them as - # tensor aliasing. - kv_caches = [ - torch.tensor([], dtype=torch.float32, device=self.device) - for _ in range(num_layers) - ] - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = \ - self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - - self.execute_model(model_input, kv_caches, intermediate_tensors) - torch.npu.synchronize() - return - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - def remove_all_prompt_adapters(self): - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def set_active_prompt_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest], - prompt_adapter_mapping: PromptAdapterMapping) -> None: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def list_prompt_adapters(self) -> Set[int]: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - -class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]): - """ - NPU model runner with sampling step. - """ - _model_input_cls: Type[ModelInputForNPUWithSamplingMetadata] = ( - ModelInputForNPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForNPUWithSamplingMetadata: - model_input = \ - ModelInputForNPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - return model_input - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelInputForNPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - if get_pp_group().is_last_rank: - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - self.pin_memory, - generators, - self.sampling_metadata_cache, - # TODO (cmq): enable this after supported in vllm - # pad_for_invariant_seq_len=True, - ) - # Get hash value of request id list to perform sampling param cache in sampler. - request_ids = model_input.request_ids_to_seq_ids.keys( # type: ignore - ) # type: ignore - request_ids_hash = hash("".join(request_ids)) - sampling_metadata.request_ids_hash = request_ids_hash # type: ignore - else: - sampling_metadata = None - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError("num_steps > 1 is not supported in ModelRunner") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - self.attn_state.begin_forward(model_input) - - assert model_input.attn_metadata is not None - # TODO(zzzzwwjj): Do we need to do it every time? - if self.torchair_graph_enabled: - torch._dynamo.mark_static(model_input.input_tokens) - torch._dynamo.mark_static(model_input.input_positions) - torch._dynamo.mark_static(model_input.attn_metadata.block_tables) - torch._dynamo.mark_static(model_input.attn_metadata.slot_mapping) - for kv in kv_caches: - if isinstance(kv, tuple): - torch._dynamo.mark_static(kv[0]) - torch._dynamo.mark_static(kv[1]) - - # TODO(andoorve): We can remove this once all - # virtual engines share the same kv cache. - virtual_engine = model_input.virtual_engine - prefill_meta = model_input.attn_metadata.prefill_metadata - previous_hidden_states = kwargs.get("previous_hidden_states") - if prefill_meta is None and self.torchair_graph_enabled: - model_executable = self.compile_model - # Note: graph_batch_size value not same as GPU - graph_batch_size = model_input.input_tokens.shape[ # type: ignore - 0] # type: ignore - # Note: previous_hidden_states maybe None not same as GPU - if previous_hidden_states is not None: - previous_hidden_states = torch.cat([ - previous_hidden_states, - torch.empty([ - graph_batch_size - previous_hidden_states.shape[0], - *previous_hidden_states.shape[1:] - ], - dtype=previous_hidden_states.dtype, - device=previous_hidden_states.device) - ]) - else: - model_executable = self.model - - # Receive KV cache in distributed KV cache transfer setting - # In disagg prefill setting, it will also recv hidden states and bypass - # model forwarding - # In KV cache database setting, it will change the model input so that - # we can skip prefilling on tokens that successfully received KV caches - # NOTE: The receive operation is blocking - bypass_model_exec = False - if self.need_recv_kv(model_input, kv_caches): - hidden_or_intermediate_states, bypass_model_exec, model_input = \ - get_kv_transfer_group().recv_kv_caches_and_hidden_states( - # model is used to know which layer the current worker - # is working on, so that we can receive KV for only those - # layers. - model_executable, - model_input, - kv_caches=kv_caches - ) - - if get_dp_group().world_size > 1: - bypass_model_exec_tensor = torch.tensor( - 1, dtype=torch.int32) if bypass_model_exec else torch.tensor( - 0, dtype=torch.int32) - torch.distributed.all_reduce(bypass_model_exec_tensor, - op=torch.distributed.ReduceOp.MIN, - group=get_dp_group().cpu_group) - # If there is any group have not receive the necessary hidden states or kv_cache, we force all the dp group execute. - if bypass_model_exec_tensor.item() == 0: - bypass_model_exec = False - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - - if self.torchair_graph_enabled: - model_kwargs: Dict[str, Any] = {"inputs_embeds": None} - else: - model_kwargs = {} - if previous_hidden_states is not None: - model_kwargs["previous_hidden_states"] = previous_hidden_states - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start = torch.npu.Event(enable_timing=True) - model_forward_end = torch.npu.Event(enable_timing=True) - model_forward_start.record() - - if not bypass_model_exec: - with set_forward_context(model_input.attn_metadata, - self.vllm_config, virtual_engine): - if model_input.attn_metadata is not None: - model_input.attn_metadata.input_positions = model_input.input_positions - if self.torchair_graph_enabled: - model_kwargs["kv_caches"] = kv_caches - model_kwargs["attn_metadata"] = model_input.attn_metadata - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=model_input.inputs_embeds, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **seqlen_agnostic_kwargs, - **model_kwargs) - - # Compute the logits in the last pipeline stage. - if not get_pp_group().is_last_rank: - if (self.is_driver_worker - and hidden_or_intermediate_states is not None - and isinstance(hidden_or_intermediate_states, - IntermediateTensors) - and self.observability_config is not None and - self.observability_config.collect_model_forward_time): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - hidden_or_intermediate_states.tensors[ - "model_forward_time"] = ( - torch.tensor(model_forward_time + - orig_model_forward_time)) - return hidden_or_intermediate_states - - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - # Sending KV cache in distributed KV cache transfer setting - if self.need_send_kv(model_input, kv_caches): - get_kv_transfer_group().send_kv_caches_and_hidden_states( - # model_executable is used to know which layer the current - # worker is working on, so that we can send KV for only those - # layers. - model_executable, - model_input, - kv_caches, - hidden_or_intermediate_states, - ) - - if self.is_driver_worker: - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - assert isinstance(self.sampler, Sampler) - orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor - if model_input.inputs_embeds is not None: - self.sampler.include_gpu_probs_tensor = True - - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - # If there are multiple workers, we are still tracking the - # latency from the start time of the driver worker to the end - # time of the driver worker. The model forward time will then - # end up covering the communication time as well. - output.model_forward_time = (orig_model_forward_time + - model_forward_time) - - if model_input.inputs_embeds is not None: - if self.is_driver_worker: - sampled = broadcast_tensor_dict( - {"token_ids": output.sampled_token_ids}) - else: - sampled = broadcast_tensor_dict() - if sampled["token_ids"] is not None: - sampled_token_embeds = self.model.get_input_embeddings( - sampled["token_ids"].squeeze(1)) - if self.is_driver_worker: - self.sampler.include_gpu_probs_tensor = \ - orig_include_gpu_probs - - output.sampled_token_embeds = sampled_token_embeds - - for token_embed, sequence_group_output in zip( - output.sampled_token_embeds, output.outputs): - assert len(sequence_group_output.samples) == 1 - sequence_group_output.samples[ - 0].output_embed = token_embed - - if not self.is_driver_worker: - return [] - - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert model_input.sampling_metadata is not None - indices = model_input.sampling_metadata.selected_token_indices - if model_input.is_prompt: - hidden_states = hidden_or_intermediate_states.index_select( - 0, indices) - output.prefill_hidden_states = hidden_or_intermediate_states - elif self.torchair_graph_enabled: - hidden_states = hidden_or_intermediate_states[:len(indices)] - else: - hidden_states = hidden_or_intermediate_states - - output.hidden_states = hidden_states - - return [output] - - def need_recv_kv(self, model_input, kv_caches) -> bool: - """Check if we need to receive kv-cache from the other worker. - We need to receive KV when - 1. current vLLM instance is KV cache consumer/decode vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_consumer and ( - not is_profile_run) and is_prefill_run - - def need_send_kv(self, model_input, kv_caches) -> bool: - """Check if we need to send kv-cache to the other worker. - We need to send KV when - 1. current vLLM instance is KV cache producer/prefill vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_producer and ( - not is_profile_run) and is_prefill_run diff --git a/vllm_ascend/worker/multi_step_runner.py b/vllm_ascend/worker/multi_step_runner.py deleted file mode 100644 index 028bcd05df..0000000000 --- a/vllm_ascend/worker/multi_step_runner.py +++ /dev/null @@ -1,737 +0,0 @@ -import dataclasses -import functools -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Union) - -import torch -from torch import nn -from vllm.distributed import get_pp_group -from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs, - SamplerOutput, - SamplingMetadata, get_logprobs, - get_pythonized_sample_results) -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SequenceGroupMetadata, SequenceOutput) -from vllm.worker.model_runner_base import ( - _init_attn_metadata_from_tensor_dict, - _init_frozen_model_input_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) -from vllm.worker.multi_step_model_runner import (ModelOutput, - PythonizationCache, - StatefulModelInput) - -from vllm_ascend.utils import current_stream -from vllm_ascend.worker.model_runner import ( - ModelInputForNPUWithSamplingMetadata, NPUModelRunnerBase) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - - -@dataclass(frozen=False) -class StatefulModelInputForNPU(StatefulModelInput): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def record_step_event(self, current_stream: torch.npu.Stream): - # record the event for the current step so that the next step can sync - # on it. We modulo by 2 to keep the events in a circular buffer and - # support any attn backends that may be supported in the future. ie - # Flashinfer would want two DecodeWrappers to overlap the CPU and NPU. - self.step_cuda_events[self.current_step & 1] = \ - torch.npu.Event(blocking=True) - self.step_cuda_events[self.current_step & 1].record(current_stream) - - # actual frozen model input dataclass passed to _base_model_runner - frozen_model_input: Optional[ModelInputForNPUWithSamplingMetadata] = None - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "StatefulModelInputForNPU": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - tensor_dict = _init_frozen_model_input_from_tensor_dict( - ModelInputForNPUWithSamplingMetadata, tensor_dict) - return cls(**tensor_dict) - - def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool): - """ - Advancing the datastructures of StatefulModelInput::frozen_model_input - is only required when prefills are scheduled with decodes to run in - multi-step. This advancement/correction is required to account for - the conversion of Prefills to Decodes after the first multi-step. - """ - if self.current_step != 1 or self.num_single_step_prefills == 0: - return - - assert self.frozen_model_input is not None - fmi = self.frozen_model_input - - # Truncate input_tokens - assert fmi.input_tokens is not None - assert fmi.input_tokens.shape[0] >= self.num_seqs - fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] - - # Update frozen_model_input::input_positons. - assert fmi.input_positions is not None - assert fmi.input_positions.shape[0] >= self.num_seqs - fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. - num_seqs] - - # Assert unsupported - # TODO Uncomment the following codes when NPU supported - # assert fmi.lora_mapping is None - # assert fmi.lora_requests is not None - # assert len(fmi.lora_requests) == 0 - # assert fmi.prompt_adapter_mapping is None - # assert fmi.prompt_adapter_requests is not None - # assert len(fmi.prompt_adapter_requests) == 0 - assert fmi.attn_metadata is not None - assert fmi.multi_modal_kwargs is not None - assert len(fmi.multi_modal_kwargs) == 0 - - self.frozen_model_input = dataclasses.replace( - self.frozen_model_input, - input_tokens=fmi_new_input_tokens, - input_positions=fmi_new_input_positions) - - self.maybe_advance_sampling_metadata(device, pin_memory) - - -@dataclass(frozen=False) -class NPUModelOutput(ModelOutput): - - logprobs: Optional["torch.Tensor"] = None - - def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput", - copy_stream: torch.npu.Stream, - pinned_sampled_token_buffer: torch.Tensor, - blocking: bool) -> bool: - """ - If blocking is set, will block until the forward pass for the output is - ready and pythonize the output. Upon completing Pythonization, erases - self.logprobs (note that a non-blocking call that is performed when - the sampler output is not yet ready, will not erase self.logprobs.) - """ - assert self.sampled_token_ids is not None - if not blocking and not self.sampler_output_ready_event.query(): - return False - - if blocking: - self.sampler_output_ready_event.synchronize() - with torch.npu.stream(copy_stream): - _pythonize_sampler_output(input_metadata, self.sampler_output, - pinned_sampled_token_buffer, - self.sampled_token_ids, self.logprobs, - self.pythonization_cache) - - # Erase the logprobs GPU-side tensor. - # Note that although _pythonize_sampler_output() runs in its - # own CUDA stream, nonetheless _pythonize_sampler_output() - # cannot return until Pythonization is complete; therefore - # we know that by the time the CPU reaches this point, - # `self.logprobs` is no longer needed. - self.logprobs = None - return True - - -class MultiStepModelNPURunner(NPUModelRunnerBase[StatefulModelInputForNPU]): - # mypy: enable-error-code=type-var - - def __init__(self, base_model_runner: NPUModelRunnerBase, *args, **kwargs): - super().__init__(*args, **kwargs) - - # uses the base model runner to execute the model and wraps it with - # multi-step logic - self._base_model_runner: NPUModelRunnerBase = base_model_runner - - self.is_multi_step = self.scheduler_config.is_multi_step - self.pinned_sampled_token_ids: Optional[torch.Tensor] = None - - # Using the PythonizationCache in Pipeline-Parallel clobbers the - # SequenceOutput and CompletionSequenceGroupOutput object. - # When cache-reset happens at the last step of a multi-step - # execution, there may be other on-going single-step/multi-step - # executions. The current caching implementation does not check - # for this. - self.pythonization_cache = PythonizationCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - def get_model(self) -> nn.Module: - return self._base_model_runner.get_model() - - @functools.cached_property - def _copy_stream(self): - # used to copy tensors from NPU to CPU asynchronously - return torch.npu.Stream() - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> StatefulModelInputForNPU: - model_input = (StatefulModelInputForNPU.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - return model_input - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> StatefulModelInputForNPU: - frozen_model_input: ModelInputForNPUWithSamplingMetadata = \ - self._base_model_runner.prepare_model_input( - seq_group_metadata_list, - virtual_engine, - finished_requests_ids) - - assert frozen_model_input.query_lens is not None - assert frozen_model_input.seq_lens is not None - assert frozen_model_input.attn_metadata is not None - num_queries = len(frozen_model_input.query_lens) - num_seqs = len(frozen_model_input.seq_lens) - num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills - - model_input = StatefulModelInputForNPU( - frozen_model_input=frozen_model_input, - num_seqs=num_seqs, - num_queries=num_queries, - num_single_step_prefills=num_single_step_prefills, - step_cuda_events=[torch.npu.Event(blocking=True)] * 2, - ) - - return model_input - - def _async_process_outputs(self, model_input: StatefulModelInputForNPU, - output_proc_callback: Callable): - # Proceed with pythonization and output_proc in order. - # Stop on the first one that fails to pythonize - output_proc_callback() - - cont = True - for step_num, model_output in enumerate(model_input.cached_outputs): - if not model_output.pythonized: - model_output.maybe_pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - if model_output.pythonized: - ctx = output_proc_callback.keywords["ctx"] # type: ignore - ctx.append_output( - outputs=[model_output.sampler_output], - seq_group_metadata_list=ctx.seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=step_num == 0) - - output_proc_callback() - else: - cont = False - - if not cont: - break - - def _final_process_outputs( - self, model_input: StatefulModelInputForNPU, - output_proc_callback: Optional[Callable]) -> List[SamplerOutput]: - assert model_input.frozen_model_input is not None - - has_async_callback = output_proc_callback is not None - - outputs = [] - for step_num, output in enumerate(model_input.cached_outputs): - is_last_step = step_num == len(model_input.cached_outputs) - 1 - - # For non-async case: - # -- We simply add the outputs - # For async case: - # -- Invoke callback, pythonize, add to callback queue and repeat - # -- For last output, just add to callback queue - if has_async_callback: - assert output_proc_callback is not None - - # Invoke callback before pythonize (to overlap with NPU) - output_proc_callback() - - # Pythonize - if not output.pythonized: - output.pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - - # For non last step, add to callback queue to chain - # callbacks=>pythonize pairs (for NPU overlap) - if not is_last_step: - ctx = output_proc_callback.keywords[ # type: ignore - "ctx"] # type: ignore - ctx.append_output( - outputs=[output.sampler_output], - seq_group_metadata_list=ctx. - seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=step_num == 0) - else: - outputs.append(output.sampler_output) - else: - output.pythonize(model_input, self._copy_stream, - self.pinned_sampled_token_ids) - outputs.append(output.sampler_output) - - return outputs - - @torch.inference_mode() - def execute_model( - self, - model_input: StatefulModelInputForNPU, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - """ - Execute the model for a single step and update multi-step - metadata - """ - assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1" - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - # path for warm up runs - if not model_input.is_multi_step: - return self._base_model_runner.execute_model( - frozen_model_input, kv_caches, intermediate_tensors, num_steps) - - # make sure we skip the sampler on the lask rank and only pythonize - # if CPU is ahead. - if self.is_driver_worker and get_pp_group().is_last_rank: - if self.pinned_sampled_token_ids is None: - self.pinned_sampled_token_ids = torch.zeros( - (self.scheduler_config.max_num_seqs, 1), - dtype=torch.long, - device="cpu", - pin_memory=True) - self._base_model_runner.sampler.include_gpu_probs_tensor = True - if frozen_model_input.sampling_metadata: - frozen_model_input.sampling_metadata.skip_sampler_cpu_output = ( - True) - - # some pre-execute model logic for multi-step: - # - if it's the first step, we need to reset the sampling tensors - # - if it's not the first step, we need to advance the step using the - # appended sampler output from last iteration - # - also maybe pythonize if CPU is ahead of NPU - - stream = current_stream() - if not model_input.is_first_multi_step: - # Explicitly block on the previous step's forward to make sure we - # don't clobber any NPU tensors still in use. - # This is not needed for flashattn backend, but for other attn - # backends such as flashinfer that performs extra CPU operations on - # input metadata we may need to synchronize any CPU operations that - # might clobber enqueued forwards. (prevents CPU from running too - # far ahead if needed) - model_input.wait_previous_step() - model_input = self._advance_step( - model_input, model_input.cached_outputs[-1].sampler_output) - - # frozen_model_input may have been updated - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - if model_input.base_output_proc_callback is None: - assert frozen_model_input is not None - model_input.base_output_proc_callback = \ - frozen_model_input.async_callback - - if frozen_model_input.async_callback is not None: - assert model_input.base_output_proc_callback is not None - async_callback = functools.partial( - self._async_process_outputs, - model_input=model_input, - output_proc_callback=model_input.base_output_proc_callback) - - model_input.frozen_model_input = dataclasses.replace( # type: ignore - model_input.frozen_model_input, - async_callback=async_callback) - # Update the local instance - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - - # Execute the model - output = self._base_model_runner.execute_model(frozen_model_input, - kv_caches, - intermediate_tensors, - num_steps=1) - - # record the event for the current step so that the next step can sync - model_input.record_step_event(stream) - - if get_pp_group().is_last_rank and self.is_driver_worker: - assert isinstance(output, list) - assert len( - output - ) == 1, "MultiStepModelRunner requires single-step base_models" - - # event for the pythonization so that we only pythonize if the - # tensors are ready. May be able to be combined with the step event - output_ready_event = torch.npu.Event() - output_ready_event.record(stream) - if self.parallel_config.pipeline_parallel_size > 1: - output[0].sampled_token_ids_cpu = output[ - 0].sampled_token_ids.cpu() - model_input.cached_outputs.append( - NPUModelOutput(output[0], output_ready_event, - output[0].sampled_token_ids, False, - output[0].logprobs, self.pythonization_cache)) - - # These NPU tensors are not required by multi-step; - # erase them to ensure they are not pythonized or - # transferred to CPU - output[0].sampled_token_ids = None - output[0].sampled_token_probs = None - output[0].logprobs = None - - # Pythonize the output if CPU is ahead and the previous step is - # ready. - if frozen_model_input.async_callback is None: - for model_output in model_input.cached_outputs: - model_output.maybe_pythonize(model_input, - self._copy_stream, - self.pinned_sampled_token_ids) - - model_input.current_step += 1 - - if not get_pp_group().is_last_rank: - # Should be IntermediateTensors - assert isinstance(output, IntermediateTensors) - return output - if not self.is_driver_worker: - return [] - - # Pythonize the output and block if needed since it is the last step - if model_input.is_last_step: - outputs = self._final_process_outputs( - model_input, model_input.base_output_proc_callback) - if self.pythonization_cache: - self.pythonization_cache.reset() - return outputs - - # should be [SamplerOutput] - return output - - def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata, - num_seqs: Optional[int], num_queries: int): - - assert sampling_metadata.num_prompts == 0 - assert len(sampling_metadata.seq_groups) == num_queries - assert sampling_metadata.selected_token_indices.shape == ( - num_queries, ) - # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501 - - # Verify that all sequences are decodes - for i in range(num_queries): - seq_group = sampling_metadata.seq_groups[i] - - assert seq_group.is_prompt is False # No prompt - assert seq_group.prompt_logprob_indices == [] # No prompt - assert seq_group.sample_indices == [i] # Simple - assert seq_group.seq_len is None # Decode - assert seq_group.query_len is None # Decode - - def _advance_step(self, model_input: StatefulModelInputForNPU, - out: SamplerOutput) -> StatefulModelInputForNPU: - - model_input.maybe_advance_frozen_model_input(self.device, - self.pin_memory) - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - assert frozen_model_input.input_tokens is not None - assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs - assert frozen_model_input.attn_metadata is not None - - sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids - num_seqs = model_input.num_seqs - num_queries = model_input.num_queries - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - attn_metadata = frozen_model_input.attn_metadata - assert attn_metadata is not None - - turn_prefills_into_decodes: bool = model_input.current_step == 1 and \ - model_input.num_single_step_prefills != 0 - attn_metadata.advance_step( - frozen_model_input, - sampled_token_ids, - self.block_size, - num_seqs, - num_queries, - turn_prefills_into_decodes=turn_prefills_into_decodes) - - return model_input - - def load_model(self) -> None: - self._base_model_runner.load_model() - self.model_memory_usage = self._base_model_runner.model_memory_usage - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - return self._base_model_runner.save_sharded_state( - path, pattern, max_size) - - def save_tensorized_model(self, - tensorizer_config: TensorizerConfig) -> None: - return self._base_model_runner.save_tensorized_model(tensorizer_config) - - def profile_run(self) -> None: - return self._base_model_runner.profile_run() - - def remove_all_loras(self): - return self._base_model_runner.remove_all_loras() - - def capture_model(self, kv_caches: List[List]) -> None: - return self._base_model_runner.capture_model(kv_caches) - - @property - def vocab_size(self) -> int: - return self._base_model_runner.vocab_size - - -DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]], - Optional[List[SampleLogprobs]]] - - -def deferred_pythonize_logprobs( - output: SamplerOutput, - sampling_metadata: SamplingMetadata, - logprobs_tensor: Optional[torch.Tensor], -) -> DeferredLogprobsReturnType: - """Perform deferred logprob Pythonization. - - 1. Pythonize NPU-side sampler result tensors into CPU-side sampler result. - 2. Pythonize NPU-side logprobs tensor into CPU-side logprobs lists, - utilizing the Pythonized sampler result computed in step 1. - - These deferred computations are not required for single-step scheduling - or the `profile_run()` phase of multi-step scheduling. - - Args: - output: sampler output (under deferred Pythonization) - sampling_metadata - - Returns: - prompt_logprobs (CPU), sample_logprobs (CPU) - """ - - # - Deferred pythonization of sample result - sampler_result = get_pythonized_sample_results( - output.deferred_sample_results_args) - - # - Erase the NPU-side deferred sample_result - # computation args to ensure it is never - # pythonized or transferred to CPU - output.deferred_sample_results_args = None - - # - Deferred pythonization of logprobs - ( - prompt_logprobs, - sample_logprobs, - ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result) - assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) - assert len(sample_logprobs) == len(sampling_metadata.seq_groups) - - return prompt_logprobs, sample_logprobs - - -def _pythonize_sampler_output( - model_input: StatefulModelInputForNPU, - output: SamplerOutput, - pinned_sampled_token_buffer: torch.Tensor, - sampled_token_ids: torch.Tensor, - logprobs_tensor: Optional[torch.Tensor], - cache: Optional[PythonizationCache], -) -> None: - """ This function is only called when the output tensors are ready. - See :class:`ModelOutput`. - - Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, - adding a Pythonized output data structure - (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`. - - Args: - model_input - output: sampler output - pinned_sampled_token_token_buffer: CPU-side pinned memory - (receives copy of - NPU-side token buffer.) - sampled_token_ids: NPU-side token buffer - logprobs_tensor: NPU-side tensor containing - logprobs computed during sampling - """ - - assert model_input.frozen_model_input is not None - - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input.sampling_metadata is not None - sampling_metadata = frozen_model_input.sampling_metadata - # samples generation should have been skipped - assert not output.outputs - - pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries] - - # We guarantee output tensors are ready, so it is safe to - # pythonize the sampler output & obtain CPU-side logprobs. - # - # However we should check whether logprobs pythonization may - # be skipped entirely, i.e. because no logprobs were requested - # or pythonization was not deferred. To that end, - # - # * `prompt_logprobs_are_requested_for_prefill` signals that - # there are *any* prefill-phase requests which specify that - # prompt logprobs should be returned. - # - # * `any_logprobs_are_requested` signals that there are any - # requests which (1) specify that sample logprobs should be - # returned, or (2) are in the prefill phase AND specify that - # prompt logprobs should be returned. - # - # Later on, these flags cause adjustments to the pythonization - # process to accommodate logprobs. - - seq_groups = sampling_metadata.seq_groups - prompt_logprobs_are_requested_for_prefill = any([ - sg.sampling_params.prompt_logprobs is not None and sg.is_prompt - for sg in seq_groups - ]) - any_logprobs_are_requested = ( - prompt_logprobs_are_requested_for_prefill - or any([sg.sampling_params.logprobs is not None for sg in seq_groups])) - - if prompt_logprobs_are_requested_for_prefill: - # CPU NPU sync, after gathering *only* sampled tokens (since - # requesting prompt logprobs leads `sampled_token_ids` to - # include prompt token ids in addition to sampled token ids.) - sample_idx_tensor = torch.tensor( - [sdx for sg in seq_groups for sdx in sg.sample_indices]) - pinned_buffer = pinned_buffer.copy_( - sampled_token_ids[sample_idx_tensor, :], non_blocking=False) - else: - # CPU NPU sync - pinned_buffer = pinned_buffer.copy_(sampled_token_ids, - non_blocking=False) - - # this will not block as the tensors are already on CPU - samples_list = pinned_buffer.tolist() - - skip_sampler_cpu_output = ( - frozen_model_input.sampling_metadata.skip_sampler_cpu_output) - - # *Don't* skip logprobs pythonization *if*: - # * Any requests require logprobs to be returned in this - # iteration AND - # * These requests are being scheduled in a fashion which - # defers pythonization (i.e. multi-step scheduling.) - do_pythonize_logprobs = (skip_sampler_cpu_output - and any_logprobs_are_requested) - ( - prompt_logprobs, - sample_logprobs, - ) = (deferred_pythonize_logprobs(output, sampling_metadata, - logprobs_tensor) - if do_pythonize_logprobs else (None, None)) - - for sgdx, (seq_group, - sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/features/compatibility_matrix.md - # If the feature combo become valid - # (Check for Guided Decoding) - if seq_group.sampling_params.logits_processors: - assert len(seq_group.sampling_params.logits_processors) == 0, ( - "Logits Processors are not supported in multi-step decoding") - - if do_pythonize_logprobs: - assert prompt_logprobs is not None - assert sample_logprobs is not None - - ( - group_prompt_logprobs, - group_sample_logprobs, - ) = ( # Utilize deferred pythonization results - prompt_logprobs[sgdx], - sample_logprobs[sgdx], - ) - elif any_logprobs_are_requested: - ( - group_prompt_logprobs, - group_sample_logprobs, - ) = ( - # profile_run: use already-computed logprobs - output.outputs[sgdx].prompt_logprobs, - [sample.logprobs for sample in output.outputs[sgdx].samples]) - - seq_ids = seq_group.seq_ids - next_token_ids = sample_result - parent_ids = [0] - seq_outputs: List[SequenceOutput] - - if cache is not None: - completion_seq_group_output: CompletionSequenceGroupOutput = \ - cache.cached_completion_seq_group_output.get_object() - completion_seq_group_output.samples.clear() - seq_outputs = completion_seq_group_output.samples - else: - seq_outputs = [] - - for tdx, (parent_id, - next_token_id) in enumerate(zip(parent_ids, next_token_ids)): - if cache is not None: - seq_output: SequenceOutput = cache.cached_seq_output.get_object( - ) - seq_output.parent_seq_id = seq_ids[parent_id] - seq_output.output_token = next_token_id - - if any_logprobs_are_requested: - seq_output.logprobs = group_sample_logprobs[tdx] - else: - logprobs = next(iter(seq_output.logprobs.values())) - seq_output.logprobs.clear() - - logprobs.logprob = float('inf') - logprobs.rank = None - logprobs.decoded_token = None - - seq_output.logprobs[next_token_id] = logprobs - - seq_outputs.append(seq_output) - - else: - seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, - (group_sample_logprobs[tdx] - if any_logprobs_are_requested else { - next_token_id: - Logprob(logprob=float('inf'), - rank=None, - decoded_token=None) - }))) - if cache is not None: - completion_seq_group_output.prompt_logprobs = \ - group_prompt_logprobs if any_logprobs_are_requested else None - output.outputs.append(completion_seq_group_output) - else: - output.outputs.append( - CompletionSequenceGroupOutput( - seq_outputs, (group_prompt_logprobs - if any_logprobs_are_requested else None))) - - assert len(output.outputs) > 0 diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py deleted file mode 100644 index 6d092805d5..0000000000 --- a/vllm_ascend/worker/multi_step_worker.py +++ /dev/null @@ -1,194 +0,0 @@ -import dataclasses -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple - -import torch -from vllm.distributed import broadcast_tensor_dict, get_pp_group -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.worker.model_runner_base import BroadcastableModelInput -from vllm.worker.multi_step_model_runner import StatefulModelInput - -from vllm_ascend.worker.multi_step_runner import MultiStepModelNPURunner -from vllm_ascend.worker.worker import NPUWorker, WorkerInput - - -@dataclass -class MultiStepState: - worker_input: WorkerInput - model_input: StatefulModelInput - - -class MultiStepWorker(NPUWorker): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - base_model_runner = self.model_runner - # for multi-step model, wrap the model runner with MultiStepModelRunner - self.model_runner = MultiStepModelNPURunner( - base_model_runner, - vllm_config=base_model_runner.vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=base_model_runner.is_driver_worker, - ) - - pipeline_parallel_size = self.parallel_config.pipeline_parallel_size - self.multi_step_states: List[ - Optional[MultiStepState]] = [None] * pipeline_parallel_size - self.temp_output = None - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: - """ - Get the driver input and broadcast it to other workers. - """ - assert self.is_driver_worker - virtual_engine = execute_model_req.virtual_engine - is_first_multi_step = execute_model_req.is_first_multi_step - if is_first_multi_step: - # on first step we prepare the worker input and model input normally - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: StatefulModelInput = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - if execute_model_req.async_callback: - model_input.frozen_model_input = dataclasses.replace( # type: ignore - model_input.frozen_model_input, - async_callback=execute_model_req.async_callback) - else: - # on subsequent steps we reuse the worker input and model input - multi_step_state = self.multi_step_states[virtual_engine] - worker_input = multi_step_state.worker_input - model_input = multi_step_state.model_input - frozen_model_input = model_input.frozen_model_input - assert frozen_model_input is not None - assert frozen_model_input.attn_metadata is not None - # clear the cached metadata so that it can be recomputed on - # the workers. - frozen_model_input.attn_metadata._cached_prefill_metadata = None - frozen_model_input.attn_metadata._cached_decode_metadata = None - - model_input.is_first_multi_step = is_first_multi_step - model_input.is_last_step = execute_model_req.is_last_step - - if not is_first_multi_step: - # we broadcast the last sampled token ids to all TP workers so they - # can update their model input metadata in-place. - self._prepare_last_sampled_token_ids_for_tp_workers( - execute_model_req=execute_model_req, model_input=model_input) - - if self.do_metadata_broadcast: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update(model_input.as_broadcastable_tensor_dict()) - broadcast_tensor_dict(broadcast_data, src=0) - - # Retuning empty dict here to keep this compatible with - # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` - return model_input, worker_input, {} - - def _prepare_last_sampled_token_ids_for_tp_workers( - self, - execute_model_req: ExecuteModelRequest, - model_input: StatefulModelInput, - ) -> None: - """ - Prepare the last sampled token ids for TP workers. If it's the last - PP rank, then the last sampled token ids are already in the model_input. - If it is NOT the last PP rank, then we need to get the last sampled - token that is cached in the execute_model_req. - """ - if get_pp_group().is_last_rank: - assert model_input.cached_outputs[ - -1].sampler_output.sampled_token_ids is None - assert model_input.cached_outputs[-1].sampled_token_ids is not None - model_input.last_sampled_token_ids = model_input.cached_outputs[ - -1].sampled_token_ids - # free sampled token ids from the previous step if it has been - # pythonized. Cannot free the last sampled token ids because - # we need it for GPU advance_step. - for output in model_input.cached_outputs[:-1]: - if output.pythonized: - output.sampled_token_ids = None - else: - # otherwise we need to get the cached sampled token ids from the - # execute_model_req - assert execute_model_req.last_sampled_token_ids is not None - model_input.last_sampled_token_ids = ( - execute_model_req.last_sampled_token_ids.npu()) - model_input.add_sampler_output( - SamplerOutput(outputs=[], sampled_token_ids=None), - model_input.last_sampled_token_ids) - - # free sampled token ids from the previous step. - # TODO(will) we could reuse the sampled token ids tensor from - # the previous step instead. - for output in model_input.cached_outputs[:-1]: - output.sampled_token_ids = None - assert model_input.cached_outputs[-1].sampled_token_ids is not None - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str, - torch.Tensor]]]: - """ - Depending on the current state of the request and multi step worker, - this method may skip the normal _prepare_model_input and - _prepare_worker_input methods and instead used cached values. - """ - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - # This signals that there's no more requests to process for - # now. All workers are running infinite loop with - # broadcast_tensor_dict, and it stops the loop when the - # driver broadcasts an empty input. Send an empty input to - # notify all other workers to stop their execution loop. - broadcast_tensor_dict({}, src=0) - return None - - virtual_engine = execute_model_req.virtual_engine - (model_input, worker_input, - kwargs) = self._get_driver_input_and_broadcast(execute_model_req) - assert isinstance(model_input, StatefulModelInput) - if execute_model_req.is_first_multi_step: - # cache the worker input and model input for the next steps - self.multi_step_states[virtual_engine] = MultiStepState( - worker_input=worker_input, model_input=model_input) - # if TP workers - else: - broadcast_data = self._get_worker_input_from_broadcast() - # if the driver has sent an empty input, we should stop the worker - # loop - if broadcast_data is None: - return None - model_input, worker_input, kwargs = broadcast_data - assert isinstance(model_input, StatefulModelInput) - virtual_engine = worker_input.virtual_engine - if model_input.is_first_multi_step: - pass - # TODO(will) Can cache the worker input and model input for the - # next steps. See below for details - else: - # TODO(will) possible to also cache and reuse the cached worker - # input and model input. The idea is essentially the delta - # optimization for model_inputs. Where the TP workers can cache - # the model input states and we only broadcast the delta need - # for the next step (sampled_token_ids from the previous step) - - assert isinstance(model_input, StatefulModelInput) - # we need to update the last sampled token ids in the model - # input for the workers so that they can run inplace - # advance_step - model_input.add_sampler_output( - SamplerOutput(outputs=[], sampled_token_ids=None), - model_input.last_sampled_token_ids) - - assert model_input is not None - assert worker_input is not None - return model_input, worker_input, kwargs diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py deleted file mode 100644 index e1262fb0a2..0000000000 --- a/vllm_ascend/worker/pooling_model_runner.py +++ /dev/null @@ -1,186 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/worker.py -# -import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context -from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalKwargs -from vllm.pooling_params import PoolingParams -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) - -from vllm_ascend.worker.model_runner import (ModelInputForNPU, - ModelInputForNPUBuilder, - NPUModelRunnerBase) - - -@dataclasses.dataclass(frozen=True) -class ModelInputForNPUWithPoolingMetadata(ModelInputForNPU): - """ - Used by the PoolingModelRunner. - """ - pooling_metadata: Optional["PoolingMetadata"] = None - - -class NPUPoolingModelRunner( - NPUModelRunnerBase[ModelInputForNPUWithPoolingMetadata]): - - _model_input_cls: Type[ModelInputForNPUWithPoolingMetadata] = ( - ModelInputForNPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForNPUWithPoolingMetadata: - return ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForNPUWithPoolingMetadata: - assert seq_group_metadata_list is not None - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Prepare PoolingMetadata. - assert model_input.seq_lens is not None - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - model_input.seq_lens) - - return dataclasses.replace(model_input, - pooling_metadata=pooling_metadata) - - def _prepare_pooling( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - ) -> PoolingMetadata: - """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - pooling_params = seq_group_metadata.pooling_params - seq_groups.append((seq_ids, pooling_params)) - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - pooling_metadata = PoolingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - ) - - return pooling_metadata - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ): - if num_steps > 1: - raise ValueError( - "PoolingModelRunner does not support multi-step execution.") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - - assert model_input.attn_metadata is not None - virtual_engine = model_input.virtual_engine - model_executable = self.model - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start = torch.npu.Event(enable_timing=True) - model_forward_end = torch.npu.Event(enable_timing=True) - model_forward_start.record() - - cross_enc_kwargs = {} - if model_input.token_types is not None: - cross_enc_kwargs["token_type_ids"] = model_input.token_types - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **cross_enc_kwargs, - **seqlen_agnostic_kwargs) - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.record() - - # Only perform pooling in the last pipeline stage. - if not get_pp_group().is_last_rank: - if (self.is_driver_worker - and hidden_or_intermediate_states is not None - and isinstance(hidden_or_intermediate_states, - IntermediateTensors) - and self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - hidden_or_intermediate_states.tensors["model_forward_time"] = ( - torch.tensor(model_forward_time + orig_model_forward_time)) - return hidden_or_intermediate_states - - # Only perform pooling in the driver worker. - if not self.is_driver_worker: - return [] - - return [ - self.model.pooler(hidden_states=hidden_or_intermediate_states, - pooling_metadata=model_input.pooling_metadata) - ] diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py deleted file mode 100644 index bffc6a8de8..0000000000 --- a/vllm_ascend/worker/worker.py +++ /dev/null @@ -1,579 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/worker.py -# - -import gc -import os -from typing import Dict, List, Optional, Set, Tuple, Type, Union - -import msgpack # type: ignore -import torch -import torch.distributed -import zmq -from torch import nn -from vllm import envs -from vllm.config import VllmConfig, set_current_vllm_config -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment, - set_custom_all_reduce) -from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized -from vllm.logger import logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import GiB_bytes, bind_kv_cache, get_ip -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner -from vllm.worker.model_runner_base import ModelRunnerBase -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -from vllm_ascend.ascend_config import init_ascend_config -from vllm_ascend.device_allocator.camem import CaMemAllocator -from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel -from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, - is_310p, try_register_lib) -from vllm_ascend.worker.model_runner import NPUModelRunner -from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner - - -class NPUWorker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a NPU. - Each worker is associated with a single NPU. The worker is responsible for - maintaining the KV cache and executing the model on the NPU. In case of - distributed inference, each worker is assigned a partition of the model. - """ - - def __init__(self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[ModelRunnerBase]] = None): - # register patch for vllm - from vllm_ascend.utils import adapt_patch - adapt_patch() - # Register ops when worker init. - from vllm_ascend import ops # noqa: F401 - - # init ascend config - init_ascend_config(vllm_config) - - WorkerBase.__init__(self, vllm_config=vllm_config) - # Try to import mindie_turbo to accelerate vLLM inference. - try_register_lib( - "mindie_turbo", - "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." - ) - # distribute related config - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - # Return hidden states from target model if the draft model is an - # mlp_speculator - speculative_config = self.speculative_config - model_config = self.model_config - speculative_args = {} if speculative_config is None \ - or (speculative_config.draft_model_config.hf_config.model_type == - model_config.hf_config.model_type) \ - or (speculative_config.draft_model_config.hf_config.model_type - not in ["medusa", "mlp_speculator", "eagle", "deepseek_mtp"]) \ - else {"return_hidden_states": True} - - ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner - if model_config.runner_type == "pooling": - ModelRunnerClass = NPUPoolingModelRunner - elif self.model_config.is_encoder_decoder: - ModelRunnerClass = EncoderDecoderModelRunner - self.model_runner: ModelRunnerBase = ModelRunnerClass( - vllm_config=self.vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - **speculative_args, - ) - if model_runner_cls is not None: - self.model_runner = model_runner_cls(self.model_runner) - - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches - self.gpu_cache: Optional[List[List[torch.Tensor]]] = None - self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} - - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - # lazy import so that torch_npu is not required for normal use. - import torch_npu - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - - experimental_config = torch_npu.profiler._ExperimentalConfig( - export_type=torch_npu.profiler.ExportType.Text, - profiler_level=torch_npu.profiler.ProfilerLevel.Level1, - msprof_tx=False, - aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone, - l2_cache=False, - op_attr=False, - data_simplification=False, - record_op_args=False, - gc_detect_threshold=None, - ) - - self.profiler = torch_npu.profiler.profile( - activities=[ - torch_npu.profiler.ProfilerActivity.CPU, - torch_npu.profiler.ProfilerActivity.NPU, - ], - with_stack=False, - profile_memory=False, - with_modules=False, - experimental_config=experimental_config, - on_trace_ready=torch_npu.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir)) - else: - self.profiler = None - - self.enable_dummy_run = False - if os.getenv("VLLM_DP_PROXY_IP", None): - logger.warning("enable dummy run for the DP") - self.enable_dummy_run = True - # dp_rank = os.environ["VLLM_DP_RANK"] - dp_master_ip = os.environ["VLLM_DP_PROXY_IP"] - dp_proxy_listener_port = os.environ["VLLM_DP_PROXY_PORT"] - dp_proxy_monitor_port = os.environ["VLLM_DP_MONITOR_PORT"] - dp_proxy_listener_addr = f"{dp_master_ip}:{dp_proxy_listener_port}" - self.dp_proxy_monitor_addr = f"{dp_master_ip}:{dp_proxy_monitor_port}" - http_ip = get_ip() - port = os.environ["VLLM_HTTP_PORT"] - self.http_addr = f"{http_ip}:{port}" - context = zmq.Context() # type: ignore - sock = context.socket(zmq.DEALER) # type: ignore - - logger.debug("ping dp proxy start, DP_RANK:%s", 0) - # logger.debug("ping dp proxy start, DP_RANK:%s", dp_rank) - - sock.connect(f"tcp://{dp_proxy_listener_addr}") - data = {"type": "DP", "http_address": self.http_addr} - for _ in range(10): - sock.send(msgpack.dumps(data)) - - self.notify_socket = context.socket(zmq.PUSH) # type: ignore - self.notify_socket.connect(f"tcp://{self.dp_proxy_monitor_addr}") - - def sleep(self, level: int = 1) -> None: - NPUPlatform.set_device(self.device) - free_bytes_before_sleep = NPUPlatform.mem_get_info()[0] - allocator = CaMemAllocator.get_instance() - allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) - free_bytes_after_sleep, total = NPUPlatform.mem_get_info() - freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep - used_bytes = total - free_bytes_after_sleep - assert freed_bytes >= 0, "Memory usage increased after sleeping." - logger.info( - "Sleep mode freed %.2f GiB memory, " - "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, - used_bytes / GiB_bytes) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - allocator = CaMemAllocator.get_instance() - allocator.wake_up(tags=tags) - - def init_device(self) -> None: - if self.device_config.device.type == "npu": - self.device = torch.device(f"npu:{self.local_rank}") - NPUPlatform.set_device(self.device) - NPUPlatform.empty_cache() - self.init_npu_memory = NPUPlatform.mem_get_info()[0] - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - self._init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank) - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CaMemAllocator.get_instance() - assert allocator.get_current_usage() == 0, ( - "Sleep mode can only be " - "used for one instance per process.") - context = allocator.use_memory_pool(tag="weights") - else: - from contextlib import nullcontext - context = nullcontext() # type: ignore - with context: - self.model_runner.load_model() - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - self.model_runner.save_sharded_state( - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - self.model_runner.save_tensorized_model( - tensorizer_config=tensorizer_config, ) - - @NPUPlatform.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of NPU and CPU blocks - that can be allocated with the remaining free memory. - .. tip:: - You may limit the usage of NPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - NPUPlatform.empty_cache() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - free_npu_memory, total_npu_memory = NPUPlatform.mem_get_info() - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - peak_memory = self.init_npu_memory - free_npu_memory - assert peak_memory > 0, ( - "Error in memory profiling. " - f"Initial free memory {self.init_npu_memory}, current free memory" - f" {free_npu_memory}. This happens when the NPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - - cache_block_size = self.get_cache_block_size_bytes() - num_npu_blocks = int( - (total_npu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_npu_blocks = max(num_npu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - gc.collect() - # TODO: don`t need impl this func after empty_cache in - # Worker.determine_num_available_blocks() unified` - NPUPlatform.empty_cache() - return num_npu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Allocate NPU and CPU KV cache with the specified number of blocks. - """ - raise_if_cache_size_invalid(num_gpu_blocks, - self.cache_config.block_size, - self.cache_config.is_attention_free, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CaMemAllocator.get_instance() - context = allocator.use_memory_pool(tag="kv_cache") - else: - from contextlib import nullcontext - context = nullcontext() # type: ignore - with context: - with set_current_vllm_config(self.vllm_config): - self._init_cache_engine() - self._warm_up_model() - - def _init_cache_engine(self): - assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = [ - CacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - import torch_npu - acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p( - ) else ACL_FORMAT_FRACTAL_ND - for ve in range(self.parallel_config.pipeline_parallel_size): - num_layers = len(self.cache_engine[ve].gpu_cache) - for i in range(num_layers): - if torch.is_tensor(self.cache_engine[ve].gpu_cache[i]): - self.cache_engine[ve].gpu_cache[ - i] = torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i], acl_format) - else: - self.cache_engine[ve].gpu_cache[i][ - 0] = torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i][0], acl_format) - self.cache_engine[ve].gpu_cache[i][ - 1] = torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i][1], acl_format) - self.gpu_cache = [ - self.cache_engine[ve].gpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - bind_kv_cache(self.compilation_config.static_forward_context, - self.gpu_cache) - - def _warm_up_model(self) -> None: - # model capture is not supported, thus we just set seed here. - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.gpu_cache - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_steps = execute_model_req.num_steps - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, - dtype=torch.int64).view(-1, 2) - - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - num_steps=num_steps, - ) - - def get_model(self) -> nn.Module: - return self.model_runner.get_model() - - @torch.inference_mode() - def execute_worker(self, worker_input: WorkerInput) -> None: - if self.enable_dummy_run: - logger.debug( - f"send notify to the dp proxy: {self.dp_proxy_monitor_addr}") - data = {"info": "notify_step", "http_address": self.http_addr} - self.notify_socket.send(msgpack.dumps(data)) - virtual_engine = worker_input.virtual_engine - # Issue cache operations. - if (worker_input.blocks_to_swap_in is not None - and worker_input.blocks_to_swap_in.numel() > 0): - self.cache_engine[virtual_engine].swap_in( - worker_input.blocks_to_swap_in) - if (worker_input.blocks_to_swap_out is not None - and worker_input.blocks_to_swap_out.numel() > 0): - self.cache_engine[virtual_engine].swap_out( - worker_input.blocks_to_swap_out) - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) - - def _get_cached_seq_group_metadata( - self, - seq_group_metadata_list: List[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]], - finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: - """Return a list of cached Sequence Group Metadata after updating its - state. - - It is used because scheduler only sends delta to workers to reduce - the data payload size. The function also cleans up cache based on - a given `finished_request_ids`. - """ - new_seq_group_metadata_list = [] - for metadata_or_delta in seq_group_metadata_list: - request_id = metadata_or_delta.request_id - if request_id not in self._seq_group_metadata_cache: - # The first prefill. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[request_id] = metadata_or_delta - else: - # The first prefill is already cached. - if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): - self._seq_group_metadata_cache[request_id].apply_delta( - metadata_or_delta) - else: - # If metadata snapshot is sent again, it is - # preempted. Reset the cache because we need to start - # from scratch. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[ - request_id] = metadata_or_delta - - new_seq_group_metadata_list.append( - self._seq_group_metadata_cache[request_id]) - - # Clean up finished ids - for finished_id in finished_request_ids: - del self._seq_group_metadata_cache[finished_id] - - return new_seq_group_metadata_list - - def _execute_model_spmd( - self, - execute_model_req: ExecuteModelRequest, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Optional[List[SamplerOutput]]: - if execute_model_req is not None: - new_seq_group_metadata_list = self._get_cached_seq_group_metadata( - execute_model_req.seq_group_metadata_list, - execute_model_req.finished_requests_ids) - - execute_model_req.seq_group_metadata_list = ( - new_seq_group_metadata_list) - output = super()._execute_model_spmd(execute_model_req, - intermediate_tensors) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - def get_cache_block_size_bytes(self) -> int: - """Get the size of the KV cache block size in bytes. - """ - return CacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - def _init_worker_distributed_environment( - self, - vllm_config: VllmConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, - backend: str = "hccl") -> None: - """Initialize the distributed environment.""" - parallel_config = self.parallel_config - set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) - init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank, - backend) - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - init_ascend_model_parallel( - parallel_config.expert_parallel_size, - parallel_config.expert_tensor_parallel_size, - parallel_config.world_size_across_dp, - ) - ensure_kv_transfer_initialized(vllm_config) - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, - max_model_len) -> None: - if is_attention_free and num_gpu_blocks != 0: - raise ValueError("No memory should be allocated for the cache blocks " - f"for an attention-free model, but {num_gpu_blocks}" - "blocks are allocated.") - if not is_attention_free and num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if not is_attention_free and max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.")