diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index a123790dbd..85557fd1e0 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -193,71 +193,48 @@ def test_check_ascend_config_pass(self): @_clean_up_ascend_config def test_check_ascend_config_wrong_case(self): test_vllm_config = VllmConfig() - # For V0 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "ascend_scheduler_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, True) - # For V1 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): - # torchair + eager mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - enforce_eager = True - check_ascend_config(test_vllm_config, enforce_eager) - # torchair + non deepseek model - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - model_path = os.path.join(os.path.dirname(__file__), - "fake_weight") - fake_model_config = ModelConfig(model=model_path) - fake_model_config.hf_config = PretrainedConfig() - fake_model_config.hf_config.model_type = "llama" - test_vllm_config.model_config = fake_model_config - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - # aclgraph + deepseek model - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - }, - "refresh": True - } - model_path = os.path.join(os.path.dirname(__file__), - "fake_weight") - fake_model_config = ModelConfig(model=model_path) - fake_model_config.hf_config = PretrainedConfig() - fake_model_config.hf_config.model_type = "deepseek" - test_vllm_config.model_config = fake_model_config - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) + + # torchair + eager mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + enforce_eager = True + check_ascend_config(test_vllm_config, enforce_eager) + # torchair + non deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "llama" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + # aclgraph + deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "deepseek" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) def test_check_torchair_supported(self): test_cases = [('deepseek_v3', True), ('PanguProMoE', True), diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index c09964a745..fd4e99980a 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -389,69 +389,6 @@ def test_check_and_update_config_v1_worker_class_selection( "vllm_ascend.worker.worker_v1.NPUWorker", ) - @patch("vllm_ascend.ascend_config.check_ascend_config") - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm.envs.VLLM_USE_V1", False) - def test_check_and_update_config_speculative_worker_config( - self, mock_init_ascend, mock_check_ascend): - mock_init_ascend.return_value = self.mock_ascend_config - self.mock_vllm_config.speculative_config = MagicMock() - self.mock_vllm_config.speculative_config.disable_logprobs = True - self.mock_vllm_config.parallel_config.worker_cls = "auto" - - with patch.dict("os.environ", {}): - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(self.mock_vllm_config) - import os - - self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1") - self.assertEqual( - self.mock_vllm_config.parallel_config.worker_cls, - "vllm.spec_decode.spec_decode_worker.create_spec_worker", - ) - self.assertEqual( - self.mock_vllm_config.parallel_config.sd_worker_cls, - "vllm_ascend.worker.worker.NPUWorker", - ) - - @patch("vllm_ascend.ascend_config.check_ascend_config") - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm.envs.VLLM_USE_V1", False) - def test_check_and_update_config_multi_step_worker_config( - self, mock_init_ascend, mock_check_ascend): - mock_init_ascend.return_value = self.mock_ascend_config - self.mock_vllm_config.scheduler_config.is_multi_step = True - self.mock_vllm_config.parallel_config.worker_cls = "auto" - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(self.mock_vllm_config) - self.assertEqual( - self.mock_vllm_config.parallel_config.worker_cls, - "vllm_ascend.worker.multi_step_worker.MultiStepWorker", - ) - - @patch("vllm_ascend.ascend_config.check_ascend_config") - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm.envs.VLLM_USE_V1", False) - def test_check_and_update_config_default_worker_config( - self, mock_init_ascend, mock_check_ascend): - mock_init_ascend.return_value = self.mock_ascend_config - self.mock_vllm_config.parallel_config.worker_cls = "auto" - self.mock_vllm_config.scheduler_config.is_multi_step = False - - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(self.mock_vllm_config) - self.assertEqual( - self.mock_vllm_config.parallel_config.worker_cls, - "vllm_ascend.worker.worker.NPUWorker", - ) - @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.is_310p", return_value=True) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 82ac32acac..50b0e83618 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -15,7 +15,6 @@ # limitations under the License. from typing import Optional -import vllm.envs as envs from vllm.logger import logger TORCHAIR_MODEL_LIST = ["deepseek", "pangu"] @@ -126,46 +125,36 @@ def get_ascend_config(): def check_ascend_config(vllm_config, enforce_eager): ascend_config = get_ascend_config() - # for v0 engine - if not envs.VLLM_USE_V1: + # for eager mode + if enforce_eager: + # torchair_graph cannot be enabled with eager mode. if ascend_config.torchair_graph_config.enabled: - raise NotImplementedError( - "Torchair graph mode is only supported for V1 Engine.") - if ascend_config.ascend_scheduler_config.enabled: - raise NotImplementedError( - "Ascend scheduler is only supported for V1 Engine.") - # for v1 engine + raise RuntimeError( + "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." + ) + # for graph mode else: - # for eager mode - if enforce_eager: - # torchair_graph cannot be enabled with eager mode. - if ascend_config.torchair_graph_config.enabled: - raise RuntimeError( - "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." - ) - # for graph mode + # torchair_graph case + if ascend_config.torchair_graph_config.enabled: + # torchair_graph is supported for deepseek/pangu model only. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if not _check_torchair_supported(model_type): + raise NotImplementedError( + "Torchair graph mode only works with following model types:" + f"{TORCHAIR_MODEL_LIST}.") + # aclgraph case else: - # torchair_graph case - if ascend_config.torchair_graph_config.enabled: - # torchair_graph is supported for deepseek/pangu model only. - if vllm_config.model_config: - model_type = vllm_config.model_config.hf_config.model_type - if not _check_torchair_supported(model_type): - raise NotImplementedError( - "Torchair graph mode only works with following model types:" - f"{TORCHAIR_MODEL_LIST}.") - # aclgraph case - else: - # aclgraph doesn't work with deepseek model and only qwen model is well tested. - if vllm_config.model_config: - model_type = vllm_config.model_config.hf_config.model_type - if "deepseek" in model_type: - raise NotImplementedError( - "ACL Graph does not support deepseek. Please " - "try torchair graph mode to serve deepseek models on vllm-ascend." - " Or set `enforce_eager=True` to use eager mode.") - if "qwen" not in model_type: - logger.warning( - "ACL Graph is currently experimental. Please " - "raise an issue on https://github.com/vllm-project/vllm-ascend/issues" - " if you encourage any Error") + # aclgraph doesn't work with deepseek model and only qwen model is well tested. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if "deepseek" in model_type: + raise NotImplementedError( + "ACL Graph does not support deepseek. Please " + "try torchair graph mode to serve deepseek models on vllm-ascend." + " Or set `enforce_eager=True` to use eager mode.") + if "qwen" not in model_type: + logger.warning( + "ACL Graph is currently experimental. Please " + "raise an issue on https://github.com/vllm-project/vllm-ascend/issues" + " if you encourage any Error") diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 679bbc2c0e..3d7778513f 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -30,7 +30,6 @@ import torch import torch.distributed as dist import torch_npu # noqa: F401 -import vllm.envs as envs from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata @@ -397,20 +396,17 @@ def forward( hidden_states_or_q_c = hidden_states if self.torchair_graph_enabled: forward_kwargs = {} - if envs.VLLM_USE_V1: - output_shape = hidden_states.shape - output = torch.empty(output_shape, - dtype=hidden_states_or_q_c.dtype, - device=hidden_states_or_q_c.device) - forward_kwargs['output'] = output - + output_shape = hidden_states.shape + output = torch.empty(output_shape, + dtype=hidden_states_or_q_c.dtype, + device=hidden_states_or_q_c.device) + forward_kwargs['output'] = output output = self.mla_attn.impl.forward(self.mla_attn, hidden_states_or_q_c, hidden_states, None, kv_cache, attn_metadata, **forward_kwargs) - if envs.VLLM_USE_V1: - output = output.view(-1, output_shape[-1]) + output = output.view(-1, output_shape[-1]) return output else: kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( @@ -885,7 +881,7 @@ def forward( def can_run_ms(self): attn_metadata = get_forward_context().attn_metadata # support mla attention and V1 engine at present - if not self.use_mla or not envs.VLLM_USE_V1: + if not self.use_mla: return False # enable prefill overlap if attn_metadata is None or attn_metadata.num_prefills == 0: diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 1de4e90ba9..bfa86f0ee2 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -29,7 +29,6 @@ import torch import torch_npu -import vllm.envs as envs from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata @@ -579,20 +578,17 @@ def forward( else: hidden_states_or_q_c = hidden_states if self.torchair_graph_enabled: - if envs.VLLM_USE_V1: - output_shape = hidden_states.shape - output = torch.empty(output_shape, - dtype=hidden_states_or_q_c.dtype, - device=hidden_states_or_q_c.device) - forward_kwargs['output'] = output - + output_shape = hidden_states.shape + output = torch.empty(output_shape, + dtype=hidden_states_or_q_c.dtype, + device=hidden_states_or_q_c.device) + forward_kwargs['output'] = output output = self.mla_attn.impl.forward(self.mla_attn, hidden_states_or_q_c, hidden_states, None, kv_cache, attn_metadata, **forward_kwargs) - if envs.VLLM_USE_V1: - output = output.view(-1, output_shape[-1]) + output = output.view(-1, output_shape[-1]) return output else: kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( @@ -660,7 +656,7 @@ def __init__( prefix=f"{prefix}.mlp", ) self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \ - and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1 + and model_config.use_mla and self.tp_size > 1 else: self.mlp = CustomDeepseekV2MLP( hidden_size=config.hidden_size, diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 07fb07fcb6..fede975d84 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -16,7 +16,6 @@ # import gc -import os from datetime import timedelta from typing import TYPE_CHECKING, Optional, Tuple @@ -117,6 +116,8 @@ def clear_npu_memory(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + if not envs.VLLM_USE_V1: + raise ValueError("vLLM Ascend does not support V0 engine") # initialize ascend config from vllm additional_config ascend_config = init_ascend_config(vllm_config) @@ -180,18 +181,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: update_aclgraph_sizes(vllm_config) if parallel_config and parallel_config.worker_cls == "auto": - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" - elif vllm_config.speculative_config: - # NOTE: We set this var to `1` in vllm-ascend to avoid segment - # fault when using spec decode with V0 engine. - os.environ["ACL_OP_INIT_MODE"] = "1" - parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker" - elif vllm_config.scheduler_config.is_multi_step: - parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker" - else: - parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker" + parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" if cache_config: if cache_config.block_size is None: @@ -202,20 +192,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) cache_config.block_size = 128 - if envs.VLLM_USE_V1: - # Activate custom ops for v1, except on 310P - if not is_310p(): - compilation_config.custom_ops = ["all"] - - # If ascend_scheduler_config is enabled, - # extents original scheduler_config to use AscendScheduler. - if ascend_config.ascend_scheduler_config.enabled: - from vllm_ascend.core.schedule_config import \ - AscendSchedulerConfig - ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( - vllm_config.scheduler_config, - ascend_config.ascend_scheduler_config) - vllm_config.scheduler_config = ascend_scheduler_config + # Activate custom ops for v1, except on 310P + if not is_310p(): + compilation_config.custom_ops = ["all"] + + # If ascend_scheduler_config is enabled, + # extents original scheduler_config to use AscendScheduler. + if ascend_config.ascend_scheduler_config.enabled: + from vllm_ascend.core.schedule_config import AscendSchedulerConfig + ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( + vllm_config.scheduler_config, + ascend_config.ascend_scheduler_config) + vllm_config.scheduler_config = ascend_scheduler_config @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype,