Skip to content

[Refactor] Refactor torchair #1661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/ut/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion tests/ut/test_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def test_get_attn_backend_cls_use_v1_and_torchair(self,
)
self.assertEqual(
result,
"vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
"vllm_ascend.torchair.attention_torchair.AscendAttentionTorchairBackend"
)

@patch('vllm_ascend.platform.get_ascend_config')
Expand Down
47 changes: 4 additions & 43 deletions tests/ut/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,13 @@ def test_aligned_16(self):
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
@mock.patch('vllm_ascend.utils.is_310p')
@mock.patch('vllm_ascend.utils.get_ascend_config')
def test_maybe_converting_weight_acl_format(self, mock_get_config,
mock_310p, mock_npu_cast,
def test_maybe_converting_weight_acl_format(self, mock_310p, mock_npu_cast,
mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_310p.return_value = True

mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = True
mock_get_config.return_value = mock_config
mock_get_format.return_value = 1

mock_npu_cast.return_value = 1
Expand All @@ -145,23 +142,21 @@ def test_maybe_converting_weight_acl_format(self, mock_get_config,
model = mock.MagicMock()
model.modules.return_value = [fused_moe]

utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
self.assertEqual(fused_moe.w13_weight.data, 1)

@mock.patch('torch_npu.get_npu_format')
@mock.patch('torch_npu.npu_format_cast')
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
@mock.patch('vllm_ascend.utils.is_310p')
@mock.patch('vllm_ascend.utils.get_ascend_config')
def test_maybe_converting_weight_acl_format_format_true(
self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format):
self, mock_310p, mock_npu_cast, mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_310p.return_value = True

mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = True
mock_get_config.return_value = mock_config
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ

mock_npu_cast.return_value = 1
Expand All @@ -176,20 +171,7 @@ def test_maybe_converting_weight_acl_format_format_true(

mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ

utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)

@mock.patch('vllm_ascend.utils.get_ascend_config')
@mock.patch('vllm_ascend.utils.is_310p', return_value=False)
def test_maybe_converting_weight_acl_format_not_310_not_graph(
self, mock_310p, mock_get_config):
mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = False
mock_get_config.return_value = mock_config

mock_constant = mock.MagicMock()

mock_model = mock.MagicMock()
utils.maybe_converting_weight_acl_format(mock_model, mock_constant)
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)

@mock.patch('importlib.util.find_spec')
@mock.patch('importlib.import_module')
Expand Down Expand Up @@ -280,27 +262,6 @@ def test_update_aclgraph_sizes(self):
3,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))

def test_get_torchair_current_work_dir(self):
cache_dir = utils.TORCHAIR_CACHE_DIR
work_dir = utils.get_torchair_current_work_dir()
self.assertEqual(cache_dir, work_dir)
work_dir = utils.get_torchair_current_work_dir("test")
self.assertEqual(os.path.join(cache_dir, "test"), work_dir)

def test_torchair_cache_dir(self):
utils.write_kv_cache_bytes_to_file(0, 100)
self.assertTrue(utils.check_torchair_cache_exist(),
"Create torchair cache dir failed")
self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
"Create kv cache bytes cache dir failed")
kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
self.assertEqual(100, kv_cache_bytes)
utils.delete_torchair_cache_file()
self.assertFalse(utils.check_torchair_cache_exist(),
"Delete torchair cache dir failed")
self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
"Delete kv cache bytes cache dir failed")


class TestProfileExecuteDuration(unittest.TestCase):

Expand Down
Empty file added tests/ut/torchair/__init__.py
Empty file.
28 changes: 28 additions & 0 deletions tests/ut/torchair/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

from tests.ut.base import TestBase
from vllm_ascend.torchair import utils


class TestTorchairUtils(TestBase):

def test_get_torchair_current_work_dir(self):
cache_dir = utils.TORCHAIR_CACHE_DIR
work_dir = utils.get_torchair_current_work_dir()
self.assertEqual(cache_dir, work_dir)
work_dir = utils.get_torchair_current_work_dir("test")
self.assertEqual(os.path.join(cache_dir, "test"), work_dir)

def test_torchair_cache_dir(self):
utils.write_kv_cache_bytes_to_file(0, 100)
self.assertTrue(utils.check_torchair_cache_exist(),
"Create torchair cache dir failed")
self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
"Create kv cache bytes cache dir failed")
kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
self.assertEqual(100, kv_cache_bytes)
utils.delete_torchair_cache_file()
self.assertFalse(utils.check_torchair_cache_exist(),
"Delete torchair cache dir failed")
self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
"Delete kv cache bytes cache dir failed")
6 changes: 4 additions & 2 deletions tests/ut/worker/test_pooling_model_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import unittest
from unittest.mock import MagicMock, patch

Expand All @@ -24,9 +25,10 @@ def _create_model_runner(self, model: str, *args,
def setUp(self):
"""Initialize test fixtures and common mocks"""
self.attn_backend = "npu"

model_path = os.path.join(os.path.dirname(__file__), "..",
"fake_weight")
model_runner = self._create_model_runner(
"tests/ut/fake_weight",
model_path,
trust_remote_code=True,
enable_chunked_prefill=False,
)
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm_ascend.multistream.context import get_multistream_comm_context
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.worker.npu_input_batch import InputBatch

if TYPE_CHECKING:
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@
from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.quantization.quant_config import AscendLinearMethod
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
npu_wait_tensor)
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import dispose_tensor


class CustomDeepseekV2SiluAndMul(SiluAndMul):
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/ops/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (FusedMoEState, dispose_tensor,
get_fused_moe_state, is_310p, npu_stream_switch,
npu_wait_tensor)
get_fused_moe_state, is_310p)

MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER

Expand Down
7 changes: 5 additions & 2 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@

if parallel_config and parallel_config.worker_cls == "auto":
if envs.VLLM_USE_V1:
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
if ascend_config.torchair_graph_config.enabled:
parallel_config.worker_cls = "vllm_ascend.torchair.worker_torchair.NPUTorchairWorker"

Check warning on line 185 in vllm_ascend/platform.py

View check run for this annotation

Codecov / codecov/patch

vllm_ascend/platform.py#L185

Added line #L185 was not covered by tests
else:
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
elif vllm_config.speculative_config:
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
# fault when using spec decode with V0 engine.
Expand Down Expand Up @@ -224,7 +227,7 @@
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
use_torchair = get_ascend_config().torchair_graph_config.enabled
if use_v1 and use_torchair:
return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
return "vllm_ascend.torchair.attention_torchair.AscendAttentionTorchairBackend"
if use_v1:
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
if use_mla:
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/quantization/w8a8_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import get_ep_group
from vllm_ascend.ops.fused_moe import select_experts
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, FusedMoEState,
dispose_tensor, get_fused_moe_state,
npu_stream_switch, npu_wait_tensor)
dispose_tensor, get_fused_moe_state)


def apply_mlp(hidden_states: torch.Tensor,
Expand Down
Empty file.
Loading
Loading