vllm-project · wangxiyuan · Jul 9, 2025
diff --git a/tests/ut/__init__.py b/tests/ut/__init__.py
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
@@ -523,7 +523,7 @@ def test_get_attn_backend_cls_use_v1_and_torchair(self,
         )
         self.assertEqual(
             result,
-            "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
+            "vllm_ascend.torchair.attention_torchair.AscendAttentionTorchairBackend"
         )
 
     @patch('vllm_ascend.platform.get_ascend_config')

diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
@@ -123,16 +123,13 @@ def test_aligned_16(self):
     @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
                 new=mock.MagicMock)
     @mock.patch('vllm_ascend.utils.is_310p')
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
-    def test_maybe_converting_weight_acl_format(self, mock_get_config,
-                                                mock_310p, mock_npu_cast,
+    def test_maybe_converting_weight_acl_format(self, mock_310p, mock_npu_cast,
                                                 mock_get_format):
         ACL_FORMAT_FRACTAL_NZ = 29
         mock_310p.return_value = True
 
         mock_config = mock.MagicMock()
         mock_config.torchair_graph_config.enabled = True
-        mock_get_config.return_value = mock_config
         mock_get_format.return_value = 1
 
         mock_npu_cast.return_value = 1
@@ -145,23 +142,21 @@ def test_maybe_converting_weight_acl_format(self, mock_get_config,
         model = mock.MagicMock()
         model.modules.return_value = [fused_moe]
 
-        utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
+        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
         self.assertEqual(fused_moe.w13_weight.data, 1)
 
     @mock.patch('torch_npu.get_npu_format')
     @mock.patch('torch_npu.npu_format_cast')
     @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
                 new=mock.MagicMock)
     @mock.patch('vllm_ascend.utils.is_310p')
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
     def test_maybe_converting_weight_acl_format_format_true(
-            self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format):
+            self, mock_310p, mock_npu_cast, mock_get_format):
         ACL_FORMAT_FRACTAL_NZ = 29
         mock_310p.return_value = True
 
         mock_config = mock.MagicMock()
         mock_config.torchair_graph_config.enabled = True
-        mock_get_config.return_value = mock_config
         mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
 
         mock_npu_cast.return_value = 1
@@ -176,20 +171,7 @@ def test_maybe_converting_weight_acl_format_format_true(
 
         mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
 
-        utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
-
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
-    @mock.patch('vllm_ascend.utils.is_310p', return_value=False)
-    def test_maybe_converting_weight_acl_format_not_310_not_graph(
-            self, mock_310p, mock_get_config):
-        mock_config = mock.MagicMock()
-        mock_config.torchair_graph_config.enabled = False
-        mock_get_config.return_value = mock_config
-
-        mock_constant = mock.MagicMock()
-
-        mock_model = mock.MagicMock()
-        utils.maybe_converting_weight_acl_format(mock_model, mock_constant)
+        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
 
     @mock.patch('importlib.util.find_spec')
     @mock.patch('importlib.import_module')
@@ -280,27 +262,6 @@ def test_update_aclgraph_sizes(self):
             3,
             len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
 
-    def test_get_torchair_current_work_dir(self):
-        cache_dir = utils.TORCHAIR_CACHE_DIR
-        work_dir = utils.get_torchair_current_work_dir()
-        self.assertEqual(cache_dir, work_dir)
-        work_dir = utils.get_torchair_current_work_dir("test")
-        self.assertEqual(os.path.join(cache_dir, "test"), work_dir)
-
-    def test_torchair_cache_dir(self):
-        utils.write_kv_cache_bytes_to_file(0, 100)
-        self.assertTrue(utils.check_torchair_cache_exist(),
-                        "Create torchair cache dir failed")
-        self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
-                        "Create kv cache bytes cache dir failed")
-        kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
-        self.assertEqual(100, kv_cache_bytes)
-        utils.delete_torchair_cache_file()
-        self.assertFalse(utils.check_torchair_cache_exist(),
-                         "Delete torchair cache dir failed")
-        self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
-                         "Delete kv cache bytes cache dir failed")
-
 
 class TestProfileExecuteDuration(unittest.TestCase):
 

diff --git a/tests/ut/torchair/__init__.py b/tests/ut/torchair/__init__.py
diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py
@@ -0,0 +1,28 @@
+import os
+
+from tests.ut.base import TestBase
+from vllm_ascend.torchair import utils
+
+
+class TestTorchairUtils(TestBase):
+
+    def test_get_torchair_current_work_dir(self):
+        cache_dir = utils.TORCHAIR_CACHE_DIR
+        work_dir = utils.get_torchair_current_work_dir()
+        self.assertEqual(cache_dir, work_dir)
+        work_dir = utils.get_torchair_current_work_dir("test")
+        self.assertEqual(os.path.join(cache_dir, "test"), work_dir)
+
+    def test_torchair_cache_dir(self):
+        utils.write_kv_cache_bytes_to_file(0, 100)
+        self.assertTrue(utils.check_torchair_cache_exist(),
+                        "Create torchair cache dir failed")
+        self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
+                        "Create kv cache bytes cache dir failed")
+        kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
+        self.assertEqual(100, kv_cache_bytes)
+        utils.delete_torchair_cache_file()
+        self.assertFalse(utils.check_torchair_cache_exist(),
+                         "Delete torchair cache dir failed")
+        self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
+                         "Delete kv cache bytes cache dir failed")
diff --git a/tests/ut/worker/test_pooling_model_runner.py b/tests/ut/worker/test_pooling_model_runner.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -24,9 +25,10 @@ def _create_model_runner(self, model: str, *args,
     def setUp(self):
         """Initialize test fixtures and common mocks"""
         self.attn_backend = "npu"
-
+        model_path = os.path.join(os.path.dirname(__file__), "..",
+                                  "fake_weight")
         model_runner = self._create_model_runner(
-            "tests/ut/fake_weight",
+            model_path,
             trust_remote_code=True,
             enable_chunked_prefill=False,
         )

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -21,7 +21,7 @@
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
 if TYPE_CHECKING:

diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -74,8 +74,8 @@
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
-from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
-                               npu_wait_tensor)
+from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.utils import dispose_tensor
 
 
 class CustomDeepseekV2SiluAndMul(SiluAndMul):

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -41,9 +41,9 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (FusedMoEState, dispose_tensor,
-                               get_fused_moe_state, is_310p, npu_stream_switch,
-                               npu_wait_tensor)
+                               get_fused_moe_state, is_310p)
 
 MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER
 

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -181,7 +181,10 @@
 
         if parallel_config and parallel_config.worker_cls == "auto":
             if envs.VLLM_USE_V1:
-                parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
+                if ascend_config.torchair_graph_config.enabled:
+                    parallel_config.worker_cls = "vllm_ascend.torchair.worker_torchair.NPUTorchairWorker"
+                else:
+                    parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
             elif vllm_config.speculative_config:
                 # NOTE: We set this var to `1` in vllm-ascend to avoid segment
                 # fault when using spec decode with V0 engine.
@@ -224,7 +227,7 @@
             return "vllm_ascend.attention.mla_v1.AscendMLABackend"
         use_torchair = get_ascend_config().torchair_graph_config.enabled
         if use_v1 and use_torchair:
-            return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
+            return "vllm_ascend.torchair.attention_torchair.AscendAttentionTorchairBackend"
         if use_v1:
             return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
         if use_mla:

diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -26,9 +26,9 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import select_experts
+from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, FusedMoEState,
-                               dispose_tensor, get_fused_moe_state,
-                               npu_stream_switch, npu_wait_tensor)
+                               dispose_tensor, get_fused_moe_state)
 
 
 def apply_mlp(hidden_states: torch.Tensor,

diff --git a/vllm_ascend/torchair/__init__.py b/vllm_ascend/torchair/__init__.py
diff --git a/...ascend/attention/attention_v1_torchair.py → vllm_ascend/torchair/attention_torchair.py b/...ascend/attention/attention_v1_torchair.py → vllm_ascend/torchair/attention_torchair.py