offline893
diff --git a/‎tests/ut/quantization/test_quant_config.py‎
Lines changed: 8 additions & 19 deletions b/‎tests/ut/quantization/test_quant_config.py‎
Lines changed: 8 additions & 19 deletions
diff --git a/‎tests/ut/quantization/test_quantizer.py‎
Lines changed: 0 additions & 145 deletions b/‎tests/ut/quantization/test_quantizer.py‎
Lines changed: 0 additions & 145 deletions
diff --git a/‎tests/ut/quantization/test_utils.py‎
Lines changed: 62 additions & 0 deletions b/‎tests/ut/quantization/test_utils.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎tests/ut/torchair/ops/test_torchair_fused_moe.py‎
Lines changed: 1 addition & 5 deletions b/‎tests/ut/torchair/ops/test_torchair_fused_moe.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎tests/ut/torchair/test_utils.py‎
Lines changed: 0 additions & 13 deletions b/‎tests/ut/torchair/test_utils.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎vllm_ascend/quantization/quant_config.py‎
Lines changed: 13 additions & 24 deletions b/‎vllm_ascend/quantization/quant_config.py‎
Lines changed: 13 additions & 24 deletions
@@ -156,33 +156,22 @@ class TestAscendKVCacheMethod(TestBase):
     def setUp(self):
         # Setup common test fixtures
         self.mock_quant_config = MagicMock(spec=AscendQuantConfig)
-        self.mock_quant_config.quant_description = {"some_config": "value"}
-        self.prefix = "attention_layer"
+        self.mock_quant_config.quant_description = {"kv_quant_type": "C8"}
+        self.prefix = "layer.attn"
 
-        # Mock the quantizer and quant_method
-        self.mock_quantizer = MagicMock()
+        # Mock quant_method
         self.mock_quant_method = MagicMock()
-
-        # Patch the AscendQuantizer
-        self.quantizer_patcher = patch(
-            'vllm_ascend.quantization.quant_config.AscendQuantizer.get_quantizer',
-            return_value=self.mock_quantizer)
-        self.mock_get_quantizer = self.quantizer_patcher.start()
-
-        self.mock_quantizer.build_attention_method.return_value = self.mock_quant_method
+        self.patcher = patch(
+            'vllm_ascend.quantization.quant_config.get_quant_method')
+        self.mock_get_quant_method = self.patcher.start()
+        self.mock_get_quant_method.return_value = self.mock_quant_method
 
         # Create instance
         self.kv_cache_method = AscendKVCacheMethod(self.mock_quant_config,
                                                    self.prefix)
 
     def tearDown(self):
-        self.quantizer_patcher.stop()
-
-    def test_init(self):
-        """Test initialization with proper quantizer setup."""
-        self.mock_get_quantizer.assert_called_once_with(
-            self.mock_quant_config.quant_description, self.prefix)
-        self.mock_quantizer.build_attention_method.assert_called_once()
+        self.patcher.stop()
 
     def test_create_weights(self):
         """Test create_weights delegates to quant_method."""
 
@@ -0,0 +1,62 @@
+import types
+
+from tests.ut.base import TestBase
+from vllm_ascend.quantization.utils import (ASCEND_QUANTIZATION_METHOD_MAP,
+                                            get_quant_method)
+
+
+class TestGetQuantMethod(TestBase):
+
+    def setUp(self):
+        self.original_quantization_method_map = ASCEND_QUANTIZATION_METHOD_MAP.copy(
+        )
+        for quant_type, layer_map in ASCEND_QUANTIZATION_METHOD_MAP.items():
+            for layer_type in layer_map.keys():
+                ASCEND_QUANTIZATION_METHOD_MAP[quant_type][
+                    layer_type] = types.new_class(f"{quant_type}_{layer_type}")
+
+    def tearDown(self):
+        # Restore original map
+        ASCEND_QUANTIZATION_METHOD_MAP.clear()
+        ASCEND_QUANTIZATION_METHOD_MAP.update(
+            self.original_quantization_method_map)
+
+    def test_linear_quant_methods(self):
+        for quant_type, layer_map in ASCEND_QUANTIZATION_METHOD_MAP.items():
+            if "linear" in layer_map.keys():
+                prefix = "linear_layer"
+                cls = layer_map["linear"]
+                method = get_quant_method({"linear_layer.weight": quant_type},
+                                          prefix, "linear")
+                self.assertIsInstance(method, cls)
+
+    def test_moe_quant_methods(self):
+        for quant_type, layer_map in ASCEND_QUANTIZATION_METHOD_MAP.items():
+            if "moe" in layer_map.keys():
+                prefix = "layer"
+                cls = layer_map["moe"]
+                method = get_quant_method({"layer.weight": quant_type}, prefix,
+                                          "moe")
+                self.assertIsInstance(method, cls)
+
+    def test_with_fa_quant_type(self):
+        quant_description = {"fa_quant_type": "C8"}
+        method = get_quant_method(quant_description, ".attn", "attention")
+        self.assertIsInstance(
+            method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])
+
+    def test_with_kv_quant_type(self):
+        quant_description = {"kv_quant_type": "C8"}
+        method = get_quant_method(quant_description, ".attn", "attention")
+        self.assertIsInstance(
+            method, ASCEND_QUANTIZATION_METHOD_MAP["C8"]["attention"])
+
+    def test_invalid_layer_type(self):
+        quant_description = {"linear_layer.weight": "W8A8"}
+        with self.assertRaises(NotImplementedError):
+            get_quant_method(quant_description, "linear_layer", "unsupported")
+
+    def test_invalid_quant_type(self):
+        quant_description = {"linear_layer.weight": "UNKNOWN"}
+        with self.assertRaises(NotImplementedError):
+            get_quant_method(quant_description, "linear_layer", "linear")
@@ -24,7 +24,6 @@
 
 from vllm_ascend.ascend_forward_context import _get_fused_moe_state
 from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
-from vllm_ascend.quantization.quantizer import W8A8Quantizer
 from vllm_ascend.torchair.ops.torchair_fused_moe import (
     TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
 from vllm_ascend.utils import AscendSocVersion, adapt_patch  # noqa E402
@@ -236,12 +235,9 @@ def test_init_with_quant(self, mock_dist_env, default_moe_config):
         mock_quant_method = MockFusedMoEMethod()
         mock_quant_config.get_quant_method.return_value = mock_quant_method
         mock_quant_config.is_layer_skipped_ascend.return_value = False
-        with patch(
-                'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
-                return_value=W8A8Quantizer):
+        with patch("vllm_ascend.quantization.quant_config.get_quant_method"):
             moe = TorchairAscendFusedMoE(**default_moe_config,
                                          quant_config=mock_quant_config)
-
             assert moe.quant_method is not None
             assert isinstance(moe.quant_method, AscendFusedMoEMethod)
 
 
@@ -6,7 +6,6 @@
 import torch
 
 from tests.ut.base import TestBase
-from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE
 from vllm_ascend.torchair import utils
 
 
@@ -135,15 +134,3 @@ def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
 
         utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
         mock_npu_cast.assert_not_called()
-
-    def test_torchair_quant_method_register(self):
-
-        TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
-            "W8A8_DYNAMIC"]
-        TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
-            "W4A8_DYNAMIC"]
-        utils.torchair_quant_method_register()
-        self.assertNotEqual(TorchairW8A8DYNAMICQuantizer,
-                            SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])
-        self.assertNotEqual(TorchairW4A8DYNAMICQuantizer,
-                            SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])
@@ -38,7 +38,7 @@
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
 
-from .quantizer import AscendQuantizer
+from .utils import get_quant_method
 
 
 @register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -150,18 +150,15 @@ def get_scaled_act_names(self) -> List[str]:
 class AscendLinearMethod(LinearMethodBase):
     """Linear method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for linear methods.
-
     Args:
         quant_config: The Ascend quantization config.
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)
 
     def create_weights(
         self,
@@ -231,17 +228,13 @@ def apply(
 class AscendKVCacheMethod(BaseKVCacheMethod):
     """KVCache method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
     Args:
         quant_config: The Ascend quantization config.
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix)
-        self.quant_method = self.quantizer.build_attention_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "attention")
 
     def create_weights(self, layer: torch.nn.Module) -> None:
         # Different from linear method, there are no weight processing/slicing
@@ -263,18 +256,15 @@ def apply(self, layer: torch.nn.Module, query: torch.Tensor,
 class AscendFusedMoEMethod(FusedMoEMethodBase):
     """FusedMoE method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
     Args:
         quant_config: The Ascend quantization config.
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]):
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_moe_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "moe",
+                                             packed_modules_mapping)
 
     def create_weights(
         self,
@@ -344,14 +334,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
 class AscendEmbeddingMethod(AscendLinearMethod):
     """Embedding method for Ascend quantization.
-      This class calls AscendQuantizer to search a specific quantization
-      implementations supported on ascend hardware for Embedding methods.
+    
       Args:
           quant_config: The Ascend quantization config.
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)