Skip to content

Commit 074cc48

Browse files
committed
refactoer quantization
Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com>
1 parent 99bf25a commit 074cc48

File tree

8 files changed

+1494
-2
lines changed

8 files changed

+1494
-2
lines changed

tests/ut/torchair/test_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from tests.ut.base import TestBase
99
from vllm_ascend.torchair import utils
10+
from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE
1011

1112

1213
class TestTorchairUtils(TestBase):
@@ -120,3 +121,11 @@ def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
120121

121122
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
122123
mock_npu_cast.assert_not_called()
124+
125+
def test_torchair_quant_method_register(self):
126+
127+
TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"]
128+
TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"]
129+
utils.torchair_quant_method_register()
130+
self.assertNotEqual(TorchairW8A8DYNAMICQuantizer, SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])
131+
self.assertNotEqual(TorchairW4A8DYNAMICQuantizer, SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])

vllm_ascend/torchair/models/torchair_deepseek_v2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171

7272
from vllm_ascend.ascend_config import get_ascend_config
7373
from vllm_ascend.quantization.quant_config import AscendLinearMethod
74-
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
74+
from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import TorchairAscendW8A8DynamicLinearMethod
7575
from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
7676
from vllm_ascend.utils import dispose_tensor, npu_prefetch
7777

@@ -262,7 +262,7 @@ def __init__(
262262
if isinstance(quant_method, UnquantizedLinearMethod):
263263
self.act_fn = TorchairDeepseekV2SiluAndMul()
264264
elif (isinstance(quant_method, AscendLinearMethod) and isinstance(
265-
quant_method.quant_method, AscendW8A8DynamicLinearMethod)):
265+
quant_method.quant_method, TorchairAscendW8A8DynamicLinearMethod)):
266266
# TODO(sdmyzlp): Currently preserved as before:
267267
# 1. The only quantization supported for silu is W8A8Dynamic
268268
# 2. Output dtype of gate_up/down is fixed to be int32/bfloat16

vllm_ascend/torchair/quantization/__init__.py

Whitespace-only changes.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from vllm_ascend.quantization.quantizer import VLLMAscendQuantizer
2+
from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import (
3+
TorchairAscendW4A8DynamicFusedMoEMethod,
4+
TorchairAscendW4A8DynamicLinearMethod)
5+
from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import (
6+
TorchairAscendW8A8DynamicFusedMoEMethod,
7+
TorchairAscendW8A8DynamicLinearMethod)
8+
9+
10+
class TorchairW8A8DYNAMICQuantizer(VLLMAscendQuantizer):
11+
12+
@staticmethod
13+
def build_linear_method():
14+
return TorchairAscendW8A8DynamicLinearMethod()
15+
16+
@staticmethod
17+
def build_moe_method():
18+
return TorchairAscendW8A8DynamicFusedMoEMethod()
19+
20+
21+
class TorchairW4A8DYNAMICQuantizer(VLLMAscendQuantizer):
22+
23+
@staticmethod
24+
def build_linear_method():
25+
return TorchairAscendW4A8DynamicLinearMethod()
26+
27+
@staticmethod
28+
def build_moe_method():
29+
return TorchairAscendW4A8DynamicFusedMoEMethod()

0 commit comments

Comments
 (0)