Skip to content

Commit 0942d9a

Browse files
authored
[3/N][Refactor][Quantization]remove packed_modules_mapping from models (#3021)
### What this PR does / why we need it? Some custom models in vllm-ascend define packed_modules_mapping, which prevent keeping same model class with vllm community. So move these custom packed_modules_mapping to quant utils.py. After this pr, some custom models can be removed. ### Does this PR introduce _any_ user-facing change? tested by CI ### How was this patch tested? tested by CI - vLLM version: v0.10.2 - vLLM main: vllm-project/vllm@5089fd7 Signed-off-by: 22dimensions <waitingwind@foxmail.com>
1 parent 4ba5671 commit 0942d9a

File tree

8 files changed

+76
-80
lines changed

8 files changed

+76
-80
lines changed

tests/ut/models/test_qwen3_moe.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -15,41 +15,11 @@
1515
import math
1616
import unittest
1717

18-
import pytest
1918
import torch
20-
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
2119

22-
from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
2320
from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention
2421

2522

26-
class TestCustomQwen3MoeForCausalLM:
27-
28-
def test_class_inheritance(self):
29-
assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM)
30-
31-
@pytest.mark.parametrize("key, expected", [
32-
("qkv_proj", ["q_proj", "k_proj", "v_proj"]),
33-
("gate_up_proj", ["gate_proj", "up_proj"]),
34-
("experts",
35-
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]),
36-
])
37-
def test_packed_modules_mapping(self, key, expected):
38-
assert CustomQwen3MoeForCausalLM.packed_modules_mapping[
39-
key] == expected
40-
41-
def test_packed_modules_mapping_structure(self):
42-
expected_mapping = {
43-
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
44-
"gate_up_proj": ["gate_proj", "up_proj"],
45-
"experts": [
46-
"experts.0.gate_proj", "experts.0.up_proj",
47-
"experts.0.down_proj"
48-
]
49-
}
50-
assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping
51-
52-
5323
class DummyRMSNorm:
5424

5525
def __init__(self, dim: int, eps: float = 1e-6):

tests/ut/quantization/test_quant_config.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,20 @@ def test_override_quantization_method(self, mock_is_available):
7373
self.assertIsNone(result)
7474

7575
def test_get_quant_method_for_linear(self):
76+
mock_config = MagicMock()
77+
mock_config.model_config.hf_config.model_type = None
7678
linear_layer = MagicMock(spec=LinearBase)
7779
# Test skipped layer
78-
with patch.object(self.ascend_config,
80+
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
81+
patch.object(self.ascend_config, \
7982
'is_layer_skipped_ascend',
8083
return_value=True):
8184
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
8285
self.assertIsInstance(method, UnquantizedLinearMethod)
8386

8487
# Test quantized layer
8588
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
89+
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
8690
patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:
8791

8892
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
@@ -93,14 +97,18 @@ def test_get_quant_method_for_linear(self):
9397

9498
def test_get_quant_method_for_attention(self):
9599
attention_layer = MagicMock(spec=Attention)
96-
with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
100+
mock_config = MagicMock()
101+
mock_config.model_config.hf_config.model_type = None
102+
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
103+
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
97104
return_value=MagicMock()) as mock_ascend_kvcache:
98105
# Test with fa_quant_type
99106
method = self.ascend_config.get_quant_method(
100107
attention_layer, ".attn")
101108
self.assertIs(method, mock_ascend_kvcache.return_value)
102109

103-
with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
110+
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
111+
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
104112
return_value=MagicMock()) as mock_ascend_kvcache:
105113
# Test with kv_quant_type
106114
modified_config = {"kv_quant_type": "C8"}
@@ -113,16 +121,20 @@ def test_get_quant_method_for_fused_moe(self):
113121
fused_moe_layer = MagicMock(spec=FusedMoE)
114122
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
115123
fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
124+
mock_config = MagicMock()
125+
mock_config.model_config.hf_config.model_type = None
116126

117127
# Test skipped layer
118128
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
129+
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
119130
patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
120131
method = self.ascend_config.get_quant_method(
121132
fused_moe_layer, "moe_layer")
122133
self.assertIs(method, mock_ascend_moe.return_value)
123134

124135
# Test quantized layer
125136
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
137+
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
126138
patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
127139
method = self.ascend_config.get_quant_method(
128140
fused_moe_layer, "moe_layer")

vllm_ascend/models/deepseek_mtp.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -180,14 +180,6 @@ def compute_logits(
180180

181181

182182
class CustomDeepSeekMTP(DeepSeekMTP):
183-
# NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
184-
# NOTE 2.The description file generated by the current msmodelslim tool does not have
185-
# MTP layer info. Please manually add it and set the value to FLOAT.
186-
packed_modules_mapping = {
187-
"gate_up_proj": ["gate_proj", "up_proj"],
188-
"experts":
189-
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
190-
}
191183

192184
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
193185
nn.Module.__init__(self)

vllm_ascend/models/deepseek_v2.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -320,12 +320,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
320320

321321

322322
class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
323-
# add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging
324-
packed_modules_mapping = {
325-
"gate_up_proj": ["gate_proj", "up_proj"],
326-
"experts":
327-
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
328-
}
329323

330324
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
331325
nn.Module.__init__(self)

vllm_ascend/models/qwen2_5_vl.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -491,17 +491,6 @@ def forward(
491491
dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
492492
class AscendQwen2_5_VLForConditionalGeneration(
493493
Qwen2_5_VLForConditionalGeneration):
494-
packed_modules_mapping = {
495-
"qkv_proj": [
496-
"q_proj",
497-
"k_proj",
498-
"v_proj",
499-
],
500-
"gate_up_proj": [
501-
"gate_proj",
502-
"up_proj",
503-
],
504-
}
505494

506495
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
507496
super().__init__(vllm_config=vllm_config, prefix=prefix)

vllm_ascend/models/qwen3_moe.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -318,19 +318,6 @@ def forward(
318318

319319

320320
class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
321-
packed_modules_mapping = {
322-
"qkv_proj": [
323-
"q_proj",
324-
"k_proj",
325-
"v_proj",
326-
],
327-
"gate_up_proj": [
328-
"gate_proj",
329-
"up_proj",
330-
],
331-
"experts":
332-
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
333-
}
334321

335322
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
336323
nn.Module.__init__(self)

vllm_ascend/models/qwen3_next.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,15 +1166,6 @@ def load_weights(self, weights: Iterable[tuple[str,
11661166

11671167
class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
11681168
MixtureOfExperts, IsHybrid):
1169-
packed_modules_mapping = {
1170-
"qkv_proj": [
1171-
"q_proj",
1172-
"k_proj",
1173-
"v_proj",
1174-
],
1175-
"gate_up_proj": ["gate_proj", "up_proj"],
1176-
"in_proj": ["in_proj_qkvz", "in_proj_ba"],
1177-
}
11781169

11791170
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
11801171
config = vllm_config.model_config.hf_config

vllm_ascend/quantization/quant_config.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from typing import Any, Callable, Dict, List, Mapping, Optional
2020

2121
import torch
22+
from vllm.config import get_current_vllm_config
2223
from vllm.distributed import get_tensor_model_parallel_rank
2324
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
2425
FusedMoeWeightScaleSupported)
@@ -89,6 +90,11 @@ def override_quantization_method(cls, hf_quant_cfg,
8990

9091
def get_quant_method(self, layer: torch.nn.Module,
9192
prefix: str) -> Optional["QuantizeMethodBase"]:
93+
vllm_config = get_current_vllm_config()
94+
model_type = vllm_config.model_config.hf_config.model_type
95+
if model_type in packed_modules_model_mapping:
96+
self.packed_modules_mapping = packed_modules_model_mapping[
97+
model_type]
9298
from vllm.attention.layer import Attention
9399
if prefix.startswith("language_model"):
94100
prefix = prefix.split('.', 1)[-1]
@@ -153,6 +159,61 @@ def get_scaled_act_names(self) -> List[str]:
153159
return []
154160

155161

162+
packed_modules_model_mapping = {
163+
"qwen3_moe": {
164+
"qkv_proj": [
165+
"q_proj",
166+
"k_proj",
167+
"v_proj",
168+
],
169+
"gate_up_proj": [
170+
"gate_proj",
171+
"up_proj",
172+
],
173+
"experts":
174+
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
175+
},
176+
"deepseek_v2": {
177+
"gate_up_proj": ["gate_proj", "up_proj"],
178+
"experts":
179+
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
180+
},
181+
"deepseek_v3": {
182+
"gate_up_proj": ["gate_proj", "up_proj"],
183+
"experts":
184+
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
185+
},
186+
# NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
187+
# NOTE 2.The description file generated by the current msmodelslim tool does not have
188+
# MTP layer info. Please manually add it and set the value to FLOAT.
189+
"deepseek_mtp": {
190+
"gate_up_proj": ["gate_proj", "up_proj"],
191+
"experts":
192+
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
193+
},
194+
"qwen3_next": {
195+
"qkv_proj": [
196+
"q_proj",
197+
"k_proj",
198+
"v_proj",
199+
],
200+
"gate_up_proj": ["gate_proj", "up_proj"],
201+
"in_proj": ["in_proj_qkvz", "in_proj_ba"],
202+
},
203+
"qwen2_5_vl": {
204+
"qkv_proj": [
205+
"q_proj",
206+
"k_proj",
207+
"v_proj",
208+
],
209+
"gate_up_proj": [
210+
"gate_proj",
211+
"up_proj",
212+
],
213+
}
214+
}
215+
216+
156217
class AscendLinearMethod(LinearMethodBase):
157218
"""Linear method for Ascend quantization.
158219

0 commit comments

Comments
 (0)