support group-wise weight quant for qwen2 and change cpu kernel to gpu kernel

zeroRains · zeroRains · commit 8d0a39afafb4 · 2025-04-20T09:14:34.000Z
diff --git a/paddlenlp/experimental/transformers/deepseek_v2/modeling.py b/paddlenlp/experimental/transformers/deepseek_v2/modeling.py
@@ -696,16 +696,18 @@ def set_state_dict(self, state_dict):
                 ).cast(dtype)
 
                 if self.use_weight_only:
-                    (
-                        self.transformer_block.q_a_proj_weights[idx],
-                        self.transformer_block.q_a_proj_weights_scale[idx],
-                    ) = weight_quantize(q_a_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size)
-
-                    (
-                        self.transformer_block.q_b_proj_weights[idx],
-                        self.transformer_block.q_b_proj_weights_scale[idx],
-                    ) = weight_quantize(q_b_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size)
+                    q_a_proj_quanted_weight, q_a_proj_weight_scale = weight_quantize(
+                        q_a_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                    )
+                    self.transformer_block.q_a_proj_weights[idx].set_value(q_a_proj_quanted_weight)
+                    self.transformer_block.q_a_proj_weights_scale[idx].set_value(q_a_proj_weight_scale)
+
+                    q_b_proj_quanted_weight, q_b_proj_weight_scale = weight_quantize(
+                        q_b_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                    )
+                    self.transformer_block.q_b_proj_weights[idx].set_value(q_b_proj_quanted_weight)
                     self.transformer_block.q_a_layernorm_weights[idx].set_value(q_a_layernorm_weight)
+                    self.transformer_block.q_b_proj_weights_scale[idx].set_value(q_b_proj_weight_scale)
                 elif "fp8" in self.quant_type:
                     q_a_proj_quanted_weight = (
                         paddle.to_tensor(
@@ -752,10 +754,11 @@ def set_state_dict(self, state_dict):
                 ).cast(dtype)
 
                 if self.use_weight_only:
-                    (
-                        self.transformer_block.q_proj_weights[idx],
-                        self.transformer_block.q_proj_weights_scale[idx],
-                    ) = weight_quantize(q_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size)
+                    q_proj_quanted_weight, q_proj_weight_scale = weight_quantize(
+                        q_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                    )
+                    self.transformer_block.q_proj_weights[idx].set_value(q_proj_quanted_weight)
+                    self.transformer_block.q_proj_weights_scale[idx].set_value(q_proj_weight_scale)
                 elif "fp8" in self.quant_type:
                     q_proj_quanted_weight = (
                         paddle.to_tensor(state_dict[f"{self.base_model_prefix}.layers.{idx}.self_attn.q_proj.weight"])
@@ -822,18 +825,18 @@ def set_state_dict(self, state_dict):
                 self.transformer_block.v_b_proj_weights[idx].set_value(wv_b)
 
             if self.use_weight_only:
-                (
-                    self.transformer_block.kv_a_proj_with_mqa_weights[idx],
-                    self.transformer_block.kv_a_proj_with_mqa_weights_scale[idx],
-                ) = weight_quantize(
+                kv_a_proj_with_mqa_quanted_weight, kv_a_proj_with_mqa_weight_scale = weight_quantize(
                     kv_a_proj_with_mqa_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
                 )
+                self.transformer_block.kv_a_proj_with_mqa_weights[idx].set_value(kv_a_proj_with_mqa_quanted_weight)
+                self.transformer_block.kv_a_proj_with_mqa_weights_scale[idx].set_value(kv_a_proj_with_mqa_weight_scale)
 
-                (
-                    self.transformer_block.kv_b_proj_weights[idx],
-                    self.transformer_block.kv_b_proj_weights_scale[idx],
-                ) = weight_quantize(kv_b_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size)
+                kv_b_proj_quanted_weight, kv_b_proj_weight_scale = weight_quantize(
+                    kv_b_proj_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
+                self.transformer_block.kv_b_proj_weights[idx].set_value(kv_b_proj_quanted_weight)
                 self.transformer_block.kv_a_layernorm_weights[idx].set_value(kv_a_layernorm_weight)
+                self.transformer_block.kv_b_proj_weights_scale[idx].set_value(kv_b_proj_weight_scale)
             elif "fp8" in self.quant_type:
                 kv_a_proj_with_mqa_quanted_weight = (
                     paddle.to_tensor(
@@ -876,10 +879,11 @@ def set_state_dict(self, state_dict):
                 self.transformer_block.kv_b_proj_weights[idx].set_value(kv_b_proj_weight)
 
             if self.use_weight_only:
-                (
-                    self.transformer_block.linear_weights[idx],
-                    self.transformer_block.linear_weights_scale[idx],
-                ) = weight_quantize(linear_weight, algo=self.quant_algo, group_size=self.weightonly_group_size)
+                linear_quanted_weight, linear_weight_scale = weight_quantize(
+                    linear_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
+                self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight)
+                self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale)
             elif "fp8" in self.quant_type:
                 linear_quanted_weight = (
                     paddle.to_tensor(state_dict[f"{self.base_model_prefix}.layers.{idx}.self_attn.o_proj.weight"])
@@ -915,12 +919,11 @@ def set_state_dict(self, state_dict):
                 ffn1_weight_tensor = paddle.to_tensor(concated_ffn1_weight).cast(paddle.get_default_dtype())
 
                 if self.use_weight_only:
-                    (
-                        self.transformer_block.ffn1_weights[idx],
-                        self.transformer_block.ffn1_weights_scale[idx],
-                    ) = weight_quantize(
+                    ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
                         ffn1_weight_tensor, algo=self.quant_algo, group_size=self.weightonly_group_size
                     )
+                    self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight_tensor)
+                    self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale_tensor)
                 elif "fp8" in self.quant_type:
                     ffn1_quanted_weight_tensor = (
                         paddle.to_tensor(concated_ffn1_weight).transpose((1, 0)).cast(paddle.float8_e4m3fn)
@@ -949,12 +952,11 @@ def set_state_dict(self, state_dict):
                     state_dict[f"{self.base_model_prefix}.layers.{idx}.mlp.down_proj.weight"]
                 ).cast(paddle.get_default_dtype())
                 if self.use_weight_only:
-                    (
-                        self.transformer_block.ffn2_weights[idx],
-                        self.transformer_block.ffn2_weights_scale[idx],
-                    ) = weight_quantize(
+                    ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
                         ffn2_weight_tensor, algo=self.quant_algo, group_size=self.weightonly_group_size
                     )
+                    self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight_tensor)
+                    self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale_tensor)
                 elif "fp8" in self.quant_type:
                     ffn2_quanted_weight_tensor = (
                         paddle.to_tensor(state_dict[f"{self.base_model_prefix}.layers.{idx}.mlp.down_proj.weight"])
@@ -1199,19 +1201,21 @@ def set_state_dict(self, state_dict):
                 ).cast(dtype)
 
                 if self.use_weight_only:
-                    (
-                        self.transformer_block.shared_expert_ffn1_weights[idx],
-                        self.transformer_block.shared_expert_ffn1_weights_scale[idx],
-                    ) = weight_quantize(
+                    shared_expert_ffn1_quanted_weight, shared_expert_ffn1_weight_scale = weight_quantize(
                         shared_expert_ffn1_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
                     )
+                    self.transformer_block.shared_expert_ffn1_weights[idx].set_value(shared_expert_ffn1_quanted_weight)
+                    self.transformer_block.shared_expert_ffn1_weights_scale[idx].set_value(
+                        shared_expert_ffn1_weight_scale
+                    )
 
-                    (
-                        self.transformer_block.shared_expert_ffn2_weights[idx],
-                        self.transformer_block.shared_expert_ffn2_weights_scale[idx],
-                    ) = weight_quantize(
+                    shared_expert_ffn2_quanted_weight, shared_expert_ffn2_weight_scale = weight_quantize(
                         shared_expert_ffn2_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
                     )
+                    self.transformer_block.shared_expert_ffn2_weights[idx].set_value(shared_expert_ffn2_quanted_weight)
+                    self.transformer_block.shared_expert_ffn2_weights_scale[idx].set_value(
+                        shared_expert_ffn2_weight_scale
+                    )
 
                 elif "fp8" in self.quant_type:
                     shared_expert_ffn1_quanted_weight = (
diff --git a/paddlenlp/experimental/transformers/qwen2/modeling.py b/paddlenlp/experimental/transformers/qwen2/modeling.py
@@ -109,12 +109,15 @@ def __init__(self, config: Qwen2Config, base_model_prefix: str):
         self.use_fake_parameter = config.get("use_fake_parameter", False)
 
         self.use_weight_only = False
+        self.weightonly_group_size = -1
         if config.quant_type == "weight_only_int8":
             self.use_weight_only = True
             self.quant_algo = "weight_only_int8"
+            self.weightonly_group_size = config.weightonly_group_size
         elif config.quant_type == "weight_only_int4":
             self.use_weight_only = True
             self.quant_algo = "weight_only_int4"
+            self.weightonly_group_size = config.weightonly_group_size
         elif "a8w8" in config.quant_type:
             self.quant_model_path = config.model_name_or_path
             self.shift = config.quantization_config.shift
@@ -312,6 +315,7 @@ def __init__(self, config: Qwen2Config, base_model_prefix: str):
             kv_num_heads=self.num_key_value_heads,
             intermediate_size=self.intermediate_size,
             quant_type=self.quant_type,
+            weightonly_group_size=self.weightonly_group_size,
             activation="swiglu",
             num_layers=config.num_hidden_layers,
             tp_degree=config.tensor_parallel_degree,
@@ -663,7 +667,9 @@ def set_state_dict(self, state_dict):
 
             if self.use_weight_only:
                 qkv_weight = paddle.transpose(qkv_weight, perm=[1, 0])
-                qkv_quanted_weight, qkv_weight_scale = weight_quantize(qkv_weight, algo=self.quant_algo)
+                qkv_quanted_weight, qkv_weight_scale = weight_quantize(
+                    qkv_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
                 self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight)
                 self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale)
             elif "fp8" in self.quant_type:
@@ -701,7 +707,9 @@ def set_state_dict(self, state_dict):
                 paddle.get_default_dtype()
             )
             if self.use_weight_only:
-                linear_quanted_weight, linear_weight_scale = weight_quantize(linear_weight, algo=self.quant_algo)
+                linear_quanted_weight, linear_weight_scale = weight_quantize(
+                    linear_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
                 self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight)
                 self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale)
             elif "fp8" in self.quant_type:
@@ -758,7 +766,9 @@ def set_state_dict(self, state_dict):
             ffn1_weight = paddle.to_tensor(concated_ffn1_weight).cast(paddle.get_default_dtype())
 
             if self.use_weight_only:
-                ffn1_quanted_weight, ffn1_weight_scale = weight_quantize(ffn1_weight, algo=self.quant_algo)
+                ffn1_quanted_weight, ffn1_weight_scale = weight_quantize(
+                    ffn1_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
                 self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight)
                 self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale)
             elif "fp8" in self.quant_type:
@@ -795,7 +805,9 @@ def set_state_dict(self, state_dict):
                 paddle.get_default_dtype()
             )
             if self.use_weight_only:
-                ffn2_quanted_weight, ffn2_weight_scale = weight_quantize(ffn2_weight, algo=self.quant_algo)
+                ffn2_quanted_weight, ffn2_weight_scale = weight_quantize(
+                    ffn2_weight, algo=self.quant_algo, group_size=self.weightonly_group_size
+                )
                 self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight)
                 self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale)
             elif "fp8" in self.quant_type: