add hadamard

lugimzzz · lugimzzz · commit fe356c3ca1c0 · 2025-05-16T18:36:23.000+08:00
diff --git a/paddlenlp/quantization/hadamard_utils.py b/paddlenlp/quantization/hadamard_utils.py
@@ -12,8 +12,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# import paddle
+
+
+# def matmul_hadU(X):
+
+#     input = X.clone().reshape((-1, X.shape[-1], 1))
+#     output = input.clone()
+#     while input.shape[1] > 1:
+#         input = input.reshape((input.shape[0], input.shape[1] // 2, 2, input.shape[2]))
+#         output = output.reshape(input.shape)
+#         output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
+#         output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
+#         output = output.reshape((input.shape[0], input.shape[1], -1))
+#         (input, output) = (output, input)
+#     del output
+
+#     return input.reshape(X.shape)
+
+
+# def random_hadamard_matrix(size, dtype, is_block=False):
+#     if not is_block:
+#         A = paddle.randint(low=0, high=2, shape=[size, size]).astype("float32") * 2 - 1
+#         Q, _ = paddle.linalg.qr(A)
+#         return Q.astype(dtype), 1
+#     else:
+#         num_blocks = size
+#         while not (num_blocks % 2):
+#             num_blocks = num_blocks // 2
+#         block_size = size // num_blocks
+#         Q = paddle.diag(paddle.ones((block_size,), dtype="float32"))
+#         block = matmul_hadU(Q)
+#         large_matrix = paddle.zeros([size, size])
+
+#         for i in range(num_blocks):
+#             start_row = i * block_size
+#             start_col = i * block_size
+#             large_matrix[start_row : start_row + block_size, start_col : start_col + block_size] = block
+#     return large_matrix.cast(dtype), block_size
+
 import paddle
 
+from paddlenlp.utils import infohub
+
 
 def matmul_hadU(X):
 
@@ -31,22 +72,43 @@ def matmul_hadU(X):
     return input.reshape(X.shape)
 
 
-def random_hadamard_matrix(size, dtype, is_block=False):
-    if not is_block:
-        A = paddle.randint(low=0, high=2, shape=[size, size]).astype("float32") * 2 - 1
-        Q, _ = paddle.linalg.qr(A)
-        return Q.astype(dtype), 1
+def random_hadamard_matrix(block_size, dtype):
+    Q = paddle.diag(paddle.ones((block_size), dtype=dtype))
+    block = matmul_hadU(Q)
+    return block
+
+
+def create_hadamard_matrix(block_size, dtype):
+    Q = paddle.diag(paddle.ones((block_size), dtype=dtype))
+    block = matmul_hadU(Q)
+    return block
+
+
+def hadamard_matmul(input, side, hadamard_matrix, block_size):
+    # left -> H.T@input right -> input@H
+    origin_shape = input.shape
+    input = input.reshape([-1, origin_shape[-1]])
+    if side == "left":
+        # H.T@input -> (input.T@H).T
+        input = input.transpose([1, 0])
+    block_num = input.shape[-1] // block_size
+    output = input.reshape([-1, block_num, block_size]) @ hadamard_matrix
+    output = output.reshape([-1, block_num * block_size])
+    if side == "left":
+        output = output.transpose([1, 0])
+    output = output.reshape(origin_shape)
+
+    return output
+
+
+def apply_hadamard_matmul(x, side, block_size):
+    if getattr(infohub, "hadamard") is None:
+        setattr(infohub, "hadamard", {})
+
+    if block_size in infohub.hadamard:
+        hadamard_matrix = infohub.hadamard[block_size]
     else:
-        num_blocks = size
-        while not (num_blocks % 2):
-            num_blocks = num_blocks // 2
-        block_size = size // num_blocks
-        Q = paddle.diag(paddle.ones((block_size,), dtype="float32"))
-        block = matmul_hadU(Q)
-        large_matrix = paddle.zeros([size, size])
-
-        for i in range(num_blocks):
-            start_row = i * block_size
-            start_col = i * block_size
-            large_matrix[start_row : start_row + block_size, start_col : start_col + block_size] = block
-    return large_matrix.cast(dtype), block_size
+        hadamard_matrix = create_hadamard_matrix(block_size, x.dtype)
+        infohub.hadamard[block_size] = hadamard_matrix
+    target_x = hadamard_matmul(x, side, hadamard_matrix, block_size)
+    return target_x, block_size
diff --git a/paddlenlp/quantization/qat_utils.py b/paddlenlp/quantization/qat_utils.py
@@ -19,7 +19,7 @@
 
 from paddlenlp.utils import infohub
 
-from .hadamard_utils import random_hadamard_matrix
+from .hadamard_utils import apply_hadamard_matmul, random_hadamard_matrix
 
 try:
     from transformer_engine import transformer_engine_paddle as tex
@@ -35,6 +35,13 @@
 except ImportError:
     USE_FP8_GEMM = False
 
+QMIN_QMAX_MAPPING = {
+    "a8w8linear_activation": (-128, 127),
+    "a8w4linear_activation": (-128, 127),
+    "a8w8linear_weight": (-128, 127),
+    "a8w4linear_weight": (-8, 7),
+}
+
 
 def quantize_tensorwise(x, quantization_config=None, bit_length=8, state=0, training=False, act_scale=None):
     qmax = (1 << (bit_length - 1)) - 1
@@ -154,16 +161,87 @@ def dequantize_channelwise(w_int8, scale, apply_hadamard=False):
     return w
 
 
-def a8w8_forward(
-    x, w_int8, w_scale=None, bias=None, dtype=None, quantization_config=None, state=0, training=False, act_scale=None
+def quantize(
+    x,
+    weight_quantize_algo,
+    tensor_type,
+    quantization_config,
+    apply_hadamard=False,
+    side="right",
+    act_scale=None,
+    state=0,
+    training=False,
+    group=None,
+):
+    if apply_hadamard:
+        target_x, hadamard_scale = apply_hadamard_matmul(x, side, quantization_config.hadamard_block_size)
+    else:
+        target_x = x
+        hadamard_scale = 1
+    qmin, qmax = QMIN_QMAX_MAPPING[weight_quantize_algo + "_" + tensor_type]
+    if tensor_type == "activation":
+        if act_scale is not None:
+            if training:
+                scale = paddle.max(paddle.abs(target_x)) / qmax
+                if state < quantization_config.apply_online_actscale_step:
+                    act_scale.set_value((state * act_scale + scale) / (state + 1))
+                else:
+                    act_scale.set_value(
+                        (1 - quantization_config.moving_rate) * act_scale + quantization_config.moving_rate * scale
+                    )
+                    scale = act_scale
+            else:
+                # scale = act_scale
+                scale = paddle.max(paddle.abs(target_x)) / qmax
+        else:
+            scale = paddle.max(paddle.abs(target_x)) / qmax
+        if weight_quantize_algo in ["a8w8linear", "a8w4linear"]:
+            quant_x = paddle.clip((target_x / scale).round(), qmin, qmax).astype("int8")
+        else:
+            raise NotImplementedError(f"Unknown {weight_quantize_algo}.")
+    elif tensor_type == "weight":
+        if weight_quantize_algo in ["a8w8linear", "a8w4linear"]:
+            # channelwise
+            scale = paddle.max(paddle.abs(target_x), axis=0, keepdim=True) / qmax
+            if group is not None:
+                paddle.distributed.all_reduce(scale, op=paddle.distributed.ReduceOp.MAX, group=group, sync_op=True)
+            quant_x = paddle.clip((target_x / scale).round(), qmin, qmax).astype("int8").T
+            scale.stop_gradient = True
+            scale = scale.squeeze(0) / hadamard_scale
+        else:
+            raise NotImplementedError(f"Unknown {weight_quantize_algo}.")
+    else:
+        raise NotImplementedError(f"Unknown {tensor_type}.")
+    return quant_x, scale
+
+
+def int8_forward(
+    x,
+    quant_w,
+    scale_w,
+    weight_quantize_algo,
+    bias=None,
+    quantization_config=None,
+    state=0,
+    training=False,
+    act_scale=None,
 ):
-    x_int8, x_scale = quantize_tensorwise(
-        x, quantization_config, bit_length=8, state=state, training=training, act_scale=act_scale
+    quant_x, scale_x = quantize(
+        x=x,
+        weight_quantize_algo=weight_quantize_algo,
+        tensor_type="activation",
+        quantization_config=quantization_config,
+        apply_hadamard=quantization_config.apply_hadamard,
+        side="right",
+        act_scale=act_scale,
+        state=state,
+        training=training,
     )
-    out = paddle.matmul(x_int8, w_int8.T).astype(dtype) * (x_scale * w_scale.unsqueeze(0))
+
+    out = paddle.matmul(quant_x, quant_w.T).astype(scale_w.dtype) * (scale_x * scale_w)
     if bias is not None:
         out += bias
-    return out, x_int8, x_scale
+    return out, quant_x, scale_x
 
 
 def a8w8_backward(ctx, x, grad_output, quant_weight, quant_scale, quant_x, x_scale):
@@ -352,6 +430,7 @@ def forward(
         state,
         training,
         act_scale,
+        weight_quantize_algo,
     ):
         quant_x, x_scale = None, None
         if quantization_config.weight_quantize_algo in ["fp8linear"]:
@@ -367,12 +446,12 @@ def forward(
                 act_scale=act_scale,
             )
         else:
-            output, quant_x, x_scale = a8w8_forward(
+            output, quant_x, x_scale = int8_forward(
                 x,
-                quant_weight,
-                w_scale=quant_scale,
+                quant_w=quant_weight,
+                scale_w=quant_scale,
+                weight_quantize_algo=weight_quantize_algo,
                 bias=bias,
-                dtype=dtype,
                 quantization_config=quantization_config,
                 state=state,
                 training=training,
diff --git a/paddlenlp/quantization/quantization_config.py b/paddlenlp/quantization/quantization_config.py
@@ -65,6 +65,7 @@ def __init__(
         ignore_modules=None,
         group_size=-1,
         apply_hadamard=False,
+        hadamard_block_size=32,
         quant_input_grad=False,
         quant_weight_grad=False,
         skip_first_act_scale_step=20,
@@ -139,6 +140,7 @@ def __init__(
         self.ignore_modules = ignore_modules
         self.group_size = group_size
         self.apply_hadamard = apply_hadamard
+        self.hadamard_block_size = hadamard_block_size
         self.quant_input_grad = quant_input_grad
         self.quant_weight_grad = quant_weight_grad
         self.skip_first_act_scale_step = skip_first_act_scale_step
diff --git a/paddlenlp/quantization/quantization_linear.py b/paddlenlp/quantization/quantization_linear.py
@@ -212,7 +212,16 @@ def quant_weight_linear(
         state, training, act_scale = act_state
 
         return QATFunc.apply(
-            x, quant_weight, bias, quant_scale, quantization_config, dtype, state, training, act_scale
+            x,
+            quant_weight,
+            bias,
+            quant_scale,
+            quantization_config,
+            dtype,
+            state,
+            training,
+            act_scale,
+            weight_quantize_algo,
         )
     else:
         return QuantizationLinearFunc.apply(
diff --git a/paddlenlp/quantization/quantization_utils.py b/paddlenlp/quantization/quantization_utils.py
@@ -34,7 +34,7 @@
     qlora_weight_quantize = None
 
 from ..utils.log import logger
-from .qat_utils import fp8_quantize_tensorwise, quantize_channelwise
+from .qat_utils import fp8_quantize_tensorwise, quantize
 from .quantization_linear import (
     ColumnParallelQuantizationLinear,
     QuantizationLinear,
@@ -155,18 +155,17 @@ def convert_to_weight_quantize_state_dict(state_dict, name, quantization_config,
     if weight_name in state_dict:
         # gpu weight_quantize will fix in future
         target_weight = state_dict.pop(weight_name).cast(dtype).cuda()
-        if weight_quantize_algo in ["a8w8linear"]:
-            quant_weight, quant_scale = quantize_channelwise(
-                target_weight, quantization_config.apply_hadamard, bit_length=8
-            )
-            act_scale = paddle.zeros([], dtype="bfloat16").cuda()
-            act_scale.stop_gradient = True
-            state_dict[act_scale_name] = act_scale
-        elif weight_quantize_algo in ["a8w4linear"]:
-            quant_weight, quant_scale = quantize_channelwise(
-                target_weight, quantization_config.apply_hadamard, bit_length=4
+
+        if weight_quantize_algo in ["a8w8linear", "a8w4linear"]:
+            quant_weight, quant_scale = quantize(
+                target_weight,
+                weight_quantize_algo,
+                "weight",
+                quantization_config,
+                apply_hadamard=quantization_config.apply_hadamard,
+                side="left",
             )
-            act_scale = paddle.zeros([], dtype="bfloat16").cuda()
+            act_scale = paddle.ones([], dtype=dtype).cuda()
             act_scale.stop_gradient = True
             state_dict[act_scale_name] = act_scale
         elif weight_quantize_algo in ["fp8linear"]: