Temporary fix for QAT quantizer when linear layer bias is True (#1087)

elfisworking · yumin · web-flow · commit 3103e7ec7aa3 · 2024-10-17T11:10:12.000-04:00
Temporary fix for QAT when linear layer bias is True

Signed-off-by: yumin &lt;zhangym33@chinatelecom.cn&gt;
Co-authored-by: yumin &lt;zhangym33@chinatelecom.cn&gt;
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -34,6 +34,8 @@
 )
 from torchao.quantization.prototype.qat.linear import (
     FakeQuantizedLinear,
+    Int8DynActInt4WeightQATLinear,
+    Int4WeightOnlyQATLinear
 )
 from torchao.quantization.prototype.qat.utils import (
     _choose_qparams_per_token_asymmetric,
@@ -66,6 +68,10 @@
     TORCH_VERSION_AT_LEAST_2_5,
 )
 
+from torchao.quantization.GPTQ import (
+    _replace_linear_8da4w,
+    _replace_linear_int4
+)
 
 # TODO: put this in a common test utils file
 _CUDA_IS_AVAILABLE = torch.cuda.is_available()
@@ -854,6 +860,48 @@ def linear_forward_4w(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         fq_out = fq_linear(x)
         baseline_out = linear_forward_4w(x2, fq_linear.weight)
         torch.testing.assert_close(baseline_out, fq_out, atol=0, rtol=0)
+        
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower")        
+    def test_replace_linear_8da4w(self):
+        module = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=256, out_features=50, bias=True)
+        ])
+        _replace_linear_8da4w(module, 256, False, torch.float32, torch.float32, Int8DynActInt4WeightQATLinear, copy_weights=True)
+        assert(not isinstance(module[0], Int8DynActInt4WeightQATLinear) and isinstance(module[0], torch.nn.Linear))
+        module = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=256, out_features=50, bias=False)
+        ])
+        _replace_linear_8da4w(module, 256, False, torch.float32, torch.float32, Int8DynActInt4WeightQATLinear, copy_weights=True)
+        assert(isinstance(module[0], Int8DynActInt4WeightQATLinear))
+    
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower")    
+    def test_replace_linear_int4(self):
+        module = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=256, out_features=50, bias=True)
+        ])
+        _replace_linear_int4(
+            module, 
+            256, 
+            8,
+            padding_allowed=True, 
+            precision=torch.bfloat16, 
+            scales_precision=torch.bfloat16, 
+            linear_class=Int4WeightOnlyQATLinear, 
+            copy_weights=True)
+        assert(not isinstance(module[0], Int4WeightOnlyQATLinear) and isinstance(module[0], torch.nn.Linear))
+        module = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=256, out_features=50, bias=False)
+        ])
+        _replace_linear_int4(
+            module, 
+            256, 
+            8,
+            padding_allowed=True, 
+            precision=torch.bfloat16, 
+            scales_precision=torch.bfloat16, 
+            linear_class=Int4WeightOnlyQATLinear, 
+            copy_weights=True)
+        assert(isinstance(module[0], Int4WeightOnlyQATLinear))
 
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower")
     def test_fake_quantized_embedding_4w(self):
@@ -891,4 +939,4 @@ def embedding_forward_4w(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -617,7 +617,8 @@ def _replace_linear_int4(
     copy_weights: bool = False,
 ):
     for name, child in module.named_children():
-        if isinstance(child, nn.Linear) and (skip_layer_func is None or not skip_layer_func(child.weight)):
+        # TODO: support linear bias
+        if isinstance(child, nn.Linear) and child.bias is None and (skip_layer_func is None or not skip_layer_func(child.weight)):
             if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) or padding_allowed:
                 new_linear = linear_class(
                     child.in_features,
@@ -979,7 +980,8 @@ def _replace_linear_8da4w(
     from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
 
     def filter_fn(child: torch.nn.Module, cur_fqn:str) -> bool:
-        return isinstance(child, nn.Linear) and (_check_linear_int4_k(child.in_features, groupsize) or padding_allowed)
+        # TODO: support linear bias
+        return isinstance(child, nn.Linear) and child.bias is None and (_check_linear_int4_k(child.in_features, groupsize) or padding_allowed)
 
     def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
         new_linear = linear_class(