Quantization: support FP4 quantized models on AMD CDNA2/CDNA3 GPUs

fengli1702 · fengli1702 · commit 2f217ff38d48 · 2025-08-09T09:16:32.000+08:00
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
@@ -42,9 +42,9 @@ class PetitNvFp4Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_nvfp4_serialized: bool = False,
-        kv_cache_quant_algo: str = None,
-        group_size: int = None,
-        exclude_modules: list[str] = None,
+        kv_cache_quant_algo: Optional[str] = None,
+        group_size: Optional[int] = None,
+        exclude_modules: Optional[list[str]] = None,
     ) -> None:
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
@@ -87,10 +87,12 @@ def from_config(cls, config: dict[str, Any]) -> "PetitNvFp4Config":
         exclude_modules = quant_config.get("exclude_modules", None)
         if not (group_size and kv_cache_quant_algo and (exclude_modules is not None)):
             logger.warning(
-                f"group_size: {group_size},"
-                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
-                f"exclude_modules: {exclude_modules}"
+                "group_size: %s, kv_cache_quant_algo: %s, exclude_modules: %s",
+                group_size,
+                kv_cache_quant_algo,
+                exclude_modules,
             )
+            
             raise ValueError(
                 "NVFP4 quantization requires group size and "
                 "kv_cache_quant_algo specified in "
diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py
@@ -3,61 +3,51 @@
 
 try:
     from petit_kernel import mul_nvfp4_a16, process_nvfp4_scales, repack_nvfp4
+    _PETIT_AVAILABLE = True
 except ImportError:
+    _PETIT_AVAILABLE = False
 
-    def _check_petit_nvfp4_supported(
-        quant_method: str, group_size: Optional[int]
-    ) -> tuple[bool, Optional[str]]:
-        return (
-            False,
-            "Petit is not installed. Please install it with `pip install petit-kernel`.",
-        )
-
-    def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
-        raise ValueError(
-            "Petit is not installed. Please install it with `pip install petit-kernel`."
-        )
-
-    def apply_petit_nvfp4_linear(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        weight_scale_2: torch.Tensor,
-        size_n: int,
-        size_k: int,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise ValueError(
-            "Petit is not installed. Please install it with `pip install petit-kernel`."
-        )
+_PETIT_INSTALL_MSG = (
+    "Petit is not installed. Please install it with "
+    "`pip install petit-kernel`."
+)
 
+def _require_petit() -> None:
+    if not _PETIT_AVAILABLE:
+        # 统一的报错出口，避免重复代码与行过长
+        raise ImportError(_PETIT_INSTALL_MSG)
 
 def _check_petit_nvfp4_supported(
     quant_method: str, group_size: Optional[int]
 ) -> tuple[bool, Optional[str]]:
     if quant_method != "NVFP4":
         return (
             False,
-            "Petit currently only supports: NVFP4"
-            " quantizations in sglang. Please check the "
-            "`hf_quant_config.json` file for your model's "
-            "quant configuration.",
+            (
+                "Petit currently only supports: NVFP4 quantizations in sglang. "
+                "Please check the `hf_quant_config.json` file for your model's "
+                "quant configuration."
+            ),
         )
     if group_size is not None and group_size != 16:
         return (
             False,
-            "Petit currently only supports: group_size=16" " quantizations.",
+            "Petit currently only supports: group_size=16 quantizations.",
         )
     return (True, None)
 
-
-def verify_petit_nvfp4_supported(quant_method: str, group_size: Optional[int]) -> None:
+def verify_petit_nvfp4_supported(
+    quant_method: str, group_size: Optional[int]
+) -> None:
     supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size)
     if not supported:
+        # 避免 mypy 对 Optional[str] 报错
+        assert error_msg is not None
         raise ValueError(error_msg)
 
-
 def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+    _require_petit()  # 没装 petit 时在这里统一报错
+
     # Repack weights to petit format
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -71,9 +61,6 @@ def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
     )
     layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
-    return
-
-
 def apply_petit_nvfp4_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -83,6 +70,8 @@ def apply_petit_nvfp4_linear(
     size_k: int,
     bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    _require_petit()  # 没装 petit 时在这里统一报错
+
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (size_n,)
 
@@ -100,4 +89,4 @@ def apply_petit_nvfp4_linear(
     if bias is not None:
         output.add_(bias)  # In-place add
 
-    return output.reshape(out_shape)
+    return output.reshape(out_shape)