feat(quantization): support FP4 quantized models on AMD CDNA2/CDNA3 GPUs

fengli1702 · fengli1702 · commit 285557777854 · 2025-08-08T23:45:14.000+08:00
Signed-off-by: feng &lt;fengli1702@gmail.com&gt;
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
@@ -17,4 +17,4 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
-conch-triton-kernels==1.2.1
+conch-triton-kernels==1.2.1
diff --git a/setup.py b/setup.py
@@ -666,6 +666,8 @@ def _read_requirements(filename: str) -> list[str]:
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
         "flashinfer": ["flashinfer-python==0.2.10"],
+        # Optional deps for AMD FP4 quantization support
+        "petit-kernel": ["petit-kernel"],
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/vllm/config.py b/vllm/config.py
@@ -1126,7 +1126,8 @@ def _verify_quantization(self) -> None:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
-            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
+            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc",
+            "petit_nvfp4",
         ]
         if self.quantization is not None:
             self.quantization = cast(me_quant.QuantizationMethods,
@@ -1159,6 +1160,7 @@ def _verify_quantization(self) -> None:
                 "moe_wna16",
                 "modelopt",
                 "modelopt_fp4",
+                "petit_nvfp4",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -52,6 +52,7 @@
     "HQQMarlinMethod",
     "QuarkLinearMethod",
     "ModelOptNvFp4LinearMethod",
+    "PetitNvFp4LinearMethod",
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -38,6 +38,7 @@
     "rtn",
     "inc",
     "mxfp4",
+    "petit_nvfp4",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -118,6 +119,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .rtn import RTNConfig
     from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
+    from .petit import PetitNvFp4Config
 
     method_to_config: dict[str, type[QuantizationConfig]] = {
         "aqlm": AQLMConfig,
@@ -151,6 +153,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "rtn": RTNConfig,
         "inc": INCConfig,
         "mxfp4": Mxfp4Config,
+        "petit_nvfp4": PetitNvFp4Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
@@ -0,0 +1,262 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+
+
+from vllm.logger import init_logger
+from typing import Any, Callable, Dict, List, Optional
+
+import regex as re
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod, LinearMethodBase
+from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils.petit_utils import (
+    apply_petit_nvfp4_linear,
+    prepare_nvfp4_layer_for_petit,
+    verify_petit_nvfp4_supported,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+
+# Initialize logger for the module
+logger = init_logger(__name__)
+
+# Configuration class to support the NVFP4 quantized model generated by the ModelOpt quantization tool
+class PetitNvFp4Config(QuantizationConfig):
+    """Config class for Petit FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str = None,
+        group_size: int = None,
+        exclude_modules: List[str] = None,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "petit_nvfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Petit supports the gfx90a and gfx942 GPUs
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "PetitNvFp4Config":
+        quant_config = cls.get_from_keys(config, ["quantization"])
+        quant_method = quant_config["quant_algo"]
+        group_size = quant_config.get("group_size", None)
+        verify_petit_nvfp4_supported(quant_method, group_size)
+
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+        kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        if not kv_cache_quant_algo:
+            kv_cache_quant_algo = "auto"
+        exclude_modules = quant_config.get("exclude_modules", None)
+        if not (group_size and kv_cache_quant_algo and (exclude_modules is not None)):
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
+            raise ValueError(
+                "NVFP4 quantization requires group size and "
+                "kv_cache_quant_algo specified in "
+                "hf_quant_config.json"
+            )
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            group_size,
+            exclude_modules,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        qc = hf_quant_cfg.get("quantization", hf_quant_cfg)
+        algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper()
+        if algo in ("NVFP4", "MODELOPT_FP4", "MODELOPT"):
+            return cls.get_name()  # "petit_nvfp4"
+        return None
+
+    @classmethod
+    def is_petit_nvfp4_compatible(cls, quant_config: Dict[str, Any]) -> bool:
+        qc = quant_config.get("quantization", quant_config)
+        algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper()
+        return algo == "NVFP4"
+
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
+            return PetitNvFp4LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return PetitFp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class PetitFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+    def __init__(self, quant_config: PetitNvFp4Config):
+        super().__init__(quant_config)
+
+
+class PetitNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: PetitNvFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is " "not multiple of 16"
+            )
+
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
+
+        prepare_nvfp4_layer_for_petit(layer)
+        del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_petit_nvfp4_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=layer.weight_scale_2,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@`
`52`	`52`	`"HQQMarlinMethod",`
`53`	`53`	`"QuarkLinearMethod",`
`54`	`54`	`"ModelOptNvFp4LinearMethod",`
	`55`	`+ "PetitNvFp4LinearMethod",`
`55`	`56`	`]`
`56`	`57`
`57`	`58`