huggingface · SunMarc · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -357,11 +357,17 @@ def set_param_for_module(
     missing_keys: MutableSet[str],
     misc: MutableMapping[str, Any],
     distributed_operation: Optional[TensorParallelLayer],
+    hf_quantizer,
 ):
     with log_to_misc(layer_name, misc, layer_name):
         module_path, _, param_name = layer_name.rpartition(".")
         module_obj = model.get_submodule(module_path) if module_path else model
-        param_value = param_value[0] if isinstance(param_value, list) else param_value[...]
+        if isinstance(param_value, list):
+            param_value = param_value[0]
+        elif isinstance(param_value, torch.nn.Parameter):
+            pass
+        else:
+            param_value = param_value[...]
         ref = meta_model_state_dict.get(layer_name, empty_param)
         use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
         if not isinstance(param_value, torch.nn.Parameter):
@@ -395,7 +401,7 @@ def convert_and_load_state_dict_in_model(
     state_dict,
     weight_mapping,
     tp_plan,
-    quantizer,
+    hf_quantizer,
     dtype=None,
     device_map=None,
     dtype_plan=None,
@@ -460,14 +466,9 @@ def convert_and_load_state_dict_in_model(
             if empty_param is None:
                 unexpected_keys.add(t)
                 continue
-
-            if quantizer is not None and quantizer.param_needs_quantization(model, t):
-                if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
-                    from .integrations.finegrained_fp8 import Fp8Quantize
-
-                    converter.quantization_operation = Fp8Quantize()  # TODO support other methods
-                else:
-                    raise ValueError("This quantization method is gonna be supported SOOOON")
+
+            if hf_quantizer is not None and hf_quantizer.param_needs_quantization(model, t):
+                converter.quantization_operation = hf_quantizer.get_quantize_ops()
             else:
                 _dtype = dtype
                 matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
@@ -532,7 +533,7 @@ def convert_and_load_state_dict_in_model(
                                 with log_to_misc(layer_name, misc, op=op):
                                     realized_value.update(
                                         op.convert(
-                                            {k: realized_value.pop(k)}, quant_config=quantizer.quantization_config
+                                            {k: realized_value.pop(k)}, quant_config=hf_quantizer.quantization_config, model=model
                                         )
                                     )
 
@@ -549,6 +550,7 @@ def convert_and_load_state_dict_in_model(
                                 missing_keys,
                                 misc,
                                 converter.distributed_operation,
+                                hf_quantizer
                             )
                 except SkipLayer:
                     continue

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -36,6 +36,7 @@
         "get_keys_to_not_convert",
         "replace_with_bnb_linear",
         "validate_bnb_backend_availability",
+        "Bnb4bitQuantize",
     ],
     "deepspeed": [
         "HfDeepSpeedConfig",
@@ -51,7 +52,7 @@
     ],
     "eetq": ["replace_with_eetq_linear"],
     "fbgemm_fp8": ["FbgemmFp8Linear", "FbgemmFp8Llama4TextExperts", "replace_with_fbgemm_fp8_linear"],
-    "finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear"],
+    "finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear", "Fp8Quantize"],
     "fsdp": ["is_fsdp_enabled", "is_fsdp_managed_module"],
     "ggml": [
         "GGUF_CONFIG_MAPPING",
@@ -181,6 +182,7 @@
         get_keys_to_not_convert,
         replace_with_bnb_linear,
         validate_bnb_backend_availability,
+        Bnb4bitQuantize,
     )
     from .deepspeed import (
         HfDeepSpeedConfig,
@@ -196,7 +198,7 @@
     )
     from .eetq import replace_with_eetq_linear
     from .fbgemm_fp8 import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts, replace_with_fbgemm_fp8_linear
-    from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear
+    from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear, Fp8Quantize
     from .fsdp import is_fsdp_enabled, is_fsdp_managed_module
     from .ggml import (
         GGUF_CONFIG_MAPPING,

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
@@ -1,5 +1,6 @@
 import inspect
 from inspect import signature
+from typing import Optional
 
 from ..utils import (
     get_available_devices,
@@ -26,7 +27,16 @@
 
 logger = logging.get_logger(__name__)
 
+from ..core_model_loading import ConversionOps
 
+class Bnb4bitQuantize(ConversionOps):
+    def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, **kwargs) -> dict[str, torch.Tensor]:
+        target_key, value = tuple(input_dict.items())[0]
+        value = value[0] if isinstance(value, list) else value
+        old_value = model.get_parameter_or_buffer(target_key)
+        new_value = bnb.nn.Params4bit(value, **old_value.__dict__).to(value.device)
+        return {target_key : new_value}
+
 def _replace_with_bnb_linear(
     model,
     modules_to_not_convert=None,

diff --git a/src/transformers/integrations/finegrained_fp8.py b/src/transformers/integrations/finegrained_fp8.py
@@ -568,15 +568,7 @@ def replace_with_fp8_linear(
         )
 
     return model
-
-
-class QuantizationOp(ConversionOps):
-    """Base class for quantization operations."""
-
-    pass
-
-
-class Fp8Quantize(QuantizationOp):
+class Fp8Quantize(ConversionOps):
     """
     A quantization operation that creates two tensors, weight and scale out of a weight.
     """
@@ -587,7 +579,7 @@ def __init__(self, block_size: Optional[tuple[int, int]] = None):
         self.block_size = block_size
         self.reverse_op = Fp8Dequantize
 
-    def convert(self, input_dict: torch.Tensor, *, quant_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    def convert(self, input_dict: torch.Tensor, quant_config: Optional[dict[str, Any]]= None, **kwargs) -> dict[str, torch.Tensor]:
         # Unpack single key/value (value may be wrapped in a list)
         target_keys, value = tuple(input_dict.items())[0]
         value = value[0] if isinstance(value, list) else value
@@ -655,8 +647,7 @@ def convert(self, input_dict: torch.Tensor, *, quant_config: dict[str, Any]) ->
             scale_key: inv_scales,
         }
 
-
-class Fp8Dequantize(QuantizationOp):
+class Fp8Dequantize(ConversionOps):
     """Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
 
     def __init__(self, block_size: Optional[tuple[int, int]] = None):

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4733,8 +4733,13 @@ def _initialize_missing_keys(self, missing_keys: list[str], is_quantized: bool)
         for key in self.state_dict():
             # If it's part of the keys that will be loaded, mark it as already initialized
             if key not in missing_keys:
-                param_or_buffer = self.get_parameter_or_buffer(key)
-                param_or_buffer._is_hf_initialized = True
+                # some quantization methods save in the state_dict tensors that are not stored as buffer or parameters
+                try:
+                    param_or_buffer = self.get_parameter_or_buffer(key)
+                    param_or_buffer._is_hf_initialized = True
+                except AttributeError as e:
+                    if not is_quantized:
+                        raise e
 
         def set_is_initialized_for_modules(module):
             # A module is already initialized if and only if all its children are also already initialized, and all

diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -417,8 +417,12 @@ def _convert_model_for_quantization(self, model):
                     parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["module_name"](
                         model.config.get_text_config()
                     )
-
-
+
+    def get_quantize_ops(self):
+        raise NotImplementedError(
+            f"{self.quantization_config.quant_method} is not available yet and will be supported soon."
+        )
+
 class SequentialLlama4TextExperts(ModuleList):
     """
     A module that implements a compressed version of a list of expert modules.

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -305,3 +305,7 @@ def _dequantize(self, model):
             model, self.modules_to_not_convert, quantization_config=self.quantization_config
         )
         return model
+
+    def get_quantize_ops(self):
+        from ..integrations.bitsandbytes import Bnb4bitQuantize
+        return Bnb4bitQuantize()
diff --git a/src/transformers/quantizers/quantizer_finegrained_fp8.py b/src/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -226,3 +226,7 @@ def is_trainable(self) -> bool:
     def get_accelerator_warm_up_factor(self):
         # Pre-processing is done cleanly, so we can allocate everything here
         return 2
+
+    def get_quantize_ops(self):
+        from ..integrations import Fp8Quantize
+        return Fp8Quantize()
diff --git a/tests/utils/test_core_model_loading_helpers.py b/tests/utils/test_core_model_loading_helpers.py
@@ -157,7 +157,7 @@ def test_moe_and_qkv_conversion(self):
         ]
 
         missing, unexpected, mismatch, misc = convert_and_load_state_dict_in_model(
-            model, state_dict, weight_mapping, tp_plan=None, quantizer=None
+            model, state_dict, weight_mapping, tp_plan=None, hf_quantizer=None
         )
 
         self.assertEqual(missing, set())