From d78fb98c9524fef30295d9c7688d57889d474cd9 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 15 Mar 2026 06:28:11 +0000
Subject: [PATCH] feat(quantization): rewrite quantize_model.py with full
 algorithm support

- Support quant_method: rtn, gptq, awq, gptq_marlin, awq_marlin
- Implement RTN quantization (no calibration)
- Implement GPTQ with Hessian-based error compensation
- Implement AWQ with activation-aware scaling
- Add Marlin repack for gptq_marlin and awq_marlin
- Correct pack functions for GPTQ and AWQ formats
- Full calibration data pipeline for diffusion models
- CLI interface with all parameters
- AutoModel support for DiffusionLM

Bug fixes:
- Add missing shutil import
- Fix load_file usage (no context manager)
- Fix attention_mask dtype to bool
- Fix AutoModel import
---
 diffulex/extensions/quantization/__init__.py  |    8 +-
 diffulex/extensions/quantization/bootstrap.py |  217 ++-
 .../quantization/kernels/__init__.py          |   10 +-
 .../kernels/kernel_availability.py            |   11 +-
 .../kernels/triton_kernels/__init__.py        |   15 +-
 .../chunked_prefill_attn_unified_fp8.py       |  361 ++++
 .../quantization/kernels/vllm_kernels.py      |   57 +-
 .../extensions/quantization/kv_cache_patch.py |  217 ++-
 .../extensions/quantization/layer_patch.py    |   24 +-
 .../quantization/linear_plan_builder.py       |    6 +
 .../extensions/quantization/linear_plans.py   |   15 +-
 .../extensions/quantization/loader_patch.py   |  341 +++-
 .../extensions/quantization/quantize_model.py | 1500 ++++++++++-------
 .../strategies/kv_cache_fp8_running_max.py    |   55 +-
 .../strategies/linear_awq_marlin_w4a16.py     |    2 +
 .../strategies/linear_awq_w4a16.py            |    2 +
 .../strategies/linear_fp8_w8a16.py            |    2 +-
 .../strategies/linear_fp8_w8a8.py             |    2 +-
 .../strategies/linear_gptq_marlin_w4a16.py    |    2 +
 .../strategies/linear_gptq_marlin_w8a16.py    |    2 +
 .../strategies/linear_gptq_wxa16.py           |   43 +-
 .../strategies/linear_int8_w8a16.py           |    2 +-
 .../strategies/linear_int8_w8a8.py            |  101 +-
 .../strategies/linear_w4a8_cutlass.py         |    2 +
 24 files changed, 2026 insertions(+), 971 deletions(-)
 create mode 100644 diffulex/extensions/quantization/kernels/triton_kernels/chunked_prefill_attn_unified_fp8.py

diff --git a/diffulex/extensions/quantization/__init__.py b/diffulex/extensions/quantization/__init__.py
index 01771a36..771de3a5 100644
--- a/diffulex/extensions/quantization/__init__.py
+++ b/diffulex/extensions/quantization/__init__.py
@@ -78,8 +78,7 @@
     VllmCutlassW4A8,
     VllmFp8LinearOp,
     # Triton kernels
-    Fp8KVAttentionKernel,
-    fp8_kv_attention_forward,
+    chunked_prefill_attn_unified_fp8,
     _HAS_TRITON_KERNELS,
 )
 
@@ -170,6 +169,8 @@
 # Offline quantization
 from .quantize_model import quantize_model
 
+
+
 __all__ = [
     # Bootstrap
     "enable",
@@ -198,8 +199,7 @@
     "VllmAllSparkW8A16",
     "VllmCutlassW4A8",
     "VllmFp8LinearOp",
-    "Fp8KVAttentionKernel",
-    "fp8_kv_attention_forward",
+    "chunked_prefill_attn_unified_fp8",
     "_HAS_TRITON_KERNELS",
     
     # Configuration
diff --git a/diffulex/extensions/quantization/bootstrap.py b/diffulex/extensions/quantization/bootstrap.py
index c12bcb55..2ac74b92 100644
--- a/diffulex/extensions/quantization/bootstrap.py
+++ b/diffulex/extensions/quantization/bootstrap.py
@@ -67,8 +67,20 @@ def enable(config: Optional[Dict[str, Any]] = None,
     
     _quant_config = config
     
+    # Setup import hooks first (before any diffulex imports)
+    _setup_import_hooks()
+    
     # Import and initialize all components
     try:
+        # 0. Import and patch loader first (before auto_model imports it)
+        # This ensures load_model is patched before auto_model binds to it
+        try:
+            import diffulex.utils.loader
+            from .loader_patch import patch_loader
+            patch_loader()
+        except Exception:
+            pass
+        
         # 1. Import strategies to register them
         from . import strategies  # noqa: F401
         
@@ -121,7 +133,59 @@ def enable(config: Optional[Dict[str, Any]] = None,
         # 5. Setup import hooks for post-import patching
         _setup_import_hooks()
         
+        # 6. Explicitly import and patch Attention class for FP8 kernel
+        # Attention is lazily imported via __getattr__, so we need to trigger the import
+        try:
+            from diffulex.attention import Attention
+            from .kv_cache_patch import patch_attention_class
+            patch_attention_class()
+        except Exception:
+            pass
+        
+        # 7. Patch already-imported model runners (e.g., d2f imported before hook was set)
+        #    Also patch ModelRunnerBase class directly to ensure allocate_kv_cache uses quantization
+        try:
+            from .kv_cache_patch import patch_model_runner, patch_allocate_kv_cache_method
+            import sys
+            
+            # Explicitly import and patch ModelRunnerBase class
+            # This must be done before any runner instances are created
+            try:
+                from diffulex.engine.model_runner import ModelRunnerBase
+                patch_allocate_kv_cache_method(ModelRunnerBase)
+                print(f"[Quantization] Patched ModelRunnerBase.allocate_kv_cache")
+            except ImportError as e:
+                print(f"[Quantization] Warning: Could not patch ModelRunnerBase: {e}")
+            
+            # Also patch instance __init__ for any runtime setup
+            for mod_name in list(sys.modules.keys()):
+                if 'model_runner' in mod_name:
+                    mod = sys.modules[mod_name]
+                    for attr_name in dir(mod):
+                        attr = getattr(mod, attr_name)
+                        if isinstance(attr, type) and 'runner' in attr_name.lower():
+                            # Patch the class
+                            original_init = attr.__init__
+                            def make_patched_init(orig_init):
+                                def patched_init(self, *args, **kwargs):
+                                    orig_init(self, *args, **kwargs)
+                                    patch_model_runner(self)
+                                return patched_init
+                            attr.__init__ = make_patched_init(original_init)
+        except Exception:
+            pass
+        
         _is_enabled = True
+        
+        # Eager patch loader if it was already imported before hook was set
+        try:
+            import sys
+            if 'diffulex.utils.loader' in sys.modules:
+                from .loader_patch import patch_loader
+                patch_loader()
+        except Exception:
+            pass
+        
         return True
         
     except Exception as e:
@@ -198,20 +262,24 @@ def _post_import_patch(module_name: str, module):
     
     This handles patching of modules that are imported after enable() is called.
     """
-    # Patch model runner
-    if 'model_runner' in module_name or hasattr(module, 'ModelRunner'):
+    # Patch model runner classes
+    if 'model_runner' in module_name or module_name.endswith('.d2f'):
         try:
             from .kv_cache_patch import patch_model_runner
             
-            # Patch class
-            if hasattr(module, 'ModelRunner'):
-                original_init = module.ModelRunner.__init__
-                
-                def patched_init(self, *args, **kwargs):
-                    original_init(self, *args, **kwargs)
-                    patch_model_runner(self)
-                
-                module.ModelRunner.__init__ = patched_init
+            # Patch all runner classes in the module
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if isinstance(attr, type) and 'runner' in attr_name.lower():
+                    original_init = attr.__init__
+                    
+                    def make_patched_init(orig_init):
+                        def patched_init(self, *args, **kwargs):
+                            orig_init(self, *args, **kwargs)
+                            patch_model_runner(self)
+                        return patched_init
+                    
+                    attr.__init__ = make_patched_init(original_init)
         except Exception:
             pass
     
@@ -231,11 +299,24 @@ def patched_init(self, *args, **kwargs):
         except Exception:
             pass
     
-    # Patch loader
-    if 'loader' in module_name:
+    # Patch Attention class for FP8 custom kernel
+    print(f"[_post_import_patch] Checking module: {module_name}, 'attn_impl' in name: {'attn_impl' in module_name}")
+    if 'attn_impl' in module_name or module_name == 'diffulex.attention.attn_impl':
+        print(f"[_post_import_patch] Patching Attention for {module_name}")
         try:
-            from .loader_patch import patch_loader
-            patch_loader()
+            from .kv_cache_patch import patch_attention_class
+            patch_attention_class()
+            print(f"[_post_import_patch] Patching Attention succeeded for {module_name}")
+        except Exception as e:
+            print(f"[_post_import_patch] Patching Attention failed: {e}")
+    
+    # Patch loader when diffulex.utils is imported (loader is a submodule)
+    if module_name == 'diffulex.utils':
+        try:
+            # Check if loader submodule exists
+            if hasattr(module, 'loader'):
+                from .loader_patch import patch_loader
+                patch_loader()
         except Exception:
             pass
     
@@ -379,96 +460,52 @@ def patched_init(self, *args, **kwargs):
 
 def _quantize_model_weights(model_wrapper):
     """
-    Quantize all linear layer weights in the model.
+    Verify offline quantized weights are properly loaded.
     
-    This is called once after model loading to pre-quantize weights.
+    Raises error if user specified GPTQ/AWQ but weights are not loaded.
     """
-    from .context import get_linear_strategy
     from .layer_mixin import LinearQuantizationMixin
+    from .context import get_linear_strategy
     
-    # Check if already quantized (avoid duplicate quantization in multi-worker setup)
     if getattr(model_wrapper, '_weights_quantized', False):
         return
+    model_wrapper._weights_quantized = True
     
-    # Get model runner
-    model_runner = getattr(model_wrapper, 'model_runner', None)
-    if model_runner is None:
-        return
-    
-    model = getattr(model_runner, 'model', None)
-    if model is None:
+    # Get current quantization config
+    global _quant_config
+    if _quant_config is None:
         return
     
-    # Get current quantization config
     weight_method = _quant_config.get('weights', {}).get('method', 'bf16')
     
-    # Skip if not online quantization
-    if weight_method in ['bf16', 'none']:
+    # Skip if not offline quantization
+    if weight_method not in ['gptq_w4a16', 'gptq_w8a16', 'awq_w4a16', 'gptq_marlin_w4a16', 'awq_marlin_w4a16']:
         return
     
-    # Skip if offline quantization (GPTQ/AWQ) - those are already quantized
-    if any(fmt in weight_method.lower() for fmt in ['gptq', 'awq', 'marlin']):
+    # Get model
+    model_runner = getattr(model_wrapper, 'model_runner', None)
+    if model_runner is None:
         return
-    
-    # Mark as quantized to avoid duplicate work
-    model_wrapper._weights_quantized = True
-    
-    print(f"[Quantization] Pre-quantizing model weights to {weight_method}...")
-    
-    # Get strategy
-    strategy = get_linear_strategy('attn')  # Use attn strategy for all
-    if strategy is None:
+    model = getattr(model_runner, 'model', None)
+    if model is None:
         return
     
-    quantized_count = 0
-    total_saved_bytes = 0
-    
-    # Iterate through all modules
+    # Check offline quantized layers
+    offline_count = 0
+    total_count = 0
     for name, module in model.named_modules():
-        # Check if this is a quantized linear layer
         if isinstance(module, LinearQuantizationMixin):
-            # Skip if already quantized
-            if module.has_quantized_weight() or module.has_offline_quantized_weight():
-                continue
-            
-            # Quantize weight
-            try:
-                weight = module.weight
-                if weight is None or weight.dtype != torch.bfloat16:
-                    continue
-                
-                original_size = weight.numel() * weight.element_size()
-                
-                # Use strategy to quantize weight
-                q_weight, w_meta = strategy.quantize_weight_for_kernel(weight)
-                w_scale = w_meta.get('scale')
-                w_zero = w_meta.get('zero_point')
-                
-                # Store quantized weight
-                module.set_quantized_weight(q_weight, w_scale, w_zero)
-                
-                # Delete original weight to save memory
-                if hasattr(module, 'weight'):
-                    delattr(module, 'weight')
-                    if 'weight' in module._parameters:
-                        del module._parameters['weight']
-                
-                quantized_size = q_weight.numel() * q_weight.element_size()
-                total_saved_bytes += (original_size - quantized_size)
-                quantized_count += 1
-                
-            except Exception as e:
-                # Log but continue
-                print(f"[Quantization] Warning: Failed to quantize {name}: {e}")
-                continue
-    
-    if quantized_count > 0:
-        saved_mb = total_saved_bytes / (1024 ** 2)
-        print(f"[Quantization] Pre-quantized {quantized_count} layers to {weight_method}")
-        print(f"[Quantization] Estimated memory saved: {saved_mb:.1f} MB")
-        
-        # Force CUDA synchronization to get accurate memory stats
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-            mem_allocated = torch.cuda.memory_allocated() / 1024**3
-            print(f"[Quantization] Current GPU memory: {mem_allocated:.2f} GB")
+            total_count += 1
+            if module.has_offline_quantized_weight():
+                offline_count += 1
+    
+    if offline_count == 0 and total_count > 0:
+        raise RuntimeError(
+            f"Quantization mismatch: weight_quant_method='{weight_method}' specified, "
+            f"but no offline quantized weights found in model. "
+            f"Please ensure you're loading a {weight_method.upper()} quantized model, "
+            f"or set weight_quant_method='bf16' for non-quantized models."
+        )
+    
+    if offline_count > 0:
+        print(f"[Quantization] {offline_count}/{total_count} layers using {weight_method}")
diff --git a/diffulex/extensions/quantization/kernels/__init__.py b/diffulex/extensions/quantization/kernels/__init__.py
index 12c9f6ac..d1b12f72 100644
--- a/diffulex/extensions/quantization/kernels/__init__.py
+++ b/diffulex/extensions/quantization/kernels/__init__.py
@@ -38,14 +38,13 @@
 # Import custom Triton kernels
 try:
     from .triton_kernels import (
-        Fp8KVAttentionKernel,
-        fp8_kv_attention_forward,
+        chunked_prefill_attn_unified_fp8,
+        _HAS_FP8_UNIFIED_KERNEL,
     )
     _HAS_TRITON_KERNELS = True
 except ImportError:
     _HAS_TRITON_KERNELS = False
-    Fp8KVAttentionKernel = None
-    fp8_kv_attention_forward = None
+    chunked_prefill_attn_unified_fp8 = None
 
 __all__ = [
     # Registry
@@ -70,7 +69,6 @@
     "VllmCutlassW4A8",
     "VllmFp8LinearOp",
     # Triton kernels
-    "Fp8KVAttentionKernel",
-    "fp8_kv_attention_forward",
+    "chunked_prefill_attn_unified_fp8",
     "_HAS_TRITON_KERNELS",
 ]
diff --git a/diffulex/extensions/quantization/kernels/kernel_availability.py b/diffulex/extensions/quantization/kernels/kernel_availability.py
index cb5cfabe..5a29acf4 100644
--- a/diffulex/extensions/quantization/kernels/kernel_availability.py
+++ b/diffulex/extensions/quantization/kernels/kernel_availability.py
@@ -7,6 +7,7 @@
 import warnings
 import os
 from typing import Set, Optional
+import torch
 
 # Track which warnings have been issued to avoid spamming
 _issued_warnings: Set[str] = set()
@@ -27,7 +28,7 @@ def is_strict_mode() -> bool:
 
 
 def check_vllm_op_available(op_name: str) -> bool:
-    """Check if a vLLM custom op is available."""
+    """Check if a vLLM custom op is available via vllm._custom_ops."""
     try:
         import vllm._custom_ops as ops
         return hasattr(ops, op_name)
@@ -35,6 +36,14 @@ def check_vllm_op_available(op_name: str) -> bool:
         return False
 
 
+def check_torch_c_op_available(op_name: str) -> bool:
+    """Check if a vLLM custom op is available via torch.ops._C."""
+    try:
+        return hasattr(torch.ops._C, op_name)
+    except (ImportError, AttributeError):
+        return False
+
+
 def check_kernel_available(kernel_name: str, op_checker: Optional[callable] = None) -> bool:
     """
     Check if a kernel is available.
diff --git a/diffulex/extensions/quantization/kernels/triton_kernels/__init__.py b/diffulex/extensions/quantization/kernels/triton_kernels/__init__.py
index 3996c3ca..39fbaa9e 100644
--- a/diffulex/extensions/quantization/kernels/triton_kernels/__init__.py
+++ b/diffulex/extensions/quantization/kernels/triton_kernels/__init__.py
@@ -4,17 +4,16 @@
 Pure Triton implementations for operations not covered by vLLM kernels.
 """
 
+# Unified FP8 kernel (Stage 1 + Stage 2)
 try:
-    from .fp8_kv_attention import (
-        Fp8KVAttentionKernel,
-        fp8_kv_attention_forward,
+    from .chunked_prefill_attn_unified_fp8 import (
+        chunked_prefill_attn_unified_fp8,
     )
-    _HAS_FP8_KERNEL = True
+    _HAS_FP8_UNIFIED_KERNEL = True
 except ImportError:
-    _HAS_FP8_KERNEL = False
+    _HAS_FP8_UNIFIED_KERNEL = False
 
 __all__ = [
-    "Fp8KVAttentionKernel",
-    "fp8_kv_attention_forward",
-    "_HAS_FP8_KERNEL",
+    "chunked_prefill_attn_unified_fp8",
+    "_HAS_FP8_UNIFIED_KERNEL",
 ]
diff --git a/diffulex/extensions/quantization/kernels/triton_kernels/chunked_prefill_attn_unified_fp8.py b/diffulex/extensions/quantization/kernels/triton_kernels/chunked_prefill_attn_unified_fp8.py
new file mode 100644
index 00000000..c6b9e3df
--- /dev/null
+++ b/diffulex/extensions/quantization/kernels/triton_kernels/chunked_prefill_attn_unified_fp8.py
@@ -0,0 +1,361 @@
+"""
+FP8 KV Cache Unified Attention Triton Kernel
+
+Extended version of chunked_prefill_attn_unified that supports FP8 quantized KV cache.
+- Stage 1: Attention against cached FP8 KV (dequantized to BF16 on-the-fly)
+- Stage 2: Attention against new BF16 KV (unchanged from original)
+
+This kernel maintains the same interface as the original unified kernel,
+only adding k_scale and v_scale parameters for FP8 dequantization.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _chunked_prefill_attn_unified_fp8_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    o_ptr,
+    k_cache_ptr,          # fp8 cache
+    v_cache_ptr,          # fp8 cache
+    k_scale_ptr,          # fp32 scalar ptr
+    v_scale_ptr,          # fp32 scalar ptr
+    page_tables_ptr,
+    status_table_ptr,
+    context_lens_ptr,
+    cu_seqlens_q_ptr,
+    valid_slices_ptr,
+    prefix_lens_ptr,
+    padded_prefix_lens_ptr,
+    softmax_scale,  # fp32 scalar
+    # q/k/v/o strides
+    q_stride_s,
+    q_stride_h,
+    q_stride_d,
+    kv_stride_s,
+    kv_stride_h,
+    kv_stride_d,
+    o_stride_s,
+    o_stride_h,
+    o_stride_d,
+    # cache strides: [npages, psz, kvh, d]
+    k_cache_stride_npages,
+    k_cache_stride_psz,
+    k_cache_stride_h,
+    k_cache_stride_d,
+    v_cache_stride_npages,
+    v_cache_stride_psz,
+    v_cache_stride_h,
+    v_cache_stride_d,
+    # page_tables strides
+    page_tables_stride_nreqs,
+    page_tables_stride_pages,
+    # misc
+    NUM_GROUPS: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    HEAD_DIM_PADDED: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    DLLM_BLOCK_SIZE: tl.constexpr,
+    IS_BLOCK_CAUSAL: tl.constexpr,
+    IS_PREFIX_FULL: tl.constexpr,
+):
+    """
+    Unified attention kernel with FP8 KV cache support.
+    
+    Stage 1: Load FP8 K/V from cache, dequantize to BF16, compute attention
+    Stage 2: Load BF16 K/V from current step, compute attention
+    """
+    req_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    q_block_id = tl.program_id(2)
+
+    kv_head_id = head_id // NUM_GROUPS
+
+    # Load metadata
+    status = tl.load(status_table_ptr + req_id).to(tl.int32)
+    context_len = tl.load(context_lens_ptr + req_id).to(tl.int32)
+    q_start = tl.load(cu_seqlens_q_ptr + req_id).to(tl.int32)
+    q_end = tl.load(cu_seqlens_q_ptr + req_id + 1).to(tl.int32)
+    valid_slice = tl.load(valid_slices_ptr + req_id).to(tl.int32)
+    prefix_len = tl.load(prefix_lens_ptr + req_id).to(tl.int32)
+    padded_prefix_len = tl.load(padded_prefix_lens_ptr + req_id).to(tl.int32)
+
+    q_len = q_end - q_start
+    valid_q_len = valid_slice - q_start
+    valid_kv_len = valid_q_len
+    new_len = q_len
+
+    # Setup Q loading
+    offs_q_block = q_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM_PADDED)
+    mask_q_block = offs_q_block < valid_q_len
+    mask_d = offs_d < HEAD_DIM
+
+    offs_q = (
+        (q_start + offs_q_block[:, None]) * q_stride_s
+        + head_id * q_stride_h
+        + offs_d[None, :] * q_stride_d
+    )
+    q = tl.load(
+        q_ptr + offs_q,
+        mask=mask_q_block[:, None] & mask_d[None, :],
+        other=0.0,
+    ).to(tl.bfloat16)
+
+    # Load FP8 dequantization scales (global per-tensor)
+    k_scale = tl.load(k_scale_ptr).to(tl.float32)
+    v_scale = tl.load(v_scale_ptr).to(tl.float32)
+
+    # Flash attention accumulators
+    m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32)
+
+    # ---------------------------------------------------------
+    # Stage 1: Attention against FP8 KV Cache (dequant to BF16)
+    # ---------------------------------------------------------
+    offs_kv_cache_block = tl.arange(0, BLOCK_N)
+    mask_kv_cache_block = offs_kv_cache_block < PAGE_SIZE
+    num_pages = (context_len + PAGE_SIZE - 1) // PAGE_SIZE
+
+    for page_rel_id in range(0, num_pages):
+        page_abs_id = tl.load(
+            page_tables_ptr + req_id * page_tables_stride_nreqs + page_rel_id * page_tables_stride_pages
+        ).to(tl.int32)
+
+        page_token_ids = offs_kv_cache_block + page_rel_id * PAGE_SIZE
+        page_token_valid_map = (page_abs_id >= 0) & (page_token_ids < context_len) & mask_kv_cache_block
+
+        # Load K from FP8 cache
+        k_offs = (
+            page_abs_id * k_cache_stride_npages
+            + offs_kv_cache_block[:, None] * k_cache_stride_psz
+            + kv_head_id * k_cache_stride_h
+            + offs_d[None, :] * k_cache_stride_d
+        )
+
+        # Load FP8, dequantize to BF16: BF16 = FP8 * scale
+        # NOTE: Use FP32 intermediate to avoid cvt.bf16.f16 (requires sm_90+)
+        k_fp8 = tl.load(
+            k_cache_ptr + k_offs,
+            mask=page_token_valid_map[:, None] & mask_d[None, :],
+            other=0.0,
+        )
+        k = (k_fp8.to(tl.float32) * k_scale).to(tl.bfloat16)
+
+        # Compute attention scores
+        scores = tl.dot(q, tl.trans(k)).to(tl.float32) * softmax_scale
+        scores = tl.where(mask_q_block[:, None] & page_token_valid_map[None, :], scores, float("-inf"))
+
+        # Online softmax update
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        # Load V from FP8 cache
+        v_offs = (
+            page_abs_id * v_cache_stride_npages
+            + offs_kv_cache_block[:, None] * v_cache_stride_psz
+            + kv_head_id * v_cache_stride_h
+            + offs_d[None, :] * v_cache_stride_d
+        )
+
+        # Load FP8, dequantize to BF16
+        v_fp8 = tl.load(
+            v_cache_ptr + v_offs,
+            mask=page_token_valid_map[:, None] & mask_d[None, :],
+            other=0.0,
+        )
+        v = (v_fp8.to(tl.float32) * v_scale).to(tl.bfloat16)
+
+        # Accumulate attention output
+        acc += tl.dot(p.to(tl.bfloat16), v).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    # ---------------------------------------------------------
+    # Stage 2: Attention against new KV (BF16, unchanged)
+    # ---------------------------------------------------------
+    kv_start = q_start
+    full_range = tl.cdiv(valid_kv_len, BLOCK_N)
+    block_causal_range = tl.minimum(
+        tl.cdiv(valid_q_len + (q_block_id + 1) * BLOCK_M, BLOCK_N),
+        tl.cdiv(valid_kv_len, BLOCK_N),
+    )
+
+    if IS_BLOCK_CAUSAL and not IS_PREFIX_FULL:
+        loop_range = block_causal_range
+    elif IS_BLOCK_CAUSAL and IS_PREFIX_FULL:
+        is_prefilling = status == 0
+        if is_prefilling:
+            loop_range = full_range
+        else:
+            loop_range = block_causal_range
+    else:
+        loop_range = full_range
+
+    for kv_block_id in range(0, loop_range):
+        kv_block_start = kv_block_id * BLOCK_N
+        offs_kv_block = kv_block_start + tl.arange(0, BLOCK_N)
+        kv_token_valid_map = (offs_kv_block < new_len) & (offs_kv_block < valid_q_len)
+
+        # Load K (BF16, from current step)
+        k_offs = (
+            (kv_start + offs_kv_block[None, :]) * kv_stride_s
+            + kv_head_id * kv_stride_h
+            + offs_d[:, None] * kv_stride_d
+        )
+        k = tl.load(
+            k_ptr + k_offs,
+            mask=kv_token_valid_map[None, :] & mask_d[:, None],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        # Compute attention scores
+        scores = tl.dot(q, k).to(tl.float32) * softmax_scale
+        score_valid_mask = mask_q_block[:, None] & kv_token_valid_map[None, :]
+
+        # Apply causal mask
+        if IS_BLOCK_CAUSAL and not IS_PREFIX_FULL:
+            score_block_mask = ((offs_q_block // DLLM_BLOCK_SIZE + 1) * DLLM_BLOCK_SIZE)[:, None] > offs_kv_block[None, :]
+            score_mask = score_valid_mask & score_block_mask
+        elif IS_BLOCK_CAUSAL and IS_PREFIX_FULL:
+            if is_prefilling:
+                score_pure_prefix_mask = (offs_q_block < prefix_len)[:, None] & (offs_kv_block < prefix_len)[None, :]
+                score_padded_causal_mask = (
+                    ((offs_q_block >= prefix_len) & (offs_q_block < padded_prefix_len))[:, None]
+                    & (offs_kv_block < padded_prefix_len)[None, :]
+                )
+                score_block_mask = ((offs_q_block // DLLM_BLOCK_SIZE + 1) * DLLM_BLOCK_SIZE)[:, None] > offs_kv_block[None, :]
+                score_block_mask_extend_only = score_block_mask & (offs_q_block >= padded_prefix_len)[:, None]
+                score_mask = score_pure_prefix_mask | score_padded_causal_mask | score_block_mask_extend_only
+            else:
+                score_block_mask = ((offs_q_block // DLLM_BLOCK_SIZE + 1) * DLLM_BLOCK_SIZE)[:, None] > offs_kv_block[None, :]
+                score_mask = score_valid_mask & score_block_mask
+        else:
+            score_mask = score_valid_mask
+
+        scores = tl.where(score_mask, scores, float("-inf"))
+
+        # Online softmax update
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        # Load V (BF16, from current step)
+        v_offs = (
+            (kv_start + offs_kv_block[:, None]) * kv_stride_s
+            + kv_head_id * kv_stride_h
+            + offs_d[None, :] * kv_stride_d
+        )
+        v = tl.load(
+            v_ptr + v_offs,
+            mask=kv_token_valid_map[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        acc += tl.dot(p.to(tl.bfloat16), v).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    # Normalize and store output
+    out = acc / l[:, None]
+    o_offs = (
+        (q_start + offs_q_block[:, None]) * o_stride_s
+        + head_id * o_stride_h
+        + offs_d[None, :] * o_stride_d
+    )
+    tl.store(
+        o_ptr + o_offs,
+        out.to(tl.bfloat16),
+        mask=mask_q_block[:, None] & mask_d[None, :],
+    )
+
+
+def chunked_prefill_attn_unified_fp8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,   # fp8
+    v_cache: torch.Tensor,   # fp8
+    k_scale: torch.Tensor,   # scalar fp32 tensor
+    v_scale: torch.Tensor,   # scalar fp32 tensor
+    attn_metadata,
+):
+    """
+    FP8 KV Cache Unified Attention Forward.
+    
+    Args:
+        q: Query tensor [total_seqlen, num_heads, head_dim] (BF16)
+        k: Key tensor [total_seqlen, num_kv_heads, head_dim] (BF16) - current step
+        v: Value tensor [total_seqlen, num_kv_heads, head_dim] (BF16) - current step
+        k_cache: Key cache [num_pages, page_size, num_kv_heads, head_dim] (FP8)
+        v_cache: Value cache [num_pages, page_size, num_kv_heads, head_dim] (FP8)
+        k_scale: Per-tensor K scale (scalar float32)
+        v_scale: Per-tensor V scale (scalar float32)
+        attn_metadata: Attention metadata object
+        
+    Returns:
+        Output tensor [total_seqlen, num_heads, head_dim] (BF16)
+    """
+    o = torch.empty_like(q)
+    num_heads = q.shape[1]
+    num_kv_heads = k.shape[1]
+    num_groups = num_heads // num_kv_heads
+
+    head_dim = q.shape[-1]
+    head_dim_padded = 1 << (head_dim - 1).bit_length()
+    softmax_scale = 1.0 / (head_dim ** 0.5)
+    page_size = k_cache.shape[1]
+    num_reqs = attn_metadata.cu_seqlens_q.shape[0] - 1
+
+    # Block sizes (tuned for RTX 4090)
+    BLOCK_M = 64
+    BLOCK_N = 32  # Reduced for shared memory
+
+    grid = (num_reqs, num_heads, triton.cdiv(int(attn_metadata.max_seqlen_q), BLOCK_M))
+
+    _chunked_prefill_attn_unified_fp8_kernel[grid](
+        q,
+        k,
+        v,
+        o,
+        k_cache,
+        v_cache,
+        k_scale,
+        v_scale,
+        attn_metadata.page_tables,
+        attn_metadata.status_table,
+        attn_metadata.context_lens,
+        attn_metadata.cu_seqlens_q,
+        attn_metadata.valid_slices,
+        attn_metadata.prefix_lens,
+        attn_metadata.padded_prefix_lens,
+        softmax_scale,
+        *q.stride(),
+        *k.stride(),
+        *o.stride(),
+        *k_cache.stride(),
+        *v_cache.stride(),
+        *attn_metadata.page_tables.stride(),
+        NUM_GROUPS=num_groups,
+        HEAD_DIM=head_dim,
+        HEAD_DIM_PADDED=head_dim_padded,
+        PAGE_SIZE=page_size,
+        DLLM_BLOCK_SIZE=attn_metadata.block_size,
+        IS_BLOCK_CAUSAL=attn_metadata.is_block_causal,
+        IS_PREFIX_FULL=attn_metadata.is_prefix_full,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+    )
+    return o
diff --git a/diffulex/extensions/quantization/kernels/vllm_kernels.py b/diffulex/extensions/quantization/kernels/vllm_kernels.py
index 2cf0c322..3421a208 100644
--- a/diffulex/extensions/quantization/kernels/vllm_kernels.py
+++ b/diffulex/extensions/quantization/kernels/vllm_kernels.py
@@ -9,7 +9,7 @@
 
 from .kernel_registry import LinearKernel, KVCacheKernel
 from .kernel_registry import register_kernel as _register
-from .kernel_availability import check_vllm_op_available, warn_kernel_unavailable
+from .kernel_availability import check_vllm_op_available, check_torch_c_op_available, warn_kernel_unavailable
 
 
 class VllmKernelBase:
@@ -47,7 +47,7 @@ def __init__(self):
 
 @_register("vllm_gptq_gemm")
 class VllmGPTQGemm(VllmKernelBase, LinearKernel):
-    """GPTQ GEMM kernel (W2/W3/W4/W8)."""
+    """GPTQ GEMM kernel (W2/W3/W4/W8) via vllm._custom_ops."""
     
     name = "vllm_gptq_gemm"
     description = "GPTQ GEMM for 2/3/4/8-bit weights"
@@ -60,6 +60,59 @@ def forward(self, x: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
         return self._op(x, qweight, qzeros, scales, g_idx, is_shuffled, bits)
 
 
+@_register("torch_c_gptq_gemm")
+class TorchCGPTQGemm(LinearKernel):
+    """GPTQ GEMM kernel via torch.ops._C (vLLM's torch library interface)."""
+    
+    name = "torch_c_gptq_gemm"
+    description = "GPTQ GEMM via torch.ops._C for 2/3/4/8-bit weights"
+    
+    _op: Optional[Callable] = None
+    _checked: bool = False
+    
+    @classmethod
+    def is_available(cls) -> bool:
+        """Check if torch.ops._C.gptq_gemm is available."""
+        if not cls._checked:
+            cls._op = None
+            if check_torch_c_op_available("gptq_gemm"):
+                try:
+                    cls._op = torch.ops._C.gptq_gemm
+                except (ImportError, AttributeError):
+                    pass
+            cls._checked = True
+        return cls._op is not None
+    
+    @classmethod
+    def get_missing_reason(cls) -> Optional[str]:
+        """Get reason why kernel is unavailable."""
+        if cls.is_available():
+            return None
+        return "torch.ops._C.gptq_gemm not available. Install vLLM with CUDA support."
+    
+    def __init__(self):
+        if not self.is_available():
+            raise RuntimeError(f"{self.__class__.__name__} is not available")
+    
+    def forward(self, x: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
+                scales: torch.Tensor, g_idx: torch.Tensor,
+                use_exllama: bool, use_v2_format: bool,
+                bits: int) -> torch.Tensor:
+        """Execute GPTQ GEMM via torch.ops._C.
+        
+        Args:
+            x: Input tensor (FP16)
+            qweight: Quantized weight tensor (int32)
+            qzeros: Quantized zeros tensor (int32)
+            scales: Scales tensor (FP16/FP32)
+            g_idx: Group indices (int), empty tensor if not used
+            use_exllama: Whether to use Exllama format
+            use_v2_format: Whether to use v2 format
+            bits: Quantization bits (2, 3, 4, or 8)
+        """
+        return self._op(x, qweight, qzeros, scales, g_idx, use_exllama, use_v2_format, bits)
+
+
 @_register("vllm_awq_gemm")
 class VllmAWQGemm(VllmKernelBase, LinearKernel):
     """AWQ GEMM kernel (W4)."""
diff --git a/diffulex/extensions/quantization/kv_cache_patch.py b/diffulex/extensions/quantization/kv_cache_patch.py
index 0cd5e3f5..c5545215 100644
--- a/diffulex/extensions/quantization/kv_cache_patch.py
+++ b/diffulex/extensions/quantization/kv_cache_patch.py
@@ -7,14 +7,25 @@
 
 import torch
 from typing import Optional, Tuple, Any, Dict
+import logging
 
 from .context import get_kv_cache_strategy
 
-# Import custom FP8 Triton kernel
+logger = logging.getLogger(__name__)
+
+# Import custom FP8 Triton kernel (new unified version with Stage 1 + Stage 2)
 try:
-    from .kernels.triton_kernels import fp8_kv_attention_forward
-    _HAS_FP8_TRITON_KERNEL = True
-except ImportError:
+    from .kernels.triton_kernels import chunked_prefill_attn_unified_fp8
+    # Enable Triton kernel for all CUDA devices (including RTX 4090 sm_89)
+    # NOTE: Kernel uses FP32 intermediate to avoid cvt.bf16.f16 (sm_90+ requirement)
+    import torch
+    if torch.cuda.is_available():
+        _HAS_FP8_TRITON_KERNEL = True
+        print(f"[Quantization] FP8 unified Triton kernel enabled for device capability {torch.cuda.get_device_capability()}")
+    else:
+        _HAS_FP8_TRITON_KERNEL = False
+except ImportError as e:
+    print(f"[Quantization] FP8 unified Triton kernel not available: {e}")
     _HAS_FP8_TRITON_KERNEL = False
 
 
@@ -179,37 +190,79 @@ def use_fp8_triton_kernel() -> bool:
     return _HAS_FP8_TRITON_KERNEL
 
 
-def run_fp8_kv_attention(
-    q: torch.Tensor,
-    k_cache: torch.Tensor,
-    v_cache: torch.Tensor,
-    k_scale: torch.Tensor,
-    v_scale: torch.Tensor,
-    page_tables: torch.Tensor,
-    context_lens: torch.Tensor,
-    cu_seqlens_q: torch.Tensor,
-    softmax_scale: float,
-    is_e4m3: bool = True,
-) -> Optional[torch.Tensor]:
+# Model Runner patching
+def patch_allocate_kv_cache_method(model_runner_class):
     """
-    Run FP8 KV attention using custom Triton kernel.
+    Patch ModelRunnerBase.allocate_kv_cache class method to support quantization.
     
-    Returns None if kernel is not available.
+    This must be called before any model runner instance is created.
     """
-    if not _HAS_FP8_TRITON_KERNEL:
-        return None
+    if hasattr(model_runner_class, '_allocate_kv_cache_patched'):
+        return
+    model_runner_class._allocate_kv_cache_patched = True
     
-    try:
-        return fp8_kv_attention_forward(
-            q, k_cache, v_cache, k_scale, v_scale,
-            page_tables, context_lens, cu_seqlens_q,
-            softmax_scale, is_e4m3
-        )
-    except Exception:
-        return None
+    # Store original allocate_kv_cache
+    original_allocate = model_runner_class.allocate_kv_cache
+    
+    def allocate_kv_cache_with_quant(self):
+        """Allocate KV cache with quantization support."""
+        # Get quantization strategy before allocation
+        strategy = get_kv_cache_strategy()
+        
+        if strategy is not None:
+            # Store strategy for later use
+            self._kv_cache_strategy = strategy
+            self.kv_cache_dtype = getattr(strategy, 'name', 'bf16')
+            
+            # Determine storage dtype from strategy
+            storage_dtype_info = strategy.get_storage_dtype(self)
+            if isinstance(storage_dtype_info, tuple):
+                storage_dtype = storage_dtype_info[0]
+            else:
+                storage_dtype = storage_dtype_info
+            
+            # Temporarily override dtype for allocation
+            original_dtype = getattr(self, 'default_dtype', torch.bfloat16)
+            self._original_dtype_for_kv = original_dtype
+            self.default_dtype = storage_dtype if storage_dtype != torch.float8_e4m3fn else torch.bfloat16
+            
+            # Store FP8 info if needed
+            if storage_dtype == torch.float8_e4m3fn:
+                self._kv_cache_storage_dtype = torch.float8_e4m3fn
+                self._kv_cache_compute_dtype = torch.bfloat16
+        
+        # Call original allocation
+        result = original_allocate(self)
+        
+        # Restore dtype and convert allocated cache if needed
+        if hasattr(self, '_original_dtype_for_kv'):
+            self.default_dtype = self._original_dtype_for_kv
+            delattr(self, '_original_dtype_for_kv')
+        
+        # If FP8 was requested, convert the allocated cache
+        if hasattr(self, '_kv_cache_storage_dtype') and self._kv_cache_storage_dtype == torch.float8_e4m3fn:
+            # Convert allocated caches to FP8
+            if hasattr(self, 'kv_cache') and self.kv_cache is not None:
+                self.kv_cache = self.kv_cache.to(torch.float8_e4m3fn)
+                
+                # Update attention module references for unified layout
+                # Find attention modules and re-assign their cache views
+                attn_modules = [m for m in self.model.modules() if hasattr(m, 'k_cache') and hasattr(m, 'v_cache')]
+                for layer_id, m in enumerate(attn_modules):
+                    m.k_cache = self.kv_cache[0, layer_id]
+                    m.v_cache = self.kv_cache[1, layer_id]
+            
+            if hasattr(self, 'k_cache') and self.k_cache is not None:
+                self.k_cache = self.k_cache.to(torch.float8_e4m3fn)
+                self.v_cache = self.v_cache.to(torch.float8_e4m3fn)
+            
+            logger.info(f"KV cache allocated with dtype: {torch.float8_e4m3fn}")
+        
+        return result
+    
+    model_runner_class.allocate_kv_cache = allocate_kv_cache_with_quant
 
 
-# Model Runner patching
 def patch_model_runner(model_runner):
     """
     Patch ModelRunner with KV cache quantization support.
@@ -221,8 +274,8 @@ def patch_model_runner(model_runner):
     
     model_runner._kv_quant_patched = True
     
-    # Store original allocate_kv_cache
-    if hasattr(model_runner, 'allocate_kv_cache'):
+    # Store original allocate_kv_cache (instance level)
+    if hasattr(model_runner, 'allocate_kv_cache') and not hasattr(model_runner.__class__, '_allocate_kv_cache_patched'):
         original_allocate = model_runner.allocate_kv_cache
         
         def allocate_kv_cache_with_quant(*args, **kwargs):
@@ -247,7 +300,17 @@ def get_kv_cache_with_dequant(*args, **kwargs):
             # Dequantize if needed
             if result is not None and model_runner._kv_cache_strategy is not None:
                 k, v = result
-                k, v = model_runner._kv_cache_strategy.dequantize_kv_for_compute(k, v)
+                
+                # For FP8 with custom kernel, skip dequantization - kernel handles it
+                strategy = model_runner._kv_cache_strategy
+                if hasattr(strategy, 'name') and 'fp8' in strategy.name.lower():
+                    has_kernel = getattr(strategy, 'has_triton_kernel', lambda: False)()
+                    if has_kernel:
+                        # Skip dequantization - kernel will handle it
+                        return result
+                
+                # Dequantize for non-FP8 or when kernel not available
+                k, v = strategy.dequantize_kv_for_compute(k, v)
                 result = (k, v)
             
             return result
@@ -290,3 +353,93 @@ def _init_runner_kv_quantization(model_runner):
             num_layers, max_num_seqs, max_seq_len, num_heads,
             dtype=torch.float32, device=device
         )
+
+
+# Attention class patching
+def patch_attention_class():
+    """
+    Patch Attention class to use custom FP8 Triton kernel when available.
+    This is called during extension initialization.
+    """
+    import warnings
+    
+    try:
+        from diffulex.attention.attn_impl import Attention
+        from diffulex_kernel import chunked_prefill_attn_unified
+    except ImportError:
+        return
+    
+    # Store original forward
+    if hasattr(Attention, '_original_forward'):
+        return  # Already patched
+    
+    Attention._original_forward = Attention.forward
+    
+    def forward_with_fp8_kernel(self, q, k, v, mask=None):
+        """Forward that uses custom FP8 unified kernel when available."""
+        from einops import rearrange
+        from diffulex.attention import fetch_attn_metadata
+        from diffulex_kernel import (
+            store_kv_cache_distinct_layout,
+            store_kv_cache_unified_layout,
+            chunked_prefill_attn_unified,
+        )
+        from .context import get_kv_cache_strategy
+        
+        # Reshape
+        q = rearrange(q, "s (nh hd) -> s nh hd", nh=self.num_heads, hd=self.head_dim)
+        k = rearrange(k, "s (nkvh hd) -> s nkvh hd", nkvh=self.num_kv_heads, hd=self.head_dim)
+        v = rearrange(v, "s (nkvh hd) -> s nkvh hd", nkvh=self.num_kv_heads, hd=self.head_dim)
+        
+        attn_metadata = fetch_attn_metadata()
+        k_cache, v_cache = self.k_cache, self.v_cache
+        is_unified_layout = attn_metadata.kv_cache_layout == "unified"
+        
+        # Store KV cache
+        if k_cache.numel() and v_cache.numel():
+            if attn_metadata.need_kv_cache_store:
+                store_kv_cache = store_kv_cache_unified_layout if is_unified_layout else store_kv_cache_distinct_layout
+                store_kv_cache(k, v, k_cache, v_cache, attn_metadata.slot_mapping, attn_metadata)
+        
+        # Try to use custom FP8 Triton kernel through strategy layer
+        strategy = get_kv_cache_strategy()
+        if strategy is not None and k_cache.dtype == torch.float8_e4m3fn and _HAS_FP8_TRITON_KERNEL:
+            try:
+                if hasattr(strategy, 'has_triton_kernel') and strategy.has_triton_kernel():
+                    # Get scales from strategy (per-tensor running max)
+                    # For now use default scales (1.0) as scale management needs integration
+                    num_reqs = len(attn_metadata.context_lens)
+                    k_scale = torch.tensor(1.0, dtype=torch.float32, device=q.device)
+                    v_scale = torch.tensor(1.0, dtype=torch.float32, device=q.device)
+                    
+                    # Call unified FP8 kernel through strategy
+                    # This handles both Stage 1 (cached FP8 KV) and Stage 2 (new BF16 KV)
+                    o = strategy.triton_attention(
+                        q=q,
+                        k=k,  # New K (BF16) for Stage 2
+                        v=v,  # New V (BF16) for Stage 2
+                        k_cache=k_cache,  # Cached K (FP8) for Stage 1
+                        v_cache=v_cache,  # Cached V (FP8) for Stage 1
+                        attn_metadata=attn_metadata,
+                        k_scale=k_scale,
+                        v_scale=v_scale,
+                    )
+                    if o is not None:
+                        return rearrange(o, "s nh hd -> s (nh hd)").contiguous()
+            except Exception as e:
+                import logging
+                logger = logging.getLogger(__name__)
+                logger.debug(f"FP8 unified kernel failed: {e}")
+                pass  # Fallback to standard kernel
+        
+        # Standard kernel with on-the-fly dequantization
+        if k_cache.dtype == torch.float8_e4m3fn:
+            k_cache_bf16 = k_cache.to(torch.bfloat16)
+            v_cache_bf16 = v_cache.to(torch.bfloat16)
+        else:
+            k_cache_bf16, v_cache_bf16 = k_cache, v_cache
+            
+        o = chunked_prefill_attn_unified(q, k, v, k_cache_bf16, v_cache_bf16, attn_metadata)
+        return rearrange(o, "s nh hd -> s (nh hd)").contiguous()
+    
+    Attention.forward = forward_with_fp8_kernel
diff --git a/diffulex/extensions/quantization/layer_patch.py b/diffulex/extensions/quantization/layer_patch.py
index 988889de..8de87271 100644
--- a/diffulex/extensions/quantization/layer_patch.py
+++ b/diffulex/extensions/quantization/layer_patch.py
@@ -101,11 +101,17 @@ def _quantized_weight_loader(self, param: nn.Parameter, loaded_weight: torch.Ten
                 return  # No quantization enabled
             
             # Step 3: Check if we should quantize this layer
-            # Skip if already quantized or if it's not a weight we want to quantize
+            # Skip if already quantized (online or offline)
             if self.has_quantized_weight() or self.has_offline_quantized_weight():
                 return
             
-            # Step 4: Quantize weight immediately after loading
+            # Step 4: Skip offline quantization strategies
+            # Offline quantized weights (GPTQ/AWQ) are loaded via buffers, not weight param
+            # They are processed in _post_process_loaded_weights after loading
+            if hasattr(strategy, 'is_offline_quantized') and strategy.is_offline_quantized:
+                return
+            
+            # Step 5: Online quantization for BF16/FP16/FP32 weights
             try:
                 # Get the loaded weight data
                 weight = param.data
@@ -114,17 +120,21 @@ def _quantized_weight_loader(self, param: nn.Parameter, loaded_weight: torch.Ten
                 if weight.dtype not in [torch.bfloat16, torch.float16, torch.float32]:
                     return
                 
-                # Quantize weight
                 q_weight, w_meta = strategy.quantize_weight_for_kernel(weight)
-                w_scale = w_meta.get("scale")
+                w_scale = w_meta.get("scales") if w_meta.get("scales") is not None else w_meta.get("scale")
                 w_zero = w_meta.get("zero_point")
                 
-                # Step 5: Store quantized weight
+                # Check if actual quantization happened (has scale means quantized)
+                if w_scale is None:
+                    # No quantization (e.g., BF16 strategy), keep original weight
+                    return
+                
+                # Step 6: Store quantized weight
                 self.set_quantized_weight(q_weight, w_scale, w_zero)
                 
-                # Step 6: Delete original weight to save memory
+                # Step 7: Delete original weight to save memory
                 # Replace param.data with empty tensor to free memory
-                # The actual data is now stored in quant_weight_int8 buffer
+                # The actual data is now stored in quant_weight buffer
                 param.data = torch.empty(0, dtype=weight.dtype, device=weight.device)
                 
                 # Remove from parameters (convert to buffer or just delete)
diff --git a/diffulex/extensions/quantization/linear_plan_builder.py b/diffulex/extensions/quantization/linear_plan_builder.py
index 232ab63a..be3e0135 100644
--- a/diffulex/extensions/quantization/linear_plan_builder.py
+++ b/diffulex/extensions/quantization/linear_plan_builder.py
@@ -84,6 +84,12 @@ def _build_online_plan(layer: torch.nn.Module, example_x: torch.Tensor,
     if hasattr(layer, '_quant_strategy'):
         strategy = layer._quant_strategy
     
+    # If strategy not set on layer, get from context
+    if strategy is None:
+        from .context import get_linear_strategy
+        quant_kind = getattr(layer, 'quant_kind', 'other')
+        strategy = get_linear_strategy(quant_kind)
+    
     if strategy is None:
         return _build_bf16_plan(layer, example_x, bias)
     
diff --git a/diffulex/extensions/quantization/linear_plans.py b/diffulex/extensions/quantization/linear_plans.py
index ccc97f61..7ffa6b68 100644
--- a/diffulex/extensions/quantization/linear_plans.py
+++ b/diffulex/extensions/quantization/linear_plans.py
@@ -136,15 +136,12 @@ def __call__(self, x: torch.Tensor) -> torch.Tensor:
         return self._strategy.linear_forward(
             x, None, self._bias,
             quant_kind=self._quant_kind,
-            gptq_qweight=self._qweight,
-            gptq_qzeros=self._qzeros,
-            gptq_scales=self._scales,
-            gptq_g_idx=self._g_idx,
-            weight_bits=self._bits,
-            use_v2_format=False,
-            out_features=self._out_features,
-            in_features=self._in_features,
-            group_size=self._group_size,
+            qweight=self._qweight,
+            qzeros=self._qzeros,
+            scales=self._scales,
+            g_idx=self._g_idx,
+            bits=self._bits,
+            is_shuffled=self._is_shuffled,
         )
     
     def get_signature(self) -> ForwardPlanSig:
diff --git a/diffulex/extensions/quantization/loader_patch.py b/diffulex/extensions/quantization/loader_patch.py
index 77571dc4..66535029 100644
--- a/diffulex/extensions/quantization/loader_patch.py
+++ b/diffulex/extensions/quantization/loader_patch.py
@@ -46,61 +46,122 @@ def patch_loader():
         if _loader_patched:  # Check again after acquiring lock
             return True
         
-        try:
-            import diffulex.utils.loader as loader_module
-        except ImportError:
-            _patch_lock = False
-            return False
-    
-    # Patch load_checkpoint function
-    if hasattr(loader_module, 'load_checkpoint'):
-        _original_load_checkpoint = loader_module.load_checkpoint
+        # Use sys.modules to avoid re-triggering import hooks
+        import sys
+        loader_module = sys.modules.get('diffulex.utils.loader')
+        if loader_module is None:
+            try:
+                import diffulex.utils.loader as loader_module
+            except ImportError:
+                _patch_lock = False
+                return False
         
-        def quantized_load_checkpoint(checkpoint_path: str, *args, **kwargs):
-            """Load checkpoint with quantization detection."""
-            state_dict = _original_load_checkpoint(checkpoint_path, *args, **kwargs)
+        # Patch load_checkpoint function
+        if hasattr(loader_module, 'load_checkpoint'):
+            _original_load_checkpoint = loader_module.load_checkpoint
             
-            # Detect quantization format
-            quant_config = _detect_quantization_config(state_dict)
+            def quantized_load_checkpoint(checkpoint_path: str, *args, **kwargs):
+                """Load checkpoint with GPTQ/AWQ weights support."""
+                state_dict = _original_load_checkpoint(checkpoint_path, *args, **kwargs)
+                
+                # Check for separate GPTQ/AWQ weights file
+                # This merges pre-quantized weights without auto-detecting format
+                # User must explicitly specify weight_quant_method when loading
+                gptq_state_dict = _load_gptq_weights_file(checkpoint_path)
+                if gptq_state_dict is not None:
+                    # Merge GPTQ weights into state_dict (keys are already converted)
+                    state_dict.update(gptq_state_dict)
+                
+                # Note: No auto-detection per user requirement.
+                # User must explicitly set weight_quant_method="gptq_w4a16" etc.
+                state_dict['_quantization_config'] = None
+                
+                return state_dict
             
-            if quant_config is not None:
-                # Process weights for quantization
-                state_dict = _process_quantized_weights(state_dict, quant_config)
-            
-            # Attach quantization config to state dict
-            state_dict['_quantization_config'] = quant_config
-            
-            return state_dict
+            loader_module.load_checkpoint = quantized_load_checkpoint
         
-        loader_module.load_checkpoint = quantized_load_checkpoint
-    
-    # Patch load_model function
-    if hasattr(loader_module, 'load_model'):
-        _original_load_model = loader_module.load_model
-        
-        def quantized_load_model(model: torch.nn.Module, checkpoint_path: str,
-                                *args, config=None, **kwargs):
-            """Load model with quantization support."""
-            # Check if config has quantization settings
-            quant_config = _get_quant_config_from_model_config(config)
-            
-            if quant_config is not None:
-                # Initialize quantization for model
-                _init_model_quantization(model, quant_config)
+        # Patch load_model function
+        if hasattr(loader_module, 'load_model'):
+            _original_load_model = loader_module.load_model
             
-            # Call original load
-            result = _original_load_model(model, checkpoint_path, *args, config=config, **kwargs)
+            def quantized_load_model(model: torch.nn.Module, config, *args, **kwargs):
+                """Load model with quantization support."""
+                import os
+                
+                # Check if config has quantization settings
+                quant_config = _get_quant_config_from_model_config(config)
+                
+                if quant_config is not None:
+                    # Initialize quantization for model
+                    _init_model_quantization(model, quant_config)
+                
+                # Call original load (loads standard weights)
+                result = _original_load_model(model, config, *args, **kwargs)
+                
+                # Load GPTQ/AWQ weights if needed
+                if quant_config is not None and 'gptq' in quant_config.get('format', '').lower():
+                    model_path = config.model if hasattr(config, 'model') else str(config)
+                    gptq_state_dict = _load_gptq_weights_file(model_path)
+                    if gptq_state_dict is not None:
+                        # Group GPTQ weights by layer
+                        layer_weights = {}
+                        for key, tensor in gptq_state_dict.items():
+                            # Parse key like "model.layers.0.mlp.down_proj.gptq_qweight"
+                            parts = key.rsplit('.', 1)
+                            if len(parts) == 2:
+                                module_name, buffer_name = parts
+                                if module_name not in layer_weights:
+                                    layer_weights[module_name] = {}
+                                layer_weights[module_name][buffer_name] = tensor
+                        
+                        # Set offline quantized weights for each layer
+                        bits = quant_config.get('bits', 4)
+                        group_size = quant_config.get('group_size', 128)
+                        for module_name, buffers in layer_weights.items():
+                            try:
+                                module = model.get_submodule(module_name)
+                                if hasattr(module, 'set_offline_quantized_weight'):
+                                    # Map buffer names from gptq_* to standard names
+                                    qweight = buffers.get('gptq_qweight')
+                                    qzeros = buffers.get('gptq_qzeros')
+                                    scales = buffers.get('gptq_scales')
+                                    g_idx = buffers.get('gptq_g_idx')
+                                    
+                                    if qweight is not None and qzeros is not None and scales is not None:
+                                        # Move tensors to model's device before setting
+                                        device = next(module.parameters()).device if list(module.parameters()) else 'cpu'
+                                        qweight = qweight.to(device)
+                                        qzeros = qzeros.to(device)
+                                        scales = scales.to(device)
+                                        if g_idx is not None and g_idx.numel() > 0:
+                                            g_idx = g_idx.to(device)
+                                        
+                                        module.set_offline_quantized_weight(
+                                            qweight=qweight,
+                                            qzeros=qzeros,
+                                            scales=scales,
+                                            g_idx=g_idx if g_idx is not None and g_idx.numel() > 0 else None,
+                                            bits=bits,
+                                            group_size=group_size,
+                                            format_type='gptq'
+                                        )
+                            except Exception:
+                                pass
+                
+                # Post-process loaded weights (prepare/shuffle if needed)
+                if quant_config is not None:
+                    _post_process_loaded_weights(model, quant_config)
+                
+                return result
             
-            # Post-process loaded weights
-            if quant_config is not None:
-                _post_process_loaded_weights(model, quant_config)
-            
-            return result
+            loader_module.load_model = quantized_load_model
         
-        loader_module.load_model = quantized_load_model
-    
-    _loader_patched = True
-    _patch_lock = False
+        _loader_patched = True
+        _patch_lock = False
+        return True
+    except Exception:
+        _patch_lock = False
+        return False
     return True
 
 
@@ -152,6 +213,74 @@ def _detect_quantization_config(state_dict: Dict[str, torch.Tensor]) -> Optional
     return config
 
 
+def _load_gptq_weights_file(checkpoint_path: str) -> Optional[Dict[str, torch.Tensor]]:
+    """
+    Load separate GPTQ/AWQ weights file if exists.
+    
+    Some models store quantized weights in separate files like:
+    - model_quantized_gptq.safetensors
+    - model_quantized_awq.safetensors
+    
+    Returns:
+        State dict with quantized weights (keys converted to match layer buffer names),
+        or None if not found
+    """
+    import os
+    
+    # Get checkpoint directory
+    checkpoint_dir = os.path.dirname(checkpoint_path) if os.path.isfile(checkpoint_path) else checkpoint_path
+    
+    # Possible quantized weights file names
+    quant_file_names = [
+        'model_quantized_gptq.safetensors',
+        'model_quantized_awq.safetensors',
+        'model_quantized.safetensors',
+        'gptq_model.safetensors',
+        'awq_model.safetensors',
+    ]
+    
+    for file_name in quant_file_names:
+        quant_path = os.path.join(checkpoint_dir, file_name)
+        if os.path.exists(quant_path):
+            try:
+                # Load quantized weights
+                if quant_path.endswith('.safetensors'):
+                    from safetensors.torch import load_file
+                    state_dict = load_file(quant_path)
+                else:
+                    state_dict = torch.load(quant_path, map_location='cpu')
+                
+                # Convert to regular dict if needed
+                if not isinstance(state_dict, dict):
+                    state_dict = dict(state_dict)
+                
+                # Convert keys to match layer buffer names
+                # e.g., "model.layers.0.self_attn.q_proj.qweight" -> "model.layers.0.self_attn.q_proj.gptq_qweight"
+                converted_state_dict = {}
+                for key, value in state_dict.items():
+                    new_key = key
+                    # GPTQ format: convert qweight/qzeros/scales/g_idx to gptq_*
+                    if '.qweight' in key and not key.endswith('.gptq_qweight'):
+                        new_key = key.replace('.qweight', '.gptq_qweight')
+                    elif '.qzeros' in key and not key.endswith('.gptq_qzeros'):
+                        new_key = key.replace('.qzeros', '.gptq_qzeros')
+                    elif '.g_idx' in key and not key.endswith('.gptq_g_idx'):
+                        new_key = key.replace('.g_idx', '.gptq_g_idx')
+                    elif key.endswith('.scales') and not key.endswith('.gptq_scales'):
+                        # Be careful not to match other scales, only exact .scales suffix
+                        parts = key.split('.')
+                        if parts[-1] == 'scales':
+                            new_key = key[:-7] + '.gptq_scales'  # Replace .scales with .gptq_scales
+                    converted_state_dict[new_key] = value
+                
+                return converted_state_dict
+            except Exception:
+                # If loading fails, try next file
+                continue
+    
+    return None
+
+
 def _infer_bits_from_qweight(qweight: torch.Tensor) -> int:
     """Infer quantization bits from qweight tensor."""
     # qweight is int32 packed, each element contains 32/bits weights
@@ -228,44 +357,68 @@ def _process_quantized_weights(state_dict: Dict[str, torch.Tensor],
 
 
 def _get_quant_config_from_model_config(config) -> Optional[Dict[str, Any]]:
-    """Extract quantization config from model config."""
-    if config is None:
-        return None
-    
-    # Check for quantization config attributes
-    quant_config = getattr(config, 'quantization_config', None)
-    if quant_config is not None:
-        return quant_config
-    
-    # Check for individual attributes
-    weight_quant = getattr(config, 'weight_quant_method', None)
-    if weight_quant is not None and weight_quant != 'bf16':
-        return {
-            'format': weight_quant,
-            'bits': 4 if '4' in weight_quant else 8,
-            'group_size': getattr(config, 'quant_group_size', 128),
-        }
+    """Extract quantization config from model config or global config."""
+    # First check model config
+    if config is not None:
+        # Check for quantization config attributes
+        quant_config = getattr(config, 'quantization_config', None)
+        if quant_config is not None:
+            return quant_config
+        
+        # Check for individual attributes
+        weight_quant = getattr(config, 'weight_quant_method', None)
+        if weight_quant is not None and weight_quant != 'bf16':
+            return {
+                'format': weight_quant,
+                'bits': 4 if '4' in weight_quant else 8,
+                'group_size': getattr(config, 'quant_group_size', 128),
+            }
+    
+    # Then check global quantization config (set via quantization.enable())
+    try:
+        from .bootstrap import get_config
+        global_config = get_config()
+        if global_config is not None:
+            weights_config = global_config.get('weights', {})
+            method = weights_config.get('method', 'bf16')
+            if method != 'bf16':
+                return {
+                    'format': method,
+                    'bits': 4 if '4' in method else 8,
+                    'group_size': weights_config.get('group_size', 128),
+                }
+    except Exception:
+        pass
     
     return None
 
 
 def _init_model_quantization(model: torch.nn.Module, quant_config: Dict[str, Any]):
     """Initialize quantization for model layers."""
-    # Create appropriate strategies
+    # Create appropriate strategies based on format
     fmt = quant_config.get('format', 'bf16')
-    
-    if 'gptq' in fmt.lower():
-        from .strategies.linear_gptq_w4a16 import GPTQW4A16LinearStrategy
-        strategy = GPTQW4A16LinearStrategy(
-            bits=quant_config.get('bits', 4),
-            group_size=quant_config.get('group_size', 128)
-        )
+    bits = quant_config.get('bits', 4)
+    group_size = quant_config.get('group_size', 128)
+    
+    if 'gptq_w4a16' in fmt.lower():
+        from .strategies.linear_gptq_wxa16 import GPTQW4A16LinearStrategy
+        strategy = GPTQW4A16LinearStrategy(group_size=group_size)
+    elif 'gptq_w8a16' in fmt.lower():
+        from .strategies.linear_gptq_wxa16 import GPTQW8A16LinearStrategy
+        strategy = GPTQW8A16LinearStrategy(group_size=group_size)
+    elif 'gptq_w2a16' in fmt.lower():
+        from .strategies.linear_gptq_wxa16 import GPTQW2A16LinearStrategy
+        strategy = GPTQW2A16LinearStrategy(group_size=group_size)
+    elif 'gptq_w3a16' in fmt.lower():
+        from .strategies.linear_gptq_wxa16 import GPTQW3A16LinearStrategy
+        strategy = GPTQW3A16LinearStrategy(group_size=group_size)
+    elif 'gptq' in fmt.lower():
+        # Default to w4a16 for generic gptq
+        from .strategies.linear_gptq_wxa16 import GPTQW4A16LinearStrategy
+        strategy = GPTQW4A16LinearStrategy(group_size=group_size)
     elif 'awq' in fmt.lower():
         from .strategies.linear_awq_w4a16 import AWQW4A16LinearStrategy
-        strategy = AWQW4A16LinearStrategy(
-            bits=quant_config.get('bits', 4),
-            group_size=quant_config.get('group_size', 128)
-        )
+        strategy = AWQW4A16LinearStrategy(bits=bits, group_size=group_size)
     else:
         return
     
@@ -273,6 +426,21 @@ def _init_model_quantization(model: torch.nn.Module, quant_config: Dict[str, Any
     from .context import set_linear_strategy
     for kind in ['attn', 'mlp', 'other']:
         set_linear_strategy(kind, strategy)
+    
+    # Initialize quantization for each linear layer
+    # This sets quant_kind so weight loader knows how to handle the weights
+    for name, module in model.named_modules():
+        # Check if this is a linear layer that supports quantization
+        if hasattr(module, 'init_quantization'):
+            # Determine quant_kind based on layer name
+            quant_kind = 'other'
+            if any(x in name for x in ['q_proj', 'k_proj', 'v_proj', 'o_proj']):
+                quant_kind = 'attn'
+            elif any(x in name for x in ['gate_proj', 'up_proj', 'down_proj']):
+                quant_kind = 'mlp'
+            
+            module.init_quantization(quant_kind)
+            module._quant_strategy = strategy
 
 
 def _post_process_loaded_weights(model: torch.nn.Module, quant_config: Dict[str, Any]):
@@ -282,6 +450,25 @@ def _post_process_loaded_weights(model: torch.nn.Module, quant_config: Dict[str,
     This prepares offline quantized weights for use.
     """
     fmt = quant_config.get('format', '')
+    bits = quant_config.get('bits', 4)
+    group_size = quant_config.get('group_size', 128)
+    
+    # Process GPTQ weights: set offline quantized state from loaded buffers
+    if 'gptq' in fmt.lower():
+        for name, module in model.named_modules():
+            if hasattr(module, 'set_offline_quantized_weight'):
+                # Check if GPTQ buffers were loaded
+                if hasattr(module, 'gptq_qweight'):
+                    g_idx = getattr(module, 'gptq_g_idx', None)
+                    module.set_offline_quantized_weight(
+                        qweight=module.gptq_qweight,
+                        qzeros=module.gptq_qzeros,
+                        scales=module.gptq_scales,
+                        g_idx=g_idx,
+                        bits=bits,
+                        group_size=group_size,
+                        format_type='gptq'
+                    )
     
     # Prepare GPTQ weights (shuffle if needed)
     if 'gptq' in fmt.lower():
diff --git a/diffulex/extensions/quantization/quantize_model.py b/diffulex/extensions/quantization/quantize_model.py
index 3b7331b1..068733cd 100644
--- a/diffulex/extensions/quantization/quantize_model.py
+++ b/diffulex/extensions/quantization/quantize_model.py
@@ -1,772 +1,1001 @@
 #!/usr/bin/env python3
-"""离线量化脚本：将模型权重量化为 vLLM 标准 GPTQ/AWQ 格式
+"""
+离线量化脚本：将 AutoModelForDiffusionLM 模型权重量化为 GPTQ/AWQ/Marlin 格式
 
-支持两种量化格式（对齐 vLLM 权重格式）：
-- GPTQ: qweight/qzeros 为 int32 packed，scales 为 fp16，g_idx 可选（常见 desc_act=False 时为空）
-- GPTQ_MARLIN: 导出 Marlin-ready 的 GPTQ 权重布局（qweight 已 repack，scales 已 permute，zp 为空）
-- AWQ : qweight/qzeros 为 int32 packed，scales 为 fp16
+支持方法:
+- rtn: Round-To-Nearest，无校准，快速
+- gptq: Hessian-based GPTQ，需校准
+- awq: Activation-aware AWQ，需校准  
+- gptq_marlin: GPTQ + Marlin重排，需校准，强制sym=True
+- awq_marlin: AWQ + Marlin重排，需校准
 
 使用方法:
     python -m diffulex.extensions.quantization.quantize_model \
         --model-path /path/to/model \
         --output-path /path/to/output \
-        --quant-format gptq_marlin \
-        --group-size 128 \
+        --quant-method gptq \
         --bits 4 \
-        --quant-method auto \
-        --calib-text-file /path/to/calib.txt \
-        --calib-num-samples 128 \
-        --calib-seq-len 512
-
-说明:
-- `quant-method=simple`：沿用当前"直接分组量化/舍入"的旧实现（不需要校准数据，不是真 GPTQ/AWQ）。
-- `quant-method=auto`：使用 `auto-gptq` / `awq(autoawq)` 做真正的校准量化，然后导出为 vLLM/Diffulex 可加载的权重格式。
+        --group-size 128 \
+        --calib-text-file /path/to/calib.txt
 """
 
 from __future__ import annotations
 
 import argparse
-import os
+import gc
 import json
+import os
 import random
 import shutil
 from pathlib import Path
-from typing import Optional
+from typing import Dict, List, Optional, Tuple, Any
 
 import torch
 import torch.nn as nn
+from safetensors.torch import load_file, save_file
 from tqdm import tqdm
-from safetensors.torch import save_file
-
-from transformers import AutoConfig, AutoTokenizer
-from safetensors import safe_open
-from glob import glob
+from transformers import AutoConfig, AutoTokenizer, AutoModel
 
 
 def _require_vllm():
+    """导入vLLM量化工具函数。"""
     try:
-        from vllm.scalar_type import scalar_types  # type: ignore
-        from vllm.model_executor.layers.quantization.utils.quant_utils import (  # type: ignore
-            awq_pack,
-            gptq_pack,
-            pack_cols,
+        from vllm.scalar_type import scalar_types
+        from vllm.model_executor.layers.quantization.utils.quantize_utils import (
             quantize_weights,
+            pack_cols,
         )
-    except Exception as e:  # pragma: no cover
-        raise RuntimeError(
-            "离线 GPTQ/AWQ 打包已切换到 vLLM 标准格式，需要可 import 的 vLLM。"
-        ) from e
-    return scalar_types, quantize_weights, gptq_pack, awq_pack, pack_cols
+        return scalar_types, quantize_weights, pack_cols
+    except Exception as e:
+        raise RuntimeError("需要vLLM来执行量化打包操作") from e
 
 
 def _require_vllm_marlin():
-    # Marlin 预处理依赖 CUDA custom ops
+    """导入vLLM Marlin相关函数。"""
     try:
-        from vllm import _custom_ops as ops  # type: ignore
-        from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+        import vllm._custom_ops as ops
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
             marlin_permute_scales,
         )
-    except Exception as e:  # pragma: no cover
-        raise RuntimeError(
-            "导出 gptq_marlin 格式需要可 import 的 vLLM Marlin（含 CUDA custom ops）。"
-        ) from e
-    return ops, marlin_permute_scales
+        return ops, marlin_permute_scales
+    except Exception as e:
+        raise RuntimeError("需要vLLM Marlin支持（含CUDA custom ops）") from e
 
 
-def _require_auto_gptq():
-    try:
-        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig  # type: ignore
-    except Exception as e:  # pragma: no cover
-        raise RuntimeError(
-            "未能导入 auto-gptq。请确认已在当前 .venv 安装（例如：BUILD_CUDA_EXT=0 pip install auto-gptq）。"
-        ) from e
-    return AutoGPTQForCausalLM, BaseQuantizeConfig
-
+# =============================================================================
+# Pack 函数
+# =============================================================================
 
-def _require_awq():
-    try:
-        from awq import AutoAWQForCausalLM  # type: ignore
-    except Exception as e:  # pragma: no cover
-        raise RuntimeError(
-            "未能导入 awq（autoawq 的导入名是 `awq`）。"
-        ) from e
-    return AutoAWQForCausalLM
-
-
-def _load_calib_texts(
-    calib_text_file: str, *, num_samples: int, seed: int
-) -> list[str]:
-    p = Path(calib_text_file)
-    if not p.exists():
-        raise FileNotFoundError(f"calib_text_file 不存在: {calib_text_file}")
-    lines = [ln.strip() for ln in p.read_text(encoding="utf-8", errors="ignore").splitlines()]
-    lines = [ln for ln in lines if ln]
-    if not lines:
-        raise ValueError(f"calib_text_file 为空: {calib_text_file}")
-    if num_samples <= 0:
-        raise ValueError(f"calib_num_samples 必须 > 0, got {num_samples}")
-    if len(lines) <= num_samples:
-        return lines[:num_samples]
-    rng = random.Random(seed)
-    return rng.sample(lines, k=num_samples)
-
-
-def _build_autogptq_examples(
-    tokenizer, texts: list[str], *, seq_len: int
-) -> list[dict[str, torch.Tensor]]:
-    if seq_len <= 0:
-        raise ValueError(f"calib_seq_len 必须 > 0, got {seq_len}")
-
-    # AutoGPTQ 会自行 collate/pad；这里用 fixed max_length 保持输入一致。
-    examples: list[dict[str, torch.Tensor]] = []
-    for t in texts:
-        enc = tokenizer(
-            t,
-            return_tensors="pt",
-            truncation=True,
-            max_length=seq_len,
-            padding="max_length",
-        )
-        examples.append(
-            {
-                "input_ids": enc["input_ids"],
-                "attention_mask": enc.get("attention_mask", torch.ones_like(enc["input_ids"])),
-            }
-        )
-    return examples
-
-
-@torch.inference_mode()
-def _quantize_to_vllm_gptq(
-    weight: torch.Tensor, *, group_size: int, bits: int, use_v2_format: bool = False
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Quantize and pack weights into vLLM GPTQ checkpoint format.
-
-    Input:
-      weight: fp32 [N, K] (PyTorch Linear weight)
-    Output (vLLM format):
-      qweight: int32 [K/pack, N]
-      qzeros : int32 [K/group, N/pack]   (GPTQ v1 stores (zeros - 1); v2 stores zeros)
-      scales : fp16  [K/group, N]
-      g_idx  : int32 empty tensor (desc_act=False)
+def gptq_pack(qweight_int: torch.Tensor, bits: int) -> torch.Tensor:
     """
-    scalar_types, quantize_weights, gptq_pack, _, pack_cols = _require_vllm()
-    # vLLM GPTQConfig mentions 2/3/4/8, but the standard vLLM int32 packing
-    # used by `gptq_pack/pack_cols` requires 32 % bits == 0.
-    # So we support 2/4/8 here; 3-bit would need a different packing scheme.
-    if bits not in (2, 4, 8):
-        raise ValueError(
-            f"GPTQ bits 仅支持 2/4/8（vLLM 标准 int32 pack 要求 32%bits==0），当前 bits={bits}"
-        )
-
-    # vLLM operates on (K, N)
-    w = weight.T.contiguous()
-    size_k, size_n = w.shape
-    group_size_norm = size_k if group_size == -1 else group_size
-    if group_size_norm <= 0 or size_k % group_size_norm != 0:
-        raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}")
-
-    if bits == 2:
-        quant_type = scalar_types.uint2b2
-    elif bits == 4:
-        quant_type = scalar_types.uint4b8
-    else:  # bits == 8
-        quant_type = scalar_types.uint8b128
+    将[K, N]的int张量pack成[K//pack, N]的int32张量（GPTQ格式）。
+    
+    Pack规则（小端序）：
+    qweight[k//pack, n] = sum(qweight_int[k+i, n] << (bits*i) for i in range(pack))
+    """
+    pack_factor = 32 // bits
+    size_k, size_n = qweight_int.shape
+    assert size_k % pack_factor == 0, f"K={size_k} must be divisible by pack_factor={pack_factor}"
+    
+    qweight = torch.zeros((size_k // pack_factor, size_n), 
+                          dtype=torch.int32, device=qweight_int.device)
+    
+    for i in range(pack_factor):
+        qweight |= (qweight_int[i::pack_factor].to(torch.int32) << (bits * i))
+    
+    return qweight
 
-    _, w_q, w_s, _ = quantize_weights(w, quant_type, group_size_norm, zero_points=False)
 
+def awq_pack(qweight_int: torch.Tensor, bits: int) -> torch.Tensor:
+    """
+    将[K, N]的int张量pack成[K, N//pack]的int32张量（AWQ格式）。
+    
+    Pack规则：
+    qweight[k, n//pack] = sum(qweight_int[k, n+i] << (bits*i) for i in range(pack))
+    """
     pack_factor = 32 // bits
-    qweight = gptq_pack(w_q, bits, size_k, size_n).contiguous()  # [K/pack, N]
-
-    num_groups = size_k // group_size_norm
-    zeros = torch.full(
-        (num_groups, size_n),
-        int(getattr(quant_type, "bias", 0)),
-        dtype=torch.int32,
-        device=w.device,
-    )
-    # GPTQ v1 stores zeros-1 in the checkpoint.
-    zeros_to_store = zeros if use_v2_format else (zeros - 1)
-    qzeros = pack_cols(zeros_to_store, bits, num_groups, size_n).contiguous()  # [K/group, N/pack]
+    size_k, size_n = qweight_int.shape
+    assert size_n % pack_factor == 0, f"N={size_n} must be divisible by pack_factor={pack_factor}"
+    
+    qweight = torch.zeros((size_k, size_n // pack_factor),
+                          dtype=torch.int32, device=qweight_int.device)
+    
+    for i in range(pack_factor):
+        qweight |= (qweight_int[:, i::pack_factor].to(torch.int32) << (bits * i))
+    
+    return qweight
 
-    scales = w_s.to(torch.float16).contiguous()  # [K/group, N]
-    g_idx = torch.empty((0,), dtype=torch.int32, device=w.device)
-    return qweight, qzeros, scales, g_idx
 
+def gptq_pack_zeros(zeros_int: torch.Tensor, bits: int) -> torch.Tensor:
+    """
+    Pack zeros tensor [num_groups, N] to [num_groups, N//pack].
+    GPTQ v1 format stores (zeros - 1).
+    """
+    pack_factor = 32 // bits
+    num_groups, size_n = zeros_int.shape
+    assert size_n % pack_factor == 0
+    
+    # v1 format: store zeros - 1
+    zeros_v1 = zeros_int - 1
+    
+    qzeros = torch.zeros((num_groups, size_n // pack_factor),
+                         dtype=torch.int32, device=zeros_int.device)
+    
+    for i in range(pack_factor):
+        qzeros |= (zeros_v1[:, i::pack_factor].to(torch.int32) << (bits * i))
+    
+    return qzeros
 
-@torch.inference_mode()
-def _quantize_to_vllm_gptq_marlin(
-    weight: torch.Tensor, *, group_size: int, bits: int
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Quantize weights and export marlin-ready GPTQ layout.
 
-    该导出格式对齐 vLLM `MarlinLinearKernel.process_weights_after_loading` 的结果：
-    - qweight: 已执行 `gptq_marlin_repack`
-    - scales : 已执行 `marlin_permute_scales`
-    - qzeros : 置空（Marlin GPTQ symmetric 路径不使用 runtime zp）
-    - g_idx  : 空（desc_act=False）
+# =============================================================================
+# Marlin 重排
+# =============================================================================
 
-    注意：需要在 CUDA 上执行（`gptq_marlin_repack` 为 CUDA op）。
+def repack_gptq_to_marlin(
+    qweight: torch.Tensor,      # GPTQ packed: [K//pack, N]
+    scales: torch.Tensor,       # [K//group, N]
+    bits: int,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    将GPTQ格式权重重排为Marlin格式。
+    
+    Returns:
+        marlin_qweight: [K//16, N*16//32]
+        marlin_scales: [K//group, N]
+        marlin_workspace: [N*16//32]
     """
-    if weight.device.type != "cuda":
-        raise ValueError("gptq_marlin 导出需要 device=cuda（Marlin repack 为 CUDA op）")
-
     ops, marlin_permute_scales = _require_vllm_marlin()
-
-    # 先按 vLLM 标准 GPTQ（symmetric, zero_points=False）量化并打包
-    qweight, _qzeros, scales, g_idx = _quantize_to_vllm_gptq(
-        weight, group_size=group_size, bits=bits, use_v2_format=False
-    )
-
-    # vLLM GPTQ packing 的 shape 基于 w=(K,N)；这里 size_k=in_features, size_n=out_features
-    size_k = weight.shape[1]
-    size_n = weight.shape[0]
-    group_size_norm = size_k if group_size == -1 else group_size
-
-    # desc_act=False 时 perm 为空
-    empty_perm = torch.empty((0,), dtype=torch.int32, device=weight.device)
-
+    
+    qweight = qweight.to(device)
+    scales = scales.to(device)
+    
+    # 空perm（desc_act=False）
+    empty_perm = torch.empty((0,), dtype=torch.int32, device=device)
+    
+    # 重排权重
     marlin_qweight = ops.gptq_marlin_repack(
         qweight.contiguous(),
         perm=empty_perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=bits,
-        is_a_8bit=False,
+        is_a_8bit=(bits == 8),
     ).contiguous()
-
+    
+    # Permute scales
     marlin_scales = marlin_permute_scales(
-        scales.contiguous(),
+        scales.contiguous().to(torch.float16),
         size_k=size_k,
         size_n=size_n,
-        group_size=group_size_norm,
-        is_a_8bit=False,
+        group_size=group_size,
+        is_a_8bit=(bits == 8),
     ).contiguous()
+    
+    # 创建工作区
+    marlin_workspace = torch.zeros(
+        marlin_qweight.shape[1] * 32 // bits,
+        dtype=torch.int32,
+        device=device,
+    )
+    
+    return marlin_qweight.cpu(), marlin_scales.cpu(), marlin_workspace.cpu()
 
-    # Marlin GPTQ symmetric 不使用 runtime zero points，导出空 qzeros 保持一致性
-    marlin_qzeros = torch.empty((0,), dtype=torch.int32, device=weight.device)
-    marlin_g_idx = g_idx  # already empty
 
-    return marlin_qweight, marlin_qzeros, marlin_scales, marlin_g_idx
+def repack_awq_to_marlin(
+    qweight: torch.Tensor,      # AWQ packed: [K, N//pack]
+    scales: torch.Tensor,       # [K//group, N]
+    bits: int,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    将AWQ格式权重重排为Marlin格式。
+    
+    注意：AWQ需要先unpack到标准格式，再调用gptq_marlin_repack。
+    """
+    ops, marlin_permute_scales = _require_vllm_marlin()
+    
+    # AWQ unpack (简化实现，实际需要完整的unpack逻辑)
+    # 这里假设AWQ的pack是可逆的
+    pack_factor = 32 // bits
+    qweight_unpacked = torch.zeros((size_k, size_n), dtype=torch.int32, device=qweight.device)
+    
+    for i in range(pack_factor):
+        mask = (2 ** bits - 1) << (bits * i)
+        qweight_unpacked[:, i::pack_factor] = ((qweight >> (bits * i)) & ((1 << bits) - 1)).to(torch.int32)
+    
+    # 转为GPTQ格式再repack
+    qweight_gptq = gptq_pack(qweight_unpacked, bits)
+    
+    return repack_gptq_to_marlin(qweight_gptq, scales, bits, size_k, size_n, group_size, device)
 
 
-@torch.inference_mode()
-def _quantize_to_vllm_awq(
-    weight: torch.Tensor, *, group_size: int, bits: int
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Quantize and pack weights into vLLM AWQ checkpoint format.
+# =============================================================================
+# 量化算法
+# =============================================================================
 
-    Input:
-      weight: fp32 [N, K]
-    Output (vLLM format):
-      qweight: int32 [K, N/pack]
-      qzeros : int32 [K/group, N/pack]
-      scales : fp16  [K/group, N]
+def quantize_rtn(
+    weight: torch.Tensor,
+    bits: int,
+    group_size: int,
+    sym: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Round-To-Nearest 量化。
+    
+    Returns:
+        qweight: packed weight
+        qzeros: packed zeros  
+        scales: scales
+        g_idx: empty tensor
     """
-    scalar_types, quantize_weights, _, awq_pack, _ = _require_vllm()
-    if bits != 4:
-        raise ValueError(f"AWQ 目前仅支持 4-bit，当前 bits={bits}")
-
-    w = weight.T.contiguous()
-    size_k, size_n = w.shape
-    group_size_norm = size_k if group_size == -1 else group_size
-    if group_size_norm <= 0 or size_k % group_size_norm != 0:
-        raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}")
-
-    quant_type = scalar_types.uint4
-    _, w_q, w_s, w_zp = quantize_weights(w, quant_type, group_size_norm, zero_points=True)
-    if w_zp is None:
-        raise RuntimeError("AWQ zero_points=True 但未生成 zero points，vLLM 量化返回异常。")
-
-    qweight = awq_pack(w_q, bits, size_k, size_n).contiguous()  # [K, N/pack]
-    num_groups = size_k // group_size_norm
-    qzeros = awq_pack(w_zp.to(torch.int32), bits, num_groups, size_n).contiguous()  # [K/group, N/pack]
-    scales = w_s.to(torch.float16).contiguous()  # [K/group, N]
-    return qweight, qzeros, scales
-
-
-@torch.inference_mode()
-def _export_autogptq_to_vllm_weights(
-    *,
-    gptq_base_model: nn.Module,
-    quant_format: str,
-    target_modules: Optional[list[str]],
-    desc_act: bool,
+    weight = weight.float()
+    out_features, in_features = weight.shape
+    
+    if group_size == -1:
+        group_size = in_features
+    
+    num_groups = (in_features + group_size - 1) // group_size
+    max_q = 2 ** bits - 1
+    
+    # 存储量化结果
+    q_full = torch.zeros_like(weight, dtype=torch.int32)
+    scales_list = []
+    zeros_list = []
+    
+    for g in range(num_groups):
+        start = g * group_size
+        end = min(start + group_size, in_features)
+        w_group = weight[:, start:end]
+        
+        if sym:
+            # 对称量化
+            w_max = w_group.abs().max(dim=1, keepdim=True)[0]
+            scale = w_max / (2 ** (bits - 1) - 1)
+            scale = torch.clamp(scale, min=1e-5)
+            zero = torch.zeros(out_features, 1, dtype=torch.int32)
+            
+            q = torch.round(w_group / scale).to(torch.int32)
+            q = torch.clamp(q, -(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
+            q = q + (2 ** (bits - 1))  # 映射到[0, max_q]
+        else:
+            # 非对称量化
+            w_min = w_group.min(dim=1, keepdim=True)[0]
+            w_max = w_group.max(dim=1, keepdim=True)[0]
+            scale = (w_max - w_min) / max_q
+            scale = torch.clamp(scale, min=1e-5)
+            zero = torch.round(-w_min / scale).to(torch.int32)
+            
+            q = torch.round(w_group / scale + zero).to(torch.int32)
+            q = torch.clamp(q, 0, max_q)
+        
+        q_full[:, start:end] = q
+        scales_list.append(scale.squeeze(1))
+        zeros_list.append(zero.squeeze(1))
+    
+    # 合并scales和zeros
+    scales = torch.stack(scales_list, dim=0).to(torch.float16)  # [num_groups, out_features]
+    zeros = torch.stack(zeros_list, dim=0)  # [num_groups, out_features]
+    
+    # Pack
+    qweight = gptq_pack(q_full.T.contiguous(), bits)  # [K//pack, N]
+    qzeros = gptq_pack_zeros(zeros, bits)  # [num_groups, N//pack]
+    g_idx = torch.empty((0,), dtype=torch.int32)
+    
+    return qweight, qzeros, scales, g_idx
+
+
+def quantize_gptq(
+    layer: nn.Linear,
+    calibration_inputs: List[torch.Tensor],
     bits: int,
     group_size: int,
-) -> dict[str, torch.Tensor]:
+    sym: bool = True,
+    damp_percent: float = 0.01,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    从 auto-gptq 的量化后模型中抽取 qweight/qzeros/scales/g_idx，并按 vLLM/Diffulex 的命名导出。
-    - quant_format == "gptq": 直接导出 QuantLinear 的 buffers。
-    - quant_format == "gptq_marlin": 在导出前使用 vLLM Marlin 的 repack/permute，且导出空 qzeros/g_idx。
+    GPTQ算法量化。
+    
+    基于Hessian的逐列量化误差补偿。
     """
-    quantized_weights: dict[str, torch.Tensor] = {}
-
-    if quant_format not in ("gptq", "gptq_marlin"):
-        raise ValueError(f"Unexpected quant_format for auto-gptq export: {quant_format}")
-
-    if quant_format == "gptq_marlin":
-        if not torch.cuda.is_available():
-            raise RuntimeError("导出 gptq_marlin 需要 CUDA（vLLM Marlin repack 为 CUDA op）。")
-        ops, marlin_permute_scales = _require_vllm_marlin()
-
-    for module_name, module in gptq_base_model.named_modules():
-        # AutoGPTQ 的 QuantLinear（triton/cuda）会有这些 buffer
-        if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")):
-            continue
-
-        # 过滤：保持和旧脚本一致，默认不量化 lm_head
-        if "lm_head" in module_name:
-            continue
-        if target_modules and not any(t in module_name for t in target_modules):
-            continue
-
-        qweight = getattr(module, "qweight")
-        qzeros = getattr(module, "qzeros")
-        scales = getattr(module, "scales")
-        g_idx = getattr(module, "g_idx", None)
-
-        if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor):
-            continue
-
-        if quant_format == "gptq":
-            quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous()
-            quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous()
-            quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous()
-            if desc_act and isinstance(g_idx, torch.Tensor) and g_idx.numel() > 0:
-                quantized_weights[f"{module_name}.g_idx"] = g_idx.detach().to(dtype=torch.int32).cpu().contiguous()
-            else:
-                quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32)
-            continue
-
-        # gptq_marlin 导出：用 vLLM 的 repack/permute 变成 Marlin-ready layout
-        in_features = int(getattr(module, "infeatures", 0))
-        out_features = int(getattr(module, "outfeatures", 0))
-        if in_features <= 0 or out_features <= 0:
-            # fallback：从张量形状推断（qweight shape: [K/pack, N]）
-            out_features = int(qweight.shape[1])
-            pack = 32 // bits
-            in_features = int(qweight.shape[0] * pack)
-
-        group_size_norm = in_features if group_size == -1 else group_size
-        empty_perm = torch.empty((0,), dtype=torch.int32, device="cuda")
-
-        qweight_cuda = qweight.contiguous().to(device="cuda")
-        scales_cuda = scales.contiguous().to(device="cuda", dtype=torch.float16)
-
-        marlin_qweight = ops.gptq_marlin_repack(
-            qweight_cuda,
-            perm=empty_perm,
-            size_k=in_features,
-            size_n=out_features,
-            num_bits=bits,
-            is_a_8bit=(bits == 8),
-        ).contiguous()
-        marlin_scales = marlin_permute_scales(
-            scales_cuda,
-            size_k=in_features,
-            size_n=out_features,
-            group_size=group_size_norm,
-            is_a_8bit=(bits == 8),
-        ).contiguous()
-
-        quantized_weights[f"{module_name}.qweight"] = marlin_qweight.detach().cpu().contiguous()
-        quantized_weights[f"{module_name}.qzeros"] = torch.empty((0,), dtype=torch.int32)
-        quantized_weights[f"{module_name}.scales"] = marlin_scales.detach().cpu().contiguous()
-        quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32)
-
-    return quantized_weights
-
-
-@torch.inference_mode()
-def _export_awq_to_vllm_weights(
-    *,
-    awq_base_model: nn.Module,
-    target_modules: Optional[list[str]],
-) -> dict[str, torch.Tensor]:
+    layer = layer.to(device)
+    layer.eval()
+    
+    weight = layer.weight.data.float()  # [out_features, in_features]
+    out_features, in_features = weight.shape
+    
+    if group_size == -1:
+        group_size = in_features
+    
+    # 1. 收集校准数据
+    print(f"  Collecting calibration data for {layer.name if hasattr(layer, 'name') else 'layer'}...")
+    inputs_list = []
+    for x in calibration_inputs:
+        x = x.to(device)
+        if x.dim() == 3:
+            x = x.reshape(-1, x.shape[-1])
+        with torch.no_grad():
+            # 前向传播获取输入
+            inputs_list.append(x.cpu())
+    
+    # 2. 计算Hessian
+    H = torch.zeros((in_features, in_features), dtype=torch.float32, device=device)
+    num_samples = 0
+    for x in inputs_list:
+        x = x.to(device).float()
+        H.addmm_(x.T, x)
+        num_samples += x.shape[0]
+    H /= num_samples
+    
+    # 3. 添加阻尼
+    damp = damp_percent * torch.mean(torch.diag(H))
+    H += damp * torch.eye(in_features, device=device, dtype=H.dtype)
+    
+    # 4. Cholesky分解
+    try:
+        L = torch.linalg.cholesky(H)
+    except RuntimeError:
+        print("  Warning: Cholesky failed, falling back to RTN")
+        return quantize_rtn(weight, bits, group_size if group_size != in_features else -1, sym)
+    
+    # 5. 逐列量化
+    W = weight.clone().to(device)  # [out_features, in_features]
+    Q = torch.zeros_like(W, dtype=torch.int32)
+    
+    num_groups = (in_features + group_size - 1) // group_size
+    scales_list = []
+    zeros_list = []
+    
+    # 预计算所有组的量化参数
+    for g in range(num_groups):
+        start = g * group_size
+        end = min(start + group_size, in_features)
+        w_group = W[:, start:end]
+        
+        if sym:
+            w_max = w_group.abs().max(dim=1)[0]
+            scale = w_max / (2 ** (bits - 1) - 1)
+            scale = torch.clamp(scale, min=1e-5)
+            zero = torch.zeros(out_features, dtype=torch.int32, device=device)
+        else:
+            w_min = w_group.min(dim=1)[0]
+            w_max = w_group.max(dim=1)[0]
+            scale = (w_max - w_min) / (2 ** bits - 1)
+            scale = torch.clamp(scale, min=1e-5)
+            zero = torch.round(-w_min / scale).to(torch.int32)
+        
+        scales_list.append(scale)
+        zeros_list.append(zero)
+    
+    # 合并为tensor
+    scales_all = torch.stack(scales_list, dim=0).to(torch.float16)  # [num_groups, out_features]
+    zeros_all = torch.stack(zeros_list, dim=0)  # [num_groups, out_features]
+    
+    max_q = 2 ** bits - 1
+    
+    # 逐列处理
+    for i in range(in_features):
+        g = i // group_size
+        scale = scales_all[g]
+        zero = zeros_all[g]
+        
+        w_col = W[:, i]
+        
+        if sym:
+            q_col = torch.round(w_col / scale).to(torch.int32)
+            q_col = torch.clamp(q_col, -(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
+            q_col = q_col + (2 ** (bits - 1))  # 映射到[0, max_q]
+            w_q = (q_col.float() - (2 ** (bits - 1))) * scale
+        else:
+            q_col = torch.round(w_col / scale + zero).to(torch.int32)
+            q_col = torch.clamp(q_col, 0, max_q)
+            w_q = (q_col.float() - zero.float()) * scale
+        
+        Q[:, i] = q_col
+        
+        # 误差补偿
+        err = w_col - w_q
+        if i < in_features - 1:
+            Li = L[i, i]
+            if Li > 1e-8:
+                W[:, i+1:] -= err.unsqueeze(1) * (L[i, i+1:] / Li).unsqueeze(0)
+    
+    # Pack
+    qweight = gptq_pack(Q.T.contiguous(), bits)
+    qzeros = gptq_pack_zeros(zeros_all, bits)
+    g_idx = torch.empty((0,), dtype=torch.int32)
+    
+    return qweight.cpu(), qzeros.cpu(), scales_all.cpu(), g_idx
+
+
+def quantize_awq(
+    layer: nn.Linear,
+    calibration_inputs: List[torch.Tensor],
+    bits: int,
+    group_size: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    从 awq(pack 后)模型中抽取 qweight/qzeros/scales，并按 vLLM/Diffulex 的命名导出。
+    AWQ算法量化（简化实现）。
+    
+    AWQ使用激活感知来缩放权重，保护重要channel。
+    这里实现一个基础版本，完整的AWQ需要更复杂的逻辑。
     """
-    quantized_weights: dict[str, torch.Tensor] = {}
-    for module_name, module in awq_base_model.named_modules():
-        if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")):
-            continue
-        if "lm_head" in module_name:
-            continue
-        if target_modules and not any(t in module_name for t in target_modules):
-            continue
-
-        qweight = getattr(module, "qweight")
-        qzeros = getattr(module, "qzeros")
-        scales = getattr(module, "scales")
-        if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor):
-            continue
-
-        quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous()
-        quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous()
-        quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous()
-    return quantized_weights
+    layer = layer.to(device)
+    layer.eval()
+    
+    weight = layer.weight.data.float()
+    out_features, in_features = weight.shape
+    
+    if group_size == -1:
+        group_size = in_features
+    
+    # 收集激活数据
+    print(f"  Collecting activation data for AWQ...")
+    activations = []
+    for x in calibration_inputs:
+        x = x.to(device)
+        if x.dim() == 3:
+            x = x.reshape(-1, x.shape[-1])
+        activations.append(x.abs().mean(dim=0).cpu())
+    
+    # 计算channel-wise激活幅度
+    act_scale = torch.stack(activations, dim=0).mean(dim=0).to(device)  # [in_features]
+    
+    # 简单的AWQ：根据激活幅度调整权重
+    # 重要的channel（激活大）分配更多精度
+    weight_scaled = weight * act_scale.unsqueeze(0)
+    
+    # 量化（AWQ固定非对称）
+    num_groups = (in_features + group_size - 1) // group_size
+    max_q = 2 ** bits - 1
+    
+    Q = torch.zeros_like(weight, dtype=torch.int32)
+    scales_list = []
+    zeros_list = []
+    
+    for g in range(num_groups):
+        start = g * group_size
+        end = min(start + group_size, in_features)
+        w_group = weight[:, start:end]
+        
+        w_min = w_group.min(dim=1)[0]
+        w_max = w_group.max(dim=1)[0]
+        scale = (w_max - w_min) / max_q
+        scale = torch.clamp(scale, min=1e-5)
+        zero = torch.round(-w_min / scale).to(torch.int32)
+        
+        q = torch.round(w_group / scale.unsqueeze(1) + zero.unsqueeze(1)).to(torch.int32)
+        q = torch.clamp(q, 0, max_q)
+        
+        Q[:, start:end] = q
+        scales_list.append(scale)
+        zeros_list.append(zero)
+    
+    scales = torch.stack(scales_list, dim=0).to(torch.float16)
+    zeros = torch.stack(zeros_list, dim=0)
+    
+    # AWQ pack（列方向）
+    qweight = awq_pack(Q.T.contiguous(), bits)
+    qzeros = awq_pack(zeros, bits)
+    
+    return qweight.cpu(), qzeros.cpu(), scales.cpu()
 
 
+# =============================================================================
+# 校准数据处理
+# =============================================================================
+
+def build_calibration_data(
+    model_path: str,
+    calib_text_file: str,
+    num_samples: int,
+    seq_len: int,
+    batch_size: int,
+    seed: int = 0,
+) -> List[Dict[str, torch.Tensor]]:
+    """
+    从文本文件构建校准数据。
+    
+    对于Diffusion模型，校准数据需要包含：
+    - input_ids
+    - attention_mask
+    - 可能需要timestep（由模型类型决定）
+    """
+    random.seed(seed)
+    
+    # 读取文本
+    with open(calib_text_file, 'r', encoding='utf-8') as f:
+        lines = [line.strip() for line in f if line.strip()]
+    
+    if len(lines) < num_samples:
+        print(f"Warning: only {len(lines)} samples available, requested {num_samples}")
+        num_samples = len(lines)
+    
+    lines = random.sample(lines, num_samples)
+    
+    # 加载tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    
+    calib_data = []
+    
+    for i in range(0, len(lines), batch_size):
+        batch_lines = lines[i:i+batch_size]
+        
+        # Tokenize
+        encoded = tokenizer(
+            batch_lines,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=seq_len,
+        )
+        
+        calib_data.append({
+            'input_ids': encoded['input_ids'],
+            'attention_mask': encoded['attention_mask'].bool(),
+        })
+    
+    return calib_data
+
+
+def collect_layer_inputs(
+    model: nn.Module,
+    calib_data: List[Dict[str, torch.Tensor]],
+    target_modules: Optional[List[str]] = None,
+    device: str = "cuda",
+) -> Dict[str, List[torch.Tensor]]:
+    """
+    收集每层线性层的输入数据用于量化。
+    
+    Returns:
+        Dict mapping layer name to list of input tensors
+    """
+    model = model.to(device)
+    model.eval()
+    
+    layer_inputs: Dict[str, List[torch.Tensor]] = {}
+    handles = []
+    
+    def make_hook(name):
+        def hook(module, input, output):
+            if isinstance(input, tuple):
+                x = input[0]
+            else:
+                x = input
+            
+            # 只保存必要的部分
+            if name not in layer_inputs:
+                layer_inputs[name] = []
+            
+            # 处理不同输入形状
+            if x.dim() == 3:
+                # [batch, seq, hidden] -> 保存每个token
+                layer_inputs[name].append(x.detach().cpu())
+            elif x.dim() == 2:
+                layer_inputs[name].append(x.detach().cpu())
+        return hook
+    
+    # 注册hooks
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear):
+            if target_modules and not any(t in name for t in target_modules):
+                continue
+            handle = module.register_forward_hook(make_hook(name))
+            handles.append(handle)
+            # 保存名字到模块
+            module.name = name
+    
+    # 运行校准
+    print(f"Running calibration with {len(calib_data)} batches...")
+    with torch.no_grad():
+        for batch in tqdm(calib_data, desc="Calibration"):
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
+                    for k, v in batch.items()}
+            try:
+                model(**batch)
+            except Exception as e:
+                print(f"  Warning: calibration step failed: {e}")
+                continue
+    
+    # 移除hooks
+    for handle in handles:
+        handle.remove()
+    
+    return layer_inputs
+
+
+# =============================================================================
+# 主量化函数
+# =============================================================================
+
 def quantize_model(
     model_path: str,
     output_path: str,
-    quant_format: str = "gptq",
-    group_size: int = 128,
+    quant_method: str = "rtn",
     bits: int = 4,
-    target_modules: Optional[list[str]] = None,
-    device: str = "cpu",
-    quant_method: str = "auto",
+    group_size: int = 128,
+    target_modules: Optional[List[str]] = None,
+    device: str = "cuda",
+    # 校准相关
     calib_text_file: Optional[str] = None,
     calib_num_samples: int = 128,
     calib_seq_len: int = 512,
     calib_batch_size: int = 1,
     calib_seed: int = 0,
-    # GPTQ config
+    # GPTQ特定
     desc_act: bool = False,
-    sym: bool = True,
     damp_percent: float = 0.01,
-    true_sequential: bool = True,
-    use_triton: bool = True,
+    # AWQ特定
+    awq_version: str = "GEMM",
 ) -> None:
-    """Quantize model weights to GPTQ/AWQ format.
+    """
+    量化模型并保存为Diffulex可加载的格式。
     
     Args:
-        model_path: Path to input model directory (containing safetensors files)
-        output_path: Path to output directory (will create if not exists)
-        quant_format: "gptq" or "awq"
-        group_size: Group size for quantization (default: 128)
-        bits: Number of bits per weight (default: 4)
-        target_modules: List of module name patterns to quantize (e.g., ["q_proj", "k_proj"]).
-                       If None, quantizes all linear layers.
-        device: Device to use for quantization ("cpu" or "cuda")
-        quant_method: "auto"（真 GPTQ/AWQ，需校准数据）或 "simple"（旧实现，无校准）
-        calib_text_file: 校准文本文件（每行一条样本）
+        model_path: 输入模型路径（HF格式）
+        output_path: 输出目录路径
+        quant_method: 量化方法
+            - "rtn": Round-To-Nearest，无校准
+            - "gptq": GPTQ算法，需校准
+            - "awq": AWQ算法，需校准
+            - "gptq_marlin": GPTQ + Marlin重排，需校准，强制sym=True
+            - "awq_marlin": AWQ + Marlin重排，需校准
+        bits: 量化位数（2, 4, 8）
+        group_size: 量化组大小（-1表示per-channel）
+        target_modules: 要量化的模块名模式，None表示所有Linear
+        device: 计算设备
+        calib_text_file: 校准文本文件路径（每行一个样本）
+        calib_num_samples: 校准样本数
+        calib_seq_len: 校准序列长度
+        calib_batch_size: 校准batch size
+        calib_seed: 随机种子
+        desc_act: GPTQ是否使用act-order（暂不支持True）
+        damp_percent: GPTQ Hessian阻尼系数
+        awq_version: AWQ版本（"GEMM"或"GEMV"）
     """
-    if quant_format not in ["gptq", "gptq_marlin", "awq"]:
-        raise ValueError(
-            f"Unsupported quant_format: {quant_format}. Must be 'gptq', 'gptq_marlin' or 'awq'"
-        )
-    if quant_method not in ["auto", "simple"]:
-        raise ValueError("quant_method must be 'auto' or 'simple'")
-
-    # Marlin GPTQ 强约束：对称量化 + 不使用 act-order
-    if quant_format == "gptq_marlin":
-        desc_act = False
-        sym = True
-    
     output_path = Path(output_path)
     output_path.mkdir(parents=True, exist_ok=True)
     
-    # Load model config (for tokenizer special tokens, etc.)
-    _ = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-
-    quantized_weights: dict[str, torch.Tensor] = {}
+    # 验证参数
+    if quant_method not in ["rtn", "gptq", "awq", "gptq_marlin", "awq_marlin"]:
+        raise ValueError(f"Unknown quant_method: {quant_method}")
+    
+    if bits not in [2, 3, 4, 8]:
+        raise ValueError(f"bits must be in [2, 3, 4, 8], got {bits}")
+    
+    # Marlin强制约束
+    is_marlin = "marlin" in quant_method
+    if is_marlin:
+        if bits not in [4, 8]:
+            raise ValueError(f"Marlin only supports 4-bit or 8-bit, got {bits}")
+        if group_size not in [128, -1]:
+            print(f"Warning: Marlin prefers group_size=128, got {group_size}")
+        print(f"Marlin mode: forcing symmetric quantization")
+        sym = True
+    else:
+        sym = quant_method == "gptq"  # GPTQ默认对称，AWQ默认非对称
+    
+    if quant_method != "rtn" and calib_text_file is None:
+        raise ValueError(f"{quant_method} requires calib_text_file")
+    
+    if desc_act:
+        raise NotImplementedError("desc_act=True is not yet supported")
+    
+    # 加载模型配置
+    print(f"Loading model config from {model_path}...")
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    
+    # 确定输出格式
+    if quant_method == "gptq_marlin":
+        output_format = "gptq_marlin"
+    elif quant_method == "awq_marlin":
+        output_format = "awq_marlin"
+    elif quant_method == "awq":
+        output_format = "awq"
+    else:
+        output_format = "gptq"
+    
+    # 元数据
     metadata = {
-        "quant_format": quant_format,
         "quant_method": quant_method,
-        "group_size": group_size,
         "bits": bits,
+        "group_size": group_size,
+        "sym": sym,
+        "desc_act": desc_act,
         "quantized_modules": [],
     }
-
-    # ----------------------------
-    # 真 GPTQ/AWQ（需要校准数据）
-    # ----------------------------
-    if quant_method == "auto":
-        if calib_text_file is None:
-            raise ValueError("quant_method=auto 需要提供 --calib-text-file")
-
-        texts = _load_calib_texts(calib_text_file, num_samples=calib_num_samples, seed=calib_seed)
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)
-        if tokenizer.pad_token_id is None:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        if quant_format in ("gptq", "gptq_marlin"):
-            if quant_format == "gptq_marlin" and device != "cuda":
-                raise ValueError("导出 gptq_marlin 需要 --device cuda")
-
-            AutoGPTQForCausalLM, BaseQuantizeConfig = _require_auto_gptq()
-            examples = _build_autogptq_examples(tokenizer, texts, seq_len=calib_seq_len)
-
-            qcfg = BaseQuantizeConfig(
-                bits=int(bits),
-                group_size=int(group_size),
-                damp_percent=float(damp_percent),
-                desc_act=bool(desc_act),
-                sym=bool(sym),
-                true_sequential=bool(true_sequential),
-            )
-
-            model_init_kwargs = {
-                "trust_remote_code": True,
-            }
-            # 让 AutoGPTQ 自己用 accelerate 做 device_map；CPU 模式下走默认加载。
-            if device == "cuda":
-                model_init_kwargs["device_map"] = "auto"
-                model_init_kwargs["torch_dtype"] = torch.float16
-
-            gptq_model = AutoGPTQForCausalLM.from_pretrained(
-                model_path,
-                qcfg,
-                **model_init_kwargs,
-            )
-            gptq_model.quantize(
-                examples,
-                batch_size=int(calib_batch_size),
-                use_triton=bool(use_triton),
-                cache_examples_on_gpu=(device == "cuda"),
-            )
-
-            quantized_weights = _export_autogptq_to_vllm_weights(
-                gptq_base_model=gptq_model.model,
-                quant_format=quant_format,
-                target_modules=target_modules,
-                desc_act=bool(desc_act),
-                bits=int(bits),
-                group_size=int(group_size),
-            )
-
-        else:  # awq
-            if bits != 4:
-                raise ValueError(f"AWQ 目前仅支持 4-bit，当前 bits={bits}")
-            AutoAWQForCausalLM = _require_awq()
-
-            awq_model = AutoAWQForCausalLM.from_pretrained(
-                model_path,
-                trust_remote_code=True,
-                safetensors=True,
-                device_map="auto" if device == "cuda" else None,
-                torch_dtype="auto",
-            )
-
-            awq_model.quantize(
-                tokenizer=tokenizer,
-                quant_config={
-                    "zero_point": True,
-                    "q_group_size": int(group_size),
-                    "w_bit": int(bits),
-                    "version": "GEMM",
-                },
-                calib_data=texts,
-                max_calib_samples=int(calib_num_samples),
-                max_calib_seq_len=int(calib_seq_len),
-            )
-            awq_model.pack()
-
-            quantized_weights = _export_awq_to_vllm_weights(
-                awq_base_model=awq_model.model,
-                target_modules=target_modules,
-            )
-
-    # ----------------------------
-    # 旧实现（无校准，不是真 GPTQ/AWQ）
-    # ----------------------------
-    else:
-        safetensors_files = list(glob(os.path.join(model_path, "*.safetensors")))
+    
+    quantized_weights: Dict[str, torch.Tensor] = {}
+    
+    # RTN方法：不需要加载完整模型
+    if quant_method == "rtn":
+        print("Loading model weights...")
+        # 加载safetensors
+        from glob import glob
+        safetensors_files = sorted(glob(os.path.join(model_path, "*.safetensors")))
+        
         if not safetensors_files:
             raise ValueError(f"No safetensors files found in {model_path}")
-
-        print(f"Found {len(safetensors_files)} safetensors files")
-
-        all_weight_keys: list[str] = []
-        for file in safetensors_files:
-            with safe_open(file, "pt", device) as f:
-                all_weight_keys.extend(f.keys())
-
-        linear_weight_keys: list[str] = []
-        for key in all_weight_keys:
-            if any(skip in key for skip in [".bias", ".norm", ".embed", ".lm_head"]):
-                continue
+        
+        # 收集所有key
+        all_keys = []
+        for f in safetensors_files:
+            st = load_file(f, device="cpu")
+            all_keys.extend(list(st.keys()))
+        
+        # 筛选线性层权重
+        linear_keys = []
+        for key in all_keys:
             if not key.endswith(".weight"):
                 continue
-            if target_modules and not any(target in key for target in target_modules):
+            if any(skip in key for skip in [".norm", "norm.", "embed", "lm_head"]):
                 continue
-            linear_weight_keys.append(key)
-
-        print(f"Found {len(linear_weight_keys)} linear layer weights to quantize")
-
-        for key in tqdm(linear_weight_keys, desc="Quantizing weights (simple)"):
+            if target_modules and not any(t in key for t in target_modules):
+                continue
+            linear_keys.append(key)
+        
+        print(f"Quantizing {len(linear_keys)} layers with RTN...")
+        
+        for key in tqdm(linear_keys, desc="RTN Quantize"):
+            # 加载权重
             weight = None
-            for file in safetensors_files:
-                with safe_open(file, "pt", device) as f:
-                    if key in f.keys():
-                        weight = f.get_tensor(key)
+            for f in safetensors_files:
+                try:
+                    st = load_file(f, device="cpu")
+                    if key in st:
+                        weight = st[key]
                         break
-
-            if weight is None:
-                print(f"Warning: Could not load weight for {key}")
-                continue
-            if weight.dim() != 2:
-                print(f"Skipping {key}: not a 2D weight (shape: {weight.shape})")
+                except:
+                    continue
+            
+            if weight is None or weight.dim() != 2:
                 continue
-
+            
             out_features, in_features = weight.shape
-            weight_fp32 = weight.to(torch.float32).to(device)
+            
+            # RTN量化
+            qweight, qzeros, scales, g_idx = quantize_rtn(
+                weight, bits, group_size, sym
+            )
+            
             prefix = key[:-7]  # Remove ".weight"
-
-            if quant_format == "gptq":
-                qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq(
-                    weight_fp32, group_size=group_size, bits=bits, use_v2_format=False
+            
+            if is_marlin:
+                # Marlin重排
+                marlin_qw, marlin_sc, marlin_ws = repack_gptq_to_marlin(
+                    qweight, scales, bits, in_features, out_features,
+                    group_size if group_size != -1 else in_features, device
                 )
-                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
-                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
-                quantized_weights[f"{prefix}.scales"] = scales.cpu()
-                quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
-
-            elif quant_format == "gptq_marlin":
-                qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin(
-                    weight_fp32, group_size=group_size, bits=bits
+                quantized_weights[f"{prefix}.marlin_qweight"] = marlin_qw
+                quantized_weights[f"{prefix}.marlin_scales"] = marlin_sc
+                quantized_weights[f"{prefix}.marlin_workspace"] = marlin_ws
+            else:
+                quantized_weights[f"{prefix}.qweight"] = qweight
+                quantized_weights[f"{prefix}.qzeros"] = qzeros
+                quantized_weights[f"{prefix}.scales"] = scales
+                quantized_weights[f"{prefix}.g_idx"] = g_idx
+            
+            metadata["quantized_modules"].append({
+                "name": prefix,
+                "in_features": in_features,
+                "out_features": out_features,
+                "group_size": group_size,
+                "bits": bits,
+            })
+    
+    else:
+        # GPTQ/AWQ方法：需要加载模型和校准
+        print(f"Building calibration data...")
+        calib_data = build_calibration_data(
+            model_path, calib_text_file, calib_num_samples,
+            calib_seq_len, calib_batch_size, calib_seed
+        )
+        
+        print(f"Loading model for {quant_method} quantization...")
+        model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.float32,
+            trust_remote_code=True,
+            device_map="cpu",  # 先在CPU加载
+        )
+        
+        print("Collecting layer inputs...")
+        layer_inputs = collect_layer_inputs(
+            model, calib_data, target_modules, device
+        )
+        
+        # 量化每个层
+        print(f"Quantizing with {quant_method}...")
+        for name, module in tqdm(list(model.named_modules()), desc="Quantize"):
+            if not isinstance(module, nn.Linear):
+                continue
+            if target_modules and not any(t in name for t in target_modules):
+                continue
+            if name not in layer_inputs:
+                continue
+            
+            weight = module.weight.data
+            out_features, in_features = weight.shape
+            
+            inputs = layer_inputs[name]
+            
+            if quant_method in ["gptq", "gptq_marlin"]:
+                qweight, qzeros, scales, g_idx = quantize_gptq(
+                    module, inputs, bits, group_size, sym=True,
+                    damp_percent=damp_percent, device=device
                 )
-                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
-                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
-                quantized_weights[f"{prefix}.scales"] = scales.cpu()
-                quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
-
-            else:  # awq
-                qweight, qzeros, scales = _quantize_to_vllm_awq(
-                    weight_fp32, group_size=group_size, bits=bits
+                
+                if is_marlin:
+                    marlin_qw, marlin_sc, marlin_ws = repack_gptq_to_marlin(
+                        qweight, scales, bits, in_features, out_features,
+                        group_size if group_size != -1 else in_features, device
+                    )
+                    quantized_weights[f"{name}.marlin_qweight"] = marlin_qw
+                    quantized_weights[f"{name}.marlin_scales"] = marlin_sc
+                    quantized_weights[f"{name}.marlin_workspace"] = marlin_ws
+                else:
+                    quantized_weights[f"{name}.qweight"] = qweight
+                    quantized_weights[f"{name}.qzeros"] = qzeros
+                    quantized_weights[f"{name}.scales"] = scales
+                    quantized_weights[f"{name}.g_idx"] = g_idx
+            
+            elif quant_method in ["awq", "awq_marlin"]:
+                qweight, qzeros, scales = quantize_awq(
+                    module, inputs, bits, group_size, device
                 )
-                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
-                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
-                quantized_weights[f"{prefix}.scales"] = scales.cpu()
-
-            metadata["quantized_modules"].append(
-                {
-                    "name": prefix,
-                    "out_features": int(out_features),
-                    "in_features": int(in_features),
-                    "group_size": group_size,
-                    "bits": bits,
-                }
-            )
-
+                
+                if is_marlin:
+                    marlin_qw, marlin_sc, marlin_ws = repack_awq_to_marlin(
+                        qweight, scales, bits, in_features, out_features,
+                        group_size if group_size != -1 else in_features, device
+                    )
+                    quantized_weights[f"{name}.marlin_qweight"] = marlin_qw
+                    quantized_weights[f"{name}.marlin_scales"] = marlin_sc
+                    quantized_weights[f"{name}.marlin_workspace"] = marlin_ws
+                else:
+                    quantized_weights[f"{name}.qweight"] = qweight
+                    quantized_weights[f"{name}.qzeros"] = qzeros
+                    quantized_weights[f"{name}.scales"] = scales
+            
+            metadata["quantized_modules"].append({
+                "name": name,
+                "in_features": in_features,
+                "out_features": out_features,
+                "group_size": group_size,
+                "bits": bits,
+            })
+            
+            # 清理内存
+            del layer_inputs[name]
+            gc.collect()
             if device == "cuda":
                 torch.cuda.empty_cache()
     
-    # Copy all model files (config, tokenizer, etc.) to output directory
-    print(f"\nCopying model files to {output_path}...")
-    model_path_obj = Path(model_path)
+    # 保存文件
+    print("\nSaving quantized model...")
     
-    # First, copy original safetensors files (for non-quantized layers like lm_head, embeddings, etc.)
-    print("  Copying original safetensors files (for non-quantized layers)...")
+    # 复制原始模型文件
+    model_path_obj = Path(model_path)
     for file in model_path_obj.glob("*.safetensors"):
-        dest_file = output_path / file.name
-        shutil.copy2(file, dest_file)
-        print(f"    Copied {file.name}")
+        if "quantized" not in file.name:
+            shutil.copy2(file, output_path / file.name)
     
-    # Copy other non-safetensors files
     for file in model_path_obj.iterdir():
         if file.is_file() and not file.name.endswith('.safetensors'):
-            dest_file = output_path / file.name
-            shutil.copy2(file, dest_file)
-            print(f"  Copied {file.name}")
+            shutil.copy2(file, output_path / file.name)
     
-    # Save quantized weights to safetensors (this will add quantized weights to the directory)
-    output_file = output_path / f"model_quantized_{quant_format}.safetensors"
-    print(f"\nSaving quantized weights to {output_file}...")
+    # 保存量化权重
+    output_file = output_path / f"model_quantized_{output_format}.safetensors"
     save_file(quantized_weights, output_file)
+    print(f"  Saved: {output_file}")
     
-    # Save metadata
-    metadata_file = output_path / f"quantization_metadata_{quant_format}.json"
+    # 保存元数据
+    metadata_file = output_path / f"quantization_metadata_{output_format}.json"
     with open(metadata_file, "w") as f:
         json.dump(metadata, f, indent=2)
-
-    # vLLM/Diffulex 会读取 quantize_config.json 识别量化类型与超参
-    if quant_format in ("gptq", "gptq_marlin", "awq"):
-        if quant_format == "gptq_marlin":
-            cfg_desc_act = False
-            cfg_sym = True
-            cfg_ckpt = "gptq_marlin"
-        elif quant_format == "gptq":
-            cfg_desc_act = bool(desc_act)
-            cfg_sym = bool(sym)
-            cfg_ckpt = "gptq"
-        else:  # awq
-            cfg_desc_act = False
-            cfg_sym = False
-            cfg_ckpt = "awq"
-
+    print(f"  Saved: {metadata_file}")
+    
+    # 保存quantize_config.json
+    if output_format == "gptq_marlin":
+        quantize_cfg = {
+            "bits": bits,
+            "group_size": group_size,
+            "desc_act": False,
+            "sym": True,
+            "lm_head": False,
+            "checkpoint_format": "gptq_marlin",
+        }
+    elif output_format == "awq_marlin":
+        quantize_cfg = {
+            "bits": bits,
+            "group_size": group_size,
+            "desc_act": False,
+            "sym": False,
+            "lm_head": False,
+            "checkpoint_format": "awq_marlin",
+        }
+    elif output_format == "awq":
+        quantize_cfg = {
+            "bits": bits,
+            "group_size": group_size,
+            "desc_act": False,
+            "sym": False,
+            "lm_head": False,
+            "checkpoint_format": "awq",
+            "version": awq_version,
+        }
+    else:  # gptq
         quantize_cfg = {
-            "bits": int(bits),
-            "group_size": int(group_size),
-            "desc_act": bool(cfg_desc_act),
-            "sym": bool(cfg_sym),
+            "bits": bits,
+            "group_size": group_size,
+            "desc_act": desc_act,
+            "sym": sym,
             "lm_head": False,
-            "checkpoint_format": cfg_ckpt,
+            "checkpoint_format": "gptq",
         }
-        with open(output_path / "quantize_config.json", "w", encoding="utf-8") as f:
-            json.dump(quantize_cfg, f, indent=2)
+    
+    cfg_file = output_path / "quantize_config.json"
+    with open(cfg_file, "w", encoding="utf-8") as f:
+        json.dump(quantize_cfg, f, indent=2)
+    print(f"  Saved: {cfg_file}")
     
     print(f"\n✓ Quantization complete!")
-    print(f"  - Quant method: {quant_method}")
-    print(f"  - Output directory: {output_path}")
-    print(f"  - Quantized weights file: {output_file}")
-    print(f"  - Metadata file: {metadata_file}")
-    print(f"\n  You can now use this directory directly as model path:")
-    print(f"    --model-path {output_path}")
+    print(f"  Method: {quant_method}")
+    print(f"  Format: {output_format}")
+    print(f"  Quantized {len(metadata['quantized_modules'])} layers")
+    print(f"  Output: {output_path}")
+
 
+# =============================================================================
+# 命令行接口
+# =============================================================================
 
 def main():
     parser = argparse.ArgumentParser(
-        description="离线量化模型权重为 GPTQ/AWQ 格式",
+        description="量化 AutoModelForDiffusionLM 模型",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument("--model-path", type=str, required=True, help="输入模型路径")
     parser.add_argument("--output-path", type=str, required=True, help="输出路径")
-    parser.add_argument(
-        "--quant-format",
-        type=str,
-        choices=["gptq", "gptq_marlin", "awq"],
-        default="gptq",
-        help="量化格式: gptq / gptq_marlin / awq",
-    )
-    parser.add_argument("--group-size", type=int, default=128, help="量化组大小 (默认: 128)")
-    parser.add_argument("--bits", type=int, default=4, help="每个权重的位数 (默认: 4)")
-    parser.add_argument("--target-modules", type=str, help="要量化的模块名称模式（逗号分隔），例如: q_proj,k_proj,v_proj")
-    parser.add_argument("--device", type=str, choices=["cpu", "cuda"], default="cpu", help="量化设备 (默认: cpu)")
     parser.add_argument(
         "--quant-method",
         type=str,
-        choices=["auto", "simple"],
-        default="auto",
-        help="量化方法: auto(真 GPTQ/AWQ, 需要校准数据) / simple(旧实现, 无校准)",
-    )
-    parser.add_argument("--calib-text-file", type=str, default=None, help="校准文本文件（每行一条样本）")
-    parser.add_argument("--calib-num-samples", type=int, default=128, help="校准样本数 (默认: 128)")
-    parser.add_argument("--calib-seq-len", type=int, default=512, help="校准序列长度 (默认: 512)")
-    parser.add_argument("--calib-batch-size", type=int, default=1, help="校准 batch size (默认: 1)")
-    parser.add_argument("--calib-seed", type=int, default=0, help="校准采样随机种子 (默认: 0)")
-    parser.add_argument("--desc-act", action="store_true", help="GPTQ act-order(desc_act) (默认: False)")
-    parser.add_argument("--sym", dest="sym", action="store_true", default=True, help="GPTQ symmetric quant (默认: True)")
-    parser.add_argument("--no-sym", dest="sym", action="store_false", help="关闭 GPTQ symmetric quant")
-    parser.add_argument("--damp-percent", type=float, default=0.01, help="GPTQ damp_percent (默认: 0.01)")
-    parser.add_argument(
-        "--true-sequential",
-        dest="true_sequential",
-        action="store_true",
-        default=True,
-        help="GPTQ true_sequential (默认: True)",
-    )
-    parser.add_argument(
-        "--no-true-sequential",
-        dest="true_sequential",
-        action="store_false",
-        help="关闭 GPTQ true_sequential",
-    )
-    parser.add_argument(
-        "--use-triton",
-        dest="use_triton",
-        action="store_true",
-        default=True,
-        help="AutoGPTQ 使用 Triton backend (默认: True)",
+        choices=["rtn", "gptq", "awq", "gptq_marlin", "awq_marlin"],
+        default="rtn",
+        help="量化方法: rtn(快速)/gptq(高精度)/awq/gptq_marlin(高性能)/awq_marlin",
     )
+    parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4, 8], help="量化位数")
+    parser.add_argument("--group-size", type=int, default=128, help="量化组大小")
     parser.add_argument(
-        "--no-triton",
-        dest="use_triton",
-        action="store_false",
-        help="关闭 AutoGPTQ Triton backend（可能回退到 CUDA extension）",
+        "--target-modules",
+        type=str,
+        help="要量化的模块名模式（逗号分隔），例如: q_proj,k_proj,v_proj"
     )
+    parser.add_argument("--device", type=str, default="cuda", help="计算设备")
+    
+    # 校准参数
+    parser.add_argument("--calib-text-file", type=str, help="校准文本文件（每行一条）")
+    parser.add_argument("--calib-num-samples", type=int, default=128, help="校准样本数")
+    parser.add_argument("--calib-seq-len", type=int, default=512, help="校准序列长度")
+    parser.add_argument("--calib-batch-size", type=int, default=1, help="校准batch size")
+    parser.add_argument("--calib-seed", type=int, default=0, help="随机种子")
+    
+    # GPTQ参数
+    parser.add_argument("--desc-act", action="store_true", help="使用act-order（暂不支持）")
+    parser.add_argument("--damp-percent", type=float, default=0.01, help="Hessian阻尼系数")
+    
+    # AWQ参数
+    parser.add_argument("--awq-version", type=str, default="GEMM", choices=["GEMM", "GEMV"])
     
     args = parser.parse_args()
     
@@ -777,22 +1006,19 @@ def main():
     quantize_model(
         model_path=args.model_path,
         output_path=args.output_path,
-        quant_format=args.quant_format,
-        group_size=args.group_size,
+        quant_method=args.quant_method,
         bits=args.bits,
+        group_size=args.group_size,
         target_modules=target_modules,
         device=args.device,
-        quant_method=args.quant_method,
         calib_text_file=args.calib_text_file,
         calib_num_samples=args.calib_num_samples,
         calib_seq_len=args.calib_seq_len,
         calib_batch_size=args.calib_batch_size,
         calib_seed=args.calib_seed,
-        desc_act=bool(args.desc_act),
-        sym=bool(args.sym),
-        damp_percent=float(args.damp_percent),
-        true_sequential=bool(args.true_sequential),
-        use_triton=bool(args.use_triton),
+        desc_act=args.desc_act,
+        damp_percent=args.damp_percent,
+        awq_version=args.awq_version,
     )
 
 
diff --git a/diffulex/extensions/quantization/strategies/kv_cache_fp8_running_max.py b/diffulex/extensions/quantization/strategies/kv_cache_fp8_running_max.py
index 39715f3d..1de3c578 100644
--- a/diffulex/extensions/quantization/strategies/kv_cache_fp8_running_max.py
+++ b/diffulex/extensions/quantization/strategies/kv_cache_fp8_running_max.py
@@ -13,9 +13,9 @@
 from ..strategy import KVCacheQuantizationStrategy
 from ..registry import register_kv_cache_strategy
 
-# Try to import custom FP8 Triton kernel
+# Try to import custom FP8 Triton kernels
 try:
-    from ..kernels.triton_kernels import fp8_kv_attention_forward
+    from ..kernels.triton_kernels import chunked_prefill_attn_unified_fp8
     _HAS_FP8_TRITON_KERNEL = True
 except ImportError:
     _HAS_FP8_TRITON_KERNEL = False
@@ -161,51 +161,56 @@ def has_triton_kernel(self) -> bool:
     def triton_attention(
         self,
         q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
         k_cache: torch.Tensor,
         v_cache: torch.Tensor,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        page_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        cu_seqlens_q: torch.Tensor,
-        softmax_scale: float,
+        attn_metadata,
+        k_scale: Optional[torch.Tensor] = None,
+        v_scale: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         """
-        Compute attention using custom FP8 Triton kernel.
+        Compute attention using unified FP8 Triton kernel.
         
-        This avoids explicit dequantization by doing it on-the-fly in the kernel.
+        This kernel handles both:
+        - Stage 1: Attention against cached FP8 KV (dequantized on-the-fly)
+        - Stage 2: Attention against new BF16 KV
         
         Args:
-            q: Query tensor [total_seqlen, num_heads, head_dim]
+            q: Query tensor [total_seqlen, num_heads, head_dim] (BF16)
+            k: Key tensor [total_seqlen, num_kv_heads, head_dim] (BF16) - current step
+            v: Value tensor [total_seqlen, num_kv_heads, head_dim] (BF16) - current step
             k_cache: Key cache in FP8 [num_pages, page_size, num_kv_heads, head_dim]
             v_cache: Value cache in FP8 [num_pages, page_size, num_kv_heads, head_dim]
-            k_scale: Per-request K scales
-            v_scale: Per-request V scales
-            page_tables: Page table mapping
-            context_lens: Context lengths per request
-            cu_seqlens_q: Cumulative sequence lengths
-            softmax_scale: Softmax scaling factor
+            attn_metadata: Attention metadata object
+            k_scale: Per-tensor K scale (scalar float32)
+            v_scale: Per-tensor V scale (scalar float32)
             
         Returns:
-            Attention output or None if kernel fails
+            Attention output [total_seqlen, num_heads, head_dim] or None if kernel fails
         """
         if not _HAS_FP8_TRITON_KERNEL:
             return None
         
+        if k_scale is None or v_scale is None:
+            raise ValueError("FP8 KV cache Triton kernel requires k_scale and v_scale")
+        
         try:
-            return fp8_kv_attention_forward(
+            return chunked_prefill_attn_unified_fp8(
                 q=q,
+                k=k,
+                v=v,
                 k_cache=k_cache,
                 v_cache=v_cache,
                 k_scale=k_scale,
                 v_scale=v_scale,
-                page_tables=page_tables,
-                context_lens=context_lens,
-                cu_seqlens_q=cu_seqlens_q,
-                softmax_scale=softmax_scale,
-                is_e4m3=(self.fp8_dtype == torch.float8_e4m3fn),
+                attn_metadata=attn_metadata,
             )
-        except Exception:
+        except Exception as e:
+            # Fallback to None to trigger dequantization path
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.debug(f"FP8 Triton kernel failed: {e}")
             return None
 
 
diff --git a/diffulex/extensions/quantization/strategies/linear_awq_marlin_w4a16.py b/diffulex/extensions/quantization/strategies/linear_awq_marlin_w4a16.py
index 5bcb4f09..aaa9678e 100644
--- a/diffulex/extensions/quantization/strategies/linear_awq_marlin_w4a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_awq_marlin_w4a16.py
@@ -21,6 +21,8 @@ class AWQMarlinW4A16LinearStrategy(LinearQuantizationStrategy):
     Uses pre-repacked Marlin format weights from AWQ checkpoints.
     """
     
+    is_offline_quantized = True
+    
     def __init__(self, bits: int = 4, group_size: int = 128):
         self.bits = bits
         self.group_size = group_size
diff --git a/diffulex/extensions/quantization/strategies/linear_awq_w4a16.py b/diffulex/extensions/quantization/strategies/linear_awq_w4a16.py
index a4c756c1..db8af038 100644
--- a/diffulex/extensions/quantization/strategies/linear_awq_w4a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_awq_w4a16.py
@@ -24,6 +24,8 @@ class AWQW4A16LinearStrategy(LinearQuantizationStrategy):
     - scales: float16/bfloat16 scales
     """
     
+    is_offline_quantized = True
+    
     def __init__(self, bits: int = 4, group_size: int = 128):
         self.bits = bits
         self.group_size = group_size
diff --git a/diffulex/extensions/quantization/strategies/linear_fp8_w8a16.py b/diffulex/extensions/quantization/strategies/linear_fp8_w8a16.py
index 3feb7d83..a5e9cbcc 100644
--- a/diffulex/extensions/quantization/strategies/linear_fp8_w8a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_fp8_w8a16.py
@@ -109,7 +109,7 @@ def quantize_weight_for_kernel(
         if device is not None:
             q_fp8 = q_fp8.to(device=device)
             meta["scales"] = meta["scales"].to(device=device)
-        return q_fp8, meta["scales"]
+        return q_fp8, meta
     
     def quantize_act_for_kernel(self, x: torch.Tensor,
                                 cache_key: Optional[str] = None) -> Tuple[torch.Tensor, Any]:
diff --git a/diffulex/extensions/quantization/strategies/linear_fp8_w8a8.py b/diffulex/extensions/quantization/strategies/linear_fp8_w8a8.py
index d8bcf03b..4a57a3da 100644
--- a/diffulex/extensions/quantization/strategies/linear_fp8_w8a8.py
+++ b/diffulex/extensions/quantization/strategies/linear_fp8_w8a8.py
@@ -107,7 +107,7 @@ def quantize_weight_for_kernel(
         if device is not None:
             q_fp8 = q_fp8.to(device=device)
             meta["scales"] = meta["scales"].to(device=device)
-        return q_fp8, meta["scales"]
+        return q_fp8, meta
     
     def quantize_act_for_kernel(self, x: torch.Tensor,
                                 cache_key: Optional[str] = None) -> Tuple[torch.Tensor, Any]:
diff --git a/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w4a16.py
index 643f44b9..dbb7299b 100644
--- a/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w4a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w4a16.py
@@ -21,6 +21,8 @@ class GPTQMarlinW4A16LinearStrategy(LinearQuantizationStrategy):
     Uses pre-repacked Marlin format weights for optimal performance.
     """
     
+    is_offline_quantized = True
+    
     def __init__(self, bits: int = 4, group_size: int = 128):
         self.bits = bits
         self.group_size = group_size
diff --git a/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w8a16.py b/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w8a16.py
index 9e8bf1e2..631fb96a 100644
--- a/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w8a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_gptq_marlin_w8a16.py
@@ -21,6 +21,8 @@ class GPTQMarlinW8A16LinearStrategy(LinearQuantizationStrategy):
     Uses pre-repacked Marlin format weights for optimal performance.
     """
     
+    is_offline_quantized = True
+    
     def __init__(self, group_size: int = 128):
         self.bits = 8
         self.group_size = group_size
diff --git a/diffulex/extensions/quantization/strategies/linear_gptq_wxa16.py b/diffulex/extensions/quantization/strategies/linear_gptq_wxa16.py
index 1f1a9522..fd9486ca 100644
--- a/diffulex/extensions/quantization/strategies/linear_gptq_wxa16.py
+++ b/diffulex/extensions/quantization/strategies/linear_gptq_wxa16.py
@@ -2,7 +2,7 @@
 GPTQ W*x*A16 Linear Strategy - Unified implementation for all bit widths.
 
 Supports 2-bit, 3-bit, 4-bit, and 8-bit weight quantization with BF16 activation.
-Uses vLLM's gptq_gemm op for optimized inference.
+Uses vLLM's gptq_gemm op via torch.ops._C for optimized inference.
 """
 
 import torch
@@ -11,7 +11,7 @@
 
 from ..strategy import LinearQuantizationStrategy
 from ..registry import register_linear_strategy
-from ..kernels.kernel_availability import warn_kernel_unavailable, check_vllm_op_available
+from ..kernels.kernel_availability import warn_kernel_unavailable, check_torch_c_op_available
 
 
 def _unpack_gptq_weights(qweight: torch.Tensor, bits: int, 
@@ -87,6 +87,7 @@ class GPTQWxa16LinearStrategy(LinearQuantizationStrategy):
     """
     
     SUPPORTED_BITS = [2, 3, 4, 8]
+    is_offline_quantized = True  # Mark as offline quantization
     
     def __init__(self, bits: int = 4, group_size: int = 128, desc_act: bool = False):
         if bits not in self.SUPPORTED_BITS:
@@ -96,16 +97,14 @@ def __init__(self, bits: int = 4, group_size: int = 128, desc_act: bool = False)
         self.group_size = group_size
         self.desc_act = desc_act
         
-        # Check for vLLM GPTQ ops
+        # Check for vLLM GPTQ ops via torch.ops._C
         self.gptq_gemm = None
         self.shuffle_weights = None
         self._kernel_warned = False
+        self._empty_g_idx_cache: Dict[int, torch.Tensor] = {}
         
-        if check_vllm_op_available('gptq_gemm'):
-            import vllm._custom_ops as ops
-            self.gptq_gemm = ops.gptq_gemm
-            if hasattr(ops, 'gptq_shuffle'):
-                self.shuffle_weights = ops.gptq_shuffle
+        if check_torch_c_op_available('gptq_gemm'):
+            self.gptq_gemm = torch.ops._C.gptq_gemm
         # Note: Warning is deferred to first forward call to avoid spam during import
     
     @property
@@ -169,19 +168,37 @@ def linear_forward(self, x: torch.Tensor, weight: torch.Tensor,
         if scales is None:
             raise ValueError("GPTQ forward requires 'scales' buffer")
         
-        # Use vLLM GPTQ GEMM if available
+        # Use vLLM GPTQ GEMM via torch.ops._C if available
         if self.gptq_gemm is not None:
             try:
-                x_2d = x.reshape(-1, x.shape[-1])
+                # vLLM expects FP16 activations
+                x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
+                x_2d = x_in.reshape(-1, x_in.shape[-1]) if x_in.dim() != 2 else x_in
+                if not x_2d.is_contiguous():
+                    x_2d = x_2d.contiguous()
                 
+                # Handle g_idx - use cached empty tensor per device
+                device = x.device
+                dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1
+                if g_idx is None or g_idx.numel() == 0:
+                    empty = self._empty_g_idx_cache.get(dev_key)
+                    if empty is None or empty.device != device:
+                        empty = torch.empty((0,), device=device, dtype=torch.int)
+                        self._empty_g_idx_cache[dev_key] = empty
+                    g_idx_t = empty
+                else:
+                    g_idx_t = g_idx if (g_idx.device == device and g_idx.dtype == torch.int) else g_idx.to(device=device, dtype=torch.int)
+                
+                # Call torch.ops._C.gptq_gemm (vLLM style)
                 output = self.gptq_gemm(
                     x_2d,
                     qweight,
                     qzeros,
                     scales,
-                    g_idx if g_idx is not None else torch.empty(0, dtype=torch.int32, device=x.device),
-                    is_shuffled,
-                    bits
+                    g_idx_t,
+                    True,   # use_exllama
+                    False,  # use_v2_format
+                    bits,
                 )
                 
                 output_shape = list(x.shape[:-1]) + [scales.shape[1]]
diff --git a/diffulex/extensions/quantization/strategies/linear_int8_w8a16.py b/diffulex/extensions/quantization/strategies/linear_int8_w8a16.py
index 31c33916..8f7e2d48 100644
--- a/diffulex/extensions/quantization/strategies/linear_int8_w8a16.py
+++ b/diffulex/extensions/quantization/strategies/linear_int8_w8a16.py
@@ -173,7 +173,7 @@ def quantize_weight_for_kernel(
         
         # Store scales as 1D
         s_1d = s_reorder_1xn.reshape(-1).to(dtype=torch.bfloat16)
-        return q_reorder.contiguous(), s_1d.contiguous()
+        return q_reorder.contiguous(), {"scales": s_1d.contiguous()}
     
     def quantize_act_for_kernel(self, x: torch.Tensor,
                                 cache_key: Optional[str] = None) -> Tuple[torch.Tensor, Any]:
diff --git a/diffulex/extensions/quantization/strategies/linear_int8_w8a8.py b/diffulex/extensions/quantization/strategies/linear_int8_w8a8.py
index e8f692e9..105cedc4 100644
--- a/diffulex/extensions/quantization/strategies/linear_int8_w8a8.py
+++ b/diffulex/extensions/quantization/strategies/linear_int8_w8a8.py
@@ -1,44 +1,41 @@
 """
-INT8 W8A8 Linear Strategy - vLLM-aligned high-performance implementation.
+INT8 W8A8 Linear Strategy - Pre-quantization only
 
-Key optimizations (from feat/kv-cache-fp8-support):
-1. Activation quantization: vllm._custom_ops.scaled_int8_quant (CUDA kernel, dynamic per-token)
-2. GEMM: vllm._custom_ops.cutlass_scaled_mm (CUTLASS, no fallback)
-3. Weight layout: stored as K×N (transposed), matching CUTLASS requirements
-
-No dequantize fallback - forces CUTLASS path for performance.
+Weight is pre-quantized during model loading. No runtime weight quantization.
 """
 
+from typing import Any, Optional, Tuple
+
 import torch
-import torch.nn.functional as F
-from typing import Any, Dict, Optional, Tuple
+from torch import nn
 
-from ..strategy import LinearQuantizationStrategy
+from .linear_bf16 import BF16LinearStrategy
 from ..registry import register_linear_strategy
 
-
+# vLLM custom ops for fast INT8 W8A8
 try:
     from vllm import _custom_ops as _vllm_ops
-except Exception:
+except ImportError:
     _vllm_ops = None
 
 
 @register_linear_strategy("int8", "int8")
-class INT8W8A8LinearStrategy(LinearQuantizationStrategy):
+class INT8W8A8LinearStrategy(BF16LinearStrategy):
     """
-    INT8 W8A8 linear quantization using vLLM's optimized CUDA kernels.
+    INT8 W8A8 quantization using vLLM's CUTLASS kernels.
+    
+    - Weight: per-channel symmetric int8, pre-quantized during loading
+    - Activation: dynamic per-token int8 quantization
+    - Kernel: vLLM's cutlass_scaled_mm
     
-    Weight layout: stored as [K, N] int8 (transposed from original [N, K])
-    Scale layout: [1, N] float32 for broadcasting with per-token activation scales
+    NOTE: This strategy requires PRE-QUANTIZED weights. It will fail if
+    called with unquantized (BF16) weights.
     """
     
-    def __init__(self):
-        # Cache: id(weight) -> (qweight_int8 [K,N], w_scales_fp32 [1,N])
-        self._weight_cache: Dict[int, Tuple[torch.Tensor, torch.Tensor]] = {}
+    name = "int8_w8a8"
     
-    @property
-    def name(self) -> str:
-        return "int8_w8a8"
+    def __init__(self):
+        super().__init__()
     
     @property
     def linear_weight_format(self) -> str:
@@ -48,25 +45,16 @@ def linear_weight_format(self) -> str:
     def linear_act_format(self) -> str:
         return "int8"
     
-    def get_storage_dtype(self, device: torch.device) -> Tuple[torch.dtype, int]:
-        return (torch.int8, 1)
-    
-    def get_scale_shape(self, original_shape: Tuple[int, ...], **kwargs: Any) -> Tuple[int, ...]:
-        """Return scale shape for weight quantization."""
-        if len(original_shape) != 2:
-            raise ValueError(f"Expected 2D weight [N,K], got {original_shape}")
-        return (original_shape[0],)  # Per-output-channel: [N]
-    
-    def quantize(self, weight: torch.Tensor, **kwargs: Any) -> Tuple[torch.Tensor, Any]:
+    def quantize(self, weight: torch.Tensor) -> Tuple[torch.Tensor, dict]:
         """
-        Quantize weight to INT8.
+        Quantize weight to INT8 (per-channel symmetric).
         
         Args:
-            weight: [N, K] float tensor
+            weight: [N, K] BF16/FP16 weight tensor
             
         Returns:
-            qweight: [K, N] int8 (transposed for CUTLASS)
-            metadata: {"scales": [1, N] float32}
+            qweight: [K, N] int8 (column-major for CUTLASS)
+            meta: {"scales": [1, N] float32}
         """
         if weight.dim() != 2:
             raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(weight.shape)}")
@@ -95,19 +83,19 @@ def quantize_weight_for_kernel(
         *,
         device: Optional[torch.device] = None,
         **_: Any,
-    ) -> Tuple[torch.Tensor, Any]:
+    ) -> Tuple[torch.Tensor, dict]:
         """
         Quantize weight for kernel consumption.
         
         Returns:
             qweight: [K, N] int8 on target device
-            scales: [1, N] float32 on target device
+            meta: {"scales": [1, N] float32 on target device}
         """
         q_kn, meta = self.quantize(weight)
         if device is not None:
             q_kn = q_kn.to(device=device)
             meta["scales"] = meta["scales"].to(device=device)
-        return q_kn, meta["scales"]
+        return q_kn, meta
     
     def quantize_act_for_kernel(self, x: torch.Tensor,
                                 cache_key: Optional[str] = None) -> Tuple[torch.Tensor, Any]:
@@ -156,33 +144,32 @@ def linear_forward(
         INT8 W8A8 linear forward using vLLM's cutlass_scaled_mm.
         
         Args:
-            x: Input tensor [..., K]
-            weight: Quantized weight [K, N] int8, or original weight
+            x: Input tensor [..., K] (BF16/FP16)
+            weight: Pre-quantized weight [K, N] int8
             bias: Optional bias [N]
-            quant_scales: Weight scales [1, N] float32 (if weight is already quantized)
+            quant_scales: Weight scales [1, N] float32
             
         Returns:
-            output: [..., N]
+            output: [..., N] (BF16/FP16)
+            
+        Raises:
+            RuntimeError: If weight is not pre-quantized (dtype != int8)
         """
         if _vllm_ops is None:
             raise RuntimeError(
                 "vLLM custom ops are required for W8A8 (scaled_int8_quant / cutlass_scaled_mm)"
             )
         
-        # Get quantized weight and scales
-        if weight is not None and weight.dtype == torch.int8 and quant_scales is not None:
-            # Already quantized (from load-time quantization)
-            qweight = weight
-            w_scales = quant_scales
-        else:
-            # Need to quantize on-the-fly (cache by weight id)
-            wid = id(weight)
-            cached = self._weight_cache.get(wid)
-            if cached is None or cached[0].device != x.device:
-                qweight, w_scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                self._weight_cache[wid] = (qweight, w_scales)
-            else:
-                qweight, w_scales = cached
+        # STRICT: Only accept pre-quantized weights
+        if weight is None or weight.dtype != torch.int8 or quant_scales is None:
+            raise RuntimeError(
+                f"INT8 W8A8 requires pre-quantized weight (dtype=int8) and scales, "
+                f"got weight.dtype={weight.dtype if weight is not None else None}, "
+                f"quant_scales={quant_scales is not None}"
+            )
+        
+        qweight = weight
+        w_scales = quant_scales
         
         # Reshape input: [..., K] -> [M, K]
         orig_shape = x.shape
diff --git a/diffulex/extensions/quantization/strategies/linear_w4a8_cutlass.py b/diffulex/extensions/quantization/strategies/linear_w4a8_cutlass.py
index 4dfafd2a..0decb6c8 100644
--- a/diffulex/extensions/quantization/strategies/linear_w4a8_cutlass.py
+++ b/diffulex/extensions/quantization/strategies/linear_w4a8_cutlass.py
@@ -61,6 +61,8 @@ class CutlassW4A8LinearStrategy(LinearQuantizationStrategy):
     - out_features % 128 == 0
     """
     
+    is_offline_quantized = True
+    
     def __init__(self, group_size: int = 128):
         if group_size != 128:
             raise ValueError(f"CutlassW4A8 only supports group_size=128, got {group_size}")