vllm-project
diff --git a/‎tests/singlecard/test_offline_inference.py
Lines changed: 3 additions & 2 deletions b/‎tests/singlecard/test_offline_inference.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm_ascend/quantization/faquant.py
Lines changed: 217 additions & 0 deletions b/‎vllm_ascend/quantization/faquant.py
Lines changed: 217 additions & 0 deletions
diff --git a/‎vllm_ascend/quantization/quant_config.py
Lines changed: 9 additions & 9 deletions b/‎vllm_ascend/quantization/quant_config.py
Lines changed: 9 additions & 9 deletions
@@ -45,6 +45,7 @@
 
 QUANTIZATION_MODELS = [
     "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8",
+    "vllm-ascend/Qwen2.5-0.5B-Instruct-fa3"
 ]
 
 
@@ -71,7 +72,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
 @pytest.mark.parametrize("max_tokens", [5])
 def test_quantization_models(model: str, max_tokens: int) -> None:
     prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+        str(i) for i in range(256)) + " are:"
     example_prompts = [prompt]
 
     # NOTE: Using quantized model repo id from modelscope encounters an issue,
@@ -80,7 +81,7 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
     model_path = snapshot_download(model)
 
     with VllmRunner(model_path,
-                    max_model_len=8192,
+                    max_model_len=4096,
                     enforce_eager=True,
                     dtype="auto",
                     gpu_memory_utilization=0.7,
 
@@ -0,0 +1,217 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import torch
+import torch_npu
+
+from .quant_utils import (SRC_DTYPE_TO_ACL_DTYPE, TYPE_QUANT_QKV_ONLINE,
+                          quant_per_tensor)
+
+
+class AscendFAQuantAttentionMethod:
+    """Linear method for Ascend FAQuant
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @staticmethod
+    def get_quant_param() -> List[str]:
+        return [
+            "fa_q.scale", "fa_q.offset", "fa_k.scale", "fa_k.offset",
+            "fa_v.scale", "fa_v.offset"
+        ]
+
+    @staticmethod
+    def get_extra_module_names() -> List[str]:
+        return ["fa_q", "fa_k", "fa_v"]
+
+    @staticmethod
+    def process_weights_after_loading(layer):
+        fa_qscale = layer.fa_q.scale
+        fa_kscale = layer.fa_k.scale
+        fa_vscale = layer.fa_v.scale
+        repeated_query_scale = layer.fa_q.scale.repeat(1, 64)
+        layer.fa_qscale = torch.nn.Parameter(repeated_query_scale,
+                                             requires_grad=False)
+        repeated_query_offset = layer.fa_q.offset.repeat(1, 64)
+        layer.fa_qoffset = torch.nn.Parameter(repeated_query_offset,
+                                              requires_grad=False)
+        repeated_fa_kscale = layer.fa_k.scale.repeat(1, 64)
+        layer.fa_kscale = torch.nn.Parameter(repeated_fa_kscale,
+                                             requires_grad=False)
+        repeated_fa_koffset = layer.fa_k.offset.repeat(1, 64)
+        layer.fa_koffset = torch.nn.Parameter(repeated_fa_koffset,
+                                              requires_grad=False)
+        repeated_fa_vscale = layer.fa_v.scale.repeat(1, 64)
+        layer.fa_vscale = torch.nn.Parameter(repeated_fa_vscale,
+                                             requires_grad=False)
+        repeated_fa_voffset = layer.fa_v.offset.repeat(1, 64)
+        layer.fa_voffset = torch.nn.Parameter(repeated_fa_voffset,
+                                              requires_grad=False)
+
+        if fa_kscale.shape[0] <= 0:
+            raise ValueError(
+                "Expected size of fa_kscale in dimension 0 should be greater than 0"
+                f"but got {fa_kscale.shape[0]}.")
+        gqa_size = fa_qscale.shape[0] // fa_kscale.shape[0]
+        fa3_k_scale, fa3_v_scale = fa_kscale.repeat(1, gqa_size).view(
+            -1, 1), fa_vscale.repeat(1, gqa_size).view(-1, 1)
+        qk_scale = torch.nn.Parameter(torch.squeeze(
+            fa_qscale * fa3_k_scale).to(torch.float),
+                                      requires_grad=False)
+        layer.register_parameter("qk_scale", qk_scale)
+        fa3_v_scale = torch.nn.Parameter(
+            torch.squeeze(fa3_v_scale).contiguous().to(torch.float),
+            requires_grad=False)
+        layer.register_parameter("fa3_v_scale", fa3_v_scale)
+
+    @classmethod
+    def apply(cls, layer: torch.nn.Module, query: torch.Tensor,
+              key: torch.Tensor, value: torch.Tensor, *extra_args,
+              **optional_args) -> torch.Tensor:
+        key_cache, value_cache, scale, block_tables, \
+            is_prefill, mask, slots, output = extra_args
+        seq_lens_tensor_cpu = optional_args.get("seq_lens_tensor_cpu", None)
+
+        query_shape = query.shape
+        key_shape = key.shape
+        value_shape = value.shape
+
+        query = query.view(query.shape[0], -1)
+        key = key.view(key.shape[0], -1)
+        value = value.view(value.shape[0], -1)
+
+        if is_prefill:
+            if key_cache is not None:
+
+                key_int8 = quant_per_tensor(key, layer.fa_kscale,
+                                            layer.fa_koffset, True)
+                value_int8 = quant_per_tensor(value, layer.fa_vscale,
+                                              layer.fa_voffset, True)
+                key_int8 = key_int8.view(key_shape)
+                value_int8 = key_int8.view(value_shape)
+                query = query.view(query_shape)
+                torch_npu._npu_reshape_and_cache(key_int8, value_int8,
+                                                 key_cache, value_cache, slots)
+            if mask is None:
+                raise ValueError(
+                    "attn_metadata.attn_mask is Null. Please check.")
+            if output is not None:
+                key = key.view(key_shape)
+                value = key.view(value_shape)
+                query = query.view(query_shape)
+                output = output.view(query.shape)
+                torch_npu._npu_flash_attention(query,
+                                               key,
+                                               value,
+                                               mask,
+                                               torch.tensor(
+                                                   seq_lens_tensor_cpu,
+                                                   dtype=torch.int32),
+                                               scale,
+                                               layer.num_heads,
+                                               layer.num_kv_heads,
+                                               out=output)
+            else:
+                key = key.view(key_shape)
+                value = key.view(value_shape)
+                query = query.view(query_shape)
+                output = output.view(query.shape)
+                output = torch.empty_like(query,
+                                          dtype=query.dtype).to(query.device)
+                torch_npu._npu_flash_attention(query,
+                                               key,
+                                               value,
+                                               mask,
+                                               torch.tensor(
+                                                   seq_lens_tensor_cpu,
+                                                   dtype=torch.int32),
+                                               scale,
+                                               layer.num_heads,
+                                               layer.num_kv_heads,
+                                               out=output)
+
+        else:
+            if key_cache is None:
+                raise ValueError(
+                    "KV Cache can't be None in decoding phase. Got None. Please check."
+                )
+            query_int8 = quant_per_tensor(query, layer.fa_qscale,
+                                          layer.fa_qoffset, True)
+            key_int8 = quant_per_tensor(key, layer.fa_kscale, layer.fa_koffset,
+                                        True)
+            value_int8 = quant_per_tensor(value, layer.fa_vscale,
+                                          layer.fa_voffset, True)
+            key_int8 = key_int8.view(key_shape)
+            value_int8 = value_int8.view(value_shape)
+            query = query.view(query_shape)
+            query_int8 = query_int8.view(query_shape)
+            output = output.view(query.shape)
+            torch_npu._npu_reshape_and_cache(key_int8, value_int8, key_cache,
+                                             value_cache, slots)
+            if output is not None:
+                output = output.view(query.shape)
+                torch_npu._npu_paged_attention_quant(
+                    query_int8, key_cache, value_cache, layer.num_kv_heads,
+                    layer.num_heads, scale, block_tables,
+                    torch.tensor(seq_lens_tensor_cpu, dtype=torch.int32),
+                    TYPE_QUANT_QKV_ONLINE, SRC_DTYPE_TO_ACL_DTYPE[query.dtype],
+                    layer.qk_scale, layer.fa3_v_scale, output)
+            else:
+                output = torch.empty_like(query,
+                                          dtype=query.dtype).to(query.device)
+                torch_npu._npu_paged_attention_quant(
+                    query_int8, key_cache, value_cache, layer.num_kv_heads,
+                    layer.num_heads, scale, block_tables,
+                    torch.tensor(seq_lens_tensor_cpu, dtype=torch.int32),
+                    TYPE_QUANT_QKV_ONLINE, SRC_DTYPE_TO_ACL_DTYPE[query.dtype],
+                    layer.qk_scale, layer.fa3_v_scale, output)
+
+        output = torch.flatten(output, start_dim=-2)
+        return output
+
+    @classmethod
+    def create_weights(cls, layer: torch.nn.Module) -> None:
+        extra_module_names = cls.get_extra_module_names()
+        for name in extra_module_names:
+            setattr(layer, name, torch.nn.Module())
+
+        params_dtype = torch.get_default_dtype()
+
+        params_dict = {}
+
+        params_dict["fa_q.scale"] = torch.empty((layer.num_heads, 1),
+                                                dtype=params_dtype)
+        params_dict["fa_q.offset"] = torch.empty((layer.num_heads, 1),
+                                                 dtype=torch.int8)
+        params_dict["fa_k.scale"] = torch.empty((layer.num_kv_heads, 1),
+                                                dtype=params_dtype)
+        params_dict["fa_k.offset"] = torch.empty((layer.num_kv_heads, 1),
+                                                 dtype=torch.int8)
+        params_dict["fa_v.scale"] = torch.empty((layer.num_kv_heads, 1),
+                                                dtype=params_dtype)
+        params_dict["fa_v.offset"] = torch.empty((layer.num_kv_heads, 1),
+                                                 dtype=torch.int8)
+
+        for name, weight in params_dict.items():
+            module_name, weight_name = name.split('.')
+            module = getattr(layer, module_name)
+            module.register_parameter(
+                weight_name, torch.nn.Parameter(weight, requires_grad=False))
@@ -42,7 +42,7 @@
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
 from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD
 
-from .quantizer import AscendQuantizer
+from .quantizer import VLLMAscendQuantizer
 
 
 @register_quantization_config(ASCEND_QUATIZATION_METHOD)
@@ -151,7 +151,7 @@ def get_scaled_act_names(self) -> List[str]:
 class AscendLinearMethod(LinearMethodBase):
     """Linear method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
+    This class calls VLLMAscendQuantizer to search a specific quantization
     implementations supported on ascend hardware for linear methods.
 
     Args:
@@ -160,7 +160,7 @@ class AscendLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
+        self.quantizer = VLLMAscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix, packed_modules_mapping)
         self.quant_method = self.quantizer.build_linear_method()
 
@@ -232,15 +232,15 @@ def apply(
 class AscendKVCacheMethod(BaseKVCacheMethod):
     """KVCache method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
+    This class calls VLLMAscendQuantizer to search a specific quantization
     implementations supported on ascend hardware for kvcache methods.
 
     Args:
         quant_config: The Ascend quantization config.
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
+        self.quantizer = VLLMAscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix)
         self.quant_method = self.quantizer.build_attention_method()
 
@@ -285,7 +285,7 @@ def apply(self,
 class AscendFusedMoEMethod(FusedMoEMethodBase):
     """FusedMoE method for Ascend quantization.
 
-    This class calls AscendQuantizer to search a specific quantization
+    This class calls VLLMAscendQuantizer to search a specific quantization
     implementations supported on ascend hardware for kvcache methods.
 
     Args:
@@ -294,7 +294,7 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]):
-        self.quantizer = AscendQuantizer.get_quantizer(
+        self.quantizer = VLLMAscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix, packed_modules_mapping)
         self.quant_method = self.quantizer.build_moe_method()
 
@@ -365,7 +365,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 class AscendEmbeddingMethod(AscendLinearMethod):
     """Embedding method for Ascend quantization.
 
-      This class calls AscendQuantizer to search a specific quantization
+      This class calls VLLMAscendQuantizer to search a specific quantization
       implementations supported on ascend hardware for Embedding methods.
 
       Args:
@@ -374,6 +374,6 @@ class AscendEmbeddingMethod(AscendLinearMethod):
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
+        self.quantizer = VLLMAscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix, packed_modules_mapping)
         self.quant_method = self.quantizer.build_linear_method()