add ut for npu_apply_rotary_pos_emb

David9857 · David9857 · commit e568126a12a1 · 2025-07-11T15:17:37.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 import torch.nn as nn
+import torch_npu
 
 from vllm_ascend.utils import enable_custom_op
 
@@ -198,3 +199,69 @@ def test_rotary_embedding_quant_with_leading_dim(
                                ref_key,
                                atol=DEFAULT_ATOL,
                                rtol=DEFAULT_RTOL)
+
+
+# test npu_apply_rotary_pos_emb with head_size=128 and rotary_dim=128 and is_neox_style=True
+@pytest.mark.parametrize("is_neox_style", [True])
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("rotary_dim", [128])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_npu_apply_rotary_pos_emb_with_head_size_equals_rotary_dim(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    if rotary_dim is None:
+        rotary_dim = head_size
+
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                           is_neox_style, dtype)
+    rope = rope.to(dtype=dtype)
+    num_tokens = batch_size * seq_len
+    positions = torch.randint(0, max_position, (batch_size * seq_len, ))
+    qkv_tensor = torch.randn(1,
+                             num_tokens,
+                             num_heads,
+                             head_size * 3,
+                             dtype=dtype)
+    query, key, _ = qkv_tensor.split(
+        [head_size, head_size, head_size],
+        dim=-1,
+    )
+
+    ref_query, ref_key = rope.forward_native(positions, query, key)
+    cos_sin = rope.cos_sin_cache.index_select(0, positions)
+    last_dim = cos_sin.size()[-1]
+    cos, sin = cos_sin.reshape(-1, 2, last_dim // 2).repeat(1, 1,
+                                                            2).chunk(2, dim=-2)
+    # BSNH
+    cos, sin = cos.view(1, -1, 1, last_dim).contiguous(), sin.view(
+        1, -1, 1, last_dim).contiguous()
+    torch_npu.npu_apply_rotary_pos_emb(query, key, cos, sin)
+
+    # Compare the results.
+    torch.testing.assert_close(query.view(ref_query.size()),
+                               ref_query,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    torch.testing.assert_close(key.view(ref_key.size()),
+                               ref_key,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
diff --git a/vllm_ascend/models/qwen3.py b/vllm_ascend/models/qwen3.py
@@ -141,23 +141,23 @@ def __init__(
                 "Expected quant_config to be an instance of AscendQuantConfig"
 
             if isinstance(self.self_attn.qkv_proj.quant_method.quant_method,
-                        AscendW8A8LinearMethod):
+                          AscendW8A8LinearMethod):
                 self.input_layernorm = AddRMSNormW8A8Quant(
                     config.hidden_size,
                     layer=self.self_attn.qkv_proj,
                     eps=config.rms_norm_eps)
             else:
                 self.input_layernorm = RMSNorm(config.hidden_size,
-                                           eps=config.rms_norm_eps)
+                                               eps=config.rms_norm_eps)
             if isinstance(self.mlp.gate_up_proj.quant_method.quant_method,
-                        AscendW8A8LinearMethod):
+                          AscendW8A8LinearMethod):
                 self.post_attention_layernorm = AddRMSNormW8A8Quant(
                     config.hidden_size,
                     layer=self.mlp.gate_up_proj,
                     eps=config.rms_norm_eps)
             else:
-                self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                    eps=config.rms_norm_eps)
+                self.post_attention_layernorm = RMSNorm(
+                    config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,