lint: fix lint error

izhuhaoran · izhuhaoran · commit cf67619729f0 · 2025-10-19T12:46:46.000+08:00
Signed-off-by: zhuhaoran &lt;zhuhaoran.zhr@alibaba-inc.com&gt;
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,12 +5,13 @@
 import torch
 
 from tests.compile.backend import LazyInitPass, TestBackend
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.qk_norm_rope_fusion import (
     FUSED_QK_ROPE_OP,
-    QKNormRoPEFusionPass,
     RMS_OP,
+    QKNormRoPEFusionPass,
 )
 from vllm.config import (
     CompilationConfig,
@@ -21,9 +22,9 @@
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.attention import Attention, AttentionType
 from vllm.platforms import current_platform
 
+
 class QKNormRoPETestModel(torch.nn.Module):
     """A minimal model that exercises the unfused Q/K RMSNorm + RoPE pattern.
 
@@ -73,15 +74,11 @@ def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
         # Unfused baseline: split, per-head RMS, then RoPE
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        q_by_head = q.view(
-            *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim
-        )
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
         q_by_head = self.q_norm(q_by_head)
         q = q_by_head.view(q.shape)
 
-        k_by_head = k.view(
-            *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim
-        )
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
         k_by_head = self.k_norm(k_by_head)
         k = k_by_head.view(k.shape)
 
@@ -93,7 +90,8 @@ def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
 @pytest.mark.parametrize("T", [17])
 @pytest.mark.parametrize("num_heads, num_kv_heads, head_dim", [(16, 2, 128)])
 @pytest.mark.skipif(
-    not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm",
+    not current_platform.is_cuda_alike(),
+    reason="Only test on CUDA and ROCm",
 )
 def test_qk_norm_rope_fusion(dtype, T, num_heads, num_kv_heads, head_dim):
     torch.set_default_device("cuda")
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
@@ -15,8 +15,8 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-from .inductor_pass import enable_fake_mode
 from .fusion import empty_bf16, empty_i64
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -93,10 +93,12 @@ def pattern(
             v = operator.getitem(split_tuple, 2)
 
             # Q path: view -> (optional contiguous) -> RMS -> view back to q.shape
-            # q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+            # q_by_head=q.view(*q.shape[:-1],q.shape[-1]//self.head_dim,self.head_dim)
             # q_out = torch.empty_like(q_by_head)
             # q_by_head_contiguous = q_by_head.contiguous()
-            q_by_head = VIEW_OP(q, (*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim))
+            q_by_head = VIEW_OP(
+                q, (*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+            )
             q_out = EMPTY_LIKE_OP(q_by_head)
             q_by_head_contiguous = CONTIGUOUS_OP(q_by_head)
 
@@ -113,10 +115,12 @@ def pattern(
             q_flat = VIEW_OP(q_normed_by_head, q.shape)
 
             # K path: view -> (optional contiguous) -> RMS -> view back to k.shape
-            # k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+            # k_by_head=k.view(*k.shape[:-1],k.shape[-1]//self.head_dim,self.head_dim)
             # k_out = torch.empty_like(k_by_head)
             # k_by_head_contiguous = k_by_head.contiguous()
-            k_by_head = VIEW_OP(k, (*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim))
+            k_by_head = VIEW_OP(
+                k, (*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+            )
             k_out = EMPTY_LIKE_OP(k_by_head)
             k_by_head_contiguous = CONTIGUOUS_OP(k_by_head)
             kn = auto_functionalized(
@@ -130,7 +134,7 @@ def pattern(
 
             # k_flat = k_normed_by_head.view(k.shape)
             k_flat = VIEW_OP(k_normed_by_head, k.shape)
-    
+
             # RoPE: apply to flattened q/k
             rope = auto_functionalized(
                 self.rope_op,
@@ -143,7 +147,6 @@ def pattern(
             )
             return rope[1], rope[2], v
 
-
         def replacement(
             qkv: torch.Tensor,
             positions: torch.Tensor,
@@ -155,7 +158,7 @@ def replacement(
             pos_flat = RESHAPE_OP(positions, [-1])
 
             # Run fused op (mutates qkv)
-            auto_functionalized(
+            result = auto_functionalized(
                 FUSED_QK_ROPE_OP,
                 qkv=qkv,
                 num_heads_q=self.num_heads,
@@ -169,18 +172,19 @@ def replacement(
                 is_neox=self.is_neox,
                 position_ids=pos_flat,
             )
+            result_qkv = result[1]
 
             # Split back to q,k,v and return
             split_tuple = SPLIT_SIZES_OP(
-                qkv, [self.q_size, self.kv_size, self.kv_size], -1
+                result_qkv, [self.q_size, self.kv_size, self.kv_size], -1
             )
             return (
                 operator.getitem(split_tuple, 0),
                 operator.getitem(split_tuple, 1),
                 operator.getitem(split_tuple, 2),
             )
 
-        # Sample inputs to help pattern tracing (sizes don't have to be exact at runtime)
+        # Sample inputs to help pattern tracing
         T = 5
         qkv = empty_bf16(T, self.q_size + 2 * self.kv_size)
         positions = empty_i64(T)
@@ -229,9 +233,7 @@ def __init__(self, config: VllmConfig):
         )
 
         if not current_platform.is_cuda_alike():
-            logger.debug(
-                "QK Norm+RoPE fusion not enabled: unsupported platform"
-            )
+            logger.debug("QK Norm+RoPE fusion not enabled: unsupported platform")
             return
 
         # Register a pattern per attention layer, as sizes differ by shard
@@ -255,7 +257,8 @@ def __init__(self, config: VllmConfig):
                         ).register(self.patterns)
                     except Exception as e:
                         logger.debug(
-                            "Skipping QkNormRopePattern registration with eps=%s is_neox=%s: %s",
+                            "Skipping QkNormRopePattern register with eps=%s "
+                            "is_neox=%s: %s",
                             epsilon,
                             neox,
                             e,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -631,8 +631,6 @@ def __post_init__(self) -> None:
 
         if self.backend == "":
             self.backend = current_platform.simple_compile_backend
-        
-
 
     def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         """