add Qwen3 and fix lint

jiahanc · jiahanc · commit be4dc1ac6d33 · 2025-10-29T14:09:56.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
-from typing import Optional
+
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     calculate_tile_tokens_dim,
@@ -23,16 +24,17 @@ def flashinfer_fused_moe_blockscale_fp8(
     w2_weight_scale_inv: torch.Tensor,
     global_num_experts: int,
     top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
+    num_expert_group: int | None,
+    topk_group: int | None,
     intermediate_size: int,
     expert_offset: int,
     local_num_experts: int,
     block_shape: list[int],
-    routed_scaling: Optional[float] = 1.0,
+    routed_scaling: float | None = 1.0,
     routing_method_type: int = 2,
 ) -> torch.Tensor:
     from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
+
     topk_group = topk_group if topk_group is not None else 0
     assert top_k <= global_num_experts
     assert top_k <= 10
@@ -49,7 +51,7 @@ def flashinfer_fused_moe_blockscale_fp8(
     # NOTE: scales of hidden states have to be transposed!
     a_sf_t = a_sf.t().contiguous()
     return flashinfer_trtllm_fp8_block_scale_moe(
-        routing_logits=routing_logits, 
+        routing_logits=routing_logits,
         routing_bias=routing_bias,
         hidden_states=a_q,
         hidden_states_scale=a_sf_t,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -160,14 +160,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using DeepGEMM backend for FP8 MoE")
             return Fp8MoeBackend.DEEPGEMM
 
-    # # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    # if (
-    #     current_platform.is_cuda()
-    #     and current_platform.is_device_capability(100)
-    #     and block_quant
-    # ):
-    #     logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
-    #     return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
+    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
+    if (
+        current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+        and block_quant
+    ):
+        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
 
     # default to Triton
     logger.info_once("Using Triton backend for FP8 MoE")
@@ -1230,7 +1230,9 @@ def apply(
                 )
                 routing_method_type = getattr(layer, "routing_method_type", 2)
                 return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                    routing_logits=router_logits.to(torch.float32) if routing_method_type == 2 else router_logits,
+                    routing_logits=router_logits.to(torch.float32)
+                    if routing_method_type == 2
+                    else router_logits,
                     routing_bias=e_score_correction_bias,
                     x=x,
                     w13_weight=layer.w13_weight,
@@ -1244,7 +1246,7 @@ def apply(
                     intermediate_size=layer.intermediate_size_per_partition,
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
-                    block_shape=self.weight_block_size, 
+                    block_shape=self.weight_block_size,
                     routed_scaling=routed_scaling_factor,
                     routing_method_type=routing_method_type,
                 )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1349,7 +1349,6 @@ def prepare_static_weights_for_trtllm_fp4_moe(
     ):
         from flashinfer import nvfp4_block_scale_interleave
         from flashinfer.fused_moe.core import (
-            get_w2_permute_indices_with_cache,
             _maybe_get_cached_w3_w1_permute_indices,
             get_w2_permute_indices_with_cache,
         )
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -28,6 +28,7 @@ class FlashinferMoeBackend(Enum):
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
     from flashinfer import next_positive_power_of_2
+
     # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
     # TODO: Revert this to dynamic calculation once a new version of FlashInfer
     # with the necessary kernels is released.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
-
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -885,7 +885,7 @@ def forward(
         residual: torch.Tensor | None,
         positions: torch.Tensor = None,
         **kwargs: object,
-    ):  
+    ):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)

Original file line number	Diff line number	Diff line change
`@@ -1349,7 +1349,6 @@ def prepare_static_weights_for_trtllm_fp4_moe(`
`1349`	`1349`	`):`
`1350`	`1350`	`from flashinfer import nvfp4_block_scale_interleave`
`1351`	`1351`	`from flashinfer.fused_moe.core import (`
`1352`		`- get_w2_permute_indices_with_cache,`
`1353`	`1352`	`_maybe_get_cached_w3_w1_permute_indices,`
`1354`	`1353`	`get_w2_permute_indices_with_cache,`
`1355`	`1354`	`)`