update work

jiahanc · jiahanc · commit bcebcc681b9e · 2025-10-29T14:09:53.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -104,10 +104,12 @@ class RoutingMethodType(IntEnum):
     DeepSeekV3 = (2,)
     # Llama4: Top1 -> Sigmoid
     Llama4 = (3,)
-    # Qwen3: Softmax -> TopK -> Renormalize
+    # RenormalizeNaive: Softmax -> TopK -> Renormalize
     RenormalizeNaive = (4,)
+    # TopK: TopK (no softmax)
+    TopK = (5,)
     # Unspecified
-    Unspecified = 5.0
+    Unspecified = 6.0
 
 
 @dataclass
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -49,7 +49,7 @@ def flashinfer_fused_moe_blockscale_fp8(
     # NOTE: scales of hidden states have to be transposed!
     a_sf_t = a_sf.t().contiguous()
     return flashinfer_trtllm_fp8_block_scale_moe(
-        routing_logits=routing_logits,
+        routing_logits=routing_logits, 
         routing_bias=routing_bias,
         hidden_states=a_q,
         hidden_states_scale=a_sf_t,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -160,14 +160,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using DeepGEMM backend for FP8 MoE")
             return Fp8MoeBackend.DEEPGEMM
 
-    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    if (
-        current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
-        and block_quant
-    ):
-        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
-        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
+    # # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
+    # if (
+    #     current_platform.is_cuda()
+    #     and current_platform.is_device_capability(100)
+    #     and block_quant
+    # ):
+    #     logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+    #     return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
 
     # default to Triton
     logger.info_once("Using Triton backend for FP8 MoE")
@@ -1294,7 +1294,10 @@ def apply(
         # can override fused_experts or cutlass but not rocm or marlin.
         #
         topk_weights, topk_ids, zero_expert_result = select_result
-
+        # if (topk_ids.shape[0] <100):
+        #     print("=== MoE Routing Results ===")
+        #     print(f"topk_ids: {topk_ids}")
+        #     print(f"topk_weights: {topk_weights}")
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                 rocm_aiter_fused_experts,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
@@ -43,6 +43,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -171,6 +173,7 @@ def __init__(
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.RenormalizeNaive,
         )
 
         self.gate = ReplicatedLinear(
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -101,6 +101,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
+        self.prefix_print = prefix
         config = vllm_config.model_config.hf_config
         parallel_config = vllm_config.parallel_config
         quant_config = vllm_config.quant_config
@@ -172,18 +173,18 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
-            routing_method_type=RoutingMethodType.Renormalize,
+            routing_method_type=RoutingMethodType.RenormalizeNaive,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-
         if self.is_sequence_parallel:
             hidden_states = sequence_parallel_chunk(hidden_states)
 
+<<<<<<< HEAD
         if self.experts.is_internal_router:
             # In this case, the gate/router runs inside the FusedMoE class
             final_hidden_states = self.experts(
@@ -195,8 +196,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = self.experts(
                 hidden_states=hidden_states, router_logits=router_logits
             )
+=======
+        # print(self.prefix_print)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+>>>>>>> 9d88f1762 (update work)
 
         if self.shared_expert is not None:
+            # if ("model.layers.0." in self.prefix_print or "model.layers.1." in self.prefix_print or "model.layers.47." in self.prefix_print):
+            #     print(f"shared_expert: {final_hidden_states[0]}")
+            #     print(f"routed_expert: {final_hidden_states[1]}")
             final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
 
         if self.is_sequence_parallel:
@@ -873,7 +885,7 @@ def forward(
         residual: torch.Tensor | None,
         positions: torch.Tensor = None,
         **kwargs: object,
-    ):
+    ):  
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
@@ -1004,6 +1016,7 @@ def forward(
                 {"hidden_states": hidden_states, "residual": residual}
             )
         hidden_states, _ = self.norm(hidden_states, residual)
+        # print("="*60)
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: