vllm-project · ApsarasX · May 15, 2025 · May 16, 2025 · ganyi1996ppo · May 19, 2025
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -66,6 +66,8 @@
     lambda: os.getenv("C_COMPILER", None),
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
+    "VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH":
+    lambda: int(os.getenv("VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH", "8192")),
 }
 
 # end-env-vars-definition

diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -27,6 +27,7 @@
 from vllm_ascend.ops.fused_moe import select_experts
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
+VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH: int = envs_ascend.VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH
 
 
 def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
@@ -636,15 +637,24 @@ def apply(
                 expert_map=expert_map,
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name)
         elif dp_size == 1:
-            return fused_experts(hidden_states=x,
-                                 w1=layer.w13_weight,
-                                 w1_scale=layer.w13_weight_scale,
-                                 w2=layer.w2_weight,
-                                 w2_scale=layer.w2_weight_scale,
-                                 topk_weights=topk_weights,
-                                 topk_ids=topk_ids,
-                                 top_k=top_k,
-                                 expert_map=expert_map)
+            x_list = x.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            topk_weights_list = topk_weights.split(
+                VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            topk_ids_list = topk_ids.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            final_hidden_states_list = []
+            for i in range(len(x_list)):
+                final_hidden_states = fused_experts(
+                    hidden_states=x_list[i],
+                    w1=layer.w13_weight,
+                    w1_scale=layer.w13_weight_scale,
+                    w2=layer.w2_weight,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights_list[i],
+                    topk_ids=topk_ids_list[i],
+                    top_k=top_k,
+                    expert_map=expert_map)
+                final_hidden_states_list.append(final_hidden_states)
+            return torch.concat(final_hidden_states_list)
         else:
             return fused_experts_with_all2all(hidden_states=x,
                                               w1=layer.w13_weight,

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -219,10 +219,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
                 device="cpu",
                 pin_memory=True)
 
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        if self.is_multimodal_model:
+            self.inputs_embeds = torch.zeros(
+                (self.max_num_tokens, self.hidden_size),
+                dtype=self.dtype,
+                device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         self.arange_np: npt.NDArray[np.int32] = np.arange(max(