From f7f00f3299e10ca345aece5cfcedcf3c0dac7279 Mon Sep 17 00:00:00 2001
From: ApsarasX <apsarax@outlook.com>
Date: Thu, 15 May 2025 15:26:54 +0000
Subject: [PATCH 1/2] [Perf] Reduce memory usage by splitting tokens in
 fused_experts

Signed-off-by: ApsarasX <apsarax@outlook.com>
---
 vllm_ascend/envs.py                      |  2 ++
 vllm_ascend/quantization/w8a8_dynamic.py | 28 ++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 8e1cc1c162..aba1d18e6f 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -66,6 +66,8 @@
     lambda: os.getenv("C_COMPILER", None),
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
+    "VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH":
+    lambda: int(os.getenv("VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH", "8192")),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
index 5d2b442cf1..0d6c6d8f66 100644
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -27,6 +27,7 @@
 from vllm_ascend.ops.fused_moe import select_experts
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
+VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH: int = envs_ascend.VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH
 
 
 def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
@@ -636,15 +637,24 @@ def apply(
                 expert_map=expert_map,
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name)
         elif dp_size == 1:
-            return fused_experts(hidden_states=x,
-                                 w1=layer.w13_weight,
-                                 w1_scale=layer.w13_weight_scale,
-                                 w2=layer.w2_weight,
-                                 w2_scale=layer.w2_weight_scale,
-                                 topk_weights=topk_weights,
-                                 topk_ids=topk_ids,
-                                 top_k=top_k,
-                                 expert_map=expert_map)
+            x_list = x.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            topk_weights_list = topk_weights.split(
+                VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            topk_ids_list = topk_ids.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH)
+            final_hidden_states_list = []
+            for i in range(len(x_list)):
+                final_hidden_states = fused_experts(
+                    hidden_states=x_list[i],
+                    w1=layer.w13_weight,
+                    w1_scale=layer.w13_weight_scale,
+                    w2=layer.w2_weight,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights_list[i],
+                    topk_ids=topk_ids_list[i],
+                    top_k=top_k,
+                    expert_map=expert_map)
+                final_hidden_states_list.append(final_hidden_states)
+            return torch.concat(final_hidden_states_list)
         else:
             return fused_experts_with_all2all(hidden_states=x,
                                               w1=layer.w13_weight,

From ab355abf0cc962fef94cba0bd3978233bfdbb523 Mon Sep 17 00:00:00 2001
From: ApsarasX <apsarax@outlook.com>
Date: Fri, 16 May 2025 06:05:34 +0000
Subject: [PATCH 2/2] [Perf] Reduce memory usage by avoiding unused tensor

Signed-off-by: ApsarasX <apsarax@outlook.com>
---
 vllm_ascend/worker/model_runner_v1.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 18037439c4..26faafc381 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -219,10 +219,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
                 device="cpu",
                 pin_memory=True)
 
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        if self.is_multimodal_model:
+            self.inputs_embeds = torch.zeros(
+                (self.max_num_tokens, self.hidden_size),
+                dtype=self.dtype,
+                device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         self.arange_np: npt.NDArray[np.int32] = np.arange(max(