From f7f00f3299e10ca345aece5cfcedcf3c0dac7279 Mon Sep 17 00:00:00 2001 From: ApsarasX Date: Thu, 15 May 2025 15:26:54 +0000 Subject: [PATCH 1/2] [Perf] Reduce memory usage by splitting tokens in fused_experts Signed-off-by: ApsarasX --- vllm_ascend/envs.py | 2 ++ vllm_ascend/quantization/w8a8_dynamic.py | 28 ++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 8e1cc1c162..aba1d18e6f 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -66,6 +66,8 @@ lambda: os.getenv("C_COMPILER", None), "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None), + "VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH": + lambda: int(os.getenv("VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH", "8192")), } # end-env-vars-definition diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 5d2b442cf1..0d6c6d8f66 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -27,6 +27,7 @@ from vllm_ascend.ops.fused_moe import select_experts VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 +VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH: int = envs_ascend.VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH def apply_mlp(hidden_states_wrapper: List[torch.Tensor], @@ -636,15 +637,24 @@ def apply( expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name) elif dp_size == 1: - return fused_experts(hidden_states=x, - w1=layer.w13_weight, - w1_scale=layer.w13_weight_scale, - w2=layer.w2_weight, - w2_scale=layer.w2_weight_scale, - topk_weights=topk_weights, - topk_ids=topk_ids, - top_k=top_k, - expert_map=expert_map) + x_list = x.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH) + topk_weights_list = topk_weights.split( + VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH) + topk_ids_list = topk_ids.split(VLLM_FUSED_EXPERTS_SEQ_SPLIT_LENGTH) + final_hidden_states_list = [] + for i in range(len(x_list)): + final_hidden_states = fused_experts( + hidden_states=x_list[i], + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights_list[i], + topk_ids=topk_ids_list[i], + top_k=top_k, + expert_map=expert_map) + final_hidden_states_list.append(final_hidden_states) + return torch.concat(final_hidden_states_list) else: return fused_experts_with_all2all(hidden_states=x, w1=layer.w13_weight, From ab355abf0cc962fef94cba0bd3978233bfdbb523 Mon Sep 17 00:00:00 2001 From: ApsarasX Date: Fri, 16 May 2025 06:05:34 +0000 Subject: [PATCH 2/2] [Perf] Reduce memory usage by avoiding unused tensor Signed-off-by: ApsarasX --- vllm_ascend/worker/model_runner_v1.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 18037439c4..26faafc381 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -219,10 +219,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): device="cpu", pin_memory=True) - self.inputs_embeds = torch.zeros( - (self.max_num_tokens, self.hidden_size), - dtype=self.dtype, - device=self.device) + if self.is_multimodal_model: + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np: npt.NDArray[np.int32] = np.arange(max(