diff --git a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py index bc0d4fb506..bf3488031e 100644 --- a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py +++ b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py @@ -186,6 +186,11 @@ def finalize(self, hidden_states: torch.Tensor, self.moe_config.tp_group.device_group) hidden_states = torch.cat(self.split_hidden_states, dim=0) + # TODO: It is a quick bugfix for the memory explosion issue in eager mode. + # If the cache is not cleared after `self.split_hidden_states` is created, + # it can lead to the memory explosion in eager mode. + del self.split_hidden_states + # Unpad if necessary if self.num_tokens < hidden_states.shape[0]: hidden_states = hidden_states[:self.num_tokens] @@ -270,6 +275,11 @@ def finalize(self, hidden_states: torch.Tensor, self.moe_config.tp_group.device_group) hidden_states = torch.cat(self.split_hidden_states, dim=0) + # TODO: It is a quick bugfix for the memory explosion issue in eager mode. + # If the cache is not cleared after `self.split_hidden_states` is created, + # it can lead to the memory explosion in eager mode. + del self.split_hidden_states + if self.num_tokens < hidden_states.shape[0]: hidden_states = hidden_states[:self.num_tokens]