[long_seq_optim] all_gather optim

LookAround0301 · LookAround0301 · commit 16489060cf0d · 2025-08-31T15:38:00.000+08:00
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -712,18 +712,12 @@ def forward(
                     hidden_states_or_q_c, 0)
                 kv_no_split = get_tp_group().all_gather(kv_no_split, 0)
 
-            # kv_c_k_pe = self.kv_a_proj_with_mqa(hidden_states)[0]
             if self.enable_sp and is_prefill:
-                chunk_kv_no_split = [torch.empty_like(kv_no_split) for _ in range(self.sp_size)]
-                dist.all_gather(list(chunk_kv_no_split), kv_no_split, self.sp_group)
-                kv_no_split = torch.cat(chunk_kv_no_split, dim=0)
+                kv_no_split = get_tp_group().all_gather(kv_no_split, 0)
                 kv_no_split = kv_no_split[:original_len]
 
-                chunk_hidden_states_or_q_c = [torch.empty_like(hidden_states_or_q_c) for _ in range(self.sp_size)]
-                dist.all_gather(list(chunk_hidden_states_or_q_c), hidden_states_or_q_c, self.sp_group)
-                hidden_states_or_q_c = torch.cat(chunk_hidden_states_or_q_c, dim=0)
+                hidden_states_or_q_c = get_tp_group().all_gather(hidden_states_or_q_c, 0)
                 hidden_states_or_q_c = hidden_states_or_q_c[:original_len]
-            # kv_c, k_pe = kv_c_k_pe.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
 
             kv_c, k_pe = kv_no_split.split(
                 [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
@@ -1038,14 +1032,10 @@ def forward(
 
         hidden_states, _ = self.norm(hidden_states, residual)
         if self.enable_sp and is_prefill:
-            chunk_hidden_states = [torch.empty_like(hidden_states) for _ in range(self.sp_size)]
-            dist.all_gather(list(chunk_hidden_states), hidden_states, self.sp_group)
-            hidden_states = torch.cat(chunk_hidden_states, dim=0)
+            hidden_states = get_tp_group().all_gather(hidden_states, 0)
             hidden_states = hidden_states[:original_len]
         if self.cp_size > 1 and is_prefill:
-            chunk_hidden_states = [torch.empty_like(hidden_states) for _ in range(self.cp_size)]
-            dist.all_gather(list(chunk_hidden_states), hidden_states, self.cp_group)
-            hidden_states = torch.cat(chunk_hidden_states, dim=0)
+            hidden_states = get_cp_group().all_gather(hidden_states, 0)
             hidden_states = torch.index_select(hidden_states, 0, attn_metadata.prefill.cp_kv_recover_idx)
         return hidden_states