fix format

clrs97 · CalvinXKY · clrs97 · commit 3a6068ae657e · 2025-10-09T11:27:39.000+08:00
Co-authored-by: CalvinXKY &lt;kyxiezju@163.com&gt;
Signed-off-by: clrs97 &lt;524936896@qq.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -105,7 +105,8 @@ def __init__(self, vllm_config):
             if self.pd_tp_ratio == 0:
                 raise AssertionError(
                     "Only support P node tp size lagger then D node tp size")
-        self.enable_mla_prefill_dp_rebalancing = additional_config.get("enable_mla_prefill_dp_rebalancing", False)
+        self.enable_mla_prefill_dp_rebalancing = additional_config.get(
+            "enable_mla_prefill_dp_rebalancing", False)
 
 
 class TorchairGraphConfig:
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -3,13 +3,16 @@
                     TypeVar)
 
 import torch
+import torch.distributed as dist
 import torch_npu
 from torch import nn
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.config import VllmConfig, get_current_vllm_config
-from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
+from vllm.distributed import (get_dp_group,
+                              get_tensor_model_parallel_world_size,
+                              get_tp_group)
 from vllm.model_executor.layers.linear import (LinearBase, ReplicatedLinear,
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
@@ -21,21 +24,19 @@
                                          maybe_save_kv_layer_to_connector,
                                          split_decodes_and_prefills,
                                          wait_for_kv_layer_from_connector)
+from vllm_ascend.distributed.parallel_state import (
+    get_mla_dp_rebalancing_o_shared_group, get_mla_dp_rebalancing_world_group)
+from vllm_ascend.mla_dp_rebalancing import get_mla_dp_rebalancing_context
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
-from vllm_ascend.utils import npu_prefetch
+from vllm_ascend.torchair.ops.shared_weight_layer import (
+    post_process_after_loading_for_shared_weight_series,
+    reach_layer_for_shared_weight_series,
+    register_layer_to_shared_weight_series)
+from vllm_ascend.utils import dispose_tensor, npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
-from vllm_ascend.distributed.parallel_state import get_mla_dp_rebalancing_o_shared_group, get_mla_dp_rebalancing_world_group
-from vllm_ascend.torchair.ops.shared_weight_layer import (post_process_after_loading_for_shared_weight_series,
-                                                          reach_layer_for_shared_weight_series,
-                                                          register_layer_to_shared_weight_series)
-from vllm_ascend.utils import dispose_tensor
-from vllm_ascend.mla_dp_rebalancing import get_mla_dp_rebalancing_context
-import torch.distributed as dist
-from vllm.distributed import get_dp_group
-
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -510,7 +511,7 @@ def __init__(
         self.prefill_mask = None
 
         self.speculative_config = vllm_config.speculative_config
-    
+
         if ascend_config.enable_mla_prefill_dp_rebalancing:
             # Dispose tensor from the original o_proj
             for attr_name in dir(self.o_proj):
@@ -519,19 +520,21 @@ def __init__(
                     dispose_tensor(attr_value)
             # Construct the new o_proj using ReplicatedLinear
             config = vllm_config.model_config.hf_config
-            new_o_proj = ReplicatedLinear(config.num_attention_heads * config.v_head_dim,
-                                          config.hidden_size,
-                                          bias=False,
-                                          quant_config=vllm_config.quant_config,
-                                          prefix=self.o_proj.prefix)
+            new_o_proj = ReplicatedLinear(
+                config.num_attention_heads * config.v_head_dim,
+                config.hidden_size,
+                bias=False,
+                quant_config=vllm_config.quant_config,
+                prefix=self.o_proj.prefix)
             # Replace the o_proj with the new one
             self.o_proj.__class__ = new_o_proj.__class__
             self.o_proj.__dict__ = new_o_proj.__dict__
             # Register the o_proj into shared weight series to cut down memory usage
-            register_layer_to_shared_weight_series(series_name="o_proj",
-                                                   group=get_mla_dp_rebalancing_o_shared_group(),
-                                                   layer=self.o_proj,
-                                                   prefetch_step=1)
+            register_layer_to_shared_weight_series(
+                series_name="o_proj",
+                group=get_mla_dp_rebalancing_o_shared_group(),
+                layer=self.o_proj,
+                prefetch_step=1)
 
     def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
@@ -991,18 +994,21 @@ def _forward_prefill_with_dp_rebalancing(
         # 1. Perform q_a_proj and q_a_layernorm to obtain q_c
         # 2. Perform kv_a_proj_with_mqa to obtain kv_no_split
         npu_prefetch(self.q_a_proj.weight,
-                        hidden_states,
-                        enabled=self.enable_prefetch)
+                     hidden_states,
+                     enabled=self.enable_prefetch)
         sp_ckq = self.q_a_proj(device_sp_hidden_states)[0]
         sp_hidden_states_or_q_c = self.q_a_layernorm(sp_ckq)
         sp_kv_no_split = self.kv_a_proj_with_mqa(device_sp_hidden_states)[0]
         # Rearrange down_proj outputs across DP.
-        sp_down_proj_output = torch.cat([sp_hidden_states_or_q_c, sp_kv_no_split], dim=1)
+        sp_down_proj_output = torch.cat(
+            [sp_hidden_states_or_q_c, sp_kv_no_split], dim=1)
         sp_world_group = get_mla_dp_rebalancing_world_group()
-        global_sp_down_proj_output = sp_world_group.all_gather(sp_down_proj_output, 0)
+        global_sp_down_proj_output = sp_world_group.all_gather(
+            sp_down_proj_output, 0)
         local_dp = context.local_dp
-        dp_ori_down_proj_output = global_sp_down_proj_output[context.start_token_of_dp[local_dp]:
-                                                             context.end_token_of_dp[local_dp]]
+        dp_ori_down_proj_output = global_sp_down_proj_output[
+            context.start_token_of_dp[local_dp]:context.
+            end_token_of_dp[local_dp]]
         prefill_q_c, prefill_kv_no_split = dp_ori_down_proj_output.split(
             [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
             dim=-1)
@@ -1048,14 +1054,15 @@ def _forward_prefill_with_dp_rebalancing(
         tp_size = get_tp_group().world_size
         total_receive_len = context.local_device_total_receive_len
         sp_o_proj_input = torch.empty(
-                [total_receive_len * tp_size, self.num_heads * self.v_head_dim],
-                dtype=output_prefill.dtype,
-                device=output_prefill.device)
+            [total_receive_len * tp_size, self.num_heads * self.v_head_dim],
+            dtype=output_prefill.dtype,
+            device=output_prefill.device)
         if get_dp_group().world_size == 1:
             if output_prefill.shape[0] < context.num_padded_global_tokens:
                 output_prefill = nn.functional.pad(
                     output_prefill,
-                    (0, 0, 0, context.num_padded_global_tokens - output_prefill.shape[0]))
+                    (0, 0, 0, context.num_padded_global_tokens -
+                     output_prefill.shape[0]))
             dist.all_to_all_single(
                 output=sp_o_proj_input,
                 input=output_prefill,
@@ -1070,8 +1077,7 @@ def _forward_prefill_with_dp_rebalancing(
                 group=sp_world_group.device_group,
             )
         sp_o_proj_input = sp_o_proj_input.reshape(
-            total_receive_len,
-            tp_size * self.num_heads * self.v_head_dim)
+            total_receive_len, tp_size * self.num_heads * self.v_head_dim)
         if total_receive_len < context.num_tokens_per_device:
             sp_o_proj_input = nn.functional.pad(
                 sp_o_proj_input,
@@ -1094,7 +1100,8 @@ def forward(
         if get_ascend_config().enable_mla_prefill_dp_rebalancing:
             reach_layer_for_shared_weight_series(self.o_proj)
             if get_mla_dp_rebalancing_context() is not None:
-                output[...] = self._forward_prefill_with_dp_rebalancing(layer_name, hidden_states, kv_cache, attn_metadata)
+                output[...] = self._forward_prefill_with_dp_rebalancing(
+                    layer_name, hidden_states, kv_cache, attn_metadata)
                 return output
         if attn_metadata is None:
             # Profiling run.
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
@@ -53,7 +53,8 @@ def get_mla_dp_rebalancing_world_group() -> GroupCoordinator:
 
 
 def get_mla_dp_rebalancing_o_shared_group() -> GroupCoordinator:
-    assert _MLA_DP_REBALANCING_O_SHARED is not None, ("o_proj shared weight group for MLA DP rebalancing is not initialized")
+    assert _MLA_DP_REBALANCING_O_SHARED is not None, (
+        "o_proj shared weight group for MLA DP rebalancing is not initialized")
     return _MLA_DP_REBALANCING_O_SHARED
 
 
@@ -152,14 +153,16 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
         global _MLA_DP_REBALANCING_WORLD
         global _MLA_DP_REBALANCING_O_SHARED
         group_ranks = [list(range(torch.distributed.get_world_size()))]
-        _MLA_DP_REBALANCING_WORLD = init_model_parallel_group(group_ranks,
-                                                              get_world_group().local_rank,
-                                                              backend,
-                                                              group_name="mla_dp_rebalancing_world")
-        _MLA_DP_REBALANCING_O_SHARED = init_model_parallel_group(group_ranks,
-                                                                 get_world_group().local_rank,
-                                                                 backend,
-                                                                 group_name="mla_dp_rebalancing_o_shared")
+        _MLA_DP_REBALANCING_WORLD = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="mla_dp_rebalancing_world")
+        _MLA_DP_REBALANCING_O_SHARED = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="mla_dp_rebalancing_o_shared")
 
 
 def get_mlp_tensor_model_parallel_world_size():
diff --git a/vllm_ascend/mla_dp_rebalancing.py b/vllm_ascend/mla_dp_rebalancing.py
@@ -7,25 +7,26 @@
 from vllm.distributed.parallel_state import get_dp_group, get_tp_group
 from vllm.forward_context import get_forward_context
 
-from vllm_ascend.distributed.parallel_state import get_ascend_config, get_mla_dp_rebalancing_world_group
+from vllm_ascend.distributed.parallel_state import (
+    get_ascend_config, get_mla_dp_rebalancing_world_group)
 
 
 @dataclass
 class RebalancingContext:
     num_padded_global_tokens: int
     num_tokens_per_dp: int
     num_tokens_per_device: int
-    start_token_of_dp: list[int] # no pad, original
-    end_token_of_dp: list[int] # no pad, original
+    start_token_of_dp: list[int]  # no pad, original
+    end_token_of_dp: list[int]  # no pad, original
     global_tokens: torch.Tensor
-    dp_sp_start_token: list[int] # i * num_tokens_per_dp
-    dp_sp_end_token: list[int] # (i + 1) * num_tokens_per_dp
-    device_sp_start_token: list[int] # i * num_tokens_per_device
-    device_sp_end_token: list[int] # (i + 1) * num_tokens_per_device
+    dp_sp_start_token: list[int]  # i * num_tokens_per_dp
+    dp_sp_end_token: list[int]  # (i + 1) * num_tokens_per_dp
+    device_sp_start_token: list[int]  # i * num_tokens_per_device
+    device_sp_end_token: list[int]  # (i + 1) * num_tokens_per_device
     local_dp: int
     local_device: int
-    local_device_sp_start_token_within_dp: int # tp_group.rank_in_group * num_tokens_per_device
-    local_device_sp_end_token_within_dp: int # (tp_group.rank_in_group + 1) * num_tokens_per_device
+    local_device_sp_start_token_within_dp: int  # tp_group.rank_in_group * num_tokens_per_device
+    local_device_sp_end_token_within_dp: int  # (tp_group.rank_in_group + 1) * num_tokens_per_device
     local_device_total_receive_len: int
     input_split_sizes: list[int]
     output_split_sizes: list[int]
@@ -66,7 +67,7 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
         num_input_tokens = attn_metadata.num_actual_tokens
     else:
         num_input_tokens = 1
-    
+
     input_ids = input_ids[:num_input_tokens]
 
     rebalancing_metadata = torch.cat([
@@ -76,7 +77,8 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
         nn.functional.pad(input_ids,
                           (0, max_num_tokens_across_dp - num_input_tokens)),
     ]).unsqueeze(0)
-    rebalancing_metadata_across_dp = dp_group.all_gather(rebalancing_metadata, 0)
+    rebalancing_metadata_across_dp = dp_group.all_gather(
+        rebalancing_metadata, 0)
     for i in range(dp_group.world_size):
         row = rebalancing_metadata_across_dp[i]
         feature_enabled = bool(row[0] > 0)
@@ -120,7 +122,8 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
     local_dp = dp_group.rank_in_group
     local_device = sp_world_group.rank_in_group
     local_device_sp_start_token_within_dp = tp_group.rank_in_group * num_tokens_per_device
-    local_device_sp_end_token_within_dp = (tp_group.rank_in_group + 1) * num_tokens_per_device
+    local_device_sp_end_token_within_dp = (tp_group.rank_in_group +
+                                           1) * num_tokens_per_device
 
     tp_size = tp_group.world_size
     input_split_sizes = []
@@ -160,7 +163,8 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
     if dp_metadata is not None:
         dp_metadata.max_tokens_across_dp_cpu.fill_(num_tokens_per_dp)
         for i in range(dp_group.world_size):
-            dp_metadata.cu_tokens_across_dp_cpu[i] = (i + 1) * num_tokens_per_dp
+            dp_metadata.cu_tokens_across_dp_cpu[i] = (i +
+                                                      1) * num_tokens_per_dp
 
     _mla_dp_rebalancing_context = RebalancingContext(
         num_padded_global_tokens=num_padded_global_tokens,
@@ -175,7 +179,8 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
         device_sp_end_token=device_sp_end_token,
         local_dp=local_dp,
         local_device=local_device,
-        local_device_sp_start_token_within_dp=local_device_sp_start_token_within_dp,
+        local_device_sp_start_token_within_dp=
+        local_device_sp_start_token_within_dp,
         local_device_sp_end_token_within_dp=local_device_sp_end_token_within_dp,
         local_device_total_receive_len=local_device_total_receive_len,
         input_split_sizes=input_split_sizes,
@@ -187,15 +192,16 @@ def set_mla_dp_rebalancing_context(input_ids: torch.Tensor):
 def calc_div_ceil(up: int, down: int) -> int:
     return (up + down - 1) // down
 
+
 def pre_forward_for_dp_rebalancing(input_ids: torch.Tensor) -> torch.Tensor:
     set_mla_dp_rebalancing_context(input_ids)
     context = get_mla_dp_rebalancing_context()
     if context is None:
         return input_ids
     local_dp = context.local_dp
-    return context.global_tokens[
-        context.dp_sp_start_token[local_dp]:context.
-        dp_sp_end_token[local_dp]]
+    return context.global_tokens[context.dp_sp_start_token[local_dp]:context.
+                                 dp_sp_end_token[local_dp]]
+
 
 def recover_output(hidden_states: torch.Tensor) -> torch.Tensor:
     context = get_mla_dp_rebalancing_context()
@@ -205,7 +211,10 @@ def recover_output(hidden_states: torch.Tensor) -> torch.Tensor:
     local_dp_end_token = context.end_token_of_dp[local_dp]
     local_dp_sp_start_token = context.dp_sp_start_token[local_dp]
     local_dp_sp_end_token = context.dp_sp_end_token[local_dp]
-    send = hidden_states[:max(0, min(local_dp_sp_end_token, context.end_token_of_dp[-1]) - local_dp_sp_start_token)]
+    send = hidden_states[:max(
+        0,
+        min(local_dp_sp_end_token, context.end_token_of_dp[-1]) -
+        local_dp_sp_start_token)]
     dp_group = get_dp_group()
     if dp_group.world_size == 1:
         return send
@@ -238,12 +247,14 @@ def recover_output(hidden_states: torch.Tensor) -> torch.Tensor:
     )
     return output
 
-def post_forward_for_dp_rebalancing(hidden_states: torch.Tensor) -> torch.Tensor:
+
+def post_forward_for_dp_rebalancing(
+        hidden_states: torch.Tensor) -> torch.Tensor:
     context = get_mla_dp_rebalancing_context()
     if context is None:
         return hidden_states
     output = recover_output(hidden_states)
     if output.shape[0] < context.num_output_tokens:
-        output = nn.functional.pad(output,
-                    (0, 0, 0, context.num_output_tokens - output.shape[0]))
-    return output
+        output = nn.functional.pad(
+            output, (0, 0, 0, context.num_output_tokens - output.shape[0]))
+    return output
diff --git a/vllm_ascend/torchair/ops/shared_weight_layer.py b/vllm_ascend/torchair/ops/shared_weight_layer.py
@@ -16,7 +16,7 @@ def dispose_tensor(x: torch.Tensor):
 class LayerMetadata:
     """Metadata for a layer.
     """
-    layer_idx: int # The index of the layer.
+    layer_idx: int  # The index of the layer.
     layer: LinearBase  # The layer object.
     post_method: Callable[[
         torch.nn.Module
@@ -56,7 +56,7 @@ def post_process_after_loading(self):
         # This method only needs to be called once per series.
         if self.shared_windows:
             return
-        
+
         self.layers.sort(key=lambda x: x.layer_idx)
         self.num_layers = len(self.layers)
         assert self.num_layers > 0, "No layers in the series"
@@ -212,13 +212,14 @@ def register_layer_to_shared_weight_series(
     series = _series_dict[series_name]
     assert layer.quant_method is not None
     layer_idx = extract_layer_index(layer.prefix)
-    series.layers.append(LayerMetadata(
-        layer_idx=layer_idx,
-        layer=layer,
-        post_method=layer.quant_method.process_weights_after_loading,
-        weight=layer.weight,
-        window_idx=-1,
-    ))
+    series.layers.append(
+        LayerMetadata(
+            layer_idx=layer_idx,
+            layer=layer,
+            post_method=layer.quant_method.process_weights_after_loading,
+            weight=layer.weight,
+            window_idx=-1,
+        ))
     # Discard the original `process_weights_after_loading` method such that it won't be called by others.
     layer.quant_method.process_weights_after_loading = lambda layer: None
     # When the layer not intended to be stored in this device, dispose the tensor and skip weight loading.
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -111,6 +111,8 @@
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
 from vllm_ascend.eplb.eplb_updator import EplbUpdator
 from vllm_ascend.eplb.utils import model_register
+from vllm_ascend.mla_dp_rebalancing import (post_forward_for_dp_rebalancing,
+                                            pre_forward_for_dp_rebalancing)
 from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
@@ -126,8 +128,6 @@
                                lmhead_tp_enable)
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
-from vllm_ascend.mla_dp_rebalancing import pre_forward_for_dp_rebalancing, post_forward_for_dp_rebalancing
-
 if TYPE_CHECKING:
     import xgrammar as xgr  # type: ignore[import-untyped]
     from vllm.v1.core.sched.output import SchedulerOutput