merge expert load 1227

lt · lt · commit c202a1d72ec2 · 2025-06-17T22:54:01.000+08:00
diff --git a/vllm_ascend/ops/expert_load_balancer.py b/vllm_ascend/ops/expert_load_balancer.py
@@ -1,20 +1,78 @@
 import json
 import random
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import torch
 
+from vllm_ascend.ascend_config import get_ascend_config
 
-class ExpertLoadBalancer(object):
 
-    def __init__(self, expert_map_path, global_expert_num):
-        self.expert_map_path = expert_map_path
+class ExpertLoadBalancer:
+    """
+    ExpertLoadBalancer is a singleton class responsible for managing and
+    recording the mapping and load balancing of experts across multiple layers
+    and devices in a distributed Mixture-of-Experts (MoE) model.
+    """
+
+    _instance = None
+    """The singleton instance of ExpertLoadBalancer."""
+
+    def __init__(self, expert_map_path: Optional[str], global_expert_num: int):
+        """
+        This method should only be called once, and it raises an exception if
+        an instance already exists.
+
+        Args:
+            expert_map_path (str): Path to the expert map file. If None, only
+            used for recording expert load.
+            global_expert_num (int): Total number of global experts.
+        Raises:
+            Exception: If an instance of ExpertLoadBalancer already exists.
+        """
+
+        if ExpertLoadBalancer._instance is not None:
+            raise Exception(
+                "This class is a singleton, cannot be instantiated "
+                "more than once.")
+
         self.global_expert_num = global_expert_num
-        self.expert_map_tensor, self.layers_num, self.ranks_num = (
-            self._expert_file_to_tensor())
 
-    def _expert_file_to_tensor(self):
-        with open(self.expert_map_path, "r") as f:
+        # If expert_map_path is not provided, we only record the expert load.
+        if expert_map_path is not None:
+            self.expert_map_tensor, self.layers_num, self.ranks_num = (
+                self._expert_file_to_tensor(expert_map_path))
+        else:
+            self.expert_map_tensor = None
+            # TODO: change the num layer source
+            self.layers_num = 58
+            self.ranks_num = None
+
+        self._torchair_graph_enabled = \
+            get_ascend_config().torchair_graph_config.enabled
+
+        self._all_layers_logical_expert_load_record = \
+            torch.zeros((self.layers_num, self.global_expert_num),
+                        dtype=torch.int64,
+                        device=torch.npu.current_device())
+        # Always enable expert load recording if torchair graph is enabled.
+        self._recording = self._torchair_graph_enabled
+
+    @staticmethod
+    def get_instance():
+        if ExpertLoadBalancer._instance is None:
+            raise ValueError(
+                "ExpertLoadBalancer instance has not been initialized.")
+        return ExpertLoadBalancer._instance
+
+    @staticmethod
+    def init_instance(expert_map_path: Optional[str], global_expert_num: int):
+        """Initialize the singleton instance of ExpertLoadBalancer."""
+        ExpertLoadBalancer._instance = ExpertLoadBalancer(
+            expert_map_path, global_expert_num)
+        return ExpertLoadBalancer._instance
+
+    def _expert_file_to_tensor(self, expert_map_path: str):
+        with open(expert_map_path, "r") as f:
             data = json.load(f)
         layers_num = data["moe_layer_count"]
         gpus_num = data["layer_list"][0]["device_count"]
@@ -97,3 +155,28 @@ def get_global_redundant_expert_num(self):
             len(self.expert_map_tensor[0][0]) * self.ranks_num -
             self.global_expert_num)
         return global_redundant_expert_num
+
+    def accumulate_expert_distribution_record(self, layer_id: int,
+                                              topk_ids: torch.Tensor):
+        if not self._recording:
+            return
+        flattened_topk_ids = topk_ids.flatten().to(torch.int64)
+        ones = torch.ones_like(flattened_topk_ids)
+        self._all_layers_logical_expert_load_record[layer_id].scatter_add_(
+            0, flattened_topk_ids, ones)
+
+    def start_expert_distribution_record(self):
+        """Start recording the expert distribution."""
+        self._all_layers_logical_expert_load_record.zero_()
+        self._recording = True
+
+    def stop_expert_distribution_record(self):
+        """Stop recording the expert distribution."""
+        # If torchair graph is not enabled, we do not turn off the recording.
+        self._recording = self._torchair_graph_enabled
+
+    def export_local_expert_distribution_record(self):
+        """Export the local expert distribution record and reset it."""
+        local_record = self._all_layers_logical_expert_load_record.clone()
+        self._all_layers_logical_expert_load_record.zero_()
+        return local_record
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -945,15 +945,15 @@ def apply(
                 top_k=top_k,
                 expert_map=expert_map,
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name,
-                shared_experts=shared_experts)
+                shared_experts=shared_experts), topk_ids
         elif fused_moe_state == FusedMoEState.AllGather:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
                                  w2=layer.w2_weight,
                                  topk_weights=topk_weights,
                                  topk_ids=topk_ids,
                                  top_k=top_k,
-                                 expert_map=expert_map)
+                                 expert_map=expert_map), topk_ids
         elif MOE_ALL2ALL_BUFFER:
             return fused_experts_with_all2all_buffer(
                 hidden_states=x,
@@ -965,16 +965,17 @@ def apply(
                 max_model_len=self.max_model_len,
                 global_batch_size=self.global_batch_size,
                 expert_map=expert_map,
-                ep_group=get_ep_group())
+                ep_group=get_ep_group()), topk_ids
         else:
-            return fused_experts_with_all2all(hidden_states=x,
-                                              w1=layer.w13_weight,
-                                              w2=layer.w2_weight,
-                                              topk_weights=topk_weights,
-                                              topk_ids=topk_ids,
-                                              top_k=top_k,
-                                              expert_map=expert_map,
-                                              ep_group=get_ep_group())
+            return fused_experts_with_all2all(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                ep_group=get_ep_group()), topk_ids
 
 
 class AscendFusedMoE(FusedMoE):
@@ -1048,22 +1049,26 @@ def __init__(
         self.log2phy = None
         self.global_redundant_expert_num = 0
 
+        self.expert_load_balancer = ExpertLoadBalancer.get_instance()
         ascend_config = get_ascend_config()
         expert_map_path = ascend_config.expert_map_path
         self.dynamic_eplb = ascend_config.dynamic_eplb
         if expert_map_path and os.path.exists(expert_map_path):
+            # only support in MC2 and graph mode
+            if not (VLLM_ENABLE_MC2
+                    and ascend_config.torchair_graph_config.enabled):
+                raise NotImplementedError(
+                    "EPLB is only supported in MC2 and graph mode")
             # moe expert load balance
-            expert_load_balancer = ExpertLoadBalancer(expert_map_path,
-                                                      self.global_num_experts)
             self.local_num_experts, self.expert_map = \
-                                expert_load_balancer.get_rank_placement_map(
+                                self.expert_load_balancer.get_rank_placement_map(
                                                 self.moe_instance_id,
                                                 get_ep_group().rank_in_group)
-            self.log2phy = expert_load_balancer.get_rank_log2phy_map(
+            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
                 self.moe_instance_id,
                 get_ep_group().rank_in_group)
             self.global_redundant_expert_num = \
-                        expert_load_balancer.get_global_redundant_expert_num()
+                        self.expert_load_balancer.get_global_redundant_expert_num()
         else:
             # Create a tensor of size num_experts filled with -1
             self.local_num_experts, self.expert_map = determine_expert_map(
@@ -1199,6 +1204,9 @@ def forward(self,
         if self.dynamic_eplb == True:
             self.calculate_moe_load()
 
+        self.expert_load_balancer.accumulate_expert_distribution_record(
+            self.moe_instance_id, self.topk_ids)
+
         if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather:
             dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                             self.tp_group)
@@ -1216,6 +1224,27 @@ def forward(self,
             dispose_tensor(e_hidden_states)
         else:
             final_hidden_states = e_hidden_states
+        self.expert_load_balancer.accumulate_expert_distribution_record(
+            self.moe_instance_id, topk_ids)
+
+        if self.dp_size > 1:
+            if VLLM_ENABLE_MC2 and not is_prefill:
+                ...
+            elif self.torchair_graph_enabled:
+                if USING_LCCL_COM:  # type: ignore
+                    e_hidden_states = dist._functional_collectives.reduce_scatter_tensor(
+                        e_hidden_states,
+                        "sum",
+                        scatter_dim=0,
+                        group=get_dp_group().device_group)
+                elif self.torchair_graph_enabled and not is_prefill:
+                    e_hidden_states = dist._functional_collectives.reduce_scatter_tensor(
+                        e_hidden_states,
+                        "sum",
+                        scatter_dim=0,
+                        group=get_dp_group().device_group)
+                else:
+                    e_hidden_states = get_ep_group().combine(e_hidden_states)
 
         if tp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
             final_hidden_states = tensor_model_parallel_all_reduce(
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -617,7 +617,7 @@ def apply(
         global_redundant_expert_num: int = 0,
         shared_experts: Optional[Any] = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert router_logits.shape[
             1] == global_num_experts, "Number of global experts mismatch"
 
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -17,6 +17,8 @@
 # Adapted from vllm-project/vllm/vllm/worker/gpu_worker.py
 #
 
+import time
+from pathlib import Path
 from typing import Optional
 
 import torch
@@ -38,9 +40,13 @@
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.worker_base import WorkerBase
 
-from vllm_ascend.ascend_config import init_ascend_config
+import vllm_ascend.envs as envs_ascend
+from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
-from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
+from vllm_ascend.distributed.parallel_state import (get_ep_group,
+                                                    get_etp_group,
+                                                    init_ascend_model_parallel)
+from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import try_register_lib
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
@@ -233,6 +239,33 @@ def profile(self, is_start: bool = True):
         else:
             self.profiler.stop()
 
+    def expert_distribution_record(self, is_start: bool = True):
+        assert envs.VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR is not None, \
+            "VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR is not set. " \
+            "Please set it to enable expert distribution recording."
+
+        if is_start:
+            logger.info("Starting expert distribution record.")
+            self.expert_load_balancer.start_expert_distribution_record()
+        else:
+            logger.info("Stopping expert distribution record.")
+            self.expert_load_balancer.stop_expert_distribution_record()
+
+    def dump_expert_distribution_record(self):
+        assert envs.VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR is not None, \
+            "VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR is not set. " \
+            "Please set it to enable expert distribution recording."
+
+        logger.info("Dumping expert distribution record.")
+        local_expert_distribution = \
+            self.expert_load_balancer.export_local_expert_distribution_record()
+
+        ep_rank = get_ep_group().rank_in_group
+        etp_rank = get_etp_group().rank_in_group
+        _dump_to_file(
+            f"expert_distribution_recorder_{time.time()}_{ep_rank}_{etp_rank}.pt",
+            local_expert_distribution)
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
 
@@ -277,6 +310,15 @@ def _init_worker_distributed_environment(self) -> None:
         )
         ensure_kv_transfer_initialized(self.vllm_config)
 
+        # Initialize the expert load balancer.
+        if self.vllm_config.model_config.is_deepseek_mla:
+            expert_map_path = get_ascend_config().expert_map_path
+            num_logical_experts = \
+                self.vllm_config.model_config.hf_config.n_routed_experts
+            self.expert_load_balancer = ExpertLoadBalancer.init_instance(
+                expert_map_path=expert_map_path,
+                global_expert_num=num_logical_experts)
+
     def _init_profiler(self):
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
@@ -310,3 +352,12 @@ def _init_profiler(self):
                     torch_profiler_trace_dir))
         else:
             return None
+
+
+def _dump_to_file(name, data):
+    save_dir = Path(envs.VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR)
+    path_output = save_dir / name
+    logger.info(f"Write expert distribution to {path_output}")
+    if not save_dir.exists():
+        save_dir.mkdir(parents=True, exist_ok=True)
+    torch.save(data, str(path_output))