refine logic

Angazenn · Angazenn · commit f74b5d43d918 · 2025-10-21T14:53:12.000+08:00
Signed-off-by: Angazenn &lt;supperccell@163.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1786,16 +1786,18 @@ def _pool(
         )
 
     def _initialize_mc2(self):
-        """Initialization of mc2-related parameters."""
+        """Initialization of MC2-related parameters and verify the validity."""
 
-        self.mc2_tokens_capacity = 0
         self.reserved_mc2_mask = None
 
         # For models contains no moe modules, we simply skip the
-        # initialization of mc2.
+        # initialization of MC2.
         if not is_moe_model(self.vllm_config):
+            self.mc2_tokens_capacity = 0
             return
 
+        # For moe models, we first assume that this model will use MC2, and compute
+        # self.mc2_tokens_capacity.
         # NOTE: To be clear, we need to make sure that during graph capture, the number of
         # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
         # the max number of tokens in graph is min(max_num_seqs * 2, 512).
@@ -1806,11 +1808,19 @@ def _initialize_mc2(self):
         tp_size = self.parallel_config.tensor_parallel_size
         # Use integer arithmetic for ceiling division.
         num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
-        mc2_tokens_capacity = num_tokens_per_tp_rank * tp_size
+        # A larger number of input tokens for mc2 introduce much more HBM consumption.
+        # Therefore, self.mc2_tokens_capacity is set to maximum of possible input_tokens
+        # in graph or decode cases.
+        self.mc2_tokens_capacity = num_tokens_per_tp_rank * tp_size
 
-        # Additional check for MC2 restrictions on specific hardwares.
-        if self._select_moe_comm_method(
-                mc2_tokens_capacity) == MoECommType.MC2:
+        # We then check whether it is really necessary to run MC2
+        # and verify the validation of self.mc2_tokens_capacity on
+        # different hardwares.
+        if self._select_moe_comm_method(self.mc2_tokens_capacity,
+                                        with_prefill=False) == MoECommType.MC2:
+
+            # MC2 will be applied in runtime. Therefore we check whether
+            # the number of input tokens exceed the limit of MC2.
 
             soc_version = get_ascend_soc_version()
             limit = None
@@ -1826,14 +1836,17 @@ def _initialize_mc2(self):
                     f"(current: {self.max_num_reqs}) or increase `tp_size` (current: {tp_size})."
                 )
 
-            # Only set these parameters if mc2 is actually needed
-            # and the above check is passed.
-            self.mc2_tokens_capacity = mc2_tokens_capacity
+            # All verification is passed, we finally initialize self.reserved_mc2_mask.
             self.reserved_mc2_mask = torch.zeros(
                 self.mc2_tokens_capacity,
                 dtype=torch.bool,
                 device=self.device,
             )
+        else:
+            # MC2 is still not needed for this moe model on certain hardware
+            # (such as a single node of A2). self.mc2_tokens_capacity falls
+            # back to 0.
+            self.mc2_tokens_capacity = 0
 
     def _select_moe_comm_method(self, num_tokens: int,
                                 with_prefill: bool) -> MoECommType: