add support for flashcomm2 in qwen3

David9857 · David9857 · commit d26e0322cc7d · 2025-07-14T14:12:07.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -143,6 +143,8 @@
     # Batch MC2 in prefill: The number of tokens in each batch
     "VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE", "128")),
+    "VLLM_ENABLE_FC":
+    lambda: int(os.getenv("VLLM_ENABLE_FC", '0'))
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/models/qwen3.py b/vllm_ascend/models/qwen3.py
@@ -2,27 +2,101 @@
 from typing import Optional, Union
 
 import torch
+import torch.distributed as dist
+import torch.nn.functional as F
 from torch import nn
 from transformers import Qwen3Config
 from vllm.attention import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+from vllm.model_executor.layers.linear import (ReplicatedLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.qwen2 import Qwen2MLP as Qwen3MLP
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.models.qwen3 import Qwen3Attention, Qwen3MLP
 from vllm.model_executor.models.utils import (AutoWeightsLoader,
                                               PPMissingLayer, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from vllm_ascend import envs
 from vllm_ascend.ops.layernorm import AddRMSNormW8A8Quant
 
 
+def pad(tensor, x):
+    length = tensor.size(0)
+    pad_size = (x - (length % x)) % x
+    if pad_size > 0:
+        return F.pad(tensor, (0, 0, 0, pad_size)), pad_size
+    return tensor, pad_size
+
+
+def unpad(tensor, pad_size):
+    if pad_size > 0:
+        return tensor[:-pad_size, :]
+    return tensor
+
+
+class CustomQwen3MLP(Qwen3MLP):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(hidden_size=hidden_size,
+                         intermediate_size=intermediate_size,
+                         hidden_act=hidden_act,
+                         quant_config=quant_config,
+                         prefix=prefix)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.enable_fc = envs.VLLM_ENABLE_FC
+        if self.enable_fc:
+            # if flashcomm2 enabled, replace Linear+AllReduce with All2All+Linear
+            self.down_proj = ReplicatedLinear(
+                intermediate_size,
+                hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.down_proj",
+            )
+        else:
+            self.down_proj = RowParallelLinear(
+                intermediate_size,
+                hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.down_proj",
+            )
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        pad_size = 0
+        if self.enable_fc:
+            # pad input because AllGather requires token_num to be divisible by tp_size
+            x, pad_size = pad(x, self.tp_size)
+            output = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+            dist.all_to_all_single(output, x)
+            x = output.reshape(self.tp_size, -1, output.size(-1)) \
+                        .transpose(0, 1) \
+                        .reshape(-1, output.size(-1)*self.tp_size)
+        x, _ = self.down_proj(x)
+        return x, pad_size
+
+
 class CustomQwen3Attention(Qwen3Attention):
 
     def __init__(self,
@@ -52,6 +126,25 @@ def __init__(self,
                          rope_scaling=rope_scaling,
                          prefix=prefix,
                          attn_type=attn_type)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.enable_fc = envs.VLLM_ENABLE_FC
+        if self.enable_fc:
+            self.o_proj = ReplicatedLinear(
+                self.total_num_heads * self.head_dim,
+                hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
+        else:
+            self.o_proj = RowParallelLinear(
+                self.total_num_heads * self.head_dim,
+                hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
 
     def forward(
         self,
@@ -78,8 +171,19 @@ def forward(
                                sin=sin,
                                skip_index_select=True)
         attn_output = self.attn(q, k, v)
+        pad_size = 0
+        if self.enable_fc:
+            # pad input because AllGather requires token_num to be divisible by tp_size
+            attn_output, pad_size = pad(attn_output, self.tp_size)
+            output = torch.empty(attn_output.shape,
+                                 dtype=attn_output.dtype,
+                                 device=attn_output.device)
+            dist.all_to_all_single(output, attn_output)
+            attn_output = output.reshape(self.tp_size, -1, output.size(-1)) \
+                                .transpose(0, 1) \
+                                .reshape(-1, output.size(-1)*self.tp_size)
         output, _ = self.o_proj(attn_output)
-        return output
+        return output, pad_size
 
 
 class CustomQwen3DecoderLayer(nn.Module):
@@ -93,6 +197,9 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.enable_fc = envs.VLLM_ENABLE_FC
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
@@ -121,7 +228,7 @@ def __init__(
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
         )
-        self.mlp = Qwen3MLP(
+        self.mlp = CustomQwen3MLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
@@ -185,6 +292,57 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
+    def pre_attention_process(self, hidden_states, residual, pad_size=0):
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
+        hidden_states = unpad(hidden_states, pad_size)
+        return hidden_states, residual
+
+    def pre_mlp_process(self, hidden_states, residual, pad_size=0):
+        token_num = hidden_states.size(0)
+        if token_num != residual.size(0):
+            if pad_size > 0:
+                residual = F.pad(residual, (0, 0, 0, pad_size))
+            split_size_list = [token_num] * self.tp_size
+            residual = torch.split(residual, split_size_list)[self.tp_rank]
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
+        hidden_states = unpad(hidden_states, pad_size)
+        return hidden_states, residual
+
+    def forward(self,
+                positions: torch.Tensor,
+                hidden_states: torch.Tensor,
+                residual: Optional[torch.Tensor],
+                pad_size: int = 0) -> tuple[torch.Tensor, torch.Tensor, int]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            if self.enable_fc:
+                hidden_states, residual = self.pre_attention_process(
+                    hidden_states, residual, pad_size)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+        hidden_states, pad_size = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        if self.enable_fc:
+            hidden_states, residual = self.pre_mlp_process(
+                hidden_states, residual, pad_size)
+        else:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+        hidden_states, pad_size = self.mlp(hidden_states)
+        return hidden_states, residual, pad_size
+
 
 ALL_DECODER_LAYER_TYPES = {
     "attention": CustomQwen3DecoderLayer,
@@ -207,6 +365,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                          prefix=prefix,
                          decoder_layer_type=CustomQwen3DecoderLayer)
         self.cos_sin_cache = self.layers[0].self_attn.rotary_emb.cos_sin_cache
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.enable_fc = envs.VLLM_ENABLE_FC
 
     def forward(
         self,
@@ -235,14 +396,17 @@ def forward(
         cos, sin = cos.view(1, -1, 1, last_dim).contiguous(), sin.view(
             1, -1, 1, last_dim).contiguous()
 
+        pad_size = 0
         for layer in self.layers[self.start_layer:self.end_layer]:
-            hidden_states, residual = layer(
-                positions,
-                cos,
-                sin,
-                hidden_states,
-                residual,
-            )
+            hidden_states, residual, pad_size = layer(positions, hidden_states,
+                                                      residual, pad_size)
+        if self.enable_fc:
+            hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
+            residual = tensor_model_parallel_all_gather(residual, 0)
+            if pad_size > 0:
+                hidden_states = hidden_states[:-pad_size]
+                residual = residual[:-pad_size]
+
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,

Original file line number	Diff line number	Diff line change
`@@ -143,6 +143,8 @@`
`143`	`143`	`# Batch MC2 in prefill: The number of tokens in each batch`
`144`	`144`	`"VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE":`
`145`	`145`	`lambda: int(os.getenv("VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE", "128")),`
	`146`	`+ "VLLM_ENABLE_FC":`
	`147`	`+ lambda: int(os.getenv("VLLM_ENABLE_FC", '0'))`
`146`	`148`	`}`
`147`	`149`
`148`	`150`	`# end-env-vars-definition`