vllm-project
diff --git a/‎vllm_ascend/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm_ascend/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm_ascend/ascend_forward_context.py‎
Lines changed: 29 additions & 0 deletions b/‎vllm_ascend/ascend_forward_context.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎vllm_ascend/envs.py‎
Lines changed: 12 additions & 0 deletions b/‎vllm_ascend/envs.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎vllm_ascend/ops/layernorm.py‎
Lines changed: 24 additions & 0 deletions b/‎vllm_ascend/ops/layernorm.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎vllm_ascend/ops/linear.py‎
Lines changed: 153 additions & 1 deletion b/‎vllm_ascend/ops/linear.py‎
Lines changed: 153 additions & 1 deletion
@@ -23,5 +23,10 @@ def register():
 
 
 def register_model():
+    import vllm.envs as envs
+    import vllm_ascend.envs as envs_ascend
     from .models import register_model
+    if envs.VLLM_USE_V1 and \
+        envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM != 0:
+        import vllm_ascend.patch.platform.patch_main.patch_decorator
     register_model()
@@ -110,6 +110,35 @@ def set_ascend_forward_context(
         # NOTE: This cannot be set using set_forward_context
         # due to multiple warmups before actual capturing
         forward_context.capturing = False
+        
+        # set this for rope forward_oot using
+        forward_context.is_first_layer = True
+        
+        # set for flashcomm_v1
+        flashcomm_v1_enabled = False
+        matmul_rs_enabled = False
+        ag_matmal_enabled = False
+        pad_size = 0
+        from vllm_ascend.attention.attention_v1 import AscendAttentionState
+        if envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM == 1 and \
+            attn_metadata is not None and \
+            attn_metadata.attn_state != AscendAttentionState.DecodeOnly:
+            flashcomm_v1_enabled = True
+        if flashcomm_v1_enabled and \
+            envs_ascend.VLLM_ASCEND_ENABLE_LCOC_MATMUL_RS == 1:
+            matmul_rs_enabled = True
+        if flashcomm_v1_enabled and \
+            envs_ascend.VLLM_ASCEND_ENABLE_LCOC_AG_MATMUL == 1:
+            ag_matmal_enabled = True
+        if flashcomm_v1_enabled:
+            # num_tokens = hidden_states.size(0)
+            tp_world_size = get_tensor_model_parallel_world_size()
+            pad_size = (tp_world_size -
+                        (num_tokens % tp_world_size)) % tp_world_size
+        forward_context.pad_size = pad_size
+        forward_context.flashcomm_v1_enabled = flashcomm_v1_enabled
+        forward_context.matmul_rs_enabled = matmul_rs_enabled
+        forward_context.ag_matmal_enabled = ag_matmal_enabled
 
         if num_tokens is None and attn_metadata is not None:
             num_tokens = attn_metadata.num_actual_tokens
 
@@ -136,11 +136,23 @@
     # this feature is supported in A2, and eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
+    # FlashComm optimization: Enable v1 and v2 by setting this flag to 1 or 2 respectively
+    "VLLM_ASCEND_ENABLE_FLASHCOMM":
+    lambda: int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0')),
+    # LcocMatmulReduceScatter optimization
+    "VLLM_ASCEND_ENABLE_LCOC_MATMUL_RS":
+    lambda: int(os.getenv("VLLM_ASCEND_ENABLE_LCOC_MATMUL_RS", '0')),
+    # LcocAllGatherMatmul optimization
+    "VLLM_ASCEND_ENABLE_LCOC_AG_MATMUL":
+    lambda: int(os.getenv("VLLM_ASCEND_ENABLE_LCOC_AG_MATMUL", '0')),
     # Whether to enable the alltoall_seq flag, this provides a basic framework on the basis of alltoall for easy expansion.
     #   0: default, normal init.
     #   1: enable moe all2all seq.
     "VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
     lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))),
+    # Whether to enable dense model and general optimizations for better performance.
+    "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))),
     # Whether to enable mlp optimize when tensor parallel is enabled.
     # this feature in eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
 
@@ -18,6 +18,10 @@
 from typing import Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
 
 
@@ -44,6 +48,16 @@ def forward(
         import torch_npu
 
         if residual is not None:
+            forward_context = get_forward_context()
+            flashcomm_v1_enabled = forward_context.flashcomm_v1_enabled
+            if x.size(0) != residual.size(0) and \
+                flashcomm_v1_enabled:
+                pad_size = forward_context.pad_size
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                from vllm_ascend.utils import maybe_pad_and_chunk_tensor
+                residual = maybe_pad_and_chunk_tensor(residual, pad_size, tp_size, tp_rank, 0)
+            assert x.size(0) == residual.size(0)
             x, _, residual = torch_npu.npu_add_rms_norm_quant(
                 x,
                 residual,
@@ -69,6 +83,16 @@ def forward_oot(
 
         from vllm_ascend.utils import is_310p
         if residual is not None:
+            forward_context = get_forward_context()
+            flashcomm_v1_enabled = forward_context.flashcomm_v1_enabled
+            if x.size(0) != residual.size(0) and \
+                flashcomm_v1_enabled:
+                pad_size = forward_context.pad_size
+                tp_size = get_tensor_model_parallel_world_size()
+                tp_rank = get_tensor_model_parallel_rank()
+                from vllm_ascend.utils import maybe_pad_and_chunk_tensor
+                residual = maybe_pad_and_chunk_tensor(residual, pad_size, tp_size, tp_rank, 0)
+            assert x.size(0) == residual.size(0)
             if is_310p():
                 orig_dtype = residual.dtype
                 x = x + residual.to(x.dtype)
 
@@ -18,7 +18,9 @@
 from typing import Optional, Union
 
 import torch
+import torch.nn.functional as F
 from torch.nn.parameter import Parameter
+import torch_npu
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -28,14 +30,19 @@
                                                ColumnParallelLinear,
                                                LinearBase,
                                                MergedColumnParallelLinear,
-                                               RowParallelLinear)
+                                               QKVParallelLinear,
+                                               RowParallelLinear,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.forward_context import get_forward_context
 
 from vllm_ascend.distributed.parallel_state import (
     get_mlp_tensor_model_parallel_rank,
     get_mlp_tensor_model_parallel_world_size, get_mlp_tp_group)
+from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod, quant_per_tensor
+from vllm_ascend.utils import all_gather_and_maybe_unpad, maybe_pad_and_reduce_scatter
 
 
 class AscendMlpColumnParallelLinear(ColumnParallelLinear):
@@ -307,3 +314,148 @@ def forward(
         if not self.return_bias:
             return output
         return output, output_bias
+
+
+class AscendDenseMergedColumnParallelLinear(MergedColumnParallelLinear):
+    """Linear layer with column parallelism.
+
+    Implemented multiple optimization projects for dense models, such as FlashComm and
+    communication-computation fusion.
+    """
+
+    def forward(
+        self,
+        input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        forward_context = get_forward_context()
+        flashcomm_v1_enabled = forward_context.flashcomm_v1_enabled
+        ag_matmal_enabled = forward_context.ag_matmal_enabled
+        pad_size = forward_context.pad_size
+        if not flashcomm_v1_enabled:
+            output_parallel = self.quant_method.apply(self, input_, bias)
+        # fp or bf
+        elif ag_matmal_enabled and isinstance(self.quant_method, UnquantizedLinearMethod):
+            raise NotImplementedError("AllGather_MatMul with UnquantizedLinearMethod is not implemented yet.")
+        # w8a8 quant
+        elif ag_matmal_enabled and isinstance(self.quant_method.quant_method, AscendW8A8LinearMethod):
+            raise NotImplementedError("AllGather_MatMul with AscendW8A8LinearMethod is not implemented yet.")
+        else:
+            input_ = all_gather_and_maybe_unpad(input_, pad_size, 0)
+            output_parallel = self.quant_method.apply(self, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+
+class AscendDenseQKVParallelLinear(QKVParallelLinear):
+    """Linear layer with column parallelism.
+
+    Implemented multiple optimization projects for dense models, such as FlashComm and
+    communication-computation fusion.
+    """
+
+    def forward(
+        self,
+        input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        forward_context = get_forward_context()
+        layer_num = self.prefix.split('.')[2]
+        if layer_num == '0':
+            flashcomm_v1_enabled = False
+        else:
+            flashcomm_v1_enabled = forward_context.flashcomm_v1_enabled
+        ag_matmal_enabled = forward_context.ag_matmal_enabled
+        pad_size = forward_context.pad_size
+        if not flashcomm_v1_enabled:
+            output_parallel = self.quant_method.apply(self, input_, bias)
+        # fp or bf
+        elif ag_matmal_enabled and isinstance(self.quant_method, UnquantizedLinearMethod):
+            raise NotImplementedError("AllGather_MatMul with UnquantizedLinearMethod is not implemented yet.")
+        # w8a8 quant
+        elif ag_matmal_enabled and isinstance(self.quant_method.quant_method, AscendW8A8LinearMethod):
+            raise NotImplementedError("AllGather_MatMul with AscendW8A8LinearMethod is not implemented yet.")
+        else:
+            input_ = all_gather_and_maybe_unpad(input_, pad_size, 0)
+            output_parallel = self.quant_method.apply(self, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
+        return output, output_bias
+
+
+class AscendDenseRowParallelLinear(RowParallelLinear):
+    """Linear layer with row parallelism.
+
+    Implemented multiple optimization projects for dense models, such as FlashComm and
+    communication-computation fusion.
+    """
+
+    def forward(
+        self,
+        input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        tp_rank = get_tensor_model_parallel_rank()
+        forward_context = get_forward_context()
+        flashcomm_v1_enabled = forward_context.flashcomm_v1_enabled
+        matmul_rs_enabled = forward_context.matmul_rs_enabled
+        pad_size = forward_context.pad_size
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[tp_rank].contiguous()
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if self.tp_size == 1 or not self.reduce_results:
+            output = self.quant_method.apply(self,
+                                            input_parallel,
+                                            bias=bias_)
+        elif not flashcomm_v1_enabled:
+            output_parallel = self.quant_method.apply(self,
+                                                    input_parallel,
+                                                    bias=bias_)
+            output = tensor_model_parallel_all_reduce(output_parallel)
+        # fp or bf
+        elif matmul_rs_enabled and isinstance(self.quant_method, UnquantizedLinearMethod):
+            raise NotImplementedError("Matmul_ReduceScatter with UnquantizedLinearMethod is not implemented yet.")
+        # w8a8 quant
+        elif matmul_rs_enabled and isinstance(self.quant_method.quant_method, AscendW8A8LinearMethod):
+            raise NotImplementedError("Matmul_ReduceScatter with AscendW8A8LinearMethod is not implemented yet.")
+        else:
+            output_parallel = self.quant_method.apply(self,
+                                                    input_parallel,
+                                                    bias=bias_)
+            output = maybe_pad_and_reduce_scatter(output_parallel, pad_size, 0)
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        if not self.return_bias:
+            return output
+        return output, output_bias