feat(graph): Enable ACL graph capture for MLA decode

yiz-liu · yiz-liu · commit 793892fc2e0a · 2025-09-25T01:41:30.000+08:00
Adds support for capturing the Multi-Layer Attention (MLA) decode operation into an ACL graph. This improves performance by compiling the attention kernel for single-token decoding.

Key changes include:
- Implementing the graph capture logic for the MLA kernel, including workspace management and parameter updates.
- Modifying the rotary embedding (RoPE) handling to use pre-allocated tensors, which is a requirement for graph capture.
- Adding a `build_for_graph_capture` method to the MLA metadata builder to create dummy metadata during the graph compilation phase.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -244,6 +244,7 @@ def build_for_graph_capture(
         self,
         common_attn_metadata: AscendCommonAttentionMetadata,
         attn_state: AscendAttentionState = AscendAttentionState.DecodeOnly,
+        model: Optional[nn.Module] = None,
     ):
         if attn_state == AscendAttentionState.DecodeOnly:
             attn_metadata = self.build(
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -10,6 +10,7 @@
                                               MLAAttentionImpl)
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
@@ -21,6 +22,7 @@
                                          maybe_save_kv_layer_to_connector,
                                          split_decodes_and_prefills,
                                          wait_for_kv_layer_from_connector)
+from vllm_ascend.compilation.acl_graph import get_graph_params
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
@@ -169,7 +171,7 @@ def split_metadata_for_multistream(
 class AscendMLAMetadataBuilder:
     # Does this backend/builder support ACL Graphs for attention (default: no).
     aclgraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.NEVER
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -314,6 +316,8 @@ def build(
                 self.model_config.dtype)  # type: ignore
             self.sin_cache = self.sin_cache.to(  # type: ignore
                 self.model_config.dtype)  # type: ignore
+        cos = common_attn_metadata.cos
+        sin = common_attn_metadata.sin
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         query_lens = query_seq_lens_cpu[:num_reqs]
@@ -395,9 +399,12 @@ def build(
             block_table = block_table[:num_decodes, ...]
             seq_lens_list = seq_lens.tolist()
 
-            cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
+            assert self.cos_cache is not None
+            assert self.sin_cache is not None
+
+            cos[:num_decodes, ...] = self.cos_cache[input_positions].unsqueeze(
                 1).unsqueeze(2)
-            sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
+            sin[:num_decodes, ...] = self.sin_cache[input_positions].unsqueeze(
                 1).unsqueeze(2)
 
             decode_metadata = AscendMLADecodeMetadata(
@@ -408,8 +415,8 @@ def build(
                 max_seq_lens=max_seq_lens,
                 attn_mask=common_attn_metadata.spec_attn_mask,
                 actual_seq_lengths_q=actual_seq_lengths_q,
-                sin=sin,
-                cos=cos)
+                sin=sin[:num_decodes, ...],
+                cos=cos[:num_decodes, ...])
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -429,6 +436,26 @@ def build(
             enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp,
         )
 
+    def build_for_graph_capture(
+        self,
+        common_attn_metadata: AscendCommonAttentionMetadata,
+        attn_state: AscendAttentionState = AscendAttentionState.DecodeOnly,
+        model: Optional[nn.Module] = None,
+    ):
+        if attn_state == AscendAttentionState.DecodeOnly:
+            attn_metadata = self.build(
+                common_prefix_len=0,
+                common_attn_metadata=common_attn_metadata,
+                model=model,
+            )
+        else:
+            raise NotImplementedError(
+                "Currently we only support building dummy metadata for DecodeOnly state"
+            )
+
+        attn_metadata.attn_state = attn_state
+        return attn_metadata
+
 
 class DecodeMLAPreprocessResult(NamedTuple):
     ql_nope: Optional[torch.Tensor] = None
@@ -832,24 +859,69 @@ def _forward_decode(
             sparse_mode = 0
             spec_attn_mask = None
 
-        attn_output, _ = torch_npu.npu_fused_infer_attention_score(
-            q_nope,
-            k_nope,
-            k_nope,
-            query_rope=q_pe,
-            key_rope=k_pe,
-            num_heads=self.num_heads,
-            num_key_value_heads=self.num_kv_heads,
-            input_layout=input_layout,
-            atten_mask=spec_attn_mask,
-            sparse_mode=sparse_mode,
-            scale=self.scale,
-            antiquant_mode=0,
-            antiquant_scale=None,
-            block_table=decode_meta.block_table,
-            block_size=block_size,
-            actual_seq_lengths_kv=decode_meta.seq_lens_list,
-            actual_seq_lengths=actual_seq_lengths)
+        common_kwargs = {
+            'query_rope': q_pe,
+            'key_rope': k_pe,
+            'num_heads': self.num_heads,
+            'num_key_value_heads': self.num_kv_heads,
+            'input_layout': input_layout,
+            'atten_mask': spec_attn_mask,
+            'sparse_mode': sparse_mode,
+            'scale': self.scale,
+            'antiquant_mode': 0,
+            'antiquant_scale': None,
+            'block_table': decode_meta.block_table,
+            'block_size': block_size,
+            "actual_seq_lengths": actual_seq_lengths,
+            "actual_seq_lengths_kv": decode_meta.seq_lens_list,
+        }
+        graph_params = get_graph_params()
+        forward_context: ForwardContext = get_forward_context()
+        if forward_context.capturing:
+            stream = torch_npu.npu.current_stream()
+
+            event = torch.npu.ExternalEvent()
+            event.wait(stream)
+            event.reset(stream)
+            graph_params.events[num_tokens].append(event)
+
+            workspace = graph_params.workspaces.get(num_tokens)
+            if workspace is None:
+                workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                    q_nope,
+                    k_nope,
+                    k_nope,
+                    **common_kwargs)
+                graph_params.workspaces[num_tokens] = workspace
+
+            attn_output = torch.empty_like(q_nope)
+            softmax_lse = torch.empty(num_tokens,
+                                      dtype=q_nope.dtype,
+                                      device=q_nope.device)
+
+            graph_params.attn_params[num_tokens].append(
+                (q_nope, k_nope, q_pe, k_pe, self.num_heads, self.num_kv_heads,
+                 input_layout, spec_attn_mask, sparse_mode, self.scale,
+                 decode_meta.block_table, block_size,
+                 decode_meta.seq_lens_list, actual_seq_lengths, workspace,
+                 attn_output, softmax_lse))
+
+            torch.npu.graph_task_group_begin(stream)
+            torch_npu.npu_fused_infer_attention_score.out(
+                q_nope,
+                k_nope,
+                k_nope,
+                **common_kwargs,
+                workspace=workspace,
+                out=[attn_output, softmax_lse])
+            handle = torch.npu.graph_task_group_end(stream)
+            graph_params.handles[num_tokens].append(handle)
+        else:
+            attn_output, _ = torch_npu.npu_fused_infer_attention_score(
+                q_nope,
+                k_nope,
+                k_nope,
+                **common_kwargs)
 
         current_ms_metadata = get_multistream_comm_context()
         if current_ms_metadata is None:
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -63,6 +63,10 @@ class AscendCommonAttentionMetadata:
 
     graph_pad_size: int = -1
 
+    # NOTE: This is a temporary solution for rotary embedding in MLA
+    cos: torch.Tensor = None
+    sin: torch.Tensor = None
+
 
 def split_decodes_and_prefills(
     common_attn_metadata: AscendCommonAttentionMetadata,
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -229,6 +229,52 @@ def update_attn_params(update_stream, forward_context, runtime_shape):
             event.record(update_stream)
 
 
+def update_mla_attn_params(update_stream, forward_context, runtime_shape):
+    graph_params = get_graph_params()
+    # FIXME: Behold! We are using a temporary hack here to update the args
+    # for each layer's attention op in the graph.
+    for key, param, handle, event in zip(
+            forward_context.attn_metadata,
+            graph_params.attn_params[runtime_shape],
+            graph_params.handles[runtime_shape],
+            graph_params.events[runtime_shape],
+    ):
+        (q_nope, k_nope, q_pe, k_pe, num_heads, num_kv_heads, input_layout,
+         spec_attn_mask, sparse_mode, scale, block_table, block_size,
+         seq_lens_list, actual_seq_lengths, workspace, attn_output,
+         softmax_lse) = param
+        seq_lens_list = forward_context.attn_metadata[key].decode.seq_lens_list
+        seq_lens_list = seq_lens_list + [0] * (runtime_shape -
+                                               len(seq_lens_list))
+
+        with torch.npu.stream(update_stream):
+            torch.npu.graph_task_update_begin(update_stream, handle)
+
+            torch_npu.npu_fused_infer_attention_score.out(
+                q_nope,
+                k_nope,
+                k_nope,
+                query_rope=q_pe,
+                key_rope=k_pe,
+                num_heads=num_heads,
+                num_key_value_heads=num_kv_heads,
+                input_layout=input_layout,
+                atten_mask=spec_attn_mask,
+                sparse_mode=sparse_mode,
+                scale=scale,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                block_table=block_table,
+                block_size=block_size,
+                actual_seq_lengths_kv=seq_lens_list,
+                actual_seq_lengths=actual_seq_lengths,
+                workspace=workspace,
+                out=[attn_output, softmax_lse])
+            torch.npu.graph_task_update_end(update_stream)
+
+            event.record(update_stream)
+
+
 @dataclass
 class GraphParams:
     events: dict[int, list[torch.npu.ExternalEvent]]
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -211,12 +211,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             compilation_config.level = CompilationLevel.NO_COMPILATION
-        # TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition
-        # after MLA being supported
-        elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE or (
-                compilation_config.cudagraph_mode
-                == CUDAGraphMode.FULL_DECODE_ONLY and model_config is not None
-                and model_config.use_mla):
+        elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
             logger.info(
                 "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
                 "using only ACL Graph mode")
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -101,7 +101,8 @@
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
                                                set_graph_params,
-                                               update_attn_params)
+                                               update_attn_params,
+                                               update_mla_attn_params)
 from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
 from vllm_ascend.eplb.core.eplb_device_transfer_loader import \
     D2DExpertWeightLoader
@@ -358,6 +359,20 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.slot_mapping = torch.zeros(self.max_num_tokens,
                                         dtype=torch.int32,
                                         device=self.device)
+        # FIXME: Do not hard code 64 here! And also find a better way to
+        # fix the MLA RoPE issue.
+        self.cos = torch.ones(self.max_num_reqs,
+                              1,
+                              1,
+                              64,
+                              dtype=self.dtype,
+                              device=self.device)
+        self.sin = torch.zeros(self.max_num_reqs,
+                               1,
+                               1,
+                               64,
+                               dtype=self.dtype,
+                               device=self.device)
 
         self.uses_mrope = self.model_config.uses_mrope
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -1508,6 +1523,8 @@ def _prepare_inputs(
                 max_query_len=max_num_scheduled_tokens,
                 graph_pad_size=self.graph_pad_size,
                 decode_token_per_req=self.decode_token_per_req,
+                cos=self.cos,
+                sin=self.sin,
             )
 
             if self.speculative_config and \
@@ -1537,7 +1554,7 @@ def _prepare_inputs(
                     attn_metadata_i = builder.build(
                         common_prefix_len=common_prefix_len,
                         common_attn_metadata=common_attn_metadata,
-                        model=self.model,
+                        model=self.get_model(),
                         **extra_attn_metadata_args)
 
                 if self.vllm_config.model_config.use_mla:
@@ -1572,8 +1589,13 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
 
         forward_context = get_forward_context()
         if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
-            update_attn_params(self.update_stream, forward_context,
-                               positions.shape[0])
+            if self.vllm_config.model_config.use_mla:
+                # FIXME: Try using `auto_dispatch_capture=True`
+                update_mla_attn_params(self.update_stream, forward_context,
+                                       positions.shape[0])
+            else:
+                update_attn_params(self.update_stream, forward_context,
+                                   positions.shape[0])
 
         if get_forward_context().flashcomm_v1_enabled:
             hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
@@ -2274,8 +2296,14 @@ def _build_attention_metadata(self, create_mixed_batch, num_reqs,
                     block_table_tensor=block_table_tensor[:num_reqs],
                     slot_mapping=self.slot_mapping,
                     num_computed_tokens_cpu=num_computed_tokens_cpu,
+                    positions=self.positions,
+                    attn_mask=self.attn_mask,
+                    spec_attn_mask=self.spec_attn_mask,
+                    attn_state=self.attn_state,
                     max_query_len=max_query_len,
                     decode_token_per_req=self.decode_token_per_req,
+                    cos=self.cos,
+                    sin=self.sin,
                 )
 
                 for attn_group in self.attn_groups[kv_cache_group_id]:
@@ -2284,7 +2312,8 @@ def _build_attention_metadata(self, create_mixed_batch, num_reqs,
                     else:
                         builder = attn_group.get_metadata_builder()
                     attn_metadata_i = builder.build_for_graph_capture(
-                        common_attn_metadata)
+                        common_attn_metadata, AscendAttentionState.DecodeOnly,
+                        self.get_model())
                     for layer_name in kv_cache_group_spec.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i