fix rebase bugs

lwq · lwq · commit 408a8ea35e6d · 2025-08-21T14:13:02.000+08:00
Signed-off-by: lwq &lt;liwenquan5@huawei.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 import torch_npu
+from torch import nn
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
@@ -21,6 +22,7 @@
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata, npu_stream_switch, npu_wait_tensor)
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
@@ -211,6 +213,7 @@ def __init__(self,
         self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
         self.cos_cache = None
         self.sin_cache = None
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -230,10 +233,8 @@ def reorder_batch(self, input_batch: "InputBatch",
             # We treat spec decoding as decode.
             if num_tokens - num_spec_tokens == 1:
                 decodes.append(i)
-                num_decode_tokens += num_tokens
             else:
                 prefills.append(i)
-                num_prefill_tokens += num_tokens
 
         # We hope that this is fairly minimal since decodes
         # should be around for a number of iterations so hopefully they are