[V0.9.1] Patch compilation.decorator to support flashcomm_v1 in aclgraph

rjg-lyh · rjg-lyh · commit 31ba211749ec · 2025-07-16T10:32:00.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py
@@ -23,6 +23,8 @@ def register():
 
 
 def register_model():
+    import vllm  # noqa: F401
+    import vllm_ascend.patch.platform.patch_0_9_1.patch_decorator  # noqa: F401
     # fix pytorch schema check error, remove this line after pytorch
     # is upgraded to 2.7.0
     import vllm_ascend.patch.worker.patch_common.patch_utils  # noqa: F401
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
@@ -8,10 +8,10 @@ def register_model():
     from .deepseek_mtp import CustomDeepSeekMTP  # noqa: F401
     from .deepseek_v2 import CustomDeepseekV2ForCausalLM  # noqa: F401
     from .deepseek_v2 import CustomDeepseekV3ForCausalLM  # noqa: F401
+    from .qwen2 import CustomQwen2ForCausalLM  # noqa: F401
     from .qwen2_5_vl import \
         AscendQwen2_5_VLForConditionalGeneration  # noqa: F401
     from .qwen2_vl import AscendQwen2VLForConditionalGeneration  # noqa: F401
-    from .qwen2 import CustomQwen2ForCausalLM  # noqa: F401
     from .qwen3 import CustomQwen3ForCausalLM  # noqa: F401
 
     ModelRegistry.register_model(
diff --git a/vllm_ascend/models/qwen2.py b/vllm_ascend/models/qwen2.py
@@ -1,34 +1,30 @@
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 from transformers import Qwen2Config
-
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2Model
+from vllm.model_executor.models.utils import (AutoWeightsLoader,
+                                              PPMissingLayer, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
-from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, maybe_prefix)
-
-from vllm.model_executor.models.qwen2 import Qwen2Model, Qwen2DecoderLayer
-from vllm.distributed import (
-                            get_pp_group,
-                            get_tensor_model_parallel_world_size,
-                            get_tensor_model_parallel_rank,
-                            tensor_model_parallel_all_gather,
-                            tensor_model_parallel_all_reduce,
-                            tensor_model_parallel_reduce_scatter)
-from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm.forward_context import get_forward_context
 import vllm_ascend.envs as ascend_envs
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 
 
 def all_gather_and_maybe_unpad(
@@ -40,6 +36,7 @@ def all_gather_and_maybe_unpad(
         return hidden_states[:-pad_size, :]
     return hidden_states
 
+
 def maybe_pad_and_reduce_scatter(
     hidden_states: torch.Tensor,
     pad_size: int,
@@ -49,6 +46,7 @@ def maybe_pad_and_reduce_scatter(
     hidden_states = tensor_model_parallel_reduce_scatter(hidden_states, 0)
     return hidden_states
 
+
 class CustomQwen2DecoderLayer(Qwen2DecoderLayer):
 
     def __init__(
@@ -64,9 +62,9 @@ def __init__(
                          prefix=prefix)
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.self_attn.o_proj.reduce_results=False
-        self.mlp.down_proj.reduce_results=False
-        
+        self.self_attn.o_proj.reduce_results = False
+        self.mlp.down_proj.reduce_results = False
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -81,19 +79,22 @@ def forward(
             if flashcomm_v1_enabled:
                 if pad_size > 0:
                     residual = F.pad(residual, (0, 0, 0, pad_size))
-                residual = torch.chunk(residual, self.tp_size, dim=0)[self.tp_rank]
+                residual = torch.chunk(residual, self.tp_size,
+                                       dim=0)[self.tp_rank]
             hidden_states = self.input_layernorm(hidden_states)
         else:
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
             if flashcomm_v1_enabled:
-                hidden_states = all_gather_and_maybe_unpad(hidden_states, pad_size)
+                hidden_states = all_gather_and_maybe_unpad(
+                    hidden_states, pad_size)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
         )
         if flashcomm_v1_enabled:
-            hidden_states = maybe_pad_and_reduce_scatter(hidden_states, pad_size)
+            hidden_states = maybe_pad_and_reduce_scatter(
+                hidden_states, pad_size)
         else:
             hidden_states = tensor_model_parallel_all_reduce(hidden_states)
         # Fully Connected
@@ -103,7 +104,8 @@ def forward(
             hidden_states = all_gather_and_maybe_unpad(hidden_states, pad_size)
         hidden_states = self.mlp(hidden_states)
         if flashcomm_v1_enabled:
-            hidden_states = maybe_pad_and_reduce_scatter(hidden_states, pad_size)
+            hidden_states = maybe_pad_and_reduce_scatter(
+                hidden_states, pad_size)
         else:
             hidden_states = tensor_model_parallel_all_reduce(hidden_states)
         return hidden_states, residual
@@ -120,11 +122,12 @@ def forward(
     })
 class CustomQwen2Model(Qwen2Model):
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 decoder_layer_type: type[nn.Module] = CustomQwen2DecoderLayer):
+    def __init__(
+            self,
+            *,
+            vllm_config: VllmConfig,
+            prefix: str = "",
+            decoder_layer_type: type[nn.Module] = CustomQwen2DecoderLayer):
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,
                          decoder_layer_type=decoder_layer_type)
@@ -156,7 +159,8 @@ def forward(
             flashcomm_v1_enabled = True
         if flashcomm_v1_enabled:
             num_tokens = hidden_states.size(0)
-            pad_size = (self.tp_size - (num_tokens % self.tp_size)) % self.tp_size
+            pad_size = (self.tp_size -
+                        (num_tokens % self.tp_size)) % self.tp_size
         for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
@@ -201,7 +205,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.quant_config = quant_config
         self.model = CustomQwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+                                      prefix=maybe_prefix(prefix, "model"))
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py
@@ -14,3 +14,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import vllm_ascend.patch.platform.patch_0_9_1.patch_decorator  # noqa
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py b/vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import inspect
+from typing import TypeVar, Union
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
+from vllm.compilation import decorators
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import start_monitoring_torch_compile
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+from vllm.utils import supports_dynamo
+
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+
+logger = init_logger(__name__)
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def _ascend_support_torch_compile(
+    cls: _T,
+    dynamic_arg_dims: dict[str, Union[int, list[int]]],
+) -> _T:
+    """
+    A decorator to add support for compiling the forward method of a class.
+    """
+    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+        # support decorating multiple times
+        return cls
+
+    # take care of method resolution order
+    # make sure super().__init__ is called on the base class
+    #  other than TorchCompileWrapperWithCustomDispatcher
+    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+
+    old_init = cls.__init__
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
+        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.vllm_config = vllm_config
+        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # will handle the compilation, so we don't need to do anything here.
+        self.do_not_compile = \
+            vllm_config.compilation_config.level in [
+            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
+        ] or not supports_dynamo()
+        if self.do_not_compile:
+            return
+        compilation_counter.num_models_seen += 1
+        TorchCompileWrapperWithCustomDispatcher.__init__(
+            self, compilation_level=vllm_config.compilation_config.level)
+
+    cls.__init__ = __init__
+
+    def __call__(self, *args, **kwargs):
+        # torch.compiler.is_compiling() means we are inside the compilation
+        # e.g. TPU has the compilation logic in model runner, so we don't
+        # need to compile the model inside.
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata is not None and attn_metadata.attn_state != AscendAttentionState.DecodeOnly:
+            return self.forward(*args, **kwargs)
+
+        if self.do_not_compile or torch.compiler.is_compiling():
+            return self.forward(*args, **kwargs)
+
+        # the first compilation needs to have dynamic shapes marked
+        if len(self.compiled_codes) < 1:
+            sig = inspect.signature(self.__class__.forward)
+            bound_args = sig.bind(self, *args, **kwargs)
+            bound_args.apply_defaults()
+            for k, dims in dynamic_arg_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [
+                            arg.ndim + dim if dim < 0 else dim for dim in dims
+                        ]
+                        torch._dynamo.mark_dynamic(arg, dims)
+                    elif isinstance(arg, IntermediateTensors):
+                        for tensor in arg.tensors.values():
+                            # In case dims is specified with negative indexing
+                            dims = [
+                                tensor.ndim + dim if dim < 0 else dim
+                                for dim in dims
+                            ]
+                            torch._dynamo.mark_dynamic(tensor, dims)
+                    else:
+                        raise ValueError(
+                            "Unsupported dynamic dimensions"
+                            f" {dims} for argument {k} with type {type(arg)}.")
+            # here, it is the starting point of the `torch.compile` process
+            start_monitoring_torch_compile(self.vllm_config)
+            logger.debug("Start compiling function %s",
+                         self.original_code_object)
+
+        # if we don't use custom dispatcher, we can directly call the
+        # compiled function and let torch.compile handle the dispatching,
+        # with the overhead of guard evaluation and recompilation.
+        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+            # it seems Dynamo reuse the compilation across instances,
+            # while we need to make sure the compiled code is not reused.
+            # we need to control all the compilation of the model.
+            torch._dynamo.eval_frame.remove_from_cache(
+                self.original_code_object)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
+
+        # usually, capturing the model once is enough, and then we can
+        # dispatch to the compiled code directly, without going through
+        # the Dynamo guard mechanism.
+        with self.dispatch_to_code(0):
+            model_output = self.forward(*args, **kwargs)
+            return model_output
+
+    cls.__call__ = __call__
+    return cls
+
+
+decorators._support_torch_compile = _ascend_support_torch_compile
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2033,8 +2033,8 @@ def capture_model(self) -> None:
                 for num_tokens in reversed(self.aclgraph_batch_sizes):
                     for _ in range(self.vllm_config.compilation_config.
                                    cudagraph_num_of_warmups):
-                        self._dummy_run(num_tokens, skip_attn=skip_attn, with_prefill=False)
-                    self._dummy_run(num_tokens, skip_attn=skip_attn, with_prefill=False)
+                        self._dummy_run(num_tokens, skip_attn=skip_attn)
+                    self._dummy_run(num_tokens, skip_attn=skip_attn)
         else:
             logger.info("Skipping NPU graph capture for eager mode.")
             return

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,5 @@`
`14`	`14`	`# See the License for the specific language governing permissions and`
`15`	`15`	`# limitations under the License.`
`16`	`16`	`#`
	`17`	`+`
	`18`	`+import vllm_ascend.patch.platform.patch_0_9_1.patch_decorator # noqa`