Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/format_pr_body.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Get vLLM version
run: |
VLLM_COMMIT=9607d5eb449711b349d4c2bee0a9c94afcc7ed14
VLLM_COMMIT=5aeb9254521023f97aca292b3478aa7ff485ffb2
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/pre-commit.yml
with:
vllm: 9607d5eb449711b349d4c2bee0a9c94afcc7ed14
vllm: 5aeb9254521023f97aca292b3478aa7ff485ffb2

changes:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -138,7 +138,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
1 change: 0 additions & 1 deletion docs/source/developer_guide/modeling/adding_a_new_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ from torch import nn
from vllm.attention import Attention
from vllm.config import VllmConfig
from vllm.sequence import IntermediateTensors
from vllm.model_executor.sampling_metadata import SamplingMetadata

class CustomAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
Expand Down
7 changes: 6 additions & 1 deletion tests/e2e/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@

from typing import Dict, List, Optional, Sequence, Tuple, Union

from vllm.sequence import PromptLogprobs, SampleLogprobs
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.10.2"):
from vllm.sequence import PromptLogprobs, SampleLogprobs
else:
from vllm.logprobs import PromptLogprobs, SampleLogprobs

TokensText = Tuple[List[int], str]

Expand Down
16 changes: 0 additions & 16 deletions tests/ut/core/test_schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def setUp(self):
max_model_len=8192,
is_multimodal_model=False,
send_delta_data=False,
scheduler_delay_factor=0,
)

def test_initialize_from_config_with_default(self):
Expand Down Expand Up @@ -90,21 +89,6 @@ def test_not_implemented_send_delta_data(self):
str(context.exception),
)

def test_not_implemented_delay_factor(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
delay_factor=1,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support scheduler_delay_factor",
str(context.exception),
)

def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {})
Expand Down
2 changes: 0 additions & 2 deletions tests/ut/models/test_deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ def setup_mtp(self, mocker: MockerFixture):
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
Expand Down
2 changes: 0 additions & 2 deletions tests/ut/torchair/models/test_torchair_deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,6 @@ def setup_mtp(self, mocker: MockerFixture):
mocker.patch(
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/core/schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __post_init__(self) -> None:
if self.send_delta_data:
raise NotImplementedError(
"currently AscendScheduler doesn't support send_delta_data.")
if self.delay_factor > 0:
if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor."
)
2 changes: 1 addition & 1 deletion vllm_ascend/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def register_model():

ModelRegistry.register_model(
"DeepseekV3ForCausalLM",
"vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
"vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")

ModelRegistry.register_model(
"DeepSeekMTPModel",
Expand Down
8 changes: 2 additions & 6 deletions vllm_ascend/models/deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,13 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.deepseek_mtp import (
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
SharedHead)
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors


Expand Down Expand Up @@ -168,7 +166,7 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
sampling_metadata, # type: ignore
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers)
Expand All @@ -188,8 +186,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
prefix=maybe_prefix(
prefix, "model"))

self.sampler = get_sampler()

def forward(
self,
input_ids: torch.Tensor,
Expand All @@ -204,4 +200,4 @@ def forward(
hidden_states = self.model(input_ids, positions, kv_caches,
attn_metadata, previous_hidden_states,
inputs_embeds, spec_step_idx)
return hidden_states
return hidden_states
4 changes: 4 additions & 0 deletions vllm_ascend/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,8 @@ def load_weights(self, weights: Iterable[tuple[str,
return loaded_params


class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
pass


DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__
27 changes: 0 additions & 27 deletions vllm_ascend/models/deepseek_v3.py
Original file line number Diff line number Diff line change
@@ -1,27 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM


class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
pass
7 changes: 3 additions & 4 deletions vllm_ascend/models/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
AutoWeightsLoader, PPMissingLayer, extract_layer_index,
is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -1079,9 +1078,9 @@ def get_mamba_state_shape_from_config(
use_v1=True)

def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
self,
hidden_states: torch.Tensor,
sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]:
return self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
Expand Down
12 changes: 12 additions & 0 deletions vllm_ascend/ops/vocab_parallel_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,15 @@ def _get_logits_normal(
logits = logits[..., :self.org_vocab_size]

return logits

def forward(
self,
lm_head: VocabParallelEmbedding,
hidden_states: torch.Tensor,
# keep this for version compatibility
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's better to add a todo: delete arg sampling_metadata when dropping v0.10.2

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, let's wait for ci passed, and I'll have a push

sampling_metadata=None, # type: ignore
embedding_bias: Optional[torch.Tensor] = None,
) -> Optional[torch.Tensor]:
return super().forward(lm_head,
hidden_states,
embedding_bias=embedding_bias)
2 changes: 1 addition & 1 deletion vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"functionality is currently suboptimal.")
if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \
not scheduler_config.delay_factor > 0 and \
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True
Expand Down
7 changes: 3 additions & 4 deletions vllm_ascend/torchair/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
from vllm.model_executor.models.utils import (AutoWeightsLoader,
PPMissingLayer, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from vllm_ascend.ascend_config import get_ascend_config
Expand Down Expand Up @@ -343,9 +342,9 @@ def forward(
return hidden_states

def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
self,
hidden_states: torch.Tensor,
sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
Expand Down
6 changes: 1 addition & 5 deletions vllm_ascend/torchair/models/torchair_deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.deepseek_mtp import (
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
SharedHead)
from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
Expand Down Expand Up @@ -172,7 +170,7 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
sampling_metadata, # type: ignore
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers)
Expand All @@ -199,8 +197,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.model = TorchairDeepSeekMultiTokenPredictor(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))

self.sampler = get_sampler()

def forward(
self,
input_ids: torch.Tensor,
Expand Down
2 changes: 0 additions & 2 deletions vllm_ascend/torchair/models/torchair_deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import (
Expand Down Expand Up @@ -945,7 +944,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
else:
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

Expand Down
19 changes: 9 additions & 10 deletions vllm_ascend/torchair/models/torchair_pangu_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,16 @@
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import SupportsPP
from vllm.model_executor.models.utils import (
extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors
from vllm.v1.sample.sampler import Sampler

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
Expand Down Expand Up @@ -913,7 +912,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
if self.config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.sampler = Sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

Expand All @@ -935,19 +934,19 @@ def forward(
return hidden_states

def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
self,
hidden_states: torch.Tensor,
sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return logits

def sample(
self,
logits: Optional[torch.Tensor],
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
self,
logits: Optional[torch.Tensor],
sampling_metadata, # type: ignore
):
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens

Expand Down
Loading
Loading