Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/format_pr_body.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Get vLLM version
run: |
VLLM_COMMIT=c60e6137f0bf2034853919b3a9d705d7e06b93cf
VLLM_COMMIT=5aeb9254521023f97aca292b3478aa7ff485ffb2
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [c60e6137f0bf2034853919b3a9d705d7e06b93cf, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -136,7 +136,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [c60e6137f0bf2034853919b3a9d705d7e06b93cf, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [c60e6137f0bf2034853919b3a9d705d7e06b93cf, v0.10.2]
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
4 changes: 1 addition & 3 deletions docs/source/developer_guide/modeling/adding_a_new_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ from torch import nn
from vllm.attention import Attention
from vllm.config import VllmConfig
from vllm.sequence import IntermediateTensors
from vllm.model_executor.sampling_metadata import SamplingMetadata

class CustomAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
Expand Down Expand Up @@ -123,8 +122,7 @@ class CustomModelForCausalLM(nn.Module):
...

def compute_logits(self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> torch.Tensor:
hidden_states: torch.Tensor) -> torch.Tensor:
...

def load_weights(self,
Expand Down
5 changes: 1 addition & 4 deletions vllm_ascend/models/deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
SharedHead)
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors


Expand Down Expand Up @@ -168,14 +167,12 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers)
mtp_layer = self.layers_list[current_step_idx]
logits = self.logits_processor(mtp_layer.shared_head.head,
mtp_layer.shared_head(hidden_states),
sampling_metadata)
mtp_layer.shared_head(hidden_states))
return logits


Expand Down
5 changes: 1 addition & 4 deletions vllm_ascend/models/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
AutoWeightsLoader, PPMissingLayer, extract_layer_index,
is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -1308,10 +1307,8 @@ def get_mamba_state_shape_from_config(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
return self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return self.logits_processor(self.lm_head, hidden_states)

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def _propose(
hidden_states=self.hidden_states[:num_input_tokens],
)
sample_hidden_states = last_hidden_states[last_token_indices]
logits = self.model.compute_logits(sample_hidden_states, None)
logits = self.model.compute_logits(sample_hidden_states)
draft_token_ids = logits.argmax(dim=-1)

# Early exit if there is only one draft token to be generated.
Expand Down
5 changes: 1 addition & 4 deletions vllm_ascend/torchair/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
from vllm.model_executor.models.utils import (AutoWeightsLoader,
PPMissingLayer, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from vllm_ascend.ascend_config import get_ascend_config
Expand Down Expand Up @@ -345,10 +344,8 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
logits = self.logits_processor(self.lm_head, hidden_states)
return logits

def load_weights(self, weights: Iterable[tuple[str,
Expand Down
5 changes: 1 addition & 4 deletions vllm_ascend/torchair/models/torchair_deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
SharedHead)
from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
Expand Down Expand Up @@ -172,14 +171,12 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers)
mtp_layer = self.layers_list[current_step_idx]
logits = self.logits_processor(mtp_layer.shared_head.head,
mtp_layer.shared_head(hidden_states),
sampling_metadata)
mtp_layer.shared_head(hidden_states))
return logits


Expand Down
15 changes: 8 additions & 7 deletions vllm_ascend/torchair/models/torchair_pangu_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
from vllm.model_executor.models.utils import (
extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -937,18 +936,20 @@ def forward(
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
# For version compatibility, we still keep this argument here.
sampling_metadata # type: ignore
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return logits
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
return self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return self.logits_processor(self.lm_head, hidden_states)

def sample(
self,
logits: Optional[torch.Tensor],
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
next_tokens = self.sampler(logits)
return next_tokens

def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
Expand Down
8 changes: 4 additions & 4 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1942,7 +1942,7 @@ def execute_model(
num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices]
logits = self.model.compute_logits(sample_hidden_states, None)
logits = self.model.compute_logits(sample_hidden_states)
if broadcast_pp_output:
model_output_broadcast_data = {
"logits": logits.contiguous(),
Expand Down Expand Up @@ -2349,7 +2349,7 @@ def _dummy_run(

def dummy_compute_logits(hidden_states):
return self.model.compute_logits(
hidden_states[dummy_indices], None)
hidden_states[dummy_indices])

with set_ascend_forward_context(
attn_metadata,
Expand Down Expand Up @@ -2418,7 +2418,7 @@ def profile_run(self) -> None:
logit_indices = np.cumsum(num_scheduled_tokens) - 1
# TODO: need to rum a dummy sampler for generate task
hidden_states = hidden_states[logit_indices]
output = self.model.compute_logits(hidden_states, None)
output = self.model.compute_logits(hidden_states)

NPUPlatform.synchronize()
del hidden_states, output
Expand Down Expand Up @@ -3241,7 +3241,7 @@ def _get_prompt_logprobs_dict(
req_idx = self.input_batch.req_id_to_index[req_id]
offset = self.query_start_loc_np[req_idx].item()
prompt_hidden_states = hidden_states[offset:offset + num_logits]
logits = self.model.compute_logits(prompt_hidden_states, None)
logits = self.model.compute_logits(prompt_hidden_states)

# Get the "target" tokens for each index. For prompt at index i,
# the token at prompt index i+1 is the "sampled" token we want
Expand Down
Loading