From db725cdd16d5b8a52427c26fb7ef19273ca5f7d4 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Wed, 17 Sep 2025 03:06:06 +0000 Subject: [PATCH] [Bugfix] ngram spec decode attention error and repeat add sampled token ids Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/spec_decode/ngram_proposer.py | 8 ++------ vllm_ascend/worker/model_runner_v1.py | 2 ++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/spec_decode/ngram_proposer.py b/vllm_ascend/spec_decode/ngram_proposer.py index 9999f1f36d..4b162fb7b4 100644 --- a/vllm_ascend/spec_decode/ngram_proposer.py +++ b/vllm_ascend/spec_decode/ngram_proposer.py @@ -51,13 +51,9 @@ def generate_token_ids(self, draft_token_ids.append([]) continue - # Add sampled_token_ids to token_ids_cpu. - start_idx = self.runner.input_batch.num_tokens_no_spec[i] - end_idx = start_idx + num_sampled_ids - self.runner.input_batch.token_ids_cpu[ - i, start_idx:end_idx] = sampled_ids + num_tokens = self.runner.input_batch.num_tokens_no_spec[i] drafter_output = self.propose( - self.runner.input_batch.token_ids_cpu[i, :end_idx]) + self.runner.input_batch.token_ids_cpu[i, :num_tokens]) if drafter_output is None or len(drafter_output) == 0: draft_token_ids.append([]) else: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a409bd3e01..0d9f2d71c8 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1440,6 +1440,8 @@ def _build_attn_state(self, num_reqs, num_scheduled_tokens, if self.drafter and (self.drafter.name == SpecDcodeType.EAGLE or self.drafter.name == SpecDcodeType.EAGLE3): attn_state = AscendAttentionState.ChunkedPrefill + elif self.drafter and self.drafter.name == SpecDcodeType.NGRAM: + attn_state = AscendAttentionState.DecodeOnly else: attn_state = AscendAttentionState.SpecDecoding # splitfuse