From ed2cc0ccff4a3710b3bffb219f43a71599ffcc87 Mon Sep 17 00:00:00 2001 From: AlvisGong Date: Mon, 15 Sep 2025 14:38:39 +0800 Subject: [PATCH 1/2] apply_top_k_top_p_tpu Signed-off-by: AlvisGong --- vllm_ascend/sample/sampler.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index e009e4cd56..3449eaebe4 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -1,7 +1,7 @@ import torch import torch_npu from vllm.config import LogprobsMode -from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample +from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample, apply_top_k_top_p_tpu from vllm.v1.sample.sampler import Sampler from vllm_ascend.utils import is_310p @@ -62,13 +62,7 @@ def _apply_top_k_top_p( return logits def forward_native(self, logits, generators, k, p): - """Override pytorch native implementation to torch_npu""" - logits = self._apply_top_k_top_p(logits, k, p) - logits_to_return = None - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: - logits_to_return = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: - logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) - + logits = apply_top_k_top_p_tpu(logits, k, p) probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators), logits_to_return + return random_sample(probs, generators) + From 98d5c434d4a1253540c1a6ca3ea61b3bca3f4660 Mon Sep 17 00:00:00 2001 From: AlvisGong Date: Mon, 15 Sep 2025 15:08:15 +0800 Subject: [PATCH 2/2] apply_top_k_top_p_tpu Signed-off-by: AlvisGong --- vllm_ascend/sample/sampler.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 3449eaebe4..be9eadf2a4 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -62,7 +62,13 @@ def _apply_top_k_top_p( return logits def forward_native(self, logits, generators, k, p): + """Override pytorch native implementation to torch_npu""" logits = apply_top_k_top_p_tpu(logits, k, p) - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) + probs = logits.softmax(dim=-1, dtype=torch.float32) + return random_sample(probs, generators), logits_to_return