xx

wangxiaoxin (A) · wangxiaoxin (A) · commit c1c7b3274b58 · 2025-05-29T19:19:26.000+08:00
diff --git a/tests/sample/test_sampler.py b/tests/sample/test_sampler.py
@@ -0,0 +1,111 @@
+# Copyright 2023 The vLLM team.
+
+# Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/tests/kernels/test_rotary_embedding.py
+
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+import torch.nn as nn
+
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p # noqa: F401
+import vllm.v1.sample.Sampler import apply_min_p # noqa: F401
+
+# Only Neox style true scenario is supported for now
+IS_NEOX_STYLE = [True]
+DTYPES = [torch.half]
+HEAD_SIZES = [64, 96, 128, 256]
+ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
+NUM_HEADS = [17]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
+SEQ_LENS = [11, 4096]  # Arbitrary values for testing
+SEEDS = [0]
+DEVICES = [f"npu:{0}"]
+# Set tolerance to 1 for quant ops
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def apply_min_p_new(
+    logits: torch.Tensor,
+    min_p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Filters logits using adaptive probability thresholding.
+    """
+    if min_p == 0:
+        return logits
+    # Convert logits to probability distribution
+    probability_values = torch.nn.functional.softmax(logits, dim=-1)
+    # Calculate maximum probabilities per sequence
+    max_probabilities = torch.amax(probability_values,
+                                    dim=-1,
+                                    keepdim=True)
+    # Reshape min_p for broadcasting
+    adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
+    # Identify valid tokens using threshold comparison
+    # Apply mask using boolean indexing
+    logits = logits.masked_fill(probability_values < adjusted_min_p, -float('inf'))
+    return logits
+
+def apply_top_k_top_p_new(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    batch_size, vocab_size = logits.shape
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+    
+    # Apply top-k.
+    boundary = logits_sort.gather(1, (vocab_size - k).unsqueeze(dim=1))
+    top_k_mask = logits_sort < boundary
+    logits_sort.masked_fill_(top_k_mask, -float("inf"))
+    
+    # Apply top-p.
+    cutoff = top_k_mask.sum(dim=-1).min()
+    probs_sort = logits_sort.softmax(dim=-1)[:, cutoff:]
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum > 1 - p.unsqueeze(dim=1)
+    
+    top_p_mask[:, -1] = True
+    strides = torch.arange(0, batch_size*vocab_size, vocab_size, device=logits.device)
+    flatten_idx = logits_idx[:, cutoff:] + strides.unsqueeze(dim=1)
+    valid_idx = torch.masked_select(flatten_idx, top_p_mask)
+    logits_flatten = logits.flatten()
+    valid_logits = torch.index_select(logits_flatten, 0, valid_idx)
+    logits = torch.empty_like(logits_flatten).fill_(-float("inf"))
+    logits[valid_idx] = valid_logits
+    return logits.reshape(batch_size, vocab_size)
+
+# test with leading dimension and merge seqlen and batch_size as num_tokens
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_apply_min_p(
+) -> None:
+    logits = 
+    min_p = 
+    logits_new = apply_min_p_new(logits, min_p)
+    logits_old = apply_top_k_top_p(logits, min_p)
+    # Compare the results.
+    torch.testing.assert_close(logits_new,
+                               logits_old,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+
+# test with leading dimension and merge seqlen and batch_size as num_tokens
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_apply_top_k_top_p(
+) -> None:
+    logits = 
+    k = 
+    p = 
+    logits_new = apply_top_k_top_p_new(logits, k, p)
+    logits_old = apply_top_k_top_p(logits, k, p)
+    # Compare the results.
+    torch.testing.assert_close(logits_new,
+                               logits_old,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py
@@ -23,7 +23,6 @@
 import vllm_ascend.ops.layernorm  # noqa
 import vllm_ascend.ops.rotary_embedding  # noqa
 import vllm_ascend.ops.vocab_parallel_embedding  # noqa
-import vllm_ascend.ops.utils  # noqa
 
 
 class dummyFusionOp:
diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py
@@ -24,3 +24,4 @@
 import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_spec_decode_worker  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_sampler  # noqa
diff --git a/vllm_ascend/patch/worker/patch_common/patch_sampler.py b/vllm_ascend/patch/worker/patch_common/patch_sampler.py
@@ -1,6 +1,22 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 import torch
-import vllm.v1.sample.sampler as s1
+import vllm.v1.sample import Sampler
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
 from vllm import envs
 from typing import Callable, Optional
@@ -73,6 +89,6 @@ def topk_topp_forward_native(
     probs = logits.softmax(dim=-1, dtype=torch.float32)
     return random_sample(probs, generators)
 
-s1.apply_min_p = apply_min_p
+Sampler.apply_min_p = apply_min_p
 if envs.VLLM_ENABLE_TOPK_OPTIMZE:
     TopKTopPSampler.forward_native = topk_topp_forward_native