fix test_min_sampling

lizexu123 · lizexu123 · commit 5c335b18e935 · 2025-07-18T10:01:58.000Z
diff --git a/docs/zh/offline_inference.md b/docs/zh/offline_inference.md
@@ -180,7 +180,7 @@ for output in outputs:
 * temperature(float): 控制生成随机性的参数，值越高结果越随机，值越低结果越确定
 * top_p(float): 概率累积分布截断阈值，仅考虑累计概率达到此阈值的最可能token集合
 * top_k(int): 采样概率最高的token数量，考虑概率最高的k个token进行采样
-* min_p(float): token入选的最小概率阈值(相对于最高概率token的比值,设为>0可通过过滤低概率token来提升文本生成质量)
+* min_p(float): token入选的最小概率阈值(相对于最高概率token的比值，设为>0可通过过滤低概率token来提升文本生成质量)
 * max_tokens(int): 限制模型生成的最大token数量（包括输入和输出）
 * min_tokens(int): 强制模型生成的最少token数量，避免过早结束
 
diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py
@@ -53,7 +53,7 @@ class SamplingParams:
         top_p: Float that controls the cumulative probability of the top tokens
             to consider. Must be in [0, 1]. Set to 1 to consider all tokens.
         top_k: Int that controls the number of top tokens to consider. Must be a positive integer.
-        min_p:Float that represents the minimum probability for a token to be
+        min_p: Float that represents the minimum probability for a token to be
             considered, relative to the probability of the most likely token.
             Must be in [0, 1]. Set to 0 to disable this.
         seed: Random seed to use for the generation.
@@ -87,7 +87,7 @@ class SamplingParams:
     temperature: float = None
     top_p: float = None
     top_k: int = 0
-    min_p: float=0.0
+    min_p: float = 0.0
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
     stop_token_ids: Optional[Union[List[List[int]], List[int]]] = None
@@ -186,7 +186,7 @@ def _verify_args(self) -> None:
         if not isinstance(self.top_k, int):
             raise TypeError(
                 f"top_k must be an integer, got {type(self.top_k).__name__}")
-        if not 0.0 <=self.min_p <= 1.0:
+        if not 0.0 <= self.min_p <= 1.0:
             raise ValueError("min_p must be in [0,1],got f{self.min_p}")
 
         if self.max_tokens is not None and self.max_tokens < 1:
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -269,7 +269,7 @@ def forward_cuda(
 
         probs = F.softmax(logits)
 
-        probs= min_p_sampling(probs,sampling_metadata.min_p)
+        probs = min_p_sampling(probs,sampling_metadata.min_p)
 
         _, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k)
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -337,7 +337,7 @@ def get_attr_from_request(request, attr, default_value=None):
                 request.eos_token_ids, dtype="int64").reshape(-1, 1)
             self.share_inputs["top_p"][idx:idx + 1] = get_attr_from_request(request, "top_p", 0.7)
             self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
-            self.share_inputs["min_p"][idx:idx + 1] = request.get("min_p",0.0)
+            self.share_inputs["min_p"][idx:idx + 1] = request.get("min_p", 0.0)
 
             self.share_inputs["temperature"][idx:idx + 1] = get_attr_from_request(request,"temperature", 0.95)
             self.share_inputs["penalty_score"][idx:idx + 1] = get_attr_from_request(
diff --git a/test/layers/test_min_p.py b/test/layers/test_min_p.py
diff --git a/test/layers/test_min_sampling.py b/test/layers/test_min_sampling.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from fastdeploy.model_executor.ops.gpu import min_p_sampling
+
+
+class TestMinPSampling(unittest.TestCase):
+    def setUp(self):
+        self.sample_time = 1000000
+        self.vocab_size = 1000
+        self.min_p_value = 0.5
+        self.batch_size = 3
+        self.batch_min_p_values = [0.1, 0.0, 0.9]
+        self.additional_batch_min_p_values = [0.1, 0.0, 0.3]
+
+
+    # min_p:0.5：FastDeploy
+    def min_p_sampling_cpu(self,min_p):
+        logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
+        logits[0][0] = 10
+        logits[0][1] = 8
+        low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
+        logits[0][2:] = low_prob_tensor
+
+        probs = F.softmax(logits)
+        max_probabilities = paddle.amax(probs, axis=-1, keepdim=True)
+        adjusted_min_p = max_probabilities * min_p.reshape([-1, 1])
+        invalid_token_mask = probs < adjusted_min_p
+        probs = paddle.where(invalid_token_mask,paddle.full_like(probs,0.0), probs)
+        return probs
+
+    # min_p:0.5：FastDeploy
+    def fastdeploy_min_p_sampling(self,min_p):
+        logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
+        logits[0][0] = 10
+        logits[0][1] = 8
+        low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
+        logits[0][2:] = low_prob_tensor
+
+        probs = F.softmax(logits)
+        probs = min_p_sampling(probs, min_p)
+        return probs
+
+
+    # batch:[0.1.0.0,0.9]：FastDeploy
+    def fastdeploy_batch_min_p_sampling(self,batch_size, min_p_values):
+        logits = paddle.ones(shape=[batch_size, self.vocab_size], dtype="float32")
+        for b in range(batch_size):
+            logits[b][0] = 10
+            logits[b][1] = 8
+            logits[b][2:] = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
+
+        probs = F.softmax(logits, axis=-1)
+        min_p_arr = paddle.to_tensor(min_p_values, dtype="float32")
+
+        probs = min_p_sampling(probs, min_p_arr)
+
+        return probs
+
+    def compare_results(self, probs, probs_cpu, atol=1e-6, rtol=1e-6):
+        probs_np = probs.numpy()
+        probs_cpu_np = probs_cpu.numpy()
+        try:
+            np.testing.assert_allclose(
+                probs_np,
+                probs_cpu_np,
+                rtol=rtol,
+                atol=atol,
+            )
+            print("The results are same between fastdeploy_min_p_sampling and min_p_sampling_cpu")
+        except AssertionError as e:
+            raise AssertionError(
+                f"The results are different between fastdeploy_min_p_sampling and min_p_sampling_cpu:\n{str(e)}")
+
+    def test_single_min_p_sampling(self):
+        min_p = paddle.to_tensor([self.min_p_value], dtype="float32")
+        probs = self.fastdeploy_min_p_sampling(min_p)
+        probs_cpu = self.min_p_sampling_cpu(min_p)
+        self.compare_results(probs, probs_cpu)
+
+    def test_batch_min_p_sampling(self):
+        batch_min_p = paddle.to_tensor(self.batch_min_p_values, dtype="float32")
+        batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, batch_min_p)
+        batch_probs_cpu = self.min_p_sampling_cpu(batch_min_p)
+        self.compare_results(batch_probs, batch_probs_cpu)
+
+    def test_additional_batch_min_p_sampling(self):
+        additional_batch_min_p = paddle.to_tensor(self.additional_batch_min_p_values, dtype="float32")
+        additional_batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, additional_batch_min_p)
+        additional_batch_probs_cpu = self.min_p_sampling_cpu(additional_batch_min_p)
+        self.compare_results(additional_batch_probs, additional_batch_probs_cpu)
+
+if __name__ == "__main__":
+    if paddle.is_compiled_with_cuda():
+        unittest.main()