modified unit test

lizexu123 · lizexu123 · commit 5b9ffc522ac6 · 2025-07-18T07:43:53.000Z
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -163,7 +163,7 @@ def min_p_sampling(
     """
     min_p_sampling
     """
-    if paddle.count_nonzero(min_p_arr)==0:
+    if paddle.count_nonzero(min_p_arr) == 0:
         return probs
     else:
         if current_platform.is_cuda():
@@ -172,6 +172,6 @@ def min_p_sampling(
         else:
             max_probabilities = paddle.amax(probs,axis=-1,keepdim=True)
             adjusted_min_p = max_probabilities * min_p_arr
-            invalid_token_mask = probs < adjusted_min_p
+            invalid_token_mask = probs < adjusted_min_p.reshape([-1, 1])
             probs= paddle.where(invalid_token_mask,paddle.full_like(probs,0.0),probs)
         return probs
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -176,7 +176,6 @@ def __init__(self):
             self.forward = self.forward_cuda
         else:
             raise NotImplementedError()
-        self.step=0
 
         self.processor = SamplerProcessor()
 
@@ -286,7 +285,7 @@ def forward_cuda(
             sampled_token_ids=next_tokens,
             logprobs_tensors=logprobs_tensors,
         )
-        self.step+=1
+
         return sampler_output
 
 
diff --git a/test/layers/test_min_p.py b/test/layers/test_min_p.py
@@ -13,117 +13,48 @@
 # limitations under the License.
 
 
-import matplotlib.pyplot as plt
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from tqdm import tqdm
 
 from fastdeploy.model_executor.ops.gpu import min_p_sampling
 
 sample_time = 1000000
 vocab_size = 1000
 min_p_value = 0.5
 batch_size = 3
-batch_min_p_values = [0.1, 0.5, 0.9]
-batch_min_p_values2=[0,3,0,0,0.4]
-
-
-def compress(data):
-    new_data = np.array([0, 0, 0], dtype=float)
-    new_data[0] = data[0]
-    new_data[1] = data[1]
-    new_data[2] = np.sum(data[2:])
-    return new_data
-
-
-def plot_bar_chart(data1, data2, data3, title, request_idx=None):
-    plt.figure(figsize=(6, 6))
-    bar_width = 0.2
-    idx = np.arange(len(data1)).astype(float)
-
-    bars1 = plt.bar(idx - bar_width, data1, width=bar_width, color='salmon', label='Original Probability', alpha=0.9)
-    bars2 = plt.bar(idx, data2, width=bar_width, color='skyblue', label='Sampled Probability', alpha=0.9)
-    bars3 = plt.bar(idx + bar_width, data3, width=bar_width, color='orange', label='Normalized Original Probability', alpha=0.9)
-
-    plt.bar_label(bars1, label_type='edge', padding=3, fmt='%.3f', fontsize=5, color='black')
-    plt.bar_label(bars2, label_type='edge', padding=3, fmt='%.3f', fontsize=5, color='red')
-    plt.bar_label(bars3, label_type='edge', padding=3, fmt='%.3f', fontsize=5, color='blue')
-
-    full_title = title if request_idx is None else f"{title} (min_p={batch_min_p_values[request_idx]})"
-    plt.title(full_title, fontsize=14)
-    plt.xlabel("Index", fontsize=12)
-    plt.ylabel("Probability", fontsize=12)
-    plt.ylim(0, 1.1)
-    plt.xlim(-1, 3)
-    plt.xticks(range(0, 3, 1))
-    plt.legend(fontsize=10)
-    plt.grid(axis='y', linestyle='--', alpha=0.5)
-    output_path = f"{title.replace(' ', '_')}{'' if request_idx is None else f'_req{request_idx}'}.png"
-    plt.savefig(output_path, dpi=300, bbox_inches='tight')
-    plt.clf()
-
-def plot_low_prob_curve(low_prob_token_probs, sample_time, title, request_idx=None):
-    plt.figure(figsize=(6, 6))
-    plt.plot(np.arange(0, sample_time), low_prob_token_probs, marker='', linestyle='-', linewidth=1, color='blue')
-    plt.xlabel('Sample Times')
-    plt.ylabel('Probability')
-    full_title = 'Probability of Low-Probability Tokens' if request_idx is None else f"Low-Probability Tokens (min_p={batch_min_p_values[request_idx]})"
-    plt.title(full_title)
-    plt.grid(alpha=0.3)
-    output_path = f"{title.replace(' ', '_')}_low_prob{'' if request_idx is None else f'_req{request_idx}'}.png"
-    plt.savefig(output_path, dpi=300, bbox_inches='tight')
-    plt.clf()
+batch_min_p_values = [0.1, 0.0, 0.9]
+
 
 # min_p:0.5：FastDeploy
-def fastdeploy_min_p_sampling():
+def min_p_sampling_cpu(min_p):
     logits = paddle.ones(shape=[1, vocab_size], dtype="float32")
     logits[0][0] = 10
     logits[0][1] = 8
     low_prob_tensor = paddle.linspace(2.0, 0.0, vocab_size - 2)
     logits[0][2:] = low_prob_tensor
 
-    probs = F.softmax(logits)
-    min_p = paddle.to_tensor([min_p_value], dtype="float32")
-
-    max_prob = probs.max().item()
-    threshold = max_prob * min_p.item()
-    allowed_tokens = paddle.where(probs[0] >= threshold)[0].numpy()
-
-    sample_freq = [0] * vocab_size
-    low_prob_token_times = 0
-    low_prob_token_probs = []
-
-    for i in tqdm(range(sample_time), desc="FastDeploy Sampling"):
-        ids = min_p_sampling(probs, min_p, seed=-1)
-        sample_freq[ids.item()] += 1
-        if ids.item() >= 2:
-            low_prob_token_times += 1
-        low_prob_token_probs.append(low_prob_token_times / (i + 1))
-
-    sample_freq = np.array(sample_freq, dtype=float) / sample_time
-    low_prob_token_probs = np.array(low_prob_token_probs, dtype=float)
+    probs=F.softmax(logits)
+    max_probabilities = paddle.amax(probs, axis=-1, keepdim=True)
+    adjusted_min_p = max_probabilities * min_p.reshape([-1, 1])
+    invalid_token_mask = probs < adjusted_min_p
+    probs = paddle.where(invalid_token_mask,paddle.full_like(probs,0.0), probs)
+    return probs
 
-    ori_data1 = probs.numpy().reshape(-1)
-    data1 = compress(ori_data1)
-    data2 = compress(sample_freq)
-
-    allowed_probs = probs[0, allowed_tokens].numpy()
-    norm_scale = np.sum(allowed_probs)
-    data3 = np.zeros_like(data1)
-    for idx in allowed_tokens:
-        if idx < 2:
-            data3[idx] = ori_data1[idx] / norm_scale
-        else:
-            data3[2] += ori_data1[idx] / norm_scale
-
-    plot_bar_chart(data1, data2, data3, "FastDeploy[min_p_sampling]")
-    plot_low_prob_curve(low_prob_token_probs, sample_time, "FastDeploy[min_p_sampling]")
+# min_p:0.5：FastDeploy
+def fastdeploy_min_p_sampling(min_p):
+    logits = paddle.ones(shape=[1, vocab_size], dtype="float32")
+    logits[0][0] = 10
+    logits[0][1] = 8
+    low_prob_tensor = paddle.linspace(2.0, 0.0, vocab_size - 2)
+    logits[0][2:] = low_prob_tensor
 
-    return data2, data3
+    probs = F.softmax(logits)
+    probs= min_p_sampling(probs, min_p)
+    return probs
 
 
-# batch:[0.1.0,5,0.9]：FastDeploy
+# batch:[0.1.0.0,0.9]：FastDeploy
 def fastdeploy_batch_min_p_sampling(batch_size, min_p_values):
     logits = paddle.ones(shape=[batch_size, vocab_size], dtype="float32")
     for b in range(batch_size):
@@ -134,68 +65,41 @@ def fastdeploy_batch_min_p_sampling(batch_size, min_p_values):
     probs = F.softmax(logits, axis=-1)
     min_p_arr = paddle.to_tensor(min_p_values, dtype="float32")
 
-    allowed_tokens_list = []
-    for b in range(batch_size):
-        max_prob = probs[b].max().item()
-        threshold = max_prob * min_p_values[b]
-        allowed_tokens = paddle.where(probs[b] >= threshold)[0].numpy()
-        allowed_tokens_list.append(allowed_tokens)
-
-    sample_freq = [np.zeros(vocab_size, dtype=float) for _ in range(batch_size)]
-    low_prob_token_times = [0] * batch_size
-    low_prob_token_probs = [[] for _ in range(batch_size)]
-
-    for i in tqdm(range(sample_time), desc="FastDeploy Batch Sampling"):
-        ids = min_p_sampling(probs, min_p_arr, seed=-1)
-        for b in range(batch_size):
-            sample_freq[b][ids[b].item()] += 1
-            if ids[b].item() >= 2:
-                low_prob_token_times[b] += 1
-            low_prob_token_probs[b].append(low_prob_token_times[b] / (i + 1))
-
-    data2_list = []
-    data3_list = []
-    for b in range(batch_size):
-        sample_freq_b = sample_freq[b] / sample_time
-        low_prob_token_probs[b] = np.array(low_prob_token_probs[b], dtype=float)
-
-        ori_data1 = probs[b].numpy()
-        data1 = compress(ori_data1)
-        data2 = compress(sample_freq_b)
-        data2_list.append(data2)
+    probs = min_p_sampling(probs, min_p_arr)
 
-        allowed_probs = probs[b, allowed_tokens_list[b]].numpy()
-        norm_scale = np.sum(allowed_probs)
-        data3 = np.zeros_like(data1)
-        for idx in allowed_tokens_list[b]:
-            if idx < 2:
-                data3[idx] = ori_data1[idx] / norm_scale
-            else:
-                data3[2] += ori_data1[idx] / norm_scale
-        data3_list.append(data3)
+    return probs
 
-        plot_bar_chart(data1, data2, data3, "FastDeploy[min_p_batch_sampling]", b)
-        plot_low_prob_curve(low_prob_token_probs[b], sample_time, "FastDeploy[min_p_batch_sampling]", b)
+def compare_results(probs,probs_cpu,atol=1e-6,rtol=1e-6):
+    probs_np = probs.numpy()
+    probs_cpu_np = probs_cpu.numpy()
+    try:
+        np.testing.assert_allclose(
+            probs_np,
+            probs_cpu_np,
+            rtol=rtol,
+            atol=atol,
+        )
+        print("The results are same between fastdeploy_min_p_sampling and min_p_sampling_cpu")
+    except AssertionError as e:
+        raise AssertionError(
+            f"The results are different between fastdeploy_min_p_sampling and min_p_sampling_cpu:\n{str(e)}")
 
-    return data2_list, data3_list
 
 
 def main():
+    # min_p:0.5：FastDeploy
+    min_p = paddle.to_tensor([min_p_value],dtype="float32")
     print("Running single min_p sampling (min_p=0.5)...")
-    data2_fastdeploy, data3_fastdeploy = fastdeploy_min_p_sampling()
-
-    print("\nFastDeploy Single Request Results:")
-    print(f"Sampled Probability: {data2_fastdeploy}")
-    print(f"Theoretical Normalized Probability: {data3_fastdeploy}")
+    probs = fastdeploy_min_p_sampling(min_p)
+    probs_cpu = min_p_sampling_cpu(min_p)
+    compare_results(probs,probs_cpu)
 
-    print("\nRunning batch min_p sampling (min_p=[0.1, 0.5, 0.9])...")
-    data2_fd_batch, data3_fd_batch = fastdeploy_batch_min_p_sampling(batch_size, batch_min_p_values)
+    # batch:[0.1.0.0,0.9]：FastDeploy
+    batch_min_p = paddle.to_tensor(batch_min_p_values,dtype="float32")
+    batch_probs = fastdeploy_batch_min_p_sampling(batch_size,batch_min_p)
+    batch_probs_cpu = min_p_sampling_cpu(batch_min_p)
+    compare_results(batch_probs,batch_probs_cpu)
 
-    data2_fd_batch,data3_fd_batch = fastdeploy_batch_min_p_sampling(batch_size,batch_min_p_values2)
-
-    for b in range(batch_size):
-        print(f"\nBatch Request {b} (min_p={batch_min_p_values[b]}):")
-        print(f"FastDeploy - Sampled: {data2_fd_batch[b]}, Normalized: {data3_fd_batch[b]}")
 
 if __name__ == "__main__":
     if paddle.device.is_compiled_with_cuda():