Skip to content

Commit 55eae37

Browse files
committed
native top_p_sampling
1 parent 273efba commit 55eae37

File tree

14 files changed

+91
-10
lines changed

14 files changed

+91
-10
lines changed

fastdeploy/model_executor/layers/activation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def __init__(
6464
super().__init__()
6565

6666
if current_platform.is_cuda() or current_platform.is_xpu(
67-
) or current_platform.is_iluvatar():
67+
) or current_platform.is_iluvatar() or current_platform.is_dcu():
6868
self.forward = self.forward_cuda
6969
elif current_platform.is_gcu():
7070
self.forward = self.forward_gcu

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
open_shm_and_get_meta_signal)
3030
from fastdeploy.platforms import current_platform
3131

32-
if current_platform.is_cuda() and not current_platform.is_dcu():
32+
if current_platform.is_cuda():
3333
from fastdeploy.model_executor.ops.gpu import (decode_mla_write_cache,
3434
multi_head_latent_attention,
3535
prefill_mla_write_cache)

fastdeploy/model_executor/layers/attention/ops/append_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from fastdeploy.platforms import current_platform
2222

23-
if current_platform.is_cuda() and not current_platform.is_dcu():
23+
if current_platform.is_cuda():
2424
from fastdeploy.model_executor.ops.gpu import \
2525
append_attention as append_attention_gpu
2626

fastdeploy/model_executor/layers/backends/dcu/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@
1818

1919
from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
2020
from .weight_only import DCUWeightOnlyLinearMethod
21+
from .top_p_sampling import native_top_p_sampling
2122

22-
__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod']
23+
__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod', "native_top_p_sampling"]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
import paddle
17+
18+
19+
def native_top_p_sampling(
20+
probs: paddle.Tensor,
21+
top_p: paddle.Tensor
22+
) -> tuple[paddle.Tensor, paddle.Tensor]:
23+
sorted_indices = paddle.argsort(probs, descending=True)
24+
sorted_probs = paddle.sort(probs, descending=True)
25+
cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
26+
sorted_indices_to_remove = cumulative_probs > top_p
27+
sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
28+
sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
29+
sorted_indices_to_remove[:, 0] = 0
30+
sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
31+
32+
condition = paddle.scatter(
33+
sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
34+
)
35+
36+
condition = paddle.cast(condition, "bool").reshape(probs.shape)
37+
probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
38+
next_tokens = paddle.multinomial(probs)
39+
40+
return None, next_tokens

fastdeploy/model_executor/layers/linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def __init__(
5858
"""
5959
super().__init__()
6060
if current_platform.is_cuda() or current_platform.is_xpu(
61-
) or current_platform.is_iluvatar() or current_platform.is_gcu():
61+
) or current_platform.is_iluvatar() or current_platform.is_gcu(
62+
) or current_platform.is_dcu():
6263
self.forward = self.forward_cuda
6364
else:
6465
raise NotImplementedError

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from ..utils import create_and_set_parameter, get_tensor
2828
from .fused_moe_backend_base import MoEMethodBase
2929

30-
if current_platform.is_cuda() and not current_platform.is_dcu():
30+
if current_platform.is_cuda():
3131
from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
3232
moe_expert_reduce, noaux_tc)
3333
elif current_platform.is_iluvatar():

fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,23 @@ def apply_penalty_multi_scores(
5353
min_dec_lens,
5454
eos_token_ids,
5555
)
56+
elif current_platform.is_dcu():
57+
from fastdeploy.model_executor.ops.gpu import \
58+
get_token_penalty_multi_scores
59+
logits = get_token_penalty_multi_scores(
60+
pre_token_ids,
61+
prompt_ids,
62+
prompt_lens,
63+
logits,
64+
repetition_penalties,
65+
frequency_penalties,
66+
presence_penalties,
67+
temperature,
68+
bad_words_token_ids,
69+
step_idx,
70+
min_dec_lens,
71+
eos_token_ids,
72+
)
5673
elif current_platform.is_xpu():
5774
from fastdeploy.model_executor.ops.xpu import \
5875
get_token_penalty_multi_scores

fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ def top_k_top_p_sampling(
8282
else:
8383
if current_platform.is_gcu():
8484
_, ids = gcu_top_p_sampling(x, top_p)
85+
elif current_platform.is_dcu():
86+
from fastdeploy.model_executor.layers.backends import native_top_p_sampling
87+
_, ids = native_top_p_sampling(x, top_p)
8588
else:
8689
_, ids = paddle.tensor.top_p_sampling(x,
8790
top_p,

fastdeploy/model_executor/layers/sample/sampler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,8 @@ def __init__(self):
172172
"""
173173
super().__init__()
174174
if current_platform.is_cuda() or current_platform.is_xpu(
175-
) or current_platform.is_iluvatar() or current_platform.is_gcu():
175+
) or current_platform.is_iluvatar() or current_platform.is_gcu(
176+
) or current_platform.is_dcu():
176177
self.forward = self.forward_cuda
177178
else:
178179
raise NotImplementedError()

0 commit comments

Comments
 (0)