native top_p_sampling

lifulll · lifulll · commit 55eae3746a5a · 2025-07-18T12:08:43.000+08:00
diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py
@@ -64,7 +64,7 @@ def __init__(
         super().__init__()
 
         if current_platform.is_cuda() or current_platform.is_xpu(
-        ) or current_platform.is_iluvatar():
+        ) or current_platform.is_iluvatar() or current_platform.is_dcu():
             self.forward = self.forward_cuda
         elif current_platform.is_gcu():
             self.forward = self.forward_gcu
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -29,7 +29,7 @@
     open_shm_and_get_meta_signal)
 from fastdeploy.platforms import current_platform
 
-if current_platform.is_cuda() and not current_platform.is_dcu():
+if current_platform.is_cuda():
     from fastdeploy.model_executor.ops.gpu import (decode_mla_write_cache,
                                                    multi_head_latent_attention,
                                                    prefill_mla_write_cache)
diff --git a/fastdeploy/model_executor/layers/attention/ops/append_attention.py b/fastdeploy/model_executor/layers/attention/ops/append_attention.py
@@ -20,7 +20,7 @@
 
 from fastdeploy.platforms import current_platform
 
-if current_platform.is_cuda() and not current_platform.is_dcu():
+if current_platform.is_cuda():
     from fastdeploy.model_executor.ops.gpu import \
         append_attention as append_attention_gpu
 
diff --git a/fastdeploy/model_executor/layers/backends/dcu/__init__.py b/fastdeploy/model_executor/layers/backends/dcu/__init__.py
@@ -18,5 +18,6 @@
 
 from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
 from .weight_only import DCUWeightOnlyLinearMethod
+from .top_p_sampling import native_top_p_sampling
 
-__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod']
+__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod', "native_top_p_sampling"]
diff --git a/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py b/fastdeploy/model_executor/layers/backends/dcu/top_p_sampling.py
@@ -0,0 +1,40 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import paddle
+
+
+def native_top_p_sampling(
+    probs: paddle.Tensor,
+    top_p: paddle.Tensor
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    sorted_indices = paddle.argsort(probs, descending=True)
+    sorted_probs = paddle.sort(probs, descending=True)
+    cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+    sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
+
+    condition = paddle.scatter(
+        sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
+    )
+
+    condition = paddle.cast(condition, "bool").reshape(probs.shape)
+    probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+    next_tokens = paddle.multinomial(probs)
+
+    return None, next_tokens
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -58,7 +58,8 @@ def __init__(
         """
         super().__init__()
         if current_platform.is_cuda() or current_platform.is_xpu(
-        ) or current_platform.is_iluvatar() or current_platform.is_gcu():
+        ) or current_platform.is_iluvatar() or current_platform.is_gcu(
+        ) or current_platform.is_dcu():
             self.forward = self.forward_cuda
         else:
             raise NotImplementedError
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -27,7 +27,7 @@
 from ..utils import create_and_set_parameter, get_tensor
 from .fused_moe_backend_base import MoEMethodBase
 
-if current_platform.is_cuda() and not current_platform.is_dcu():
+if current_platform.is_cuda():
     from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
                                                    moe_expert_reduce, noaux_tc)
 elif current_platform.is_iluvatar():
diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -53,6 +53,23 @@ def apply_penalty_multi_scores(
             min_dec_lens,
             eos_token_ids,
         )
+    elif current_platform.is_dcu():
+        from fastdeploy.model_executor.ops.gpu import \
+            get_token_penalty_multi_scores
+        logits = get_token_penalty_multi_scores(
+            pre_token_ids,
+            prompt_ids,
+            prompt_lens,
+            logits,
+            repetition_penalties,
+            frequency_penalties,
+            presence_penalties,
+            temperature,
+            bad_words_token_ids,
+            step_idx,
+            min_dec_lens,
+            eos_token_ids,
+        )
     elif current_platform.is_xpu():
         from fastdeploy.model_executor.ops.xpu import \
             get_token_penalty_multi_scores
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -82,6 +82,9 @@ def top_k_top_p_sampling(
     else:
         if current_platform.is_gcu():
             _, ids = gcu_top_p_sampling(x, top_p)
+        elif current_platform.is_dcu():
+            from fastdeploy.model_executor.layers.backends import native_top_p_sampling
+            _, ids = native_top_p_sampling(x, top_p)
         else:
             _, ids = paddle.tensor.top_p_sampling(x,
                                                   top_p,
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -172,7 +172,8 @@ def __init__(self):
         """
         super().__init__()
         if current_platform.is_cuda() or current_platform.is_xpu(
-        ) or current_platform.is_iluvatar() or current_platform.is_gcu():
+        ) or current_platform.is_iluvatar() or current_platform.is_gcu(
+        ) or current_platform.is_dcu():
             self.forward = self.forward_cuda
         else:
             raise NotImplementedError()
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -42,7 +42,7 @@
 from fastdeploy.model_executor.models.model_base import ModelForCasualLM
 from fastdeploy.platforms import current_platform
 
-if current_platform.is_cuda() and not current_platform.is_dcu():
+if current_platform.is_cuda():
     from fastdeploy.model_executor.ops.gpu import (extract_text_token_output,
                                                    text_image_gather_scatter,
                                                    text_image_index_out)
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
@@ -436,6 +436,17 @@ def rebuild_padding(tmp_out: paddle.Tensor,
             output_padding_offset,
             max_input_length,
         )
+    elif current_platform.is_dcu():
+        from fastdeploy.model_executor.ops.gpu import rebuild_padding
+        hidden_states = rebuild_padding(
+            tmp_out,
+            cum_offsets,
+            seq_len_this_time,
+            seq_lens_decoder,
+            seq_lens_encoder,
+            output_padding_offset,
+            max_input_length,
+        )
     elif current_platform.is_iluvatar():
         from fastdeploy.model_executor.ops.iluvatar import rebuild_padding
         hidden_states = rebuild_padding(
diff --git a/fastdeploy/platforms/base.py b/fastdeploy/platforms/base.py
@@ -38,7 +38,7 @@ def is_cuda(self) -> bool:
         """
         whether platform is cuda
         """
-        return paddle.is_compiled_with_cuda()
+        return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()
 
     def is_npu(self) -> bool:
         """
diff --git a/requirements_dcu.txt b/requirements_dcu.txt
@@ -27,4 +27,11 @@ moviepy
 use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
-msgpack
+msgpack
+opentelemetry-api>=1.24.0
+opentelemetry-sdk>=1.24.0
+opentelemetry-instrumentation-redis
+opentelemetry-instrumentation-mysql
+opentelemetry-distro 
+opentelemetry-exporter-otlp
+opentelemetry-instrumentation-fastapi