PaddlePaddle
diff --git a/‎paddlenlp/datasets/rlhf_datasets/protocol.py
Lines changed: 2 additions & 6 deletions b/‎paddlenlp/datasets/rlhf_datasets/protocol.py
Lines changed: 2 additions & 6 deletions
diff --git a/‎paddlenlp/datasets/rlhf_datasets/rl_dataset.py
Lines changed: 2 additions & 0 deletions b/‎paddlenlp/datasets/rlhf_datasets/rl_dataset.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddlenlp/rl/algos/penalty.py
Lines changed: 38 additions & 0 deletions b/‎paddlenlp/rl/algos/penalty.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎paddlenlp/rl/models/ppo_model_utils.py
Lines changed: 36 additions & 10 deletions b/‎paddlenlp/rl/models/ppo_model_utils.py
Lines changed: 36 additions & 10 deletions
@@ -219,9 +219,6 @@ class DataProtoItem:
 class DataProto:
     """
     A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
-    It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
-    TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
-    same batch size should be put inside batch.
     """
 
     batch: TensorDict = None
@@ -343,7 +340,7 @@ def to(self, device) -> "DataProto":
         """move the batch to device
 
         Args:
-            device (torch.device, str): torch device
+            device (paddle.device, str): paddle device
 
         Returns:
             DataProto: the current DataProto
@@ -466,8 +463,7 @@ def union(self, other: "DataProto") -> "DataProto":
         return self
 
     def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
-        """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
-        dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
+        """Make an iterator from the DataProto.
 
         Args:
             mini_batch_size (int): mini-batch size when iterating the dataset. We require that
 
@@ -42,10 +42,12 @@ def padding_batch_data(samples: list[dict], pad_token_id: int, requires_label: b
     # attention_mask = [np.ones(input_id.shape, dtype=bool) for input_id in input_ids]
     input_dict["input_ids"] = left_padding(input_ids, padding_value=pad_token_id)
     # input_dict["attention_mask"] = left_padding(attention_mask, padding_value=0)
+    input_dict["raw_prompt_len"] = paddle.to_tensor([len(sample["input_ids"]) for sample in samples])
 
     if requires_label:
         label_ids = [sample["label_ids"] for sample in samples]
         input_dict["label_ids"] = left_padding(label_ids, padding_value=pad_token_id)
+        input_dict["raw_label_ids_len"] = paddle.to_tensor([len(sample["label_ids"]) for sample in samples])
 
     return input_dict
 
 
@@ -0,0 +1,38 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def apply_overlong_penalty(response_length, max_dec_len, overlong_buffer_len, penalty_factor):
+    """
+    Apply length penalty to overlong responses.
+
+    Args:
+        response_length (paddle.Tensor): Tensor of shape (B,) indicating the length of each response.
+        max_dec_len (int): The maximum allowed decoding length.
+        overlong_buffer_len (int): The allowed buffer before applying penalty.
+        penalty_factor (float): The penalty factor to scale the length overflow.
+
+    Returns:
+        paddle.Tensor: A tensor of shape (B,) representing the length penalty for each response.
+    """
+    expected_len = max_dec_len - overlong_buffer_len
+    exceed_len = response_length - expected_len
+
+    reward_penalty = -exceed_len / overlong_buffer_len * penalty_factor
+    # Only apply negative penalty if response exceeds limit, otherwise zero
+    overlong_penalty = paddle.minimum(reward_penalty, paddle.zeros_like(reward_penalty))
+
+    return overlong_penalty
@@ -239,7 +239,7 @@ def create_startend_row_indices(input_ids, pad_token_id=0):
 
 
 class RLHFPPOLoss(nn.Layer):
-    def __init__(self, config, clip_range_ratio=0.2):
+    def __init__(self, config, clip_range_ratio=0.2, clip_range_ratio_low=None, clip_range_ratio_high=None):
         """
         Initialize the `ClipRewardRange` object.
 
@@ -257,6 +257,8 @@ def __init__(self, config, clip_range_ratio=0.2):
         """
         super().__init__()
         self.clip_range_ratio = clip_range_ratio
+        self.clip_range_ratio_low = clip_range_ratio_low
+        self.clip_range_ratio_high = clip_range_ratio_high
         self.config = config
 
     def actor_loss_fn(
@@ -283,8 +285,8 @@ def actor_loss_fn(
         pg_loss1 = -advantages * ratio
         pg_loss2 = -advantages * paddle.clip(
             ratio,
-            1.0 - self.clip_range_ratio,
-            1.0 + self.clip_range_ratio,
+            1.0 - self.clip_range_ratio_low,
+            1.0 + self.clip_range_ratio_high,
         )
         return paddle.sum(paddle.maximum(pg_loss1, pg_loss2) * mask) / mask.sum()
 
@@ -361,6 +363,8 @@ def __init__(
         config,
         ptx_coeff=16,
         clip_range_ratio=0.2,
+        clip_range_ratio_low=None,
+        clip_range_ratio_high=None,
         kl_loss_coeff=0.001,
         clip_range_score=10,
         info_buffer=None,
@@ -379,10 +383,14 @@ def __init__(
         self.config = config
         self.ptx_coeff = ptx_coeff
         # if self.config.use_fused_head_and_loss_fn:
-        #     self.ppo_criterion = FusedPPOLoss(config, clip_range_ratio)
+        #     self.ppo_criterion = FusedPPOLoss(config, clip_range_ratio, clip_range_ratio_low, clip_range_ratio_high)
         # else:
-        #     self.ppo_criterion = RLHFPPOLoss(config, clip_range_ratio)
-        self.ppo_criterion = RLHFPPOLoss(config, clip_range_ratio)
+        #     self.ppo_criterion = RLHFPPOLoss(config, clip_range_ratio, clip_range_ratio_low, clip_range_ratio_high)
+        self.clip_range_ratio_low = clip_range_ratio_low if clip_range_ratio_low is not None else clip_range_ratio
+        self.clip_range_ratio_high = clip_range_ratio_high if clip_range_ratio_high is not None else clip_range_ratio
+        self.ppo_criterion = RLHFPPOLoss(
+            config, clip_range_ratio, self.clip_range_ratio_low, self.clip_range_ratio_high
+        )
         self.sft_criterion = PretrainingCriterion(config)
         self.kl_loss_coeff = kl_loss_coeff
         self.clip_range_score = clip_range_score
@@ -449,6 +457,8 @@ def forward(
                 tensor_parallel_output=self.config.tensor_parallel_output,
                 pg_loss_coeff=self.pg_loss_coeff,  # donot use this
                 clip_range_ratio=self.clip_range_ratio,
+                clip_range_ratio_low=self.clip_range_ratio_low,
+                clip_range_ratio_high=self.clip_range_ratio_high,
                 entropy_coeff=self.entropy_coeff,  # donot support this
                 clip_range_score=self.clip_range_score,
                 kl_loss_coeff=self.kl_loss_coeff,
@@ -642,6 +652,8 @@ def forward(
         ref_log_probs: paddle.Tensor,
         advantages: paddle.Tensor,
         clip_range_ratio: float,
+        clip_range_ratio_low: float,
+        clip_range_ratio_high: float,
         clip_range_score: float,
         kl_loss_coeff: float,  # KL loss coefficient
         temperature: float,
@@ -777,7 +789,9 @@ def forward(
 
             # ratio
             ratio_chunk = paddle.exp(log_probs_chunk - old_log_probs_chunk)
-            clipped_ratio_chunk = paddle.clip(ratio_chunk, min=1.0 - clip_range_ratio, max=1.0 + clip_range_ratio)
+            clipped_ratio_chunk = paddle.clip(
+                ratio_chunk, min=1.0 - clip_range_ratio_low, max=1.0 + clip_range_ratio_high
+            )
 
             # final loss
             pg_loss1_chunk = -advantages_chunk * ratio_chunk
@@ -913,10 +927,12 @@ def backward(ctx, grad_output, *args):
 class FusedPPOLoss(nn.Layer):
     """Fused PPOLoss"""
 
-    def __init__(self, config, clip_range_ratio=0.2):
+    def __init__(self, config, clip_range_ratio=0.2, clip_range_ratio_low=None, clip_range_ratio_high=None):
         """Initialize FusedPPOLoss class."""
         super().__init__()
         self.clip_range_ratio = clip_range_ratio
+        self.clip_range_ratio_low = clip_range_ratio_low
+        self.clip_range_ratio_high = clip_range_ratio_high
         self.config = config
 
     def forward(
@@ -970,6 +986,8 @@ def forward(
             old_log_probs=old_log_probs,
             advantages=reward_advantages,
             clip_range_ratio=self.clip_range_ratio,
+            clip_range_ratio_low=self.clip_range_ratio_low,
+            clip_range_ratio_high=self.clip_range_ratio_high,
         )
         return actor_loss
 
@@ -994,6 +1012,8 @@ def forward(
         tensor_parallel_output: bool,
         pg_loss_coeff: float,
         clip_range_ratio: float,  # pg loss
+        clip_range_ratio_low: float,
+        clip_range_ratio_high: float,
         entropy_coeff: float,  # entropy loss
         clip_range_score: float,  # clip loss
         kl_loss_coeff: float,  # clip loss
@@ -1092,8 +1112,8 @@ def maybe_transpose(x):
             ratio_chunk = paddle.exp(log_probs_chunk - old_log_prob_chunk)
             clipped_ratio_chunk = paddle.clip(
                 ratio_chunk,
-                min=1.0 - clip_range_ratio,
-                max=1.0 + clip_range_ratio,
+                min=1.0 - clip_range_ratio_low,
+                max=1.0 + clip_range_ratio_high,
             )
 
             pg_loss1_chunk = -advantages_chunk * ratio_chunk
@@ -1249,6 +1269,8 @@ def actor_fused_pg_entropy_kl_loss(
     tensor_parallel_output: bool = False,
     pg_loss_coeff: float = 1.0,
     clip_range_ratio: float = 0.2,
+    clip_range_ratio_low: float = None,
+    clip_range_ratio_high: float = None,
     entropy_coeff: float = 0.001,
     clip_range_score: float = 10.0,
     kl_loss_coeff: float = 0.001,
@@ -1280,6 +1302,8 @@ def actor_fused_pg_entropy_kl_loss(
             fused_linear=fused_linear,
             loop_chunk_size=loop_chunk_size,
             clip_range_ratio=clip_range_ratio,
+            clip_range_ratio_low=clip_range_ratio_low,
+            clip_range_ratio_high=clip_range_ratio_high,
             clip_range_score=clip_range_score,
             kl_loss_coeff=kl_loss_coeff,
             ignore_index=-100,
@@ -1301,6 +1325,8 @@ def actor_fused_pg_entropy_kl_loss(
         tensor_parallel_output=tensor_parallel_output,
         pg_loss_coeff=pg_loss_coeff,
         clip_range_ratio=clip_range_ratio,  # pg loss
+        clip_range_ratio_low=clip_range_ratio_low,
+        clip_range_ratio_high=clip_range_ratio_high,
         entropy_coeff=entropy_coeff,  # entropy loss
         clip_range_score=clip_range_score,  # clip loss
         kl_loss_coeff=kl_loss_coeff,  # clip loss