PaddlePaddle
diff --git a/‎.github/workflows/approval.yml‎
Lines changed: 3 additions & 4 deletions b/‎.github/workflows/approval.yml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎csrc/gpu/unittest/test_get_padding_offset_v2.py‎
Lines changed: 0 additions & 1 deletion b/‎csrc/gpu/unittest/test_get_padding_offset_v2.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎llm/tools/preprocess/create_pretraining_data.py‎
Lines changed: 1 addition & 1 deletion b/‎llm/tools/preprocess/create_pretraining_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm/utils/fused_layers.py‎
Lines changed: 2 additions & 2 deletions b/‎llm/utils/fused_layers.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm/utils/sp_async_reduce_scatter.py‎
Lines changed: 1 addition & 1 deletion b/‎llm/utils/sp_async_reduce_scatter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/experimental/transformers/llama/modeling.py‎
Lines changed: 137 additions & 46 deletions b/‎paddlenlp/experimental/transformers/llama/modeling.py‎
Lines changed: 137 additions & 46 deletions
diff --git a/‎paddlenlp/trainer/auto_trainer.py‎
Lines changed: 14 additions & 0 deletions b/‎paddlenlp/trainer/auto_trainer.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎paddlenlp/trainer/trainer.py‎
Lines changed: 22 additions & 7 deletions b/‎paddlenlp/trainer/trainer.py‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎paddlenlp/trainer/training_args.py‎
Lines changed: 6 additions & 1 deletion b/‎paddlenlp/trainer/training_args.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎paddlenlp/transformers/clipseg/modeling.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/transformers/clipseg/modeling.py‎
Lines changed: 1 addition & 1 deletion
@@ -23,9 +23,8 @@ jobs:
       - name: Update paddle
         run: |
           wget -q --no-proxy https://xly-devops.bj.bcebos.com/PaddleTest/PaddleNLP/PaddleNLP-develop.tar.gz --no-check-certificate
-          rm -rf PaddleNLP-develop && tar zxf PaddleNLP-develop.tar.gz >/dev/null
-          mv PaddleNLP-develop PaddleNLP && rm -rf PaddleNLP-develop.tar.gz >/dev/null
-          cd PaddleNLP/
+          tar zxf PaddleNLP-develop.tar.gz --strip-components=1 >/dev/null
+          rm -rf PaddleNLP-develop.tar.gz >/dev/null
           git fetch origin pull/${PR_ID}/head
           git checkout -b origin_pr FETCH_HEAD
           git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git
@@ -44,5 +43,5 @@ jobs:
       - name: Display Required Approvers
         if: steps.check-bypass.outputs.can-skip != 'true'
         run: |
-          cd PaddleNLP/scripts/ci_approval
+          cd scripts/ci_approval
           bash -x run_ci_approval.sh
@@ -64,6 +64,5 @@ def test_get_padding_offset_v2(self):
         assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, "Check cu_seqlens_q failed."
         assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, "Check cu_seqlens_k failed."
 
-
 if __name__ == "__main__":
     unittest.main()
@@ -176,7 +176,7 @@ def get_whole_word_mask_tokens(tokens, words, max_word_length=6):
             i += 1
             continue
 
-        # add "##" mark on the middel tokens of Chinese words
+        # add "##" mark on the middle tokens of Chinese words
         # such as ["通过", "利用"] -> ["通", "##过"， "利", "##用"]
         has_add = False
         for length in range(max_word_length, 0, -1):
 
@@ -106,11 +106,11 @@ def sp_async_reducesctter(x_grad):
 def sync_mp_allreduce(task, dist_tensor):
     mp_placement_index = dist_tensor.process_mesh.dim_names.index("mp")
     new_placments = list()
-    for idx, placment in enumerate(dist_tensor.placements):
+    for idx, placement in enumerate(dist_tensor.placements):
         if idx == mp_placement_index:
             new_placments.append(dist.Replicate())
         else:
-            new_placments.append(placment)
+            new_placments.append(placement)
     place = paddle.framework._current_expected_place()
     place = paddle.framework._get_paddle_place(place)
 
 
@@ -172,7 +172,7 @@ def forward_pre_hook(layer, input):
     ipp = id2ipp[id(layer)]
 
 
-def forward_post_hook(layer, input, ouput):
+def forward_post_hook(layer, input, output):
     paddle.nn.functional.linear = paddle_nn_functional_linear
     if is_fused_matmul_bias_supported():
         paddle.incubate.nn.functional.fused_linear = paddle_incubate_nn_functional_fused_linear
 
@@ -16,6 +16,7 @@
 import os
 import random
 import time
+import types
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
@@ -24,6 +25,7 @@
 import paddle.distributed.auto_parallel.intermediate.parallelize as parallelize
 import paddle.nn as nn
 from paddle.distributed import fleet
+from paddle.distributed.auto_parallel._utils import _patch_grads_for_step
 from paddle.profiler.utils import switch_job_schedule_profiler
 from tqdm.auto import tqdm
 
@@ -518,6 +520,18 @@ def _inner_training_loop(
             npu_accelerate_plugin(self.optimizer)
 
         model, dist_loader = self._wrap_for_auto(model, train_dataloader)
+
+        if (
+            dist.in_auto_parallel_align_mode()
+        ):  # When in auto parallel align mode, patching the optimizer step function
+
+            orig_step = (
+                self.optimizer.step.__func__ if hasattr(self.optimizer.step, "__func__") else self.optimizer.step
+            )
+            decorator = _patch_grads_for_step(amp_master_grad=self.args.amp_master_grad)
+            new_step = decorator(orig_step)
+            self.optimizer.__dict__["step"] = types.MethodType(new_step, self.optimizer)
+
         train_dataloader = dist_loader()
         if resume_from_checkpoint is not None:
             self._load_from_checkpoint(resume_from_checkpoint)
 
@@ -45,6 +45,11 @@
 from paddle import framework
 from paddle.distributed.fleet.meta_parallel import PipelineLayer
 
+try:
+    from paddle.distributed.fleet.meta_parallel import PipelineDatasetPreprocessor
+except:
+    PipelineDatasetPreprocessor = None
+
 try:
     from paddle.base import core
 except:
@@ -2756,22 +2761,32 @@ def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle
         # for v in self._pp_data_buffer[0].values():
         #     assert isinstance(v, paddle.Tensor), f"Only support tensor as pipeline mode input, got type {type(v)}"
 
-        with self.autocast_smart_context_manager():
-            inputs = model._prepare_pipeline_inputs_func(self._pp_data_buffer)
-        self._pp_data_buffer = []
-
         model.train()
         if model._dp_comm_overlap or model._sharding_comm_overlap:
             for _, buffers in model._chunk_2_comm_buffers.items():
                 for buffer in buffers:
                     buffer._acc_steps = self.args.gradient_accumulation_steps
 
-        inputs = model._prepare_training(
-            inputs, self.optimizer, self.lr_scheduler
-        )  # None, None => [optimizer, lr_scheduler]
         model.optimizer = None  # we do not use `PipelineParallel` to handler optimizer step
         model.lr_scheduler = None
 
+        def _dataset_process_function():
+            # Pass a local function to forward_backward_pipeline instead of the dataset itself.
+            # This prevents the dataset from being passed as a direct argument to forward_backward_pipeline,
+            # which would create additional reference counts that cannot be cleared, leading to GPU memory leaks.
+            with self.autocast_smart_context_manager():
+                inputs = model._prepare_pipeline_inputs_func(self._pp_data_buffer)
+            self._pp_data_buffer = []
+
+            return model._prepare_training(
+                inputs, self.optimizer, self.lr_scheduler
+            )  # None, None => [optimizer, lr_scheduler]
+
+        if PipelineDatasetPreprocessor is None:
+            inputs = _dataset_process_function()
+        else:
+            inputs = PipelineDatasetPreprocessor(_dataset_process_function)
+
         with self.autocast_smart_context_manager():
             loss = model.forward_backward_pipeline(inputs, self.scaler if self.do_grad_scaling else None)
 
 
@@ -1145,7 +1145,12 @@ class TrainingArguments:
     def __post_init__(self):
         world_size = paddle.distributed.get_world_size()
         if in_auto_parallel_align_mode():
-            self.max_grad_norm = 0.0
+            # self.max_grad_norm = 0.0
+            # The current auto_hybrid_pp has aligned the handling of ClipGradByGlobalNorm with the original dygraph semi-auto parallel and dynamic manual-parallel modes and can correctly handle grad_clip, so it is no longer necessary to set max_grad_norm=0.0.
+            if self.max_grad_norm != 0.0:
+                warnings.warn(
+                    "max_grad_norm is not 0.0,We will execute ClipGradByGlobalNorm,if you want to disable it,please set max_grad_norm=0.0"
+                )
             os.environ["FLAGS_max_inplace_grad_add"] = "65536"
             os.environ["FLAGS_embedding_deterministic"] = "1"
             os.environ["FLAGS_cudnn_deterministic"] = "1"
 
@@ -340,7 +340,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, axis=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following