PaddlePaddle
diff --git a/‎csrc/gpu/unittest/test_get_padding_offset_v2.py‎
Lines changed: 0 additions & 1 deletion b/‎csrc/gpu/unittest/test_get_padding_offset_v2.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddlenlp/experimental/transformers/llama/modeling.py‎
Lines changed: 137 additions & 46 deletions b/‎paddlenlp/experimental/transformers/llama/modeling.py‎
Lines changed: 137 additions & 46 deletions
diff --git a/‎paddlenlp/trainer/trainer.py‎
Lines changed: 202 additions & 66 deletions b/‎paddlenlp/trainer/trainer.py‎
Lines changed: 202 additions & 66 deletions
diff --git a/‎paddlenlp/trainer/trainer_utils.py‎
Lines changed: 88 additions & 0 deletions b/‎paddlenlp/trainer/trainer_utils.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎paddlenlp/trainer/training_args.py‎
Lines changed: 80 additions & 8 deletions b/‎paddlenlp/trainer/training_args.py‎
Lines changed: 80 additions & 8 deletions
@@ -64,6 +64,5 @@ def test_get_padding_offset_v2(self):
         assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, "Check cu_seqlens_q failed."
         assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, "Check cu_seqlens_k failed."
 
-
 if __name__ == "__main__":
     unittest.main()
@@ -37,6 +37,10 @@
 import paddle
 import paddle.distributed as dist
 from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+    DygraphShardingOptimizer,
+    DygraphShardingOptimizerV2,
+)
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.io import IterableDataset
 from paddle.optimizer.lr import LambdaDecay
@@ -1357,3 +1361,87 @@ def set_comm_config(configs, attr, dict_obj):
     set_comm_config("moe_sharding_configs", "check_nccl_config", nccl_config.get("moe_sharding_check", None))
     set_comm_config("default_comm_group_configs", "nccl_config", nccl_config.get("default", None))
     return strategy
+
+
+def init_optimizer(optimizer, model_sharded_state_dict, state_dict_metadata):
+    """
+    Initialize the optimizer's states according to its type.
+
+    For DygraphShardingOptimizer (V1), initializes accumulators for local parameters.
+    For DygraphShardingOptimizerV2, manually initializes master weights and state dict for sharded parameters.
+    For other cases, initializes accumulators for all parameters.
+
+    Args:
+        optimizer: The optimizer instance to be initialized.
+    """
+    optimizer_state_names = [".moment1_0", ".moment2_0", ".beta1_pow_acc_0", ".beta2_pow_acc_0", ".w_0"]
+    inner_opt = getattr(optimizer, "_inner_opt", None)
+    static_to_struct_mapping = {}
+    model_sharded_state_dict = dict(sorted(model_sharded_state_dict.items()))
+    for k, v in model_sharded_state_dict.items():
+        if v.local_tensor.name not in static_to_struct_mapping:
+            static_to_struct_mapping[v.local_tensor.name] = k
+
+    if isinstance(inner_opt, DygraphShardingOptimizer):
+        local_params = optimizer._rank2params[optimizer._sharding_rank]
+        param_list = []
+        for param in local_params:
+            param_name = param.name
+            struct_name = static_to_struct_mapping[param_name]
+            if not any(struct_name + state_name in state_dict_metadata for state_name in optimizer_state_names):
+                continue
+            param_list.append(param)
+        optimizer._create_accumulators(paddle.base.framework.default_main_program().global_block(), param_list)
+        return
+
+    elif isinstance(inner_opt, DygraphShardingOptimizerV2):
+
+        def init_param_optimizer_states(param_iter):
+            master_weights = {}
+            state_dict = {}
+            moments = ("moment1_0", "moment2_0")
+            betas = ("beta1_pow_acc_0", "beta2_pow_acc_0")
+            for static_name, shape, no_need_master_weights in param_iter:
+                if not no_need_master_weights:
+                    master_weights[static_name] = paddle.zeros(shape, dtype="float32")
+                    prefix = f"{static_name}_fp32_master_0_"
+                else:
+                    prefix = f"{static_name}_"
+
+                for moment in moments:
+                    key = f"{prefix}{moment}"
+                    state_dict[key] = paddle.zeros(shape, dtype="float32")
+                for beta in betas:
+                    key = f"{prefix}{beta}"
+                    state_dict[key] = paddle.zeros((1,), dtype="float32")
+            return master_weights, state_dict
+
+        def buffer_params():
+            for buffer in optimizer._comm_buffer_list:
+                for param_name, grad_view in buffer._sharding_param_grad_view.items():
+                    struct_name = static_to_struct_mapping[param_name]
+                    if not any(
+                        struct_name + state_name in state_dict_metadata for state_name in optimizer_state_names
+                    ):
+                        continue
+                    param_begin = grad_view._param_begin
+                    param_end = grad_view._param_end
+                    shape = (param_end - param_begin,)
+                    no_need_master_weights = grad_view._param.dtype == paddle.float32
+
+                    if shape[0] > 0:
+                        yield param_name, shape, no_need_master_weights
+
+        master_weights, state_dict = init_param_optimizer_states(buffer_params())
+        state_dict["master_weights"] = master_weights
+        state_dict["LR_Scheduler"] = {"last_epoch": 1, "last_lr": 5e-06}
+        optimizer.set_state_dict(state_dict)
+        return
+    param_list = []
+    for param in optimizer._parameter_list:
+        param_name = param.name
+        struct_name = static_to_struct_mapping[param_name]
+        if not any(struct_name + state_name in state_dict_metadata for state_name in optimizer_state_names):
+            continue
+        param_list.append(param)
+    optimizer._create_accumulators(paddle.base.framework.default_main_program().global_block(), param_list)
@@ -407,6 +407,12 @@ class TrainingArguments:
             Whether to release gradients during training. Default is `False`.
         ckpt_quant_stage (`str`, *optional*):
             Whether activate checkpoint quantization. O0: deactivate, O1: Int8 compression, O2: Int4 compression. (default: O0).
+        save_checkpoint_format (`str`, *optional*):
+            Specifies the format for saving checkpoints. Options are: None, 'sharding_io', 'unified_checkpoint', 'flex_checkpoint'. (default: None). This setting is ignored if the corresponding switch is configured.
+        load_checkpoint_format (`str`, *optional*):
+            Specifies the format for loading checkpoints. Options are: None, 'sharding_io', 'unified_checkpoint', 'flex_checkpoint'. (default: None). This setting is ignored if the corresponding switch is configured.
+        aoa_config (`Optional[dict[str, list[str]]]`, *optional*):
+            The AoA configuration of FlexCheckpoint, used to describe the mapping between model weights and the checkpoint content. Default is None.
     """
 
     output_dir: str = field(
@@ -941,6 +947,29 @@ class TrainingArguments:
         default=False,
         metadata={"help": "Whether to use async_save instead of paddle.save."},
     )
+    save_checkpoint_format: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Specifies the format used to save checkpoints. "
+                "Available options: 'sharding_io', 'unified_checkpoint', "
+                "'flex_checkpoint'."
+                "This setting is ignored if the corresponding switch is configured."
+            )
+        },
+    )
+
+    load_checkpoint_format: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Specifies the format used to load checkpoints. "
+                "Available options: 'sharding_io', 'unified_checkpoint', "
+                "'flex_checkpoint'."
+                "This setting is ignored if the corresponding switch is configured."
+            )
+        },
+    )
     ordered_save_group_size: int = field(
         default=0,
         metadata={
@@ -1106,6 +1135,13 @@ class TrainingArguments:
         default=None, metadata={"help": "NCCL中通信组的细粒度控制的配置文件路径, 默认值为None, 代表不启用此项配置"}
     )
 
+    aoa_config: Optional[dict[str, list[str]]] = field(
+        default=None,
+        metadata={
+            "help": "The AoA configuration of FlexCheckpoint, used to describe the mapping between model weights and the checkpoint content. Default is None."
+        },
+    )
+
     def __post_init__(self):
         world_size = paddle.distributed.get_world_size()
         if in_auto_parallel_align_mode():
@@ -1210,7 +1246,8 @@ def __post_init__(self):
             raise ValueError("AdamW Mini currently doesn't support tensor parallelism.")
 
         self._post_init_parallel_degree()
-
+        self._post_init_save_checkpoint_format()
+        self._post_init_load_checkpoint_format()
         if self.to_static:
             assert world_size == 1 or self.enable_auto_parallel, (
                 "It's not supported for training in static mode except the following cases : "
@@ -1864,24 +1901,31 @@ def is_context_parallel_supported():
         else:
             if world_size > 1:
                 if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
-                    if self.unified_checkpoint:
+                    if self.save_checkpoint_format in [
+                        "unified_checkpoint",
+                        "flex_checkpoint",
+                    ] or self.load_checkpoint_format in ["unified_checkpoint", "flex_checkpoint"]:
                         # DP use hybrid group
                         strategy = fleet.DistributedStrategy()
                         fleet.init(is_collective=True, strategy=strategy)
                     else:
                         paddle.distributed.init_parallel_env()
 
         if (
-            self.unified_checkpoint
+            (
+                self.save_checkpoint_format == "unified_checkpoint"
+                or self.load_checkpoint_format == "unified_checkpoint"
+            )
             and self.sharding_parallel_degree > 0
             and ShardingOption.FULL_SHARD in self.sharding
         ):
             logger.warning(
-                "Unified checkpoint currently do not support sharding stage3, set `unified_checkpoint` to False."
+                "Unified checkpoint currently do not support sharding stage3, disabling unified_checkpoint format."
             )
-            self.unified_checkpoint = False
+            self.save_checkpoint_format = None
+            self.load_checkpoint_format = None
 
-        if self.unified_checkpoint:
+        if self.save_checkpoint_format == "unified_checkpoint" or self.load_checkpoint_format == "unified_checkpoint":
             unified_checkpoint_config = set(self.unified_checkpoint_config.split(" "))
             if sys.platform.startswith("win") and "async_save" in self.unified_checkpoint_config:
                 raise ValueError("Currently do not support asynchronous saving for Windows system!")
@@ -2134,6 +2178,30 @@ def _post_init_parallel_degree(self):
         if self.use_hybrid_parallel and self.enable_auto_parallel:
             self.use_hybrid_parallel = False
 
+    def _post_init_save_checkpoint_format(self):
+        if self.save_checkpoint_format:
+            valid_modes = ["unified_checkpoint", "sharding_io", "flex_checkpoint"]
+            assert (
+                self.save_checkpoint_format in valid_modes
+            ), f"Invalid save_checkpoint_format: {self.save_checkpoint_format}, Only these formats are allowed: {valid_modes}."
+        else:
+            if self.unified_checkpoint:
+                self.save_checkpoint_format = "unified_checkpoint"
+            elif self.save_sharded_model:
+                self.save_checkpoint_format = "sharding_io"
+
+    def _post_init_load_checkpoint_format(self):
+        if self.load_checkpoint_format:
+            valid_modes = ["unified_checkpoint", "sharding_io", "flex_checkpoint"]
+            assert (
+                self.load_checkpoint_format in valid_modes
+            ), f"Invalid load_checkpoint_format: {self.load_checkpoint_format}, Only these formats are allowed: {valid_modes}."
+        else:
+            if self.unified_checkpoint:
+                self.load_checkpoint_format = "unified_checkpoint"
+            elif self.load_sharded_model:
+                self.load_checkpoint_format = "sharding_io"
+
     def add_moe_comm_group(self):
         hybrid_configs = fleet.fleet._user_defined_strategy.hybrid_configs
         hcg = fleet.get_hybrid_communicate_group()
@@ -2462,6 +2530,8 @@ def should_save_model_state(self):
                 return True
             elif self.enable_auto_parallel:
                 return True
+            elif self.save_checkpoint_format == "flex_checkpoint":
+                return False
             elif self.use_hybrid_parallel:
                 # save on dataset rank 0
                 return self.sharding_parallel_rank == 0 and (self.data_parallel_rank == 0 or self.use_expert_parallel)
@@ -2480,14 +2550,16 @@ def should_save_sharding_stage1_model(self):
         if self.enable_auto_parallel:
             return False
         return (
-            ShardingOption.SHARD_OP in self.sharding and self.sharding_parallel_degree > 1 and self.save_sharded_model
+            ShardingOption.SHARD_OP in self.sharding
+            and self.sharding_parallel_degree > 1
+            and self.save_checkpoint_format == "sharding_io"
         )
 
     @property
     def should_load_sharding_stage1_model(self):
         if self.enable_auto_parallel:
             return False
-        return self.load_sharded_model
+        return self.load_checkpoint_format == "sharding_io"
 
     @property
     def should_load_dataset(self):