PaddlePaddle · ZHUI · May 22, 2025 · Apr 24, 2025 · Apr 25, 2025 · Apr 29, 2025
diff --git a/docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po b/docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po
@@ -276,7 +276,7 @@ msgid "word of current node."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
-msgid "Implementataion of BK-Tree"
+msgid "Implementation of BK-Tree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
@@ -300,7 +300,7 @@ msgid "similar words."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree:1
-msgid "Implementataion of TriedTree"
+msgid "Implementation of TriedTree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree.add_word:1

diff --git a/docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.transformers.squeezebert.tokenizer.po b/docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.transformers.squeezebert.tokenizer.po
@@ -50,7 +50,7 @@ msgid ""
 msgstr ""
 
 #: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:11
-msgid "The special token for unkown words. Default: \"[UNK]\"."
+msgid "The special token for unknown words. Default: \"[UNK]\"."
 msgstr ""
 
 #: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:13

diff --git a/llm/experimental/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py
@@ -1380,12 +1380,12 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         def progressive_seq(x, y):
-            globel_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
-            if globel_step < 500:
+            global_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
+            if global_step < 500:
                 return x[:, :512], y[:, :512]
-            if globel_step < 1000:
+            if global_step < 1000:
                 return x[:, :1024], y[:, :1024]
-            if globel_step < 1500:
+            if global_step < 1500:
                 return x[:, :2048], y[:, :2048]
             return x, y
 

diff --git a/paddlenlp/data/blendable_dataset.py b/paddlenlp/data/blendable_dataset.py
@@ -43,7 +43,7 @@ def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=Non
         assert sum_weights > 0.0
         weights /= sum_weights
 
-        # Build indicies.
+        # Build indices.
         def _build_indices():
             start_time = time.time()
 

diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py
@@ -677,7 +677,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
                 doc_offset += remaining_seq_length + doc_length - 1
                 remaining_seq_length = 0
             else:
-                # Otherwise, start from the begining of the next document.
+                # Otherwise, start from the beginning of the next document.
                 doc_idx_index += 1
                 doc_offset = 0
         # Record the sequence.

diff --git a/paddlenlp/data/indexed_dataset.py b/paddlenlp/data/indexed_dataset.py
@@ -55,7 +55,7 @@
 
 def make_dataset(path, impl, skip_warmup=False):
     if CompatibleIndexedDataset.exists(path):
-        print("Using old dataet (.npy & .npz)")
+        print("Using old dataset (.npy & .npz)")
         return CompatibleIndexedDataset(path)
     elif not IndexedDataset.exists(path):
         print(f"Dataset does not exist: {path}")
@@ -903,7 +903,7 @@
 
         self._path = path
 
-        # All documment ids, extend as 1-D array.
+        # All document ids, extend as 1-D array.
         self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
         process_data = np.load(path + "_idx.npz")
         self._sizes = process_data["lens"]

diff --git a/paddlenlp/data/tokenizer.py b/paddlenlp/data/tokenizer.py
@@ -58,7 +58,7 @@ def cut(self, sentence, cut_all=False, use_hmm=True):
         The method used to cut the text to tokens.
 
         Args:
-            sentence(str): The text that needs to be cuted.
+            sentence(str): The text that needs to be cut.
             cut_all(bool, optional): Whether to use the full mode. If True,
                 using full mode that gets all the possible words from the
                 sentence, which is fast but not accurate. If False, using
@@ -97,7 +97,7 @@ def encode(self, sentence, cut_all=False, use_hmm=True):
         ids using `vocab`.
 
         Args:
-            sentence(str): The text that needs to be cuted.
+            sentence(str): The text that needs to be cut.
             cut_all(bool, optional): Whether to use the full mode. If True,
                 using full mode that gets all the possible words from the
                 sentence, which is fast but not accurate. If False, using

diff --git a/paddlenlp/data/vocab.py b/paddlenlp/data/vocab.py
@@ -40,7 +40,7 @@ class Vocab(object):
             between tokens and indices to be used. If provided, adjust the tokens
             and indices mapping according to it. If None, counter must be provided.
             Default: None.
-        unk_token (str, optional): Special token for unknow token. If no need,
+        unk_token (str, optional): Special token for unknown token. If no need,
             it also could be None. Default: None.
         pad_token (str, optional): Special token for padding token. If no need,
             it also could be None. Default: None.
@@ -214,7 +214,7 @@ def to_tokens(self, indices):
         for idx in indices:
             if not isinstance(idx, (int, np.integer)):
                 warnings.warn(
-                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
+                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
                 )
                 idx = int(idx)
 
@@ -382,7 +382,7 @@ def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None,
         Args:
             token_to_idx (dict): A dict describes the mapping relationship between
                 tokens and indices.
-            unk_token (str, optional): The special token for unknow token. If
+            unk_token (str, optional): The special token for unknown token. If
                 no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token. If
                 no need, it also could be None. Default: None.
@@ -440,7 +440,7 @@ def build_vocab(
         **kwargs
     ):
         """
-        Builds the :class:`Vocab` accoring to given iterator and other
+        Builds the :class:`Vocab` according to given iterator and other
         information. Firstly, iterate over the `iterator` to construct a
         :class:`collections.Counter` and used to init the as  :class:`Vocab`.
 
@@ -455,7 +455,7 @@ def build_vocab(
                 relationship between tokens and indices to be used. If provided,
                 adjust the tokens and indices mapping according to it. If None,
                 counter must be provided. Default: None.
-            unk_token (str, optional): The special token for unknow token
+            unk_token (str, optional): The special token for unknown token
                 '<unk>'. If no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token
                 '<pad>'. If no need, it also could be None. Default: None.

diff --git a/paddlenlp/datasets/dataset.py b/paddlenlp/datasets/dataset.py
@@ -448,7 +448,7 @@
                 num_samples += 1
         else:
             if inspect.isgenerator(self.data):
-                warnings.warn("Reciving generator as data source, data can only be iterated once")
+                warnings.warn("Receiving generator as data source, data can only be iterated once")
             for example in self.data:
                 if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
                     num_samples=num_samples
@@ -580,7 +580,7 @@
                 lock_files.append(lock_file)
             # Must register to all procs to make the lock file can be removed
             # when any proc breaks. Otherwise, the single registered proc may
-            # not receive proper singal send by the parent proc to exit.
+            # not receive proper signal send by the parent proc to exit.
             atexit.register(lambda: remove_if_exit(lock_files))
             for split in splits:
                 filename = self._get_data(split)

diff --git a/paddlenlp/datasets/hf_datasets/docvqa_zh.py b/paddlenlp/datasets/hf_datasets/docvqa_zh.py
@@ -16,17 +16,17 @@
 
 # Lint as: python3
 
-import os
-import json
 import hashlib
+import json
+import os
 
 import datasets
 
 logger = datasets.logging.get_logger(__name__)
 
 _DESCRIPTION = """\
 The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
-The submission is now closed so we split original dataset into three parts for model evluation. \
+The submission is now closed so we split original dataset into three parts for model evaluation. \
 There are 4,187 training images, 500 validation images, and 500 test images.
 """
 

diff --git a/paddlenlp/datasets/rlhf_datasets/protocol.py b/paddlenlp/datasets/rlhf_datasets/protocol.py
@@ -393,7 +393,7 @@ def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None)
             meta_info_keys (list, optional): a list of keys indicating the meta info to pop
 
         Returns:
-            DataProto: the DataProto with the poped batch_keys and meta_info_keys
+            DataProto: the DataProto with the popped batch_keys and meta_info_keys
         """
         assert batch_keys is not None
         if meta_info_keys is None:

diff --git a/paddlenlp/datasets/rlhf_datasets/rl_dataset.py b/paddlenlp/datasets/rlhf_datasets/rl_dataset.py
@@ -40,7 +40,7 @@ def padding_batch_data(
     input_dict = {}
 
     input_ids = [sample["input_ids"] for sample in samples]
-    # TODO(drownfish19): confim if this is correct
+    # TODO(drownfish19): confirm if this is correct
     # attention_mask = [np.ones(input_id.shape, dtype=bool) for input_id in input_ids]
     input_dict["input_ids"] = left_padding(input_ids, padding_value=pad_token_id, max_length=max_prompt_len)
     # input_dict["attention_mask"] = left_padding(attention_mask, padding_value=0)

diff --git a/paddlenlp/datasets/thucnews.py b/paddlenlp/datasets/thucnews.py
@@ -24,7 +24,7 @@
 class THUCNews(DatasetBuilder):
     """
     A subset of THUCNews dataset. THUCNews is a text classification dataset.
-    See descrition about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
+    See description about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
     The whole dataset can be downloaded at https://thunlp.oss-cn-qingdao.aliyuncs.com/THUCNews.zip
     """
 

diff --git a/paddlenlp/datasets/xnli_cn.py b/paddlenlp/datasets/xnli_cn.py
@@ -29,7 +29,7 @@ class XNLI_CN(DatasetBuilder):
     XNLI dataset for chinese.
 
     XNLI is an evaluation corpus for language transfer and cross-lingual
-    sentence classification in 15 languages. Here, XNLI only contrains
+    sentence classification in 15 languages. Here, XNLI only contains
     chinese corpus.
 
     For more information, please visit https://github.com/facebookresearch/XNLI

diff --git a/paddlenlp/datasets/zero_padding_dataset.py b/paddlenlp/datasets/zero_padding_dataset.py
@@ -88,7 +88,7 @@ def _pad_batch_records(cls, batch_records):
                 attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
                 batched_features["attention_mask"].append(attention_mask)
             # NOTE: position_ids is optional and not required by every model
-            # We append instead of extend here to accomodate 2D position ids
+            # We append instead of extend here to accommodate 2D position ids
             if "position_ids" in record:
                 batched_features["position_ids"].append(record["position_ids"])
             sequence_sum += seq_length
@@ -98,7 +98,7 @@ def _pad_batch_records(cls, batch_records):
             # convert to 3-D [batch_size(1), seq_length, seq_length]
             batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
         if "position_ids" in batched_features:
-            # Accomodate both 1D and 2D position ids
+            # Accommodate both 1D and 2D position ids
             batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
         return batched_features
 

diff --git a/paddlenlp/ops/distributed/utils/topo.py b/paddlenlp/ops/distributed/utils/topo.py
@@ -49,8 +49,8 @@
         self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size)))
         worlds = []
         for i in range(len(ranks)):
-            indexs = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
-            worlds.append(arr[indexs])
+            indexes = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
+            worlds.append(arr[indexes])
 
         for i, key in enumerate(self.order):
             if key == "dp":

diff --git a/paddlenlp/ops/triton_ops/triton_utils.py b/paddlenlp/ops/triton_ops/triton_utils.py
@@ -621,7 +621,7 @@ def decorator(*args, **kwargs):
             op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
             op_dict["triton_kernel_args"] = ",".join(modified_arg_exclude_constexpr)
             op_dict["key"] = ",".join(self.key_args)
-            # when tunning, we need to reset the out to zero.
+            # when tuning, we need to reset the out to zero.
             if "reset_zero_when_tune" in other_config.keys():
                 op_dict["reset_zero_when_tune"] = other_config["reset_zero_when_tune"]
 

diff --git a/paddlenlp/quantization/quantization_config.py b/paddlenlp/quantization/quantization_config.py
@@ -30,7 +30,7 @@ class QuantizationConfig:
     This is the configuration class to store quantization configuration.
     Args:
         weight_quantize_algo: Weight quantization algorithm.
-        quant_type: Quantization type appplied to weight and activation, weight may still keep in float tensor.
+        quant_type: Quantization type applied to weight and activation, weight may still keep in float tensor.
         shift: Whether the model applied the shift strategy.
         smooth: Whether the model applied the smooth strategy.
         shift_smooth_all_linears: Whether the model applied shift or smooth strategy for all linears.

diff --git a/paddlenlp/rl/trainer/ppo_trainer.py b/paddlenlp/rl/trainer/ppo_trainer.py
@@ -267,7 +267,7 @@ def __init__(
                 "pipeline_parallel_degree": 1,  # workaround for pipeline parallel model check
             },
         ):
-            # just used to create trival attrs might be used in the training
+            # just used to create trivial attrs might be used in the training
             # process of trainer, while changing some args to avoid model usage
             # in __init__ such as recompute and AMP-O2
             super().__init__(

diff --git a/paddlenlp/taskflow/dialogue.py b/paddlenlp/taskflow/dialogue.py
@@ -155,7 +155,7 @@
                 inputs = [list(self.context)]
                 return inputs
             else:
-                raise ValueError("In the interactive mode, the input data shold be a string")
+                raise ValueError("In the interactive mode, the input data should be a string")
         elif not isinstance(inputs[0], list):
             raise ValueError("If not in the interactive mode, the input data should be a list.")
         return inputs

diff --git a/paddlenlp/taskflow/document_intelligence.py b/paddlenlp/taskflow/document_intelligence.py
@@ -50,7 +50,7 @@
 
 class DocPromptTask(Task):
     """
-    The document intelligence model, give the querys and predict the answers.
+    The document intelligence model, give the queries and predict the answers.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.

diff --git a/paddlenlp/taskflow/task.py b/paddlenlp/taskflow/task.py
@@ -33,7 +33,7 @@
 
 class Task(metaclass=abc.ABCMeta):
     """
-    The meta classs of task in Taskflow. The meta class has the five abstract function,
+    The meta class of task in Taskflow. The meta class has the five abstract function,
         the subclass need to inherit from the meta class.
     Args:
         task(string): The name of task.

diff --git a/paddlenlp/taskflow/text_classification.py b/paddlenlp/taskflow/text_classification.py
@@ -108,7 +108,7 @@ def softmax(x, axis=None):
 
 class TextClassificationTask(Task):
     """
-    The text classfication model to classify text.
+    The text classification model to classify text.
     NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
     Instead, it's used as a simple inference pipeline.
 
@@ -122,7 +122,7 @@ class TextClassificationTask(Task):
             multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
             max_length (int): Maximum number of tokens for the model.
             precision (int): Select among ["fp32", "fp16"]. Default to "fp32".
-            plm_model_name (str): Pretrained langugae model name for PromptModel.
+            plm_model_name (str): Pretrained language model name for PromptModel.
             input_spec [list]: Specify the tensor information for each input parameter of the forward function.
             id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
             batch_size(int): The sample number of a mini-batch.
@@ -171,7 +171,7 @@ def _construct_input_spec(self):
                     init_class = json.load(fb)["architectures"].pop()
             else:
                 raise IOError(
-                    f"Model configuration file dosen't exist.[task_path] should inclue {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
+                    f"Model configuration file doesn't exist.[task_path] should include {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
                 )
 
             if init_class in ["ErnieMForSequenceClassification"]:
@@ -286,7 +286,7 @@ def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
         Run the task model from the outputs of the `_tokenize` function.
         """
-        # TODO: support hierachical classification
+        # TODO: support hierarchical classification
         outputs = {}
         outputs["text"] = inputs["text"]
         outputs["batch_logits"] = []
@@ -326,7 +326,7 @@ def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
         This function converts the model logits output to class score and predictions
         """
-        # TODO: support hierachical classification
+        # TODO: support hierarchical classification
         postprocessed_outputs = []
         for logits in inputs["batch_logits"]:
             if self.problem_type == "multi_class":

diff --git a/paddlenlp/taskflow/text_feature_extraction.py b/paddlenlp/taskflow/text_feature_extraction.py
@@ -424,7 +424,7 @@ def _parse_batch(batch_examples, max_seq_len=None):
             )
             return tokenized_inputs
 
-        # Seperates data into some batches.
+        # Separates data into some batches.
         one_batch = []
         self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
         sentences_sorted = [data[idx] for idx in self.length_sorted_idx]