PaddlePaddle
diff --git a/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po‎
Lines changed: 2 additions & 2 deletions b/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.transformers.squeezebert.tokenizer.po‎
Lines changed: 1 addition & 1 deletion b/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.transformers.squeezebert.tokenizer.po‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm/experimental/ernie-3.5-se/modeling.py‎
Lines changed: 4 additions & 4 deletions b/‎llm/experimental/ernie-3.5-se/modeling.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddlenlp/data/blendable_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/data/blendable_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/data/causal_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/data/causal_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/data/indexed_dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎paddlenlp/data/indexed_dataset.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddlenlp/data/tokenizer.py‎
Lines changed: 2 additions & 2 deletions b/‎paddlenlp/data/tokenizer.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddlenlp/data/vocab.py‎
Lines changed: 5 additions & 5 deletions b/‎paddlenlp/data/vocab.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎paddlenlp/datasets/dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎paddlenlp/datasets/dataset.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddlenlp/datasets/hf_datasets/docvqa_zh.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/datasets/hf_datasets/docvqa_zh.py‎
Lines changed: 1 addition & 1 deletion
@@ -276,7 +276,7 @@ msgid "word of current node."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
-msgid "Implementataion of BK-Tree"
+msgid "Implementation of BK-Tree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
@@ -300,7 +300,7 @@ msgid "similar words."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree:1
-msgid "Implementataion of TriedTree"
+msgid "Implementation of TriedTree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree.add_word:1
 
@@ -50,7 +50,7 @@ msgid ""
 msgstr ""
 
 #: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:11
-msgid "The special token for unkown words. Default: \"[UNK]\"."
+msgid "The special token for unknown words. Default: \"[UNK]\"."
 msgstr ""
 
 #: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:13
 
@@ -1380,12 +1380,12 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         def progressive_seq(x, y):
-            globel_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
-            if globel_step < 500:
+            global_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
+            if global_step < 500:
                 return x[:, :512], y[:, :512]
-            if globel_step < 1000:
+            if global_step < 1000:
                 return x[:, :1024], y[:, :1024]
-            if globel_step < 1500:
+            if global_step < 1500:
                 return x[:, :2048], y[:, :2048]
             return x, y
 
 
@@ -43,7 +43,7 @@ def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=Non
         assert sum_weights > 0.0
         weights /= sum_weights
 
-        # Build indicies.
+        # Build indices.
         def _build_indices():
             start_time = time.time()
 
 
@@ -677,7 +677,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
                 doc_offset += remaining_seq_length + doc_length - 1
                 remaining_seq_length = 0
             else:
-                # Otherwise, start from the begining of the next document.
+                # Otherwise, start from the beginning of the next document.
                 doc_idx_index += 1
                 doc_offset = 0
         # Record the sequence.
 
@@ -55,7 +55,7 @@ def get_available_dataset_impl():
 
 def make_dataset(path, impl, skip_warmup=False):
     if CompatibleIndexedDataset.exists(path):
-        print("Using old dataet (.npy & .npz)")
+        print("Using old dataset (.npy & .npz)")
         return CompatibleIndexedDataset(path)
     elif not IndexedDataset.exists(path):
         print(f"Dataset does not exist: {path}")
@@ -903,7 +903,7 @@ def __init__(self, path):
 
         self._path = path
 
-        # All documment ids, extend as 1-D array.
+        # All document ids, extend as 1-D array.
         self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
         process_data = np.load(path + "_idx.npz")
         self._sizes = process_data["lens"]
 
@@ -58,7 +58,7 @@ def cut(self, sentence, cut_all=False, use_hmm=True):
         The method used to cut the text to tokens.
 
         Args:
-            sentence(str): The text that needs to be cuted.
+            sentence(str): The text that needs to be cut.
             cut_all(bool, optional): Whether to use the full mode. If True,
                 using full mode that gets all the possible words from the
                 sentence, which is fast but not accurate. If False, using
@@ -97,7 +97,7 @@ def encode(self, sentence, cut_all=False, use_hmm=True):
         ids using `vocab`.
 
         Args:
-            sentence(str): The text that needs to be cuted.
+            sentence(str): The text that needs to be cut.
             cut_all(bool, optional): Whether to use the full mode. If True,
                 using full mode that gets all the possible words from the
                 sentence, which is fast but not accurate. If False, using
 
@@ -40,7 +40,7 @@ class Vocab(object):
             between tokens and indices to be used. If provided, adjust the tokens
             and indices mapping according to it. If None, counter must be provided.
             Default: None.
-        unk_token (str, optional): Special token for unknow token. If no need,
+        unk_token (str, optional): Special token for unknown token. If no need,
             it also could be None. Default: None.
         pad_token (str, optional): Special token for padding token. If no need,
             it also could be None. Default: None.
@@ -214,7 +214,7 @@ def to_tokens(self, indices):
         for idx in indices:
             if not isinstance(idx, (int, np.integer)):
                 warnings.warn(
-                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
+                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
                 )
                 idx = int(idx)
 
@@ -382,7 +382,7 @@ def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None,
         Args:
             token_to_idx (dict): A dict describes the mapping relationship between
                 tokens and indices.
-            unk_token (str, optional): The special token for unknow token. If
+            unk_token (str, optional): The special token for unknown token. If
                 no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token. If
                 no need, it also could be None. Default: None.
@@ -440,7 +440,7 @@ def build_vocab(
         **kwargs
     ):
         """
-        Builds the :class:`Vocab` accoring to given iterator and other
+        Builds the :class:`Vocab` according to given iterator and other
         information. Firstly, iterate over the `iterator` to construct a
         :class:`collections.Counter` and used to init the as  :class:`Vocab`.
 
@@ -455,7 +455,7 @@ def build_vocab(
                 relationship between tokens and indices to be used. If provided,
                 adjust the tokens and indices mapping according to it. If None,
                 counter must be provided. Default: None.
-            unk_token (str, optional): The special token for unknow token
+            unk_token (str, optional): The special token for unknown token
                 '<unk>'. If no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token
                 '<pad>'. If no need, it also could be None. Default: None.
 
@@ -448,7 +448,7 @@ def __iter__(self):
                 num_samples += 1
         else:
             if inspect.isgenerator(self.data):
-                warnings.warn("Reciving generator as data source, data can only be iterated once")
+                warnings.warn("Receiving generator as data source, data can only be iterated once")
             for example in self.data:
                 if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
                     num_samples=num_samples
@@ -580,7 +580,7 @@ def remove_if_exit(filepath):
                 lock_files.append(lock_file)
             # Must register to all procs to make the lock file can be removed
             # when any proc breaks. Otherwise, the single registered proc may
-            # not receive proper singal send by the parent proc to exit.
+            # not receive proper signal send by the parent proc to exit.
             atexit.register(lambda: remove_if_exit(lock_files))
             for split in splits:
                 filename = self._get_data(split)
 
@@ -26,7 +26,7 @@
 
 _DESCRIPTION = """\
 The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
-The submission is now closed so we split original dataset into three parts for model evluation. \
+The submission is now closed so we split original dataset into three parts for model evaluation. \
 There are 4,187 training images, 500 validation images, and 500 test images.
 """