Skip to content

Commit 91aaa05

Browse files
committed
Fix
1 parent c2c41a3 commit 91aaa05

File tree

198 files changed

+361
-361
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

198 files changed

+361
-361
lines changed

docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ msgid "word of current node."
276276
msgstr ""
277277

278278
#: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
279-
msgid "Implementataion of BK-Tree"
279+
msgid "Implementation of BK-Tree"
280280
msgstr ""
281281

282282
#: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
@@ -300,7 +300,7 @@ msgid "similar words."
300300
msgstr ""
301301

302302
#: of paddlenlp.taskflow.utils.TriedTree:1
303-
msgid "Implementataion of TriedTree"
303+
msgid "Implementation of TriedTree"
304304
msgstr ""
305305

306306
#: of paddlenlp.taskflow.utils.TriedTree.add_word:1

docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.transformers.squeezebert.tokenizer.po

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ msgid ""
5050
msgstr ""
5151

5252
#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:11
53-
msgid "The special token for unkown words. Default: \"[UNK]\"."
53+
msgid "The special token for unknown words. Default: \"[UNK]\"."
5454
msgstr ""
5555

5656
#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:13

llm/experimental/ernie-3.5-se/modeling.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,12 +1380,12 @@ def forward(
13801380
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
13811381

13821382
def progressive_seq(x, y):
1383-
globel_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
1384-
if globel_step < 500:
1383+
global_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
1384+
if global_step < 500:
13851385
return x[:, :512], y[:, :512]
1386-
if globel_step < 1000:
1386+
if global_step < 1000:
13871387
return x[:, :1024], y[:, :1024]
1388-
if globel_step < 1500:
1388+
if global_step < 1500:
13891389
return x[:, :2048], y[:, :2048]
13901390
return x, y
13911391

paddlenlp/data/blendable_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=Non
4343
assert sum_weights > 0.0
4444
weights /= sum_weights
4545

46-
# Build indicies.
46+
# Build indices.
4747
def _build_indices():
4848
start_time = time.time()
4949

paddlenlp/data/causal_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
677677
doc_offset += remaining_seq_length + doc_length - 1
678678
remaining_seq_length = 0
679679
else:
680-
# Otherwise, start from the begining of the next document.
680+
# Otherwise, start from the beginning of the next document.
681681
doc_idx_index += 1
682682
doc_offset = 0
683683
# Record the sequence.

paddlenlp/data/indexed_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def get_available_dataset_impl():
5555

5656
def make_dataset(path, impl, skip_warmup=False):
5757
if CompatibleIndexedDataset.exists(path):
58-
print("Using old dataet (.npy & .npz)")
58+
print("Using old dataset (.npy & .npz)")
5959
return CompatibleIndexedDataset(path)
6060
elif not IndexedDataset.exists(path):
6161
print(f"Dataset does not exist: {path}")
@@ -903,7 +903,7 @@ def __init__(self, path):
903903

904904
self._path = path
905905

906-
# All documment ids, extend as 1-D array.
906+
# All document ids, extend as 1-D array.
907907
self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
908908
process_data = np.load(path + "_idx.npz")
909909
self._sizes = process_data["lens"]

paddlenlp/data/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def cut(self, sentence, cut_all=False, use_hmm=True):
5858
The method used to cut the text to tokens.
5959
6060
Args:
61-
sentence(str): The text that needs to be cuted.
61+
sentence(str): The text that needs to be cut.
6262
cut_all(bool, optional): Whether to use the full mode. If True,
6363
using full mode that gets all the possible words from the
6464
sentence, which is fast but not accurate. If False, using
@@ -97,7 +97,7 @@ def encode(self, sentence, cut_all=False, use_hmm=True):
9797
ids using `vocab`.
9898
9999
Args:
100-
sentence(str): The text that needs to be cuted.
100+
sentence(str): The text that needs to be cut.
101101
cut_all(bool, optional): Whether to use the full mode. If True,
102102
using full mode that gets all the possible words from the
103103
sentence, which is fast but not accurate. If False, using

paddlenlp/data/vocab.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class Vocab(object):
4040
between tokens and indices to be used. If provided, adjust the tokens
4141
and indices mapping according to it. If None, counter must be provided.
4242
Default: None.
43-
unk_token (str, optional): Special token for unknow token. If no need,
43+
unk_token (str, optional): Special token for unknown token. If no need,
4444
it also could be None. Default: None.
4545
pad_token (str, optional): Special token for padding token. If no need,
4646
it also could be None. Default: None.
@@ -214,7 +214,7 @@ def to_tokens(self, indices):
214214
for idx in indices:
215215
if not isinstance(idx, (int, np.integer)):
216216
warnings.warn(
217-
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
217+
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
218218
)
219219
idx = int(idx)
220220

@@ -382,7 +382,7 @@ def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None,
382382
Args:
383383
token_to_idx (dict): A dict describes the mapping relationship between
384384
tokens and indices.
385-
unk_token (str, optional): The special token for unknow token. If
385+
unk_token (str, optional): The special token for unknown token. If
386386
no need, it also could be None. Default: None.
387387
pad_token (str, optional): The special token for padding token. If
388388
no need, it also could be None. Default: None.
@@ -440,7 +440,7 @@ def build_vocab(
440440
**kwargs
441441
):
442442
"""
443-
Builds the :class:`Vocab` accoring to given iterator and other
443+
Builds the :class:`Vocab` according to given iterator and other
444444
information. Firstly, iterate over the `iterator` to construct a
445445
:class:`collections.Counter` and used to init the as :class:`Vocab`.
446446
@@ -455,7 +455,7 @@ def build_vocab(
455455
relationship between tokens and indices to be used. If provided,
456456
adjust the tokens and indices mapping according to it. If None,
457457
counter must be provided. Default: None.
458-
unk_token (str, optional): The special token for unknow token
458+
unk_token (str, optional): The special token for unknown token
459459
'<unk>'. If no need, it also could be None. Default: None.
460460
pad_token (str, optional): The special token for padding token
461461
'<pad>'. If no need, it also could be None. Default: None.

paddlenlp/datasets/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ def __iter__(self):
448448
num_samples += 1
449449
else:
450450
if inspect.isgenerator(self.data):
451-
warnings.warn("Reciving generator as data source, data can only be iterated once")
451+
warnings.warn("Receiving generator as data source, data can only be iterated once")
452452
for example in self.data:
453453
if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
454454
num_samples=num_samples
@@ -580,7 +580,7 @@ def remove_if_exit(filepath):
580580
lock_files.append(lock_file)
581581
# Must register to all procs to make the lock file can be removed
582582
# when any proc breaks. Otherwise, the single registered proc may
583-
# not receive proper singal send by the parent proc to exit.
583+
# not receive proper signal send by the parent proc to exit.
584584
atexit.register(lambda: remove_if_exit(lock_files))
585585
for split in splits:
586586
filename = self._get_data(split)

paddlenlp/datasets/hf_datasets/docvqa_zh.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
_DESCRIPTION = """\
2828
The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
29-
The submission is now closed so we split original dataset into three parts for model evluation. \
29+
The submission is now closed so we split original dataset into three parts for model evaluation. \
3030
There are 4,187 training images, 500 validation images, and 500 test images.
3131
"""
3232

paddlenlp/datasets/rlhf_datasets/protocol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None)
393393
meta_info_keys (list, optional): a list of keys indicating the meta info to pop
394394
395395
Returns:
396-
DataProto: the DataProto with the poped batch_keys and meta_info_keys
396+
DataProto: the DataProto with the popped batch_keys and meta_info_keys
397397
"""
398398
assert batch_keys is not None
399399
if meta_info_keys is None:

paddlenlp/datasets/rlhf_datasets/rl_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def padding_batch_data(
4040
input_dict = {}
4141

4242
input_ids = [sample["input_ids"] for sample in samples]
43-
# TODO(drownfish19): confim if this is correct
43+
# TODO(drownfish19): confirm if this is correct
4444
# attention_mask = [np.ones(input_id.shape, dtype=bool) for input_id in input_ids]
4545
input_dict["input_ids"] = left_padding(input_ids, padding_value=pad_token_id, max_length=max_prompt_len)
4646
# input_dict["attention_mask"] = left_padding(attention_mask, padding_value=0)

paddlenlp/datasets/thucnews.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
class THUCNews(DatasetBuilder):
2525
"""
2626
A subset of THUCNews dataset. THUCNews is a text classification dataset.
27-
See descrition about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
27+
See description about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
2828
The whole dataset can be downloaded at https://thunlp.oss-cn-qingdao.aliyuncs.com/THUCNews.zip
2929
"""
3030

paddlenlp/datasets/xnli_cn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class XNLI_CN(DatasetBuilder):
2929
XNLI dataset for chinese.
3030
3131
XNLI is an evaluation corpus for language transfer and cross-lingual
32-
sentence classification in 15 languages. Here, XNLI only contrains
32+
sentence classification in 15 languages. Here, XNLI only contains
3333
chinese corpus.
3434
3535
For more information, please visit https://github.com/facebookresearch/XNLI

paddlenlp/datasets/zero_padding_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _pad_batch_records(cls, batch_records):
8888
attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
8989
batched_features["attention_mask"].append(attention_mask)
9090
# NOTE: position_ids is optional and not required by every model
91-
# We append instead of extend here to accomodate 2D position ids
91+
# We append instead of extend here to accommodate 2D position ids
9292
if "position_ids" in record:
9393
batched_features["position_ids"].append(record["position_ids"])
9494
sequence_sum += seq_length
@@ -98,7 +98,7 @@ def _pad_batch_records(cls, batch_records):
9898
# convert to 3-D [batch_size(1), seq_length, seq_length]
9999
batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
100100
if "position_ids" in batched_features:
101-
# Accomodate both 1D and 2D position ids
101+
# Accommodate both 1D and 2D position ids
102102
batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
103103
return batched_features
104104

paddlenlp/ops/distributed/utils/topo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ def __init__(
4949
self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size)))
5050
worlds = []
5151
for i in range(len(ranks)):
52-
indexs = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
53-
worlds.append(arr[indexs])
52+
indexes = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
53+
worlds.append(arr[indexes])
5454

5555
for i, key in enumerate(self.order):
5656
if key == "dp":

paddlenlp/ops/triton_ops/triton_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ def decorator(*args, **kwargs):
621621
op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
622622
op_dict["triton_kernel_args"] = ",".join(modified_arg_exclude_constexpr)
623623
op_dict["key"] = ",".join(self.key_args)
624-
# when tunning, we need to reset the out to zero.
624+
# when tuning, we need to reset the out to zero.
625625
if "reset_zero_when_tune" in other_config.keys():
626626
op_dict["reset_zero_when_tune"] = other_config["reset_zero_when_tune"]
627627

paddlenlp/quantization/quantization_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class QuantizationConfig:
2525
This is the configuration class to store quantization configuration.
2626
Args:
2727
weight_quantize_algo: Weight quantization algorithm.
28-
quant_type: Quantization type appplied to weight and activation, weight may still keep in float tensor.
28+
quant_type: Quantization type applied to weight and activation, weight may still keep in float tensor.
2929
shift: Whether the model applied the shift strategy.
3030
smooth: Whether the model applied the smooth strategy.
3131
shift_smooth_all_linears: Whether the model applied shift or smooth strategy for all linears.

paddlenlp/rl/trainer/ppo_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def __init__(
260260
"pipeline_parallel_degree": 1, # workaround for pipeline parallel model check
261261
},
262262
):
263-
# just used to create trival attrs might be used in the training
263+
# just used to create trivial attrs might be used in the training
264264
# process of trainer, while changing some args to avoid model usage
265265
# in __init__ such as recompute and AMP-O2
266266
super().__init__(

paddlenlp/taskflow/dialogue.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def _check_input_text(self, inputs):
155155
inputs = [list(self.context)]
156156
return inputs
157157
else:
158-
raise ValueError("In the interactive mode, the input data shold be a string")
158+
raise ValueError("In the interactive mode, the input data should be a string")
159159
elif not isinstance(inputs[0], list):
160160
raise ValueError("If not in the interactive mode, the input data should be a list.")
161161
return inputs

paddlenlp/taskflow/document_intelligence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050

5151
class DocPromptTask(Task):
5252
"""
53-
The document intelligence model, give the querys and predict the answers.
53+
The document intelligence model, give the queries and predict the answers.
5454
Args:
5555
task(string): The name of task.
5656
model(string): The model name in the task.

paddlenlp/taskflow/task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
class Task(metaclass=abc.ABCMeta):
3535
"""
36-
The meta classs of task in Taskflow. The meta class has the five abstract function,
36+
The meta class of task in Taskflow. The meta class has the five abstract function,
3737
the subclass need to inherit from the meta class.
3838
Args:
3939
task(string): The name of task.

paddlenlp/taskflow/text_classification.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def softmax(x, axis=None):
108108

109109
class TextClassificationTask(Task):
110110
"""
111-
The text classfication model to classify text.
111+
The text classification model to classify text.
112112
NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
113113
Instead, it's used as a simple inference pipeline.
114114
@@ -122,7 +122,7 @@ class TextClassificationTask(Task):
122122
multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
123123
max_length (int): Maximum number of tokens for the model.
124124
precision (int): Select among ["fp32", "fp16"]. Default to "fp32".
125-
plm_model_name (str): Pretrained langugae model name for PromptModel.
125+
plm_model_name (str): Pretrained language model name for PromptModel.
126126
input_spec [list]: Specify the tensor information for each input parameter of the forward function.
127127
id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
128128
batch_size(int): The sample number of a mini-batch.
@@ -171,7 +171,7 @@ def _construct_input_spec(self):
171171
init_class = json.load(fb)["architectures"].pop()
172172
else:
173173
raise IOError(
174-
f"Model configuration file dosen't exist.[task_path] should inclue {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
174+
f"Model configuration file doesn't exist.[task_path] should include {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
175175
)
176176

177177
if init_class in ["ErnieMForSequenceClassification"]:
@@ -286,7 +286,7 @@ def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
286286
"""
287287
Run the task model from the outputs of the `_tokenize` function.
288288
"""
289-
# TODO: support hierachical classification
289+
# TODO: support hierarchical classification
290290
outputs = {}
291291
outputs["text"] = inputs["text"]
292292
outputs["batch_logits"] = []
@@ -326,7 +326,7 @@ def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
326326
"""
327327
This function converts the model logits output to class score and predictions
328328
"""
329-
# TODO: support hierachical classification
329+
# TODO: support hierarchical classification
330330
postprocessed_outputs = []
331331
for logits in inputs["batch_logits"]:
332332
if self.problem_type == "multi_class":

paddlenlp/taskflow/text_feature_extraction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ def _parse_batch(batch_examples, max_seq_len=None):
424424
)
425425
return tokenized_inputs
426426

427-
# Seperates data into some batches.
427+
# Separates data into some batches.
428428
one_batch = []
429429
self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
430430
sentences_sorted = [data[idx] for idx in self.length_sorted_idx]

0 commit comments

Comments
 (0)