Skip to content

[Typos] Fix #10494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ msgid "word of current node."
msgstr ""

#: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
msgid "Implementataion of BK-Tree"
msgid "Implementation of BK-Tree"
msgstr ""

#: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
Expand All @@ -300,7 +300,7 @@ msgid "similar words."
msgstr ""

#: of paddlenlp.taskflow.utils.TriedTree:1
msgid "Implementataion of TriedTree"
msgid "Implementation of TriedTree"
msgstr ""

#: of paddlenlp.taskflow.utils.TriedTree.add_word:1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ msgid ""
msgstr ""

#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:11
msgid "The special token for unkown words. Default: \"[UNK]\"."
msgid "The special token for unknown words. Default: \"[UNK]\"."
msgstr ""

#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:13
Expand Down
8 changes: 4 additions & 4 deletions llm/experimental/ernie-3.5-se/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,12 +1380,12 @@ def forward(
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

def progressive_seq(x, y):
globel_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
if globel_step < 500:
global_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
if global_step < 500:
return x[:, :512], y[:, :512]
if globel_step < 1000:
if global_step < 1000:
return x[:, :1024], y[:, :1024]
if globel_step < 1500:
if global_step < 1500:
return x[:, :2048], y[:, :2048]
return x, y

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/data/blendable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=Non
assert sum_weights > 0.0
weights /= sum_weights

# Build indicies.
# Build indices.
def _build_indices():
start_time = time.time()

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/data/causal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
doc_offset += remaining_seq_length + doc_length - 1
remaining_seq_length = 0
else:
# Otherwise, start from the begining of the next document.
# Otherwise, start from the beginning of the next document.
doc_idx_index += 1
doc_offset = 0
# Record the sequence.
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/data/indexed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

def make_dataset(path, impl, skip_warmup=False):
if CompatibleIndexedDataset.exists(path):
print("Using old dataet (.npy & .npz)")
print("Using old dataset (.npy & .npz)")

Check warning on line 58 in paddlenlp/data/indexed_dataset.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/data/indexed_dataset.py#L58

Added line #L58 was not covered by tests
return CompatibleIndexedDataset(path)
elif not IndexedDataset.exists(path):
print(f"Dataset does not exist: {path}")
Expand Down Expand Up @@ -903,7 +903,7 @@

self._path = path

# All documment ids, extend as 1-D array.
# All document ids, extend as 1-D array.
self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
process_data = np.load(path + "_idx.npz")
self._sizes = process_data["lens"]
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/data/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def cut(self, sentence, cut_all=False, use_hmm=True):
The method used to cut the text to tokens.

Args:
sentence(str): The text that needs to be cuted.
sentence(str): The text that needs to be cut.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
Expand Down Expand Up @@ -97,7 +97,7 @@ def encode(self, sentence, cut_all=False, use_hmm=True):
ids using `vocab`.

Args:
sentence(str): The text that needs to be cuted.
sentence(str): The text that needs to be cut.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
Expand Down
10 changes: 5 additions & 5 deletions paddlenlp/data/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Vocab(object):
between tokens and indices to be used. If provided, adjust the tokens
and indices mapping according to it. If None, counter must be provided.
Default: None.
unk_token (str, optional): Special token for unknow token. If no need,
unk_token (str, optional): Special token for unknown token. If no need,
it also could be None. Default: None.
pad_token (str, optional): Special token for padding token. If no need,
it also could be None. Default: None.
Expand Down Expand Up @@ -214,7 +214,7 @@ def to_tokens(self, indices):
for idx in indices:
if not isinstance(idx, (int, np.integer)):
warnings.warn(
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
)
idx = int(idx)

Expand Down Expand Up @@ -382,7 +382,7 @@ def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None,
Args:
token_to_idx (dict): A dict describes the mapping relationship between
tokens and indices.
unk_token (str, optional): The special token for unknow token. If
unk_token (str, optional): The special token for unknown token. If
no need, it also could be None. Default: None.
pad_token (str, optional): The special token for padding token. If
no need, it also could be None. Default: None.
Expand Down Expand Up @@ -440,7 +440,7 @@ def build_vocab(
**kwargs
):
"""
Builds the :class:`Vocab` accoring to given iterator and other
Builds the :class:`Vocab` according to given iterator and other
information. Firstly, iterate over the `iterator` to construct a
:class:`collections.Counter` and used to init the as :class:`Vocab`.

Expand All @@ -455,7 +455,7 @@ def build_vocab(
relationship between tokens and indices to be used. If provided,
adjust the tokens and indices mapping according to it. If None,
counter must be provided. Default: None.
unk_token (str, optional): The special token for unknow token
unk_token (str, optional): The special token for unknown token
'<unk>'. If no need, it also could be None. Default: None.
pad_token (str, optional): The special token for padding token
'<pad>'. If no need, it also could be None. Default: None.
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@
num_samples += 1
else:
if inspect.isgenerator(self.data):
warnings.warn("Reciving generator as data source, data can only be iterated once")
warnings.warn("Receiving generator as data source, data can only be iterated once")

Check warning on line 451 in paddlenlp/datasets/dataset.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/datasets/dataset.py#L451

Added line #L451 was not covered by tests
for example in self.data:
if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
num_samples=num_samples
Expand Down Expand Up @@ -580,7 +580,7 @@
lock_files.append(lock_file)
# Must register to all procs to make the lock file can be removed
# when any proc breaks. Otherwise, the single registered proc may
# not receive proper singal send by the parent proc to exit.
# not receive proper signal send by the parent proc to exit.
atexit.register(lambda: remove_if_exit(lock_files))
for split in splits:
filename = self._get_data(split)
Expand Down
6 changes: 3 additions & 3 deletions paddlenlp/datasets/hf_datasets/docvqa_zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@

# Lint as: python3

import os
import json
import hashlib
import json
import os

Check warning on line 21 in paddlenlp/datasets/hf_datasets/docvqa_zh.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/datasets/hf_datasets/docvqa_zh.py#L20-L21

Added lines #L20 - L21 were not covered by tests

import datasets

logger = datasets.logging.get_logger(__name__)

_DESCRIPTION = """\
The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
The submission is now closed so we split original dataset into three parts for model evluation. \
The submission is now closed so we split original dataset into three parts for model evaluation. \
There are 4,187 training images, 500 validation images, and 500 test images.
"""

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/datasets/rlhf_datasets/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None)
meta_info_keys (list, optional): a list of keys indicating the meta info to pop

Returns:
DataProto: the DataProto with the poped batch_keys and meta_info_keys
DataProto: the DataProto with the popped batch_keys and meta_info_keys
"""
assert batch_keys is not None
if meta_info_keys is None:
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/datasets/rlhf_datasets/rl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def padding_batch_data(
input_dict = {}

input_ids = [sample["input_ids"] for sample in samples]
# TODO(drownfish19): confim if this is correct
# TODO(drownfish19): confirm if this is correct
# attention_mask = [np.ones(input_id.shape, dtype=bool) for input_id in input_ids]
input_dict["input_ids"] = left_padding(input_ids, padding_value=pad_token_id, max_length=max_prompt_len)
# input_dict["attention_mask"] = left_padding(attention_mask, padding_value=0)
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/datasets/thucnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
class THUCNews(DatasetBuilder):
"""
A subset of THUCNews dataset. THUCNews is a text classification dataset.
See descrition about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
See description about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
The whole dataset can be downloaded at https://thunlp.oss-cn-qingdao.aliyuncs.com/THUCNews.zip
"""

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/datasets/xnli_cn.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class XNLI_CN(DatasetBuilder):
XNLI dataset for chinese.

XNLI is an evaluation corpus for language transfer and cross-lingual
sentence classification in 15 languages. Here, XNLI only contrains
sentence classification in 15 languages. Here, XNLI only contains
chinese corpus.

For more information, please visit https://github.com/facebookresearch/XNLI
Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/datasets/zero_padding_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _pad_batch_records(cls, batch_records):
attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
batched_features["attention_mask"].append(attention_mask)
# NOTE: position_ids is optional and not required by every model
# We append instead of extend here to accomodate 2D position ids
# We append instead of extend here to accommodate 2D position ids
if "position_ids" in record:
batched_features["position_ids"].append(record["position_ids"])
sequence_sum += seq_length
Expand All @@ -98,7 +98,7 @@ def _pad_batch_records(cls, batch_records):
# convert to 3-D [batch_size(1), seq_length, seq_length]
batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
if "position_ids" in batched_features:
# Accomodate both 1D and 2D position ids
# Accommodate both 1D and 2D position ids
batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
return batched_features

Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/ops/distributed/utils/topo.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size)))
worlds = []
for i in range(len(ranks)):
indexs = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
worlds.append(arr[indexs])
indexes = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
worlds.append(arr[indexes])

Check warning on line 53 in paddlenlp/ops/distributed/utils/topo.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/ops/distributed/utils/topo.py#L52-L53

Added lines #L52 - L53 were not covered by tests

for i, key in enumerate(self.order):
if key == "dp":
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/ops/triton_ops/triton_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ def decorator(*args, **kwargs):
op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
op_dict["triton_kernel_args"] = ",".join(modified_arg_exclude_constexpr)
op_dict["key"] = ",".join(self.key_args)
# when tunning, we need to reset the out to zero.
# when tuning, we need to reset the out to zero.
if "reset_zero_when_tune" in other_config.keys():
op_dict["reset_zero_when_tune"] = other_config["reset_zero_when_tune"]

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/quantization/quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class QuantizationConfig:
This is the configuration class to store quantization configuration.
Args:
weight_quantize_algo: Weight quantization algorithm.
quant_type: Quantization type appplied to weight and activation, weight may still keep in float tensor.
quant_type: Quantization type applied to weight and activation, weight may still keep in float tensor.
shift: Whether the model applied the shift strategy.
smooth: Whether the model applied the smooth strategy.
shift_smooth_all_linears: Whether the model applied shift or smooth strategy for all linears.
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/rl/trainer/ppo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def __init__(
"pipeline_parallel_degree": 1, # workaround for pipeline parallel model check
},
):
# just used to create trival attrs might be used in the training
# just used to create trivial attrs might be used in the training
# process of trainer, while changing some args to avoid model usage
# in __init__ such as recompute and AMP-O2
super().__init__(
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
inputs = [list(self.context)]
return inputs
else:
raise ValueError("In the interactive mode, the input data shold be a string")
raise ValueError("In the interactive mode, the input data should be a string")

Check warning on line 158 in paddlenlp/taskflow/dialogue.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/taskflow/dialogue.py#L158

Added line #L158 was not covered by tests
elif not isinstance(inputs[0], list):
raise ValueError("If not in the interactive mode, the input data should be a list.")
return inputs
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/document_intelligence.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

class DocPromptTask(Task):
"""
The document intelligence model, give the querys and predict the answers.
The document intelligence model, give the queries and predict the answers.
Args:
task(string): The name of task.
model(string): The model name in the task.
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

class Task(metaclass=abc.ABCMeta):
"""
The meta classs of task in Taskflow. The meta class has the five abstract function,
The meta class of task in Taskflow. The meta class has the five abstract function,
the subclass need to inherit from the meta class.
Args:
task(string): The name of task.
Expand Down
10 changes: 5 additions & 5 deletions paddlenlp/taskflow/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def softmax(x, axis=None):

class TextClassificationTask(Task):
"""
The text classfication model to classify text.
The text classification model to classify text.
NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
Instead, it's used as a simple inference pipeline.

Expand All @@ -122,7 +122,7 @@ class TextClassificationTask(Task):
multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
max_length (int): Maximum number of tokens for the model.
precision (int): Select among ["fp32", "fp16"]. Default to "fp32".
plm_model_name (str): Pretrained langugae model name for PromptModel.
plm_model_name (str): Pretrained language model name for PromptModel.
input_spec [list]: Specify the tensor information for each input parameter of the forward function.
id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
batch_size(int): The sample number of a mini-batch.
Expand Down Expand Up @@ -171,7 +171,7 @@ def _construct_input_spec(self):
init_class = json.load(fb)["architectures"].pop()
else:
raise IOError(
f"Model configuration file dosen't exist.[task_path] should inclue {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
f"Model configuration file doesn't exist.[task_path] should include {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
)

if init_class in ["ErnieMForSequenceClassification"]:
Expand Down Expand Up @@ -286,7 +286,7 @@ def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
Run the task model from the outputs of the `_tokenize` function.
"""
# TODO: support hierachical classification
# TODO: support hierarchical classification
outputs = {}
outputs["text"] = inputs["text"]
outputs["batch_logits"] = []
Expand Down Expand Up @@ -326,7 +326,7 @@ def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
This function converts the model logits output to class score and predictions
"""
# TODO: support hierachical classification
# TODO: support hierarchical classification
postprocessed_outputs = []
for logits in inputs["batch_logits"]:
if self.problem_type == "multi_class":
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/text_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def _parse_batch(batch_examples, max_seq_len=None):
)
return tokenized_inputs

# Seperates data into some batches.
# Separates data into some batches.
one_batch = []
self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
sentences_sorted = [data[idx] for idx in self.length_sorted_idx]
Expand Down
Loading
Loading