diff --git a/slm/examples/machine_reading_comprehension/SQuAD/README.md b/slm/examples/machine_reading_comprehension/SQuAD/README.md index 1afb2b027e56..60f5aa70e584 100644 --- a/slm/examples/machine_reading_comprehension/SQuAD/README.md +++ b/slm/examples/machine_reading_comprehension/SQuAD/README.md @@ -164,6 +164,7 @@ python -m paddle.distributed.launch --gpus "0" run_squad.py \ 在 Fine-tune 完成后,我们可以使用如下方式导出希望用来预测的模型: +默认模型: ```shell python -u ./export_model.py \ --model_type bert \ @@ -171,6 +172,14 @@ python -u ./export_model.py \ --output_path ./infer_model/model ``` +微调模型: +```shell +python -u ./export_model.py \ + --model_type bert \ + --model_path tmp/squad/model_5000 \ + --output_path ./infer_model/model +``` + 其中参数释义如下: - `model_type` 指示了模型类型,使用 BERT 模型时设置为 bert 即可。 - `model_path` 表示训练模型的保存路径,与训练时的`output_dir`一致。 @@ -192,4 +201,83 @@ python -u deploy/python/predict.py \ - `batch_size` 表示每个预测批次的样本数目。 - `max_seq_length` 表示最大句子长度,超过该长度将被截断,和训练时一致。 +运行结果示例: +``` +{ + "exact": 37.74109323675567, + "f1": 42.348199704946815, + "total": 11873, + "HasAns_exact": 75.59041835357625, + "HasAns_f1": 84.81784330243481, + "HasAns_total": 5928, + "NoAns_exact": 0.0, + "NoAns_f1": 0.0, + "NoAns_total": 5945, + "best_exact": 50.11370336056599, + "best_exact_thresh": 0.0, + "best_f1": 50.11370336056599, + "best_f1_thresh": 0.0 +} +``` + 以上命令将在 SQuAD v1.1的验证集上进行预测。此外,同训练时一样,用户可以通过命令行传入`--version_2_with_negative`控制所需要的 SQuAD 数据集版本。 + +### 其他问题 +#### Q1: 适配 python 3.8的 datasets 3.1.0无法支持当前任务 +如果运行时出现如下问题: + +> File "/home/aistudio/.cache/huggingface/modules/datasets_modules/datasets/squad_v2/dca5ba0e483a42ca20ec41a13e9fb630541d6fcb0ba646da3e8ff9a1f21fcb81/squad_v2.py", line 19, in +> from datasets.tasks import QuestionAnsweringExtractive +> ModuleNotFoundError: No module named 'datasets.tasks' + +那么需要对 datasets 进行版本更换。运行: +```shell +pip install -U "datasets>=2.14.6,<3.0.0" +``` +安装 ```datasets-2.21.0```等版本可以正常运行。 + + +#### Q2: 无法通过运行命令连接 huggingface 获取 SQuAD 数据集 +1. 手动从[数据集官网](https://rajpurkar.github.io/SQuAD-explorer/)下载 training/dev set 并放在当前目录。 + +2. 将 ```run_squad.py```中的 +```python +if args.version_2_with_negative: + train_examples = load_dataset("squad_v2", split="train", trust_remote_code=True) + dev_examples = load_dataset("squad_v2", split="validation", trust_remote_code=True) +else: + train_examples = load_dataset("squad", split="train", trust_remote_code=True) + dev_examples = load_dataset("squad", split="validation", trust_remote_code=True) +``` +替换为 +```python +datasets = load_dataset( + "squad_v2", + data_files={ + "train": "train-v2.0.json", + "validation": "dev-v2.0.json" + } +) +train_examples = datasets["train"] +dev_examples = datasets["validation"] +``` + +3. 将 ```deploy/python/predict.py```中的 +```python +if args.version_2_with_negative: + raw_dataset = load_dataset("squad_v2", split="validation") +else: + raw_dataset = load_dataset("squad", split="validation") +``` +替换为 +```python +datasets = load_dataset( + "squad_v2", + data_files={ + "train": "train-v2.0.json", + "validation": "dev-v2.0.json" + } +) +raw_dataset = datasets["validation"] +``` +正常运行命令即可。 diff --git a/slm/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py b/slm/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py index 3a8281d70d96..eb572818fa5d 100644 --- a/slm/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py +++ b/slm/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py @@ -21,6 +21,10 @@ from paddlenlp.data import Dict, Pad from paddlenlp.metrics.squad import compute_prediction, squad_evaluate +from paddlenlp.utils.env import ( + PADDLE_INFERENCE_MODEL_SUFFIX, + PADDLE_INFERENCE_WEIGHTS_SUFFIX, +) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))) from args import parse_args # noqa: E402 @@ -35,7 +39,10 @@ def __init__(self, predictor, input_handles, output_handles): @classmethod def create_predictor(cls, args): - config = paddle.inference.Config(args.model_name_or_path + ".pdmodel", args.model_name_or_path + ".pdiparams") + config = paddle.inference.Config( + args.model_name_or_path + f"{PADDLE_INFERENCE_MODEL_SUFFIX}", + args.model_name_or_path + f"{PADDLE_INFERENCE_WEIGHTS_SUFFIX}", + ) if args.device == "gpu": # set GPU configs accordingly config.enable_use_gpu(100, 0)