|
1 |
| -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. |
| 1 | +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. |
2 | 2 | #
|
3 | 3 | # Licensed under the Apache License, Version 2.0 (the "License");
|
4 | 4 | # you may not use this file except in compliance with the License.
|
|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
15 |
| -import distutils.util |
| 15 | +import argparse |
16 | 16 | import os
|
17 | 17 |
|
18 |
| -import fastdeploy as fd |
19 | 18 | import numpy as np
|
| 19 | +import paddle.inference as paddle_infer |
20 | 20 | from PIL import Image
|
| 21 | +from scipy.special import softmax |
21 | 22 |
|
22 | 23 | from paddlenlp.transformers import ErnieViLProcessor
|
23 | 24 | from paddlenlp.utils.env import (
|
|
27 | 28 |
|
28 | 29 |
|
29 | 30 | def parse_arguments():
|
30 |
| - import argparse |
31 |
| - |
32 | 31 | parser = argparse.ArgumentParser()
|
33 |
| - parser.add_argument("--model_dir", required=True, help="The directory of model.") |
34 |
| - parser.add_argument( |
35 |
| - "--device", |
36 |
| - type=str, |
37 |
| - default="gpu", |
38 |
| - choices=["gpu", "cpu", "kunlunxin"], |
39 |
| - help="Type of inference device, support 'cpu', 'kunlunxin' or 'gpu'.", |
40 |
| - ) |
41 |
| - parser.add_argument( |
42 |
| - "--backend", |
43 |
| - type=str, |
44 |
| - default="onnx_runtime", |
45 |
| - choices=["onnx_runtime", "paddle", "openvino", "tensorrt", "paddle_tensorrt"], |
46 |
| - help="The inference runtime backend.", |
47 |
| - ) |
48 |
| - parser.add_argument("--batch_size", type=int, default=1, help="The batch size of data.") |
49 |
| - parser.add_argument("--temperature", type=float, default=4.30022621, help="The temperature of the model.") |
50 |
| - parser.add_argument("--max_length", type=int, default=128, help="The max length of sequence.") |
51 |
| - parser.add_argument("--log_interval", type=int, default=10, help="The interval of logging.") |
52 |
| - parser.add_argument("--use_fp16", type=distutils.util.strtobool, default=False, help="Wheter to use FP16 mode") |
53 |
| - parser.add_argument( |
54 |
| - "--encode_type", |
55 |
| - type=str, |
56 |
| - default="text", |
57 |
| - choices=[ |
58 |
| - "image", |
59 |
| - "text", |
60 |
| - ], |
61 |
| - help="The encoder type.", |
62 |
| - ) |
63 |
| - parser.add_argument( |
64 |
| - "--image_path", |
65 |
| - default="000000039769.jpg", |
66 |
| - type=str, |
67 |
| - help="image_path used for prediction", |
68 |
| - ) |
| 32 | + parser.add_argument("--model_dir", required=True, help="Directory with .json and .pdiparams") |
| 33 | + parser.add_argument("--device", default="gpu", choices=["gpu", "cpu"], help="Device for inference") |
| 34 | + parser.add_argument("--batch_size", type=int, default=1) |
| 35 | + parser.add_argument("--temperature", type=float, default=4.3) |
| 36 | + parser.add_argument("--max_length", type=int, default=128) |
| 37 | + parser.add_argument("--encode_type", choices=["text", "image"], default="text") |
| 38 | + parser.add_argument("--image_path", type=str, default="data/datasets/Flickr30k-CN/image/36979.jpg") |
69 | 39 | return parser.parse_args()
|
70 | 40 |
|
71 | 41 |
|
72 |
| -class ErnieVil2Predictor(object): |
| 42 | +class PaddleErnieViLPredictor: |
73 | 43 | def __init__(self, args):
|
| 44 | + self.args = args |
74 | 45 | self.processor = ErnieViLProcessor.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
|
75 |
| - self.runtime = self.create_fd_runtime(args) |
76 |
| - self.batch_size = args.batch_size |
77 |
| - self.max_length = args.max_length |
78 |
| - self.encode_type = args.encode_type |
79 |
| - |
80 |
| - def create_fd_runtime(self, args): |
81 |
| - option = fd.RuntimeOption() |
82 |
| - if args.encode_type == "text": |
83 |
| - model_path = os.path.join(args.model_dir, f"get_text_features{PADDLE_INFERENCE_MODEL_SUFFIX}") |
84 |
| - params_path = os.path.join(args.model_dir, f"get_text_features{PADDLE_INFERENCE_WEIGHTS_SUFFIX}") |
85 |
| - else: |
86 |
| - model_path = os.path.join(args.model_dir, f"get_image_features{PADDLE_INFERENCE_MODEL_SUFFIX}") |
87 |
| - params_path = os.path.join(args.model_dir, f"get_image_features{PADDLE_INFERENCE_WEIGHTS_SUFFIX}") |
88 |
| - option.set_model_path(model_path, params_path) |
89 |
| - if args.device == "kunlunxin": |
90 |
| - option.use_kunlunxin() |
91 |
| - option.use_paddle_lite_backend() |
92 |
| - return fd.Runtime(option) |
93 |
| - if args.device == "cpu": |
94 |
| - option.use_cpu() |
| 46 | + self.predictor, self.input_names, self.output_names = self.load_predictor() |
| 47 | + |
| 48 | + def load_predictor(self): |
| 49 | + model_file = os.path.join( |
| 50 | + self.args.model_dir, f"get_{self.args.encode_type}_features{PADDLE_INFERENCE_MODEL_SUFFIX}" |
| 51 | + ) |
| 52 | + params_file = os.path.join( |
| 53 | + self.args.model_dir, f"get_{self.args.encode_type}_features{PADDLE_INFERENCE_WEIGHTS_SUFFIX}" |
| 54 | + ) |
| 55 | + |
| 56 | + config = paddle_infer.Config(model_file, params_file) |
| 57 | + if self.args.device == "gpu": |
| 58 | + config.enable_use_gpu(100, 0) |
95 | 59 | else:
|
96 |
| - option.use_gpu() |
97 |
| - if args.backend == "paddle": |
98 |
| - option.use_paddle_infer_backend() |
99 |
| - elif args.backend == "onnx_runtime": |
100 |
| - option.use_ort_backend() |
101 |
| - elif args.backend == "openvino": |
102 |
| - option.use_openvino_backend() |
103 |
| - else: |
104 |
| - option.use_trt_backend() |
105 |
| - if args.backend == "paddle_tensorrt": |
106 |
| - option.enable_paddle_to_trt() |
107 |
| - option.enable_paddle_trt_collect_shape() |
108 |
| - trt_file = os.path.join(args.model_dir, "{}_infer.trt".format(args.encode_type)) |
109 |
| - if args.encode_type == "text": |
110 |
| - option.set_trt_input_shape( |
111 |
| - "input_ids", |
112 |
| - min_shape=[1, args.max_length], |
113 |
| - opt_shape=[args.batch_size, args.max_length], |
114 |
| - max_shape=[args.batch_size, args.max_length], |
115 |
| - ) |
116 |
| - else: |
117 |
| - option.set_trt_input_shape( |
118 |
| - "pixel_values", |
119 |
| - min_shape=[1, 3, 224, 224], |
120 |
| - opt_shape=[args.batch_size, 3, 224, 224], |
121 |
| - max_shape=[args.batch_size, 3, 224, 224], |
122 |
| - ) |
123 |
| - if args.use_fp16: |
124 |
| - option.enable_trt_fp16() |
125 |
| - trt_file = trt_file + ".fp16" |
126 |
| - option.set_trt_cache_file(trt_file) |
127 |
| - return fd.Runtime(option) |
| 60 | + config.disable_gpu() |
| 61 | + config.disable_glog_info() |
| 62 | + config.switch_ir_optim(True) |
| 63 | + |
| 64 | + predictor = paddle_infer.create_predictor(config) |
| 65 | + input_names = predictor.get_input_names() |
| 66 | + output_names = predictor.get_output_names() |
| 67 | + return predictor, input_names, output_names |
128 | 68 |
|
129 | 69 | def preprocess(self, inputs):
|
130 |
| - if self.encode_type == "text": |
131 |
| - dataset = [np.array([self.processor(text=text)["input_ids"] for text in inputs], dtype="int64")] |
| 70 | + if self.args.encode_type == "text": |
| 71 | + input_ids = [self.processor(text=t)["input_ids"] for t in inputs] |
| 72 | + input_ids = np.array(input_ids, dtype="int64") |
| 73 | + return {"input_ids": input_ids} |
132 | 74 | else:
|
133 |
| - dataset = [np.array([self.processor(images=image)["pixel_values"][0] for image in inputs])] |
134 |
| - input_map = {} |
135 |
| - for input_field_id, data in enumerate(dataset): |
136 |
| - input_field = self.runtime.get_input_info(input_field_id).name |
137 |
| - input_map[input_field] = data |
138 |
| - return input_map |
139 |
| - |
140 |
| - def postprocess(self, infer_data): |
141 |
| - logits = np.array(infer_data[0]) |
142 |
| - out_dict = { |
143 |
| - "features": logits, |
144 |
| - } |
145 |
| - return out_dict |
146 |
| - |
147 |
| - def infer(self, input_map): |
148 |
| - results = self.runtime.infer(input_map) |
149 |
| - return results |
| 75 | + pixel_values = [self.processor(images=img)["pixel_values"][0] for img in inputs] |
| 76 | + pixel_values = np.stack(pixel_values) |
| 77 | + return {"pixel_values": pixel_values.astype("float32")} |
| 78 | + |
| 79 | + def infer(self, input_dict): |
| 80 | + for name in self.input_names: |
| 81 | + input_tensor = self.predictor.get_input_handle(name) |
| 82 | + input_tensor.copy_from_cpu(input_dict[name]) |
| 83 | + self.predictor.run() |
| 84 | + output_tensor = self.predictor.get_output_handle(self.output_names[0]) |
| 85 | + return output_tensor.copy_to_cpu() |
150 | 86 |
|
151 | 87 | def predict(self, inputs):
|
152 | 88 | input_map = self.preprocess(inputs)
|
153 |
| - infer_result = self.infer(input_map) |
154 |
| - output = self.postprocess(infer_result) |
| 89 | + output = self.infer(input_map) |
155 | 90 | return output
|
156 | 91 |
|
157 | 92 |
|
158 | 93 | def main():
|
159 | 94 | args = parse_arguments()
|
160 |
| - texts = [ |
161 |
| - "猫的照片", |
162 |
| - "狗的照片", |
163 |
| - ] |
164 |
| - args.batch_size = 2 |
165 |
| - predictor = ErnieVil2Predictor(args) |
166 |
| - outputs = predictor.predict(texts) |
167 |
| - print(outputs) |
168 |
| - text_feats = outputs["features"] |
169 |
| - image = Image.open(args.image_path) |
| 95 | + |
| 96 | + # 文本推理 |
| 97 | + args.encode_type = "text" |
| 98 | + predictor_text = PaddleErnieViLPredictor(args) |
| 99 | + texts = ["猫的照片", "狗的照片"] |
| 100 | + args.batch_size = len(texts) |
| 101 | + text_features = predictor_text.predict(texts) |
| 102 | + |
| 103 | + # 图像推理 |
170 | 104 | args.encode_type = "image"
|
171 | 105 | args.batch_size = 1
|
172 |
| - predictor = ErnieVil2Predictor(args) |
173 |
| - images = [image] |
174 |
| - outputs = predictor.predict(images) |
175 |
| - image_feats = outputs["features"] |
176 |
| - print(image_feats) |
177 |
| - from scipy.special import softmax |
178 |
| - |
179 |
| - image_feats = image_feats / np.linalg.norm(image_feats, ord=2, axis=-1, keepdims=True) |
180 |
| - text_feats = text_feats / np.linalg.norm(text_feats, ord=2, axis=-1, keepdims=True) |
181 |
| - # Get from dygraph, refer to predict.py |
182 |
| - exp_data = np.exp(args.temperature) |
183 |
| - m = softmax(np.matmul(exp_data * text_feats, image_feats.T), axis=0).T |
184 |
| - print(m) |
| 106 | + predictor_image = PaddleErnieViLPredictor(args) |
| 107 | + image = Image.open(args.image_path).convert("RGB") |
| 108 | + image_features = predictor_image.predict([image]) |
| 109 | + |
| 110 | + # 特征归一化 + 相似度计算 |
| 111 | + image_features = image_features / np.linalg.norm(image_features, axis=-1, keepdims=True) |
| 112 | + text_features = text_features / np.linalg.norm(text_features, axis=-1, keepdims=True) |
| 113 | + |
| 114 | + sim_logits = softmax(np.exp(args.temperature) * np.matmul(text_features, image_features.T), axis=0).T |
| 115 | + print("相似度矩阵(image→text):") |
| 116 | + print(sim_logits) |
185 | 117 |
|
186 | 118 |
|
187 | 119 | if __name__ == "__main__":
|
|
0 commit comments