Skip to content

Commit 59230ef

Browse files
[Misc] Consolidate example usage of OpenAI client for multimodal models (#9412)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent cee711f commit 59230ef

File tree

4 files changed

+237
-217
lines changed

4 files changed

+237
-217
lines changed

docs/source/models/vlm.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ To consume the server, you can use the OpenAI client like in the example below:
241241
print("Chat completion output:", chat_response.choices[0].message.content)
242242
243243
244-
A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
244+
A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
245245

246246
.. note::
247247

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
"""An example showing how to use vLLM to serve multimodal models
2+
and run online inference with OpenAI client.
3+
4+
Launch the vLLM server with the following command:
5+
6+
(single image inference with Llava)
7+
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
8+
9+
(multi-image inference with Phi-3.5-vision-instruct)
10+
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
11+
--trust-remote-code --limit-mm-per-prompt image=2
12+
13+
(audio inference with Ultravox)
14+
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
15+
"""
16+
import base64
17+
18+
import requests
19+
from openai import OpenAI
20+
21+
from vllm.assets.audio import AudioAsset
22+
from vllm.utils import FlexibleArgumentParser
23+
24+
# Modify OpenAI's API key and API base to use vLLM's API server.
25+
openai_api_key = "EMPTY"
26+
openai_api_base = "http://localhost:8000/v1"
27+
28+
client = OpenAI(
29+
# defaults to os.environ.get("OPENAI_API_KEY")
30+
api_key=openai_api_key,
31+
base_url=openai_api_base,
32+
)
33+
34+
models = client.models.list()
35+
model = models.data[0].id
36+
37+
38+
def encode_base64_content_from_url(content_url: str) -> str:
39+
"""Encode a content retrieved from a remote url to base64 format."""
40+
41+
with requests.get(content_url) as response:
42+
response.raise_for_status()
43+
result = base64.b64encode(response.content).decode('utf-8')
44+
45+
return result
46+
47+
48+
# Text-only inference
49+
def run_text_only() -> None:
50+
chat_completion = client.chat.completions.create(
51+
messages=[{
52+
"role": "user",
53+
"content": "What's the capital of France?"
54+
}],
55+
model=model,
56+
max_tokens=64,
57+
)
58+
59+
result = chat_completion.choices[0].message.content
60+
print("Chat completion output:", result)
61+
62+
63+
# Single-image input inference
64+
def run_single_image() -> None:
65+
66+
## Use image url in the payload
67+
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
68+
chat_completion_from_url = client.chat.completions.create(
69+
messages=[{
70+
"role":
71+
"user",
72+
"content": [
73+
{
74+
"type": "text",
75+
"text": "What's in this image?"
76+
},
77+
{
78+
"type": "image_url",
79+
"image_url": {
80+
"url": image_url
81+
},
82+
},
83+
],
84+
}],
85+
model=model,
86+
max_tokens=64,
87+
)
88+
89+
result = chat_completion_from_url.choices[0].message.content
90+
print("Chat completion output from image url:", result)
91+
92+
## Use base64 encoded image in the payload
93+
image_base64 = encode_base64_content_from_url(image_url)
94+
chat_completion_from_base64 = client.chat.completions.create(
95+
messages=[{
96+
"role":
97+
"user",
98+
"content": [
99+
{
100+
"type": "text",
101+
"text": "What's in this image?"
102+
},
103+
{
104+
"type": "image_url",
105+
"image_url": {
106+
"url": f"data:image/jpeg;base64,{image_base64}"
107+
},
108+
},
109+
],
110+
}],
111+
model=model,
112+
max_tokens=64,
113+
)
114+
115+
result = chat_completion_from_base64.choices[0].message.content
116+
print("Chat completion output from base64 encoded image:", result)
117+
118+
119+
# Multi-image input inference
120+
def run_multi_image() -> None:
121+
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
122+
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
123+
chat_completion_from_url = client.chat.completions.create(
124+
messages=[{
125+
"role":
126+
"user",
127+
"content": [
128+
{
129+
"type": "text",
130+
"text": "What are the animals in these images?"
131+
},
132+
{
133+
"type": "image_url",
134+
"image_url": {
135+
"url": image_url_duck
136+
},
137+
},
138+
{
139+
"type": "image_url",
140+
"image_url": {
141+
"url": image_url_lion
142+
},
143+
},
144+
],
145+
}],
146+
model=model,
147+
max_tokens=64,
148+
)
149+
150+
result = chat_completion_from_url.choices[0].message.content
151+
print("Chat completion output:", result)
152+
153+
154+
# Audio input inference
155+
def run_audio() -> None:
156+
# Any format supported by librosa is supported
157+
audio_url = AudioAsset("winning_call").url
158+
159+
# Use audio url in the payload
160+
chat_completion_from_url = client.chat.completions.create(
161+
messages=[{
162+
"role":
163+
"user",
164+
"content": [
165+
{
166+
"type": "text",
167+
"text": "What's in this audio?"
168+
},
169+
{
170+
"type": "audio_url",
171+
"audio_url": {
172+
"url": audio_url
173+
},
174+
},
175+
],
176+
}],
177+
model=model,
178+
max_tokens=64,
179+
)
180+
181+
result = chat_completion_from_url.choices[0].message.content
182+
print("Chat completion output from audio url:", result)
183+
184+
audio_base64 = encode_base64_content_from_url(audio_url)
185+
chat_completion_from_base64 = client.chat.completions.create(
186+
messages=[{
187+
"role":
188+
"user",
189+
"content": [
190+
{
191+
"type": "text",
192+
"text": "What's in this audio?"
193+
},
194+
{
195+
"type": "audio_url",
196+
"audio_url": {
197+
# Any format supported by librosa is supported
198+
"url": f"data:audio/ogg;base64,{audio_base64}"
199+
},
200+
},
201+
],
202+
}],
203+
model=model,
204+
max_tokens=64,
205+
)
206+
207+
result = chat_completion_from_base64.choices[0].message.content
208+
print("Chat completion output from base64 encoded audio:", result)
209+
210+
211+
example_function_map = {
212+
"text-only": run_text_only,
213+
"single-image": run_single_image,
214+
"multi-image": run_multi_image,
215+
"audio": run_audio,
216+
}
217+
218+
219+
def main(args) -> None:
220+
chat_type = args.chat_type
221+
example_function_map[chat_type]()
222+
223+
224+
if __name__ == "__main__":
225+
parser = FlexibleArgumentParser(
226+
description='Demo on using OpenAI client for online inference with '
227+
'multimodal language models served with vLLM.')
228+
parser.add_argument(
229+
'--chat-type',
230+
'-c',
231+
type=str,
232+
default="single-image",
233+
choices=["text-only", "single-image", "multi-image", "audio"],
234+
help='Conversation type with multimodal data.')
235+
args = parser.parse_args()
236+
main(args)

examples/openai_audio_api_client.py

Lines changed: 0 additions & 90 deletions
This file was deleted.

0 commit comments

Comments
 (0)