33
33
from transformers .models .auto .auto_factory import _BaseAutoModelClass
34
34
from vllm import LLM , SamplingParams
35
35
from vllm .config import TaskOption , _get_and_verify_dtype
36
- from vllm .inputs import ExplicitEncoderDecoderPrompt , TextPrompt , TokensPrompt
36
+ from vllm .inputs import TextPrompt
37
37
from vllm .outputs import RequestOutput
38
- from vllm .sampling_params import BeamSearchParams
39
38
from vllm .transformers_utils .utils import maybe_model_redirect
40
- from vllm .utils import is_list_of
41
39
42
- from tests .e2e .model_utils import (PROMPT_TEMPLATES , TokensTextLogprobs ,
40
+ from tests .e2e .model_utils import (TokensTextLogprobs ,
43
41
TokensTextLogprobsPromptLogprobs )
44
42
# TODO: remove this part after the patch merged into vllm, if
45
43
# we not explicitly patch here, some of them might be effectiveless
62
60
PromptVideoInput = _PromptMultiModalInput [np .ndarray ]
63
61
64
62
_TEST_DIR = os .path .dirname (__file__ )
65
- _TEST_PROMPTS = [os .path .join (_TEST_DIR , "prompts" , "example.txt" )]
66
63
67
64
68
65
def cleanup_dist_env_and_memory (shutdown_ray : bool = False ):
@@ -89,13 +86,13 @@ def __init__(
89
86
# Use smaller max model length, otherwise bigger model cannot run due
90
87
# to kv cache size limit.
91
88
max_model_len : int = 1024 ,
92
- dtype : str = "half " ,
89
+ dtype : str = "auto " ,
93
90
disable_log_stats : bool = True ,
94
91
tensor_parallel_size : int = 1 ,
95
92
block_size : int = 16 ,
96
93
enable_chunked_prefill : bool = False ,
97
94
swap_space : int = 4 ,
98
- enforce_eager : Optional [bool ] = True ,
95
+ enforce_eager : Optional [bool ] = False ,
99
96
quantization : Optional [str ] = None ,
100
97
** kwargs ,
101
98
) -> None :
@@ -220,26 +217,6 @@ def generate_w_logprobs(
220
217
if sampling_params .prompt_logprobs is None else
221
218
toks_str_logsprobs_prompt_logprobs )
222
219
223
- def generate_encoder_decoder_w_logprobs (
224
- self ,
225
- encoder_decoder_prompts : List [ExplicitEncoderDecoderPrompt [str , str ]],
226
- sampling_params : SamplingParams ,
227
- ) -> Union [List [TokensTextLogprobs ],
228
- List [TokensTextLogprobsPromptLogprobs ]]:
229
- '''
230
- Logprobs generation for vLLM encoder/decoder models
231
- '''
232
-
233
- assert sampling_params .logprobs is not None
234
- req_outputs = self .model .generate (encoder_decoder_prompts ,
235
- sampling_params = sampling_params )
236
- toks_str_logsprobs_prompt_logprobs = (
237
- self ._final_steps_generate_w_logprobs (req_outputs ))
238
- # Omit prompt logprobs if not required by sampling params
239
- return ([x [0 :- 1 ] for x in toks_str_logsprobs_prompt_logprobs ]
240
- if sampling_params .prompt_logprobs is None else
241
- toks_str_logsprobs_prompt_logprobs )
242
-
243
220
def generate_greedy (
244
221
self ,
245
222
prompts : List [str ],
@@ -284,53 +261,6 @@ def generate_greedy_logprobs(
284
261
audios = audios ,
285
262
videos = videos )
286
263
287
- def generate_encoder_decoder_greedy_logprobs (
288
- self ,
289
- encoder_decoder_prompts : List [ExplicitEncoderDecoderPrompt [str , str ]],
290
- max_tokens : int ,
291
- num_logprobs : int ,
292
- num_prompt_logprobs : Optional [int ] = None ,
293
- ) -> Union [List [TokensTextLogprobs ],
294
- List [TokensTextLogprobsPromptLogprobs ]]:
295
- greedy_logprobs_params = SamplingParams (
296
- temperature = 0.0 ,
297
- max_tokens = max_tokens ,
298
- logprobs = num_logprobs ,
299
- prompt_logprobs = (num_prompt_logprobs ),
300
- )
301
- '''
302
- Greedy logprobs generation for vLLM encoder/decoder models
303
- '''
304
-
305
- return self .generate_encoder_decoder_w_logprobs (
306
- encoder_decoder_prompts , greedy_logprobs_params )
307
-
308
- def generate_beam_search (
309
- self ,
310
- prompts : Union [List [str ], List [List [int ]]],
311
- beam_width : int ,
312
- max_tokens : int ,
313
- ) -> List [Tuple [List [List [int ]], List [str ]]]:
314
- if is_list_of (prompts , str , check = "all" ):
315
- prompts = [TextPrompt (prompt = prompt ) for prompt in prompts ]
316
- else :
317
- prompts = [
318
- TokensPrompt (prompt_token_ids = tokens ) for tokens in prompts
319
- ]
320
- outputs = self .model .beam_search (
321
- prompts ,
322
- BeamSearchParams (beam_width = beam_width , max_tokens = max_tokens ))
323
- returned_outputs = []
324
- for output in outputs :
325
- token_ids = [x .tokens for x in output .sequences ]
326
- texts = [x .text for x in output .sequences ]
327
- returned_outputs .append ((token_ids , texts ))
328
- return returned_outputs
329
-
330
- def classify (self , prompts : List [str ]) -> List [List [float ]]:
331
- req_outputs = self .model .classify (prompts )
332
- return [req_output .outputs .probs for req_output in req_outputs ]
333
-
334
264
def encode (
335
265
self ,
336
266
prompts : List [str ],
@@ -346,14 +276,6 @@ def encode(
346
276
req_outputs = self .model .embed (inputs )
347
277
return [req_output .outputs .embedding for req_output in req_outputs ]
348
278
349
- def score (
350
- self ,
351
- text_1 : Union [str , List [str ]],
352
- text_2 : Union [str , List [str ]],
353
- ) -> List [float ]:
354
- req_outputs = self .model .score (text_1 , text_2 )
355
- return [req_output .outputs .score for req_output in req_outputs ]
356
-
357
279
def __enter__ (self ):
358
280
return self
359
281
@@ -362,35 +284,6 @@ def __exit__(self, exc_type, exc_value, traceback):
362
284
cleanup_dist_env_and_memory ()
363
285
364
286
365
- @pytest .fixture (scope = "session" )
366
- def vllm_runner ():
367
- return VllmRunner
368
-
369
-
370
- @pytest .fixture (params = list (PROMPT_TEMPLATES .keys ()))
371
- def prompt_template (request ):
372
- return PROMPT_TEMPLATES [request .param ]
373
-
374
-
375
- def _read_prompts (filename : str ) -> list [str ]:
376
- with open (filename ) as f :
377
- prompts = f .readlines ()
378
- return prompts
379
-
380
-
381
- @pytest .fixture
382
- def example_prompts () -> list [str ]:
383
- prompts = []
384
- for filename in _TEST_PROMPTS :
385
- prompts += _read_prompts (filename )
386
- return prompts
387
-
388
-
389
- @pytest .fixture (scope = "session" )
390
- def ilama_lora_files ():
391
- return snapshot_download (repo_id = "vllm-ascend/ilama-text2sql-spider" )
392
-
393
-
394
287
class HfRunner :
395
288
396
289
def get_default_device (self ):
@@ -515,5 +408,22 @@ def __exit__(self, exc_type, exc_value, traceback):
515
408
516
409
517
410
@pytest .fixture (scope = "session" )
518
- def hf_runner ():
519
- return HfRunner
411
+ def ilama_lora_files ():
412
+ return snapshot_download (repo_id = "vllm-ascend/ilama-text2sql-spider" )
413
+
414
+
415
+ def qwen_prompt (questions : List [str ]) -> List [str ]:
416
+ placeholder = "<|image_pad|>"
417
+ return [("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
418
+ f"<|im_start|>user\n <|vision_start|>{ placeholder } <|vision_end|>"
419
+ f"{ q } <|im_end|>\n <|im_start|>assistant\n " ) for q in questions ]
420
+
421
+
422
+ PROMPT_TEMPLATES = {
423
+ "qwen2.5vl" : qwen_prompt ,
424
+ }
425
+
426
+
427
+ @pytest .fixture (params = list (PROMPT_TEMPLATES .keys ()))
428
+ def prompt_template (request ):
429
+ return PROMPT_TEMPLATES [request .param ]
0 commit comments