25
25
26
26
import pytest
27
27
import vllm # noqa: F401
28
+ from modelscope import snapshot_download # type: ignore[import-untyped]
28
29
from vllm import SamplingParams
29
30
from vllm .assets .image import ImageAsset
30
31
33
34
34
35
MODELS = [
35
36
"Qwen/Qwen2.5-0.5B-Instruct" ,
36
- "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8" ,
37
37
"Qwen/Qwen3-0.6B-Base" ,
38
38
]
39
39
MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40
+
41
+ QUANTIZATION_MODELS = [
42
+ "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new" ,
43
+ ]
40
44
os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
41
45
42
46
@@ -59,6 +63,27 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
59
63
vllm_model .generate_greedy (example_prompts , max_tokens )
60
64
61
65
66
+ @pytest .mark .parametrize ("model" , QUANTIZATION_MODELS )
67
+ @pytest .mark .parametrize ("max_tokens" , [5 ])
68
+ def test_quantization_models (model : str , max_tokens : int ) -> None :
69
+ prompt = "The following numbers of the sequence " + ", " .join (
70
+ str (i ) for i in range (1024 )) + " are:"
71
+ example_prompts = [prompt ]
72
+
73
+ # NOTE: Using quantized model repo id from modelscope encounters an issue,
74
+ # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue,
75
+ # after it is being merged, there's no need to download model explicitly.
76
+ model_path = snapshot_download (model )
77
+
78
+ with VllmRunner (model_path ,
79
+ max_model_len = 8192 ,
80
+ enforce_eager = True ,
81
+ dtype = "auto" ,
82
+ gpu_memory_utilization = 0.7 ,
83
+ quantization = "ascend" ) as vllm_model :
84
+ vllm_model .generate_greedy (example_prompts , max_tokens )
85
+
86
+
62
87
@pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
63
88
def test_multimodal (model , prompt_template , vllm_runner ):
64
89
image = ImageAsset ("cherry_blossom" ) \
0 commit comments