31
31
32
32
MODELS = [
33
33
"Qwen/Qwen2.5-0.5B-Instruct" ,
34
- "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8" ,
35
34
"Qwen/Qwen3-0.6B-Base" ,
36
35
]
37
36
MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
37
+
38
+ QUANTIZATION_MODELS = [
39
+ "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new" ,
40
+ "vllm-ascend/DeepSeek-V2-Lite-W8A8"
41
+ ]
38
42
os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
39
43
40
44
@@ -57,6 +61,21 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
57
61
vllm_model .generate_greedy (example_prompts , max_tokens )
58
62
59
63
64
+ @pytest .mark .parametrize ("model" , QUANTIZATION_MODELS )
65
+ @pytest .mark .parametrize ("max_tokens" , [5 ])
66
+ def test_quantization_models (model : str , max_tokens : int ) -> None :
67
+ prompt = "The following numbers of the sequence " + ", " .join (
68
+ str (i ) for i in range (1024 )) + " are:"
69
+ example_prompts = [prompt ]
70
+
71
+ with VllmRunner (model ,
72
+ max_model_len = 8192 ,
73
+ enforce_eager = False ,
74
+ gpu_memory_utilization = 0.7 ,
75
+ quantization = "ascend" ) as vllm_model :
76
+ vllm_model .generate_greedy (example_prompts , max_tokens )
77
+
78
+
60
79
@pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
61
80
@pytest .mark .skipif (os .getenv ("VLLM_USE_V1" ) == "1" ,
62
81
reason = "qwen2.5_vl is not supported on v1" )
0 commit comments