Skip to content

Commit 05d5da5

Browse files
committed
remove old quantization model
Signed-off-by: 22dimensions <waitingwind@foxmail.com>
1 parent f1543d5 commit 05d5da5

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def __init__(
7878
enable_chunked_prefill: bool = False,
7979
swap_space: int = 4,
8080
enforce_eager: Optional[bool] = True,
81+
quantization: Optional[str] = None,
8182
**kwargs,
8283
) -> None:
8384
self.model = LLM(
@@ -94,6 +95,7 @@ def __init__(
9495
max_model_len=max_model_len,
9596
block_size=block_size,
9697
enable_chunked_prefill=enable_chunked_prefill,
98+
quantization=quantization,
9799
**kwargs,
98100
)
99101

tests/singlecard/test_offline_inference.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import pytest
2727
import vllm # noqa: F401
28+
from modelscope import snapshot_download # type: ignore[import-untyped]
2829
from vllm import SamplingParams
2930
from vllm.assets.image import ImageAsset
3031

@@ -33,10 +34,14 @@
3334

3435
MODELS = [
3536
"Qwen/Qwen2.5-0.5B-Instruct",
36-
"vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8",
3737
"Qwen/Qwen3-0.6B-Base",
3838
]
3939
MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
40+
41+
QUANTIZATION_MODELS = [
42+
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new",
43+
"vllm-ascend/DeepSeek-V2-Lite-W8A8"
44+
]
4045
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
4146

4247

@@ -59,6 +64,26 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
5964
vllm_model.generate_greedy(example_prompts, max_tokens)
6065

6166

67+
@pytest.mark.parametrize("model", QUANTIZATION_MODELS)
68+
@pytest.mark.parametrize("max_tokens", [5])
69+
def test_quantization_models(model: str, max_tokens: int) -> None:
70+
prompt = "The following numbers of the sequence " + ", ".join(
71+
str(i) for i in range(1024)) + " are:"
72+
example_prompts = [prompt]
73+
74+
# NOTE: Using quantized model repo id from modelscope encounters an issue,
75+
# this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue,
76+
# after it is being merged, there's no need to download model explicitly.
77+
model_path = snapshot_download(model)
78+
79+
with VllmRunner(model_path,
80+
max_model_len=8192,
81+
enforce_eager=True,
82+
gpu_memory_utilization=0.7,
83+
quantization="ascend") as vllm_model:
84+
vllm_model.generate_greedy(example_prompts, max_tokens)
85+
86+
6287
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
6388
def test_multimodal(model, prompt_template, vllm_runner):
6489
image = ImageAsset("cherry_blossom") \

0 commit comments

Comments
 (0)