Skip to content

Commit 85bb352

Browse files
committed
remove old quantization model
Signed-off-by: 22dimensions <waitingwind@foxmail.com>
1 parent 5903547 commit 85bb352

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def __init__(
7878
enable_chunked_prefill: bool = False,
7979
swap_space: int = 4,
8080
enforce_eager: Optional[bool] = True,
81+
quantization: Optional[str] = None,
8182
**kwargs,
8283
) -> None:
8384
self.model = LLM(
@@ -94,6 +95,7 @@ def __init__(
9495
max_model_len=max_model_len,
9596
block_size=block_size,
9697
enable_chunked_prefill=enable_chunked_prefill,
98+
quantization=quantization,
9799
**kwargs,
98100
)
99101

tests/singlecard/test_offline_inference.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,14 @@
3131

3232
MODELS = [
3333
"Qwen/Qwen2.5-0.5B-Instruct",
34-
"vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8",
3534
"Qwen/Qwen3-0.6B-Base",
3635
]
3736
MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
37+
38+
QUANTIZATION_MODELS = [
39+
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new",
40+
"vllm-ascend/DeepSeek-V2-Lite-W8A8"
41+
]
3842
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
3943

4044

@@ -57,6 +61,21 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
5761
vllm_model.generate_greedy(example_prompts, max_tokens)
5862

5963

64+
@pytest.mark.parametrize("model", QUANTIZATION_MODELS)
65+
@pytest.mark.parametrize("max_tokens", [5])
66+
def test_quantization_models(model: str, max_tokens: int) -> None:
67+
prompt = "The following numbers of the sequence " + ", ".join(
68+
str(i) for i in range(1024)) + " are:"
69+
example_prompts = [prompt]
70+
71+
with VllmRunner(model,
72+
max_model_len=8192,
73+
enforce_eager=False,
74+
gpu_memory_utilization=0.7,
75+
quantization="ascend") as vllm_model:
76+
vllm_model.generate_greedy(example_prompts, max_tokens)
77+
78+
6079
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
6180
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
6281
reason="qwen2.5_vl is not supported on v1")

0 commit comments

Comments
 (0)