Skip to content

Commit ef99fe1

Browse files
authored
[Test] Clean up duplicate test for ascend scheduler (#1819)
There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: vllm-project/vllm@1eb2b9c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent c66b082 commit ef99fe1

File tree

5 files changed

+84
-500
lines changed

5 files changed

+84
-500
lines changed
File renamed without changes.

tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py

Lines changed: 0 additions & 42 deletions
This file was deleted.

tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py

Lines changed: 0 additions & 60 deletions
This file was deleted.

tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py renamed to tests/e2e/singlecard/test_ascend_scheduler.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,17 @@
1515
from vllm.v1.request import Request, RequestStatus
1616
from vllm.v1.structured_output import StructuredOutputManager
1717

18+
from tests.e2e.conftest import VllmRunner
19+
from tests.e2e.model_utils import check_outputs_equal
1820
from vllm_ascend.core.scheduler import AscendScheduler
1921
from vllm_ascend.utils import vllm_version_is
2022

2123
EOS_TOKEN_ID = 50256
24+
MODEL = "Qwen/Qwen3-0.6B"
2225

2326

2427
def create_scheduler(
25-
model: str = "Qwen/Qwen2.5-0.5B-Instruct",
28+
model: str = MODEL,
2629
max_num_seqs: int = 16,
2730
max_num_batched_tokens: int = 8192,
2831
enable_prefix_caching: Optional[bool] = None,
@@ -733,3 +736,83 @@ def test_memory_leak():
733736

734737
# Confirm no memory leak.
735738
assert_scheduler_empty(scheduler)
739+
740+
741+
def test_concurrent_partial_prefill():
742+
with VllmRunner(MODEL,
743+
additional_config={
744+
'ascend_scheduler_config': {
745+
'enabled': True,
746+
},
747+
},
748+
max_num_seqs=3,
749+
max_num_batched_tokens=200,
750+
enforce_eager=True,
751+
max_model_len=2048,
752+
gpu_memory_utilization=0.7) as vllm_model:
753+
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
754+
3)
755+
assert len(outputs) == 3
756+
for output in outputs:
757+
assert len(output.outputs) == 1
758+
759+
760+
def test_prefix_cache_stats_is_recorded():
761+
with VllmRunner(MODEL,
762+
additional_config={
763+
'ascend_scheduler_config': {
764+
'enabled': True,
765+
},
766+
},
767+
max_num_seqs=3,
768+
max_num_batched_tokens=200,
769+
enforce_eager=True,
770+
max_model_len=2048,
771+
gpu_memory_utilization=0.7) as vllm_model:
772+
# 17 tokens will make sure first 16 tokens are cached in a block
773+
input_tokens = {"prompt_token_ids": [101] * 129}
774+
_ = vllm_model.model.generate([input_tokens])
775+
outputs = vllm_model.model.generate([input_tokens])
776+
assert outputs[0].num_cached_tokens == 128
777+
778+
779+
@pytest.mark.parametrize("max_tokens",
780+
[4]) # cannot align results when max_tokens > 4
781+
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
782+
def test_chunked_prefill_with_ascend_scheduler(
783+
example_prompts, max_tokens: int,
784+
chunked_prefill_token_size: int) -> None:
785+
max_num_seqs = chunked_prefill_token_size
786+
max_num_batched_tokens = chunked_prefill_token_size
787+
with VllmRunner(MODEL,
788+
additional_config={
789+
'ascend_scheduler_config': {
790+
'enabled': True,
791+
'enable_chunked_prefill': True,
792+
},
793+
},
794+
max_num_seqs=max_num_seqs,
795+
max_num_batched_tokens=max_num_batched_tokens,
796+
enforce_eager=True,
797+
max_model_len=2048,
798+
gpu_memory_utilization=0.7) as vllm_model:
799+
chunked_prefill_output = vllm_model.generate_greedy(
800+
example_prompts, max_tokens)
801+
802+
with VllmRunner(MODEL,
803+
additional_config={
804+
'ascend_scheduler_config': {
805+
'enabled': True,
806+
},
807+
},
808+
enforce_eager=True,
809+
max_model_len=2048,
810+
gpu_memory_utilization=0.7) as vllm_model:
811+
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
812+
813+
check_outputs_equal(
814+
outputs_0_lst=vllm_output,
815+
outputs_1_lst=chunked_prefill_output,
816+
name_0="vllm_output",
817+
name_1="chunked_prefill_output",
818+
)

0 commit comments

Comments
 (0)