|
15 | 15 | from vllm.v1.request import Request, RequestStatus
|
16 | 16 | from vllm.v1.structured_output import StructuredOutputManager
|
17 | 17 |
|
| 18 | +from tests.e2e.conftest import VllmRunner |
| 19 | +from tests.e2e.model_utils import check_outputs_equal |
18 | 20 | from vllm_ascend.core.scheduler import AscendScheduler
|
19 | 21 | from vllm_ascend.utils import vllm_version_is
|
20 | 22 |
|
21 | 23 | EOS_TOKEN_ID = 50256
|
| 24 | +MODEL = "Qwen/Qwen3-0.6B" |
22 | 25 |
|
23 | 26 |
|
24 | 27 | def create_scheduler(
|
25 |
| - model: str = "Qwen/Qwen2.5-0.5B-Instruct", |
| 28 | + model: str = MODEL, |
26 | 29 | max_num_seqs: int = 16,
|
27 | 30 | max_num_batched_tokens: int = 8192,
|
28 | 31 | enable_prefix_caching: Optional[bool] = None,
|
@@ -733,3 +736,83 @@ def test_memory_leak():
|
733 | 736 |
|
734 | 737 | # Confirm no memory leak.
|
735 | 738 | assert_scheduler_empty(scheduler)
|
| 739 | + |
| 740 | + |
| 741 | +def test_concurrent_partial_prefill(): |
| 742 | + with VllmRunner(MODEL, |
| 743 | + additional_config={ |
| 744 | + 'ascend_scheduler_config': { |
| 745 | + 'enabled': True, |
| 746 | + }, |
| 747 | + }, |
| 748 | + max_num_seqs=3, |
| 749 | + max_num_batched_tokens=200, |
| 750 | + enforce_eager=True, |
| 751 | + max_model_len=2048, |
| 752 | + gpu_memory_utilization=0.7) as vllm_model: |
| 753 | + outputs = vllm_model.model.generate(["Hello my name is Robert and I"] * |
| 754 | + 3) |
| 755 | + assert len(outputs) == 3 |
| 756 | + for output in outputs: |
| 757 | + assert len(output.outputs) == 1 |
| 758 | + |
| 759 | + |
| 760 | +def test_prefix_cache_stats_is_recorded(): |
| 761 | + with VllmRunner(MODEL, |
| 762 | + additional_config={ |
| 763 | + 'ascend_scheduler_config': { |
| 764 | + 'enabled': True, |
| 765 | + }, |
| 766 | + }, |
| 767 | + max_num_seqs=3, |
| 768 | + max_num_batched_tokens=200, |
| 769 | + enforce_eager=True, |
| 770 | + max_model_len=2048, |
| 771 | + gpu_memory_utilization=0.7) as vllm_model: |
| 772 | + # 17 tokens will make sure first 16 tokens are cached in a block |
| 773 | + input_tokens = {"prompt_token_ids": [101] * 129} |
| 774 | + _ = vllm_model.model.generate([input_tokens]) |
| 775 | + outputs = vllm_model.model.generate([input_tokens]) |
| 776 | + assert outputs[0].num_cached_tokens == 128 |
| 777 | + |
| 778 | + |
| 779 | +@pytest.mark.parametrize("max_tokens", |
| 780 | + [4]) # cannot align results when max_tokens > 4 |
| 781 | +@pytest.mark.parametrize("chunked_prefill_token_size", [16]) |
| 782 | +def test_chunked_prefill_with_ascend_scheduler( |
| 783 | + example_prompts, max_tokens: int, |
| 784 | + chunked_prefill_token_size: int) -> None: |
| 785 | + max_num_seqs = chunked_prefill_token_size |
| 786 | + max_num_batched_tokens = chunked_prefill_token_size |
| 787 | + with VllmRunner(MODEL, |
| 788 | + additional_config={ |
| 789 | + 'ascend_scheduler_config': { |
| 790 | + 'enabled': True, |
| 791 | + 'enable_chunked_prefill': True, |
| 792 | + }, |
| 793 | + }, |
| 794 | + max_num_seqs=max_num_seqs, |
| 795 | + max_num_batched_tokens=max_num_batched_tokens, |
| 796 | + enforce_eager=True, |
| 797 | + max_model_len=2048, |
| 798 | + gpu_memory_utilization=0.7) as vllm_model: |
| 799 | + chunked_prefill_output = vllm_model.generate_greedy( |
| 800 | + example_prompts, max_tokens) |
| 801 | + |
| 802 | + with VllmRunner(MODEL, |
| 803 | + additional_config={ |
| 804 | + 'ascend_scheduler_config': { |
| 805 | + 'enabled': True, |
| 806 | + }, |
| 807 | + }, |
| 808 | + enforce_eager=True, |
| 809 | + max_model_len=2048, |
| 810 | + gpu_memory_utilization=0.7) as vllm_model: |
| 811 | + vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) |
| 812 | + |
| 813 | + check_outputs_equal( |
| 814 | + outputs_0_lst=vllm_output, |
| 815 | + outputs_1_lst=chunked_prefill_output, |
| 816 | + name_0="vllm_output", |
| 817 | + name_1="chunked_prefill_output", |
| 818 | + ) |
0 commit comments