16
16
from vllm .v1 .structured_output import StructuredOutputManager
17
17
18
18
from vllm_ascend .core .scheduler import AscendScheduler
19
+ from vllm_ascend .utils import vllm_version_is
19
20
20
21
EOS_TOKEN_ID = 50256
21
22
@@ -97,7 +98,11 @@ def create_scheduler(
97
98
)
98
99
kv_cache_config = KVCacheConfig (
99
100
num_blocks = num_blocks , # A large number of blocks to hold all requests
100
- kv_cache_tensors = [],
101
+ ** ({
102
+ "tensors" : {}
103
+ } if vllm_version_is ("0.9.0" ) else {
104
+ "kv_cache_tensors" : []
105
+ }),
101
106
kv_cache_groups = [
102
107
KVCacheGroupSpec (['layer' ],
103
108
FullAttentionSpec (block_size , 1 , 1 , torch .float32 ,
@@ -139,6 +144,9 @@ def create_requests(num_requests: int,
139
144
multi_modal_placeholders = mm_position ,
140
145
multi_modal_hashes = None ,
141
146
eos_token_id = EOS_TOKEN_ID ,
147
+ ** ({
148
+ "arrival_time" : 0.0
149
+ } if vllm_version_is ("0.9.0" ) else {}),
142
150
)
143
151
requests .append (request )
144
152
return requests
@@ -557,6 +565,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
557
565
1. Speculated tokens get scheduled correctly
558
566
2. Spec decoding stats properly count number of draft and accepted tokens
559
567
"""
568
+ if vllm_version_is ("0.9.0" ):
569
+ return
560
570
num_spec_tokens = max (1 , max (len (t ) for t in spec_tokens ))
561
571
scheduler = create_scheduler (num_speculative_tokens = num_spec_tokens )
562
572
requests = create_requests (num_requests = len (spec_tokens ), num_tokens = 1 )
@@ -734,11 +744,12 @@ def assert_scheduler_empty(scheduler: AscendScheduler):
734
744
assert len (scheduler .encoder_cache_manager .cached ) == 0
735
745
736
746
# KVCache Manager.
737
- assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
738
- req_to_blocks ) == 0
747
+ if not vllm_version_is ("0.9.0" ):
748
+ assert len (scheduler .kv_cache_manager .coordinator .
749
+ single_type_managers [0 ].req_to_blocks ) == 0
750
+ assert len (scheduler .kv_cache_manager .coordinator .
751
+ single_type_managers [0 ].num_cached_block ) == 0
739
752
assert len (scheduler .kv_cache_manager .req_to_block_hashes ) == 0
740
- assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
741
- num_cached_block ) == 0
742
753
num_free_blocks = (
743
754
scheduler .kv_cache_manager .block_pool .free_block_queue .num_free_blocks )
744
755
assert num_free_blocks == (
@@ -748,10 +759,6 @@ def assert_scheduler_empty(scheduler: AscendScheduler):
748
759
# value, etc will remain since we lazily evict for prefix cache.
749
760
for block in scheduler .kv_cache_manager .block_pool .blocks :
750
761
assert block .ref_cnt == 0
751
- # assert block._block_hash is None
752
- # assert (
753
- # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
754
- # ) == 0)
755
762
756
763
757
764
def test_memory_leak ():
0 commit comments