16
16
from vllm .v1 .structured_output import StructuredOutputManager
17
17
18
18
from vllm_ascend .core .scheduler import AscendScheduler
19
+ from vllm_ascend .utils import vllm_version_is
19
20
20
21
EOS_TOKEN_ID = 50256
21
22
@@ -97,7 +98,7 @@ def create_scheduler(
97
98
)
98
99
kv_cache_config = KVCacheConfig (
99
100
num_blocks = num_blocks , # A large number of blocks to hold all requests
100
- kv_cache_tensors = [] ,
101
+ ** ({ "tensors" : {}} if vllm_version_is ( "0.9.0" ) else { " kv_cache_tensors" : []}) ,
101
102
kv_cache_groups = [
102
103
KVCacheGroupSpec (['layer' ],
103
104
FullAttentionSpec (block_size , 1 , 1 , torch .float32 ,
@@ -139,6 +140,7 @@ def create_requests(num_requests: int,
139
140
multi_modal_placeholders = mm_position ,
140
141
multi_modal_hashes = None ,
141
142
eos_token_id = EOS_TOKEN_ID ,
143
+ ** ({"arrival_time" : 0.0 } if vllm_version_is ("0.9.0" ) else {}),
142
144
)
143
145
requests .append (request )
144
146
return requests
@@ -557,6 +559,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
557
559
1. Speculated tokens get scheduled correctly
558
560
2. Spec decoding stats properly count number of draft and accepted tokens
559
561
"""
562
+ if vllm_version_is ("0.9.0" ):
563
+ return
560
564
num_spec_tokens = max (1 , max (len (t ) for t in spec_tokens ))
561
565
scheduler = create_scheduler (num_speculative_tokens = num_spec_tokens )
562
566
requests = create_requests (num_requests = len (spec_tokens ), num_tokens = 1 )
@@ -734,11 +738,12 @@ def assert_scheduler_empty(scheduler: AscendScheduler):
734
738
assert len (scheduler .encoder_cache_manager .cached ) == 0
735
739
736
740
# KVCache Manager.
737
- assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
738
- req_to_blocks ) == 0
741
+ if not vllm_version_is ("0.9.0" ):
742
+ assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
743
+ req_to_blocks ) == 0
744
+ assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
745
+ num_cached_block ) == 0
739
746
assert len (scheduler .kv_cache_manager .req_to_block_hashes ) == 0
740
- assert len (scheduler .kv_cache_manager .coordinator .single_type_managers [0 ].
741
- num_cached_block ) == 0
742
747
num_free_blocks = (
743
748
scheduler .kv_cache_manager .block_pool .free_block_queue .num_free_blocks )
744
749
assert num_free_blocks == (
@@ -748,10 +753,6 @@ def assert_scheduler_empty(scheduler: AscendScheduler):
748
753
# value, etc will remain since we lazily evict for prefix cache.
749
754
for block in scheduler .kv_cache_manager .block_pool .blocks :
750
755
assert block .ref_cnt == 0
751
- # assert block._block_hash is None
752
- # assert (
753
- # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
754
- # ) == 0)
755
756
756
757
757
758
def test_memory_leak ():
0 commit comments