@@ -159,7 +159,7 @@ class FlashInferMetadata:
159
159
# (batch_size + 1,). The cumulative subquery lengths of the sequences in
160
160
# the batch, used to index into subquery. E.g., if the subquery length
161
161
# is [4, 6], it is [0, 4, 10].
162
- qo_indptr : torch .Tensor
162
+ qo_indptr_cpu : torch .Tensor
163
163
# An example for paged_kv_indices, paged_kv_indptr:
164
164
# request 1, page indices [0, 5, 8]
165
165
# request 2, page indices [1, 6, 7]
@@ -213,14 +213,6 @@ class FlashInferMetadata:
213
213
decode_wrapper : Optional [BatchDecodeWithPagedKVCacheWrapper ] = None
214
214
cascade_wrapper : Optional [MultiLevelCascadeAttentionWrapper ] = None
215
215
216
- # CPU version for FlashInfer planning
217
- qo_indptr_cpu : Optional [torch .Tensor ] = None
218
-
219
- @property
220
- def query_start_loc (self ):
221
- # The GPUModelRunner expects to be able to access this property.
222
- return self .qo_indptr
223
-
224
216
def __post_init__ (self ):
225
217
if self .head_dim is not None :
226
218
FlashInferBackend .validate_head_size (self .head_dim )
@@ -396,7 +388,6 @@ def build(self,
396
388
split_decodes_and_prefills (common_attn_metadata )
397
389
398
390
page_size = self .kv_cache_spec .block_size
399
- qo_indptr = common_attn_metadata .query_start_loc
400
391
max_seq_len = common_attn_metadata .seq_lens_cpu .max ()
401
392
seq_lens = common_attn_metadata .seq_lens
402
393
seq_lens_cpu = common_attn_metadata .seq_lens_cpu
@@ -457,7 +448,6 @@ def build(self,
457
448
kv_cache_dtype = self .kv_cache_spec .dtype
458
449
attn_metadata = FlashInferMetadata (
459
450
num_actual_tokens = num_actual_tokens ,
460
- qo_indptr = qo_indptr ,
461
451
qo_indptr_cpu = common_attn_metadata .query_start_loc_cpu ,
462
452
paged_kv_indptr_cpu = paged_kv_indptr_cpu ,
463
453
paged_kv_indices = paged_kv_indices ,
0 commit comments