cleanup

LucasWilkinson · LucasWilkinson · commit 155e95498774 · 2025-07-18T00:06:09.000-04:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -159,7 +159,7 @@ class FlashInferMetadata:
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
-    qo_indptr: torch.Tensor
+    qo_indptr_cpu: torch.Tensor
     # An example for paged_kv_indices, paged_kv_indptr:
     # request 1, page indices [0, 5, 8]
     # request 2, page indices [1, 6, 7]
@@ -213,14 +213,6 @@ class FlashInferMetadata:
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
     cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
 
-    # CPU version for FlashInfer planning
-    qo_indptr_cpu: Optional[torch.Tensor] = None
-
-    @property
-    def query_start_loc(self):
-        # The GPUModelRunner expects to be able to access this property.
-        return self.qo_indptr
-
     def __post_init__(self):
         if self.head_dim is not None:
             FlashInferBackend.validate_head_size(self.head_dim)
@@ -396,7 +388,6 @@ def build(self,
             split_decodes_and_prefills(common_attn_metadata)
 
         page_size = self.kv_cache_spec.block_size
-        qo_indptr = common_attn_metadata.query_start_loc
         max_seq_len = common_attn_metadata.seq_lens_cpu.max()
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
@@ -457,7 +448,6 @@ def build(self,
             kv_cache_dtype = self.kv_cache_spec.dtype
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            qo_indptr=qo_indptr,
             qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
             paged_kv_indptr_cpu=paged_kv_indptr_cpu,
             paged_kv_indices=paged_kv_indices,