vllm-project
diff --git a/‎vllm/attention/__init__.py
Lines changed: 0 additions & 1 deletion b/‎vllm/attention/__init__.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/inputs/preprocess.py
Lines changed: 0 additions & 6 deletions b/‎vllm/inputs/preprocess.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎vllm/model_executor/models/whisper.py
Lines changed: 4 additions & 5 deletions b/‎vllm/model_executor/models/whisper.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎vllm/v1/attention/backends/flash_attn.py
Lines changed: 38 additions & 19 deletions b/‎vllm/v1/attention/backends/flash_attn.py
Lines changed: 38 additions & 19 deletions
diff --git a/‎vllm/v1/attention/backends/utils.py
Lines changed: 8 additions & 0 deletions b/‎vllm/v1/attention/backends/utils.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/v1/core/kv_cache_coordinator.py
Lines changed: 22 additions & 9 deletions b/‎vllm/v1/core/kv_cache_coordinator.py
Lines changed: 22 additions & 9 deletions
diff --git a/‎vllm/v1/core/kv_cache_manager.py
Lines changed: 39 additions & 0 deletions b/‎vllm/v1/core/kv_cache_manager.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎vllm/v1/core/sched/scheduler.py
Lines changed: 30 additions & 3 deletions b/‎vllm/v1/core/sched/scheduler.py
Lines changed: 30 additions & 3 deletions
@@ -14,7 +14,6 @@
     "AttentionMetadata",
     "AttentionType",
     "AttentionMetadataBuilder",
-    "Attention",
     "AttentionState",
     "get_attn_backend",
 ]
@@ -869,9 +869,6 @@ def preprocess(
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
-            assert not return_mm_hashes, (
-                "Multimodal hashes for encoder-decoder models should not be ",
-                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return self._process_encoder_decoder_prompt(
@@ -903,9 +900,6 @@ async def preprocess_async(
         [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
         """
         if self.model_config.is_encoder_decoder:
-            assert not return_mm_hashes, (
-                "Multimodal hashes for encoder-decoder models should not be ",
-                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return await self._process_encoder_decoder_prompt_async(prompt)
 
@@ -42,7 +42,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
-                         SupportsTranscription, SupportsV0Only)
+                         SupportsTranscription)
 from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
                     make_layers)
 
@@ -790,7 +790,7 @@ def _get_prompt_updates(
                                         info=WhisperProcessingInfo,
                                         dummy_inputs=WhisperDummyInputsBuilder)
 class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
-                                      SupportsMultiModal, SupportsV0Only):
+                                      SupportsMultiModal):
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
@@ -916,10 +916,9 @@ def get_language_model(self) -> torch.nn.Module:
 
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
-        # TODO: This method does not obey the interface for SupportsMultiModal.
-        # Refactor this once encoder/decoder support is implemented in V1.
+        # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
-        return self.model.get_encoder_outputs(audio_input["input_features"])
+        return [self.model.get_encoder_outputs(audio_input["input_features"])]
 
     def get_input_embeddings(
         self,
 
@@ -131,6 +131,15 @@ class FlashAttentionMetadata:
     max_num_splits: int = 0
 
     causal: bool = True
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # (batch_size + 1,). The cumulative sequence lengths of the encoder
+    # sequences in the batch, used to index into sequence. E.g., if the sequence
+    # length is [4, 6], it is [0, 4, 10].
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    cross_slot_mapping: Optional[torch.Tensor] = None
 
 
 def _get_sliding_window_configs(
@@ -209,7 +218,13 @@ def build(self,
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+
+        if (common_attn_metadata.cross_slot_mapping is not None
+                and common_attn_metadata.max_encoder_seq_len is not None):
+            # ENCODER_DECODER cross-attention
+            max_seq_len = common_attn_metadata.max_encoder_seq_len
+        else:
+            max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
@@ -329,7 +344,12 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             suffix_kv_lens=suffix_kv_lens,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
             max_num_splits=max_num_splits,
-            causal=causal)
+            causal=causal,
+            # Encoder/cross-attention fields
+            encoder_seq_start_loc=common_attn_metadata.encoder_seq_start_loc,
+            max_encoder_seq_len=common_attn_metadata.max_encoder_seq_len,
+            cross_slot_mapping=common_attn_metadata.cross_slot_mapping,
+        )
         return attn_metadata
 
     def can_run_in_cudagraph(
@@ -378,13 +398,6 @@ def __init__(
 
         FlashAttentionBackend.validate_head_size(head_size)
 
-        if attn_type not in [
-                AttentionType.DECODER, AttentionType.ENCODER_ONLY
-        ]:
-            raise NotImplementedError("Encoder/decoder cross-attention "
-                                      "is not implemented for "
-                                      "FlashAttentionImpl")
-
         self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
@@ -442,7 +455,7 @@ def forward(
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         # Handle encoder attention differently - no KV cache needed
-        if attn_type in (AttentionType.ENCODER_ONLY, ):
+        if attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
             # For encoder attention,
             # we use direct Q, K, V tensors without caching
             return self._forward_encoder_attention(query[:num_actual_tokens],
@@ -454,20 +467,26 @@ def forward(
         # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
+        if (self.kv_sharing_target_layer_name is None and (key is not None)
+                and (value is not None)):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping is
             # not padded. However, we don't need to do key[:num_actual_tokens]
             # and value[:num_actual_tokens] because the reshape_and_cache_flash
             # op uses the slot_mapping's shape to determine the number of
             # actual tokens.
+            if attn_type == AttentionType.ENCODER_DECODER:
+                updated_slot_mapping = attn_metadata.cross_slot_mapping
+            else:
+                updated_slot_mapping = attn_metadata.slot_mapping
+
             reshape_and_cache_flash(
                 key,
                 value,
                 key_cache,
                 value_cache,
-                attn_metadata.slot_mapping,
+                updated_slot_mapping,
                 self.kv_cache_dtype,
                 layer._k_scale,
                 layer._v_scale,
@@ -491,7 +510,7 @@ def forward(
             block_table = attn_metadata.block_table
             scheduler_metadata = attn_metadata.scheduler_metadata
 
-            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+            descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
 
             flash_attn_varlen_func(
                 q=query[:num_actual_tokens],
@@ -510,9 +529,9 @@ def forward(
                 softcap=self.logits_soft_cap,
                 scheduler_metadata=scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
-                q_descale=layer._q_scale.expand(descale_shape),
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
+                q_descale=layer._q_scale,
+                k_descale=layer._k_scale,
+                v_descale=layer._v_scale,
                 num_splits=attn_metadata.max_num_splits,
             )
             return output
@@ -538,9 +557,9 @@ def forward(
             fa_version=self.vllm_flash_attn_version,
             prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
             suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
-            q_descale=layer._q_scale,
-            k_descale=layer._k_scale,
-            v_descale=layer._v_scale,
+            q_descale=layer._q_scale.expand(descale_shape),
+            k_descale=layer._k_scale.expand(descale_shape),
+            v_descale=layer._v_scale.expand(descale_shape),
         )
         return output
 
 
@@ -61,6 +61,14 @@ class CommonAttentionMetadata:
 
     causal: bool
 
+    # Encoder/cross-attention specific fields (optional)
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    """(batch_size + 1,), cumulative encoder sequence lengths"""
+    max_encoder_seq_len: Optional[int] = None
+    """Maximum encoder sequence length in batch"""
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    """Slot mapping for cross-attention KV cache"""
+
 
 M = TypeVar("M")
 
 
@@ -6,7 +6,7 @@
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    FullAttentionManager, get_manager_for_kv_cache_spec)
+    CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
 from vllm.v1.request import Request
 
@@ -43,9 +43,12 @@ def __init__(
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
-    def get_num_blocks_to_allocate(
-            self, request_id: str, num_tokens: int,
-            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
+    def get_num_blocks_to_allocate(self,
+                                   request_id: str,
+                                   num_tokens: int,
+                                   new_computed_blocks: tuple[
+                                       list[KVCacheBlock], ...],
+                                   cross_attn: bool = False) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
@@ -61,8 +64,14 @@ def get_num_blocks_to_allocate(
         """
         num_blocks_to_allocate = 0
         for i, manager in enumerate(self.single_type_managers):
-            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                request_id, num_tokens, new_computed_blocks[i])
+            if cross_attn and isinstance(manager, CrossAttentionManager):
+                # For cross-attention, we issue a single static allocation
+                # of blocks based on the number of encoder input tokens.
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_tokens, [])
+            elif not cross_attn:
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_tokens, new_computed_blocks[i])
         return num_blocks_to_allocate
 
     def save_new_computed_blocks(
@@ -80,8 +89,11 @@ def save_new_computed_blocks(
             manager.save_new_computed_blocks(request_id,
                                              new_computed_blocks[i])
 
-    def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
+    def allocate_new_blocks(
+            self,
+            request_id: str,
+            num_tokens: int,
+            cross_attn: bool = False) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens` 
         token slots.
@@ -95,7 +107,8 @@ def allocate_new_blocks(self, request_id: str,
             The new allocated blocks.
         """
         return tuple(
-            manager.allocate_new_blocks(request_id, num_tokens)
+            (manager.allocate_new_blocks(request_id, num_tokens) if isinstance(
+                manager, CrossAttentionManager) == cross_attn else [])
             for manager in self.single_type_managers)
 
     def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
 
@@ -307,6 +307,45 @@ def allocate_slots(
 
         return KVCacheBlocks(new_blocks)
 
+    def allocate_slots_for_cross_attn(
+        self,
+        request: Request,
+        num_encoder_tokens: int,
+    ) -> Optional[KVCacheBlocks]:
+        """Add slots for cross-attention blocks.
+
+        This is separate from the main `allocate_slots` function because
+        cross-attention blocks are allocated based on the max encoder length,
+        which is a static value. The number of blocks to allocate is not
+        affected by the number of decoder tokens.
+
+        Args:
+            request: The request to allocate slots.
+            num_encoder_tokens: The number of tokens sent to the encoder.
+
+        Returns:
+            A list of new allocated blocks.
+        """
+        if num_encoder_tokens == 0:
+            raise ValueError("num_encoder_tokens must be greater than 0")
+
+        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+            request_id=request.request_id,
+            num_tokens=num_encoder_tokens,
+            new_computed_blocks=tuple(),
+            cross_attn=True,
+        )
+
+        if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+            # Cannot allocate new blocks
+            return None
+
+        new_blocks = self.coordinator.allocate_new_blocks(request.request_id,
+                                                          num_encoder_tokens,
+                                                          cross_attn=True)
+
+        return KVCacheBlocks(new_blocks)
+
     def free(self, request: Request) -> None:
         """Free the blocks allocated for the request.
         We free the blocks in reverse order so that he tail blocks are evicted 
 
@@ -19,7 +19,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -58,6 +58,7 @@ def __init__(
         self.parallel_config = vllm_config.parallel_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
+        self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
 
         # include_finished_set controls whether a separate set of finished
         # request ids should be included in the EngineCoreOutputs returned
@@ -150,11 +151,17 @@ def __init__(
                 self.use_eagle = True
                 self.num_lookahead_tokens = self.num_spec_tokens
 
+        enable_caching = self.cache_config.enable_prefix_caching or False
+        if self.is_encoder_decoder:
+            # prefix caching for encoder-decoder models is not currently
+            # supported
+            enable_caching = False
+
         # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=self.cache_config.enable_prefix_caching,
+            enable_caching=enable_caching,
             caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
@@ -399,6 +406,7 @@ def schedule(self) -> SchedulerOutput:
 
                 encoder_inputs_to_schedule = None
                 new_encoder_budget = encoder_budget
+                new_cross_blocks: Optional[KVCacheBlocks] = None
 
                 # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
@@ -436,6 +444,22 @@ def schedule(self) -> SchedulerOutput:
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
+                        if self.is_encoder_decoder:
+                            # For encoder-decoder models, we allocate slots for
+                            # the cross-attention blocks based on the max
+                            # encoder length. This is a single static allocation
+                            # and does not grow with the number of decoder
+                            # tokens.
+                            max_encoder_len = (self.vllm_config.model_config.
+                                               hf_config.max_source_positions)
+                            new_cross_blocks = (self.kv_cache_manager.
+                                                allocate_slots_for_cross_attn(
+                                                    request,
+                                                    max_encoder_len,
+                                                ))
+                            if new_cross_blocks is None:
+                                # The request cannot be scheduled.
+                                break
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
@@ -454,9 +478,12 @@ def schedule(self) -> SchedulerOutput:
                 # This information is used to determine if a load is
                 # needed for this request.
                 if self.connector is not None:
+                    update_blocks = new_computed_blocks + new_blocks
+                    if new_cross_blocks is not None:
+                        update_blocks += new_cross_blocks
                     self.connector.update_state_after_alloc(
                         request,
-                        new_computed_blocks + new_blocks,
+                        update_blocks,
                         num_external_computed_tokens,
                     )
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@`
`14`	`14`	`"AttentionMetadata",`
`15`	`15`	`"AttentionType",`
`16`	`16`	`"AttentionMetadataBuilder",`
`17`		`- "Attention",`
`18`	`17`	`"AttentionState",`
`19`	`18`	`"get_attn_backend",`
`20`	`19`	`]`