From 9de1f12c4b8f2f733eeb121bbec2eb42d69967f8 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Sun, 21 Jul 2024 16:58:15 +0300
Subject: [PATCH 01/22] Mamba cache single buffer (#42)

* WIP - working on swaping indices

* WIP

* Save changes

* Orginize indices during assigment, working and passing tests!

* Add TODOs

* Remove diff

* Format

* Remove TODOs

* Remove unused code

* Cleanup

* Cleanup

* Cleanup the redundant 10 blocks

* Small changes

* Simplify code and add comments

* Renaming and simplify

* Remove return

* Clean up

* Cleanup

* Renaming

* Another clean up

* Clean up

* Clean up and simplify more

* Add n > 1 test

* Format

* cosmetics

* Add functionality to find first free

* Raise exception if could not find spot

* Typos

* Add 2 slots as precaution

---------

Co-authored-by: Mor Zusman <morz@ai21.com>
---
 tests/models/test_jamba.py          |  39 +++++
 vllm/model_executor/models/jamba.py | 258 +++++++++++++++-------------
 vllm/worker/model_runner.py         |   3 -
 3 files changed, 175 insertions(+), 125 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 774f2d9d9cd..e2f2e4b6513 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -1,6 +1,8 @@
 import pytest
+from torch.cuda import temperature
 
 from tests.models.utils import check_outputs_equal
+from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 
 MODELS = ["ai21labs/Jamba-tiny-random"]
@@ -63,6 +65,43 @@ def test_batching(
     )
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [15])
+def test_n_lt_1(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    # assert dtype == "float"
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for_loop_outputs = []
+        for _ in range(10):
+            for_loop_outputs.append(
+                vllm_model.generate_greedy([example_prompts[0]],
+                                           max_tokens)[0])
+        sampling_params = SamplingParams(n=10,
+                                         temperature=0.001,
+                                         seed=0,
+                                         max_tokens=max_tokens)
+        n_lt_1_outputs = vllm_model.generate([example_prompts[0]],
+                                             sampling_params)
+    token_ids, texts = n_lt_1_outputs[0]
+    n_lt_1_outputs = [(token_id, text)
+                      for token_id, text in zip(token_ids, texts)]
+
+    check_outputs_equal(
+        outputs_0_lst=n_lt_1_outputs,
+        outputs_1_lst=for_loop_outputs,
+        name_0="vllm_n_lt_1_outputs",
+        name_1="vllm",
+    )
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index cf407c86acd..14390345ae0 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -609,12 +609,8 @@ def __init__(
             # compatibility
             if not lora_config else lora_config.lora_vocab_padding_size,
         )
-        # Current step used indices
-        self.current_indices: List[int] = []
         # Used to track and store by the Mamba cache between steps.
         self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple()
-        # Used as an input_buffer for the CUDA graph runs.
-        self.mamba_gc_cache_buffer: Tuple[torch.Tensor, torch.Tensor] = tuple()
         # Maps between the request id and a dict that maps between the seq_id
         # and its index inside the self.mamba_cache
         self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
@@ -644,95 +640,135 @@ def forward(self,
             batch_size = input_ids.shape[0]
             if attn_metadata.prefill_metadata:
                 batch_size = len(request_ids_to_seq_ids)
-            (
-                current_seqlen_agnostic_cache,
-                indices,
-            ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
-                                                      batch_size,
-                                                      finished_requests_ids)
+            mamba_cache = self._prepare_current_run_mamba_cache(
+                request_ids_to_seq_ids, batch_size, finished_requests_ids)
         else:
             # CUDA graph capturing runs
-            current_seqlen_agnostic_cache, indices = (
-                kwargs["seqlen_agnostic_capture_inputs"],
-                [],
-            )
-        self.current_indices = indices
+            mamba_cache = kwargs["seqlen_agnostic_capture_inputs"]
 
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata,
-                                   current_seqlen_agnostic_cache[0],
-                                   current_seqlen_agnostic_cache[1])
-
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            self._copy_mamba_cache_by_indices(self.current_indices,
-                                              current_seqlen_agnostic_cache)
-
+                                   attn_metadata, mamba_cache[0],
+                                   mamba_cache[1])
         return hidden_states
 
-    def _copy_mamba_cache_by_indices(
-            self, indices: List[int],
-            current_seqlen_agnostic_cache: Tuple[torch.Tensor, torch.Tensor]):
-        for i, offset in enumerate(indices):
-            self._copy_mamba_cache(offset, i, current_seqlen_agnostic_cache)
+    def _swap_mamba_cache(self, to_index: int, from_index: int):
+        assert len(self.mamba_cache) > 0
+        for cache_t in self.mamba_cache:
+            cache_t[:, [to_index,from_index]] = \
+             cache_t[:, [from_index,to_index]]
 
-    def _copy_mamba_cache(self, index_to: int, index_from: int,
-                          from_buffer: Tuple[torch.Tensor, torch.Tensor]):
+    def _copy_mamba_cache(self, to_index: int, from_index: int):
         assert len(self.mamba_cache) > 0
-        for (cache_t, from_buffer_t) in zip(self.mamba_cache, from_buffer):
-            cache_t[:, index_to].copy_(from_buffer_t[:, index_from],
+        for cache_t in self.mamba_cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
                                        non_blocking=True)
 
-    def _assign_seq_id_to_mamba_cache(self, cur_rid: str,
-                                      seqs_id: List[int]) -> List[int]:
-        indices_for_current_run = []
-        for seq_id in seqs_id:
-            if cur_rid not in self.mamba_cache_indices_mapping:
-                self.mamba_cache_indices_mapping[cur_rid] = {}
-                first_free_index = self._first_free_index_in_mamba_cache()
-                self.mamba_cache_indices_mapping[cur_rid][
-                    seq_id] = first_free_index
-                index_for_current_run = first_free_index
+    def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
+                                                       seq_id: int,
+                                                       destination_index: int):
+        all_occupied_indices = self._get_all_occupied_indices()
+        if cur_rid not in self.mamba_cache_indices_mapping:
+            ## assign new free index
+            if destination_index in all_occupied_indices:
+                # In case occupied, move the occupied to a new empty block
+                self._move_cache_index_and_mappings(
+                    from_index=destination_index,
+                    to_index=self._first_free_index_in_mamba_cache())
+            self.mamba_cache_indices_mapping[cur_rid] = {
+                seq_id: destination_index
+            }
+        elif seq_id not in (seq_ids2indices :=
+                            self.mamba_cache_indices_mapping[cur_rid]):
+            # N > 1
+            first_free_index = self._first_free_index_in_mamba_cache()
+            if destination_index in all_occupied_indices:
+                # In case occupied, move the occupied to a new empty block
+                self._move_cache_index_and_mappings(
+                    from_index=destination_index, to_index=first_free_index)
+            index_exists = list(seq_ids2indices.values())[0]
             ## case of decoding n>1, copy prefill cache to decoding indices
-            elif seq_id not in (seq_ids2indices :=
-                                self.mamba_cache_indices_mapping[cur_rid]):
-                first_free_index = self._first_free_index_in_mamba_cache()
-                index_exist = list(seq_ids2indices.values())[0]
-                self._copy_mamba_cache(index_from=index_exist,
-                                       index_to=first_free_index,
-                                       from_buffer=self.mamba_cache)
-                self.mamba_cache_indices_mapping[cur_rid][
-                    seq_id] = first_free_index
-                index_for_current_run = first_free_index
-            else:
-                index_for_current_run = self.mamba_cache_indices_mapping[
-                    cur_rid][seq_id]
-
-            indices_for_current_run.append(index_for_current_run)
-        return indices_for_current_run
+            self._copy_mamba_cache(from_index=index_exists,
+                                   to_index=destination_index)
+            self.mamba_cache_indices_mapping[cur_rid][
+                seq_id] = destination_index
+        else:
+            ## already exists
+            cache_index_already_exists = self.mamba_cache_indices_mapping[
+                cur_rid][seq_id]
+            if cache_index_already_exists != destination_index:
+                # In case the seq id already exists but not in
+                # the right destination, swap it with what's occupying it
+                self._swap_pair_indices_and_mappings(
+                    from_index=cache_index_already_exists,
+                    to_index=destination_index)
 
     def _prepare_current_run_mamba_cache(
-        self, request_ids_to_seq_ids: Dict[str, list[int]], batch_size: int,
-        finished_requests_ids: List[str]
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], List[int]]:
-        indices_for_current_run = []
-        for request_id, seqs_id in request_ids_to_seq_ids.items():
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            batch_size: int, finished_requests_ids: List[str]):
+        running_indices = []
+        for dest_index, (request_id,
+                         seqs_id) in enumerate(request_ids_to_seq_ids.items()):
             if request_id in finished_requests_ids:
-                # Do not allocate cache for requests that run
+                # Do not allocate cache index for requests that run
                 # and finish right after
                 continue
-            indices_for_current_run += self._assign_seq_id_to_mamba_cache(
-                request_id, seqs_id)
-        ## Pad the batch in case of running batch that was not captured via CG
-        padded_indices = indices_for_current_run.copy()
-        pad_index = self._first_free_index_in_mamba_cache()
-
-        for _ in range(batch_size - len(indices_for_current_run)):
-            padded_indices.append(pad_index)
-
-        conv_state = self.mamba_cache[0][:, padded_indices]
-        temporal_state = self.mamba_cache[1][:, padded_indices]
+            for seq_id in seqs_id:
+                self._assign_seq_id_to_mamba_cache_in_specific_dest(
+                    request_id, seq_id, dest_index)
+                running_indices.append(dest_index)
+
+        self._clean_up_first_bs_blocks(batch_size, running_indices)
+        conv_state = self.mamba_cache[0][:, :batch_size]
+        temporal_state = self.mamba_cache[1][:, :batch_size]
+
+        return (conv_state, temporal_state)
+
+    def _get_all_occupied_indices(self):
+        return [
+            cache_idx
+            for seq_ids2indices in self.mamba_cache_indices_mapping.values()
+            for cache_idx in seq_ids2indices.values()
+        ]
 
-        return (conv_state, temporal_state), indices_for_current_run
+    def _clean_up_first_bs_blocks(self, batch_size: int,
+                                  indices_for_current_run: List[int]):
+        # move out all of the occupied but currently not running blocks
+        # outside of the first n blocks
+        destination_indices = set([range(batch_size)])
+        max_possible_batch_size = self.mamba_cache[0].shape[1]
+        for destination_index in destination_indices:
+            if destination_index in self._get_all_occupied_indices() and  \
+               destination_index not in indices_for_current_run:
+                # move not running indices outside of the batch
+                all_other_indices = list(
+                    range(batch_size, max_possible_batch_size))
+                first_avail_index = self._first_free_index_in_mamba_cache(
+                    all_other_indices)
+                self._swap_indices(from_index=destination_index,
+                                   to_index=first_avail_index)
+
+    def _move_cache_index_and_mappings(self, from_index: int, to_index: int):
+        self._copy_mamba_cache(from_index=from_index, to_index=to_index)
+        self._update_mapping_index(from_index=from_index, to_index=to_index)
+
+    def _swap_pair_indices_and_mappings(self, from_index: int, to_index: int):
+        self._swap_mamba_cache(from_index=from_index, to_index=to_index)
+        self._swap_mapping_index(from_index=from_index, to_index=to_index)
+
+    def _swap_mapping_index(self, from_index: int, to_index: int):
+        for seq_ids2index in self.mamba_cache_indices_mapping.values():
+            for seq_id, index in seq_ids2index.items():
+                if from_index == index:
+                    seq_ids2index.update({seq_id: to_index})
+                elif to_index == index:
+                    seq_ids2index.update({seq_id: from_index})
+
+    def _update_mapping_index(self, from_index: int, to_index: int):
+        for seq_ids2index in self.mamba_cache_indices_mapping.values():
+            for seq_id, index in seq_ids2index.items():
+                if from_index == index:
+                    seq_ids2index.update({seq_id: to_index})
+                    return
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         """
@@ -747,28 +783,9 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         self._release_mamba_cache(finished_requests_ids)
         request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
         cg_batch_size = input_buffers['input_ids'].shape[0]
-        (
-            current_mamba_cache,
-            indices,
-        ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
-                                                  cg_batch_size,
-                                                  finished_requests_ids)
-        self.current_indices = indices
-
-        for input_buffer, current_cache_buffer in zip(
-                input_buffers["seqlen_agnostic_capture_inputs"],
-                current_mamba_cache):
-            input_buffer.copy_(current_cache_buffer, non_blocking=True)
-
-    def copy_outputs_after_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant Mamba cache from the CUDA graph input_buffers
-        back to the JambaForCausalLM.mamba_cache after CUDA 
-        graph replay run is done.
-        """
-        self._copy_mamba_cache_by_indices(
-            self.current_indices,
-            input_buffers["seqlen_agnostic_capture_inputs"])
+        self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                              cg_batch_size,
+                                              finished_requests_ids)
 
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         """
@@ -776,26 +793,25 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         The buffer is used to maintain the Mamba Cache during the CUDA graph 
         replay runs.
         """
-        return tuple(buffer[:, :batch_size]
-                     for buffer in self.mamba_gc_cache_buffer)
+        return tuple(buffer[:, :batch_size] for buffer in self.mamba_cache)
 
     def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]):
         for req_id in finished_seq_groups_req_ids:
             if req_id in self.mamba_cache_indices_mapping:
                 self.mamba_cache_indices_mapping.pop(req_id)
 
-    def _first_free_index_in_mamba_cache(self) -> int:
-        if self.mamba_cache:
+    def _first_free_index_in_mamba_cache(
+            self, indices_range: Optional[List[int]] = None) -> int:
+        assert self.mamba_cache is not None
+        if indices_range is None:
             max_possible_batch_size = self.mamba_cache[0].shape[1]
-            occupied = [
-                id for seq_ids in self.mamba_cache_indices_mapping.values()
-                for id in seq_ids.values()
-            ]
-            first_free_index = [
-                i not in occupied for i in range(max_possible_batch_size)
-            ].index(True)
-            return first_free_index
-        return 0
+            indices_range = list(range(max_possible_batch_size))
+        all_occupied_indices = self._get_all_occupied_indices()
+        for i in indices_range:
+            if i not in all_occupied_indices:
+                return i
+        raise Exception("Couldn't find a free spot in the mamba cache! This"
+                        "should never happen")
 
     def _get_mamba_cache_shape(
             self
@@ -819,20 +835,18 @@ def _prepare_mamba_cache(self):
             [layer_type == "mamba" for layer_type in layers_type])
         max_batch_size = (_get_graph_batch_size(
             self.scheduler_config.max_num_seqs) if self.scheduler_config else
-                          max(_BATCH_SIZES_TO_CAPTURE)) + 10
+                          max(_BATCH_SIZES_TO_CAPTURE) + 2)
         conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape()
         assert conv_state_shape is not None and temporal_state_shape is not None
 
-        for buffername in ["mamba_cache", "mamba_gc_cache_buffer"]:
-            buffer = (torch.empty(size=(mamba_layers, max_batch_size) +
-                                  conv_state_shape,
-                                  dtype=dtype,
-                                  device="cuda"),
-                      torch.empty(size=(mamba_layers, max_batch_size) +
-                                  temporal_state_shape,
-                                  dtype=dtype,
-                                  device="cuda"))
-            setattr(self, buffername, buffer)
+        self.mamba_cache = (torch.empty(size=(mamba_layers, max_batch_size) +
+                                        conv_state_shape,
+                                        dtype=dtype,
+                                        device="cuda"),
+                            torch.empty(size=(mamba_layers, max_batch_size) +
+                                        temporal_state_shape,
+                                        dtype=dtype,
+                                        device="cuda"))
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 913a08ce9f5..2731bddba76 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1711,9 +1711,6 @@ def forward(
                                               non_blocking=True)
         # Run the graph.
         self.graph.replay()
-        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
-            self.model.copy_outputs_after_cuda_graphs(self.input_buffers,
-                                                      **kwargs)
         # Return the output tensor.
         if get_pp_group().is_last_rank:
             return self.output_buffers["hidden_states"]

From b9ef9307a672af0c2f8d5561a714d8556896b8a7 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 13:17:10 +0300
Subject: [PATCH 02/22] Format

---
 tests/models/test_jamba.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index e2f2e4b6513..c2ab6c0ad95 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -1,5 +1,4 @@
 import pytest
-from torch.cuda import temperature
 
 from tests.models.utils import check_outputs_equal
 from vllm.sampling_params import SamplingParams

From f9d311dc25bfd39295c17856542cdfda03d292ba Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 22:35:36 +0300
Subject: [PATCH 03/22] Change example

---
 tests/models/test_jamba.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index c2ab6c0ad95..c31bbd20ab5 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -65,7 +65,7 @@ def test_batching(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_n_lt_1(
     vllm_runner,
@@ -81,13 +81,13 @@ def test_n_lt_1(
         for_loop_outputs = []
         for _ in range(10):
             for_loop_outputs.append(
-                vllm_model.generate_greedy([example_prompts[0]],
+                vllm_model.generate_greedy([example_prompts[1]],
                                            max_tokens)[0])
         sampling_params = SamplingParams(n=10,
                                          temperature=0.001,
                                          seed=0,
                                          max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[0]],
+        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
                                              sampling_params)
     token_ids, texts = n_lt_1_outputs[0]
     n_lt_1_outputs = [(token_id, text)
@@ -128,8 +128,8 @@ def test_mamba_cache_cg_padding(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [20])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [96])
 def test_models_preemption_recompute(
     hf_runner,
     vllm_runner,

From 69c0da8139bc214ab62d99813041b60339678283 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 22:15:18 +0300
Subject: [PATCH 04/22] Change tested model (trained), now the tests are more
 reliable

---
 tests/models/test_jamba.py | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index c31bbd20ab5..7329ebef135 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -4,11 +4,11 @@
 from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 
-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = ["pszemraj/jamba-900M-v0.13-KIx2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -18,8 +18,6 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
-    assert dtype == "float"
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -139,7 +137,7 @@ def test_models_preemption_recompute(
     max_tokens: int,
 ) -> None:
     # Tests that outputs are identical with and w/o preemtions (recompute)
-    assert dtype == "float"
+    # assert dtype == "float"
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_model.model.llm_engine.scheduler[
@@ -160,7 +158,7 @@ def test_models_preemption_recompute(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     vllm_runner,
     model: str,
@@ -182,7 +180,29 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_cleanup_upon_aborted_requests(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Jamba inner state management doesn't
+    # collapse in case where the number of incoming requests and
+    # finished_requests_ids is larger than the maximum mamba block capacity.
+    # This could generally happen due to the fact that Jamba does support
+    # statelessness mechanism where it can cleanup new incoming requests in
+    # a single step.
+    try:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_state_cleanup(
     vllm_runner,
     model: str,
@@ -201,7 +221,7 @@ def test_state_cleanup(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_model_print(
     vllm_runner,
     model: str,

From 9a3a1be9210ed5fe15f34ef5080bdd77901a6007 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 22:11:04 +0300
Subject: [PATCH 05/22] Bugfix, the dest index didn't run on the seq ids

---
 vllm/model_executor/models/jamba.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 14390345ae0..653d5806ee5 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -706,16 +706,18 @@ def _prepare_current_run_mamba_cache(
             self, request_ids_to_seq_ids: Dict[str, list[int]],
             batch_size: int, finished_requests_ids: List[str]):
         running_indices = []
-        for dest_index, (request_id,
-                         seqs_id) in enumerate(request_ids_to_seq_ids.items()):
+        dest_index = 0
+        for (request_id, seqs_id) in request_ids_to_seq_ids.items():
             if request_id in finished_requests_ids:
                 # Do not allocate cache index for requests that run
                 # and finish right after
+                dest_index += 1
                 continue
             for seq_id in seqs_id:
                 self._assign_seq_id_to_mamba_cache_in_specific_dest(
                     request_id, seq_id, dest_index)
                 running_indices.append(dest_index)
+                dest_index += 1
 
         self._clean_up_first_bs_blocks(batch_size, running_indices)
         conv_state = self.mamba_cache[0][:, :batch_size]

From c705ed2000c0a5c959bbc6f35984057bc2efe87b Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 22:50:52 +0300
Subject: [PATCH 06/22] Clean up

---
 tests/models/test_jamba.py          | 6 +++---
 vllm/model_executor/models/jamba.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 7329ebef135..f502331d21c 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -35,8 +35,8 @@ def test_models(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [20])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [15])
 def test_batching(
     vllm_runner,
     example_prompts,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 653d5806ee5..8e736804bf2 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,4 @@
 # coding=utf-8
-"""Inference-only Jamba model."""
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple
 

From 7fd4e22305f61d46d7c32a20e4a8e93ff6e6dd70 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 23:00:02 +0300
Subject: [PATCH 07/22] Revert "Clean up"

This reverts commit 381c2aa16201bf79736c7b0dd740c85672c0c9db.
---
 tests/models/test_jamba.py          | 6 +++---
 vllm/model_executor/models/jamba.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index f502331d21c..7329ebef135 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -35,8 +35,8 @@ def test_models(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [15])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
 def test_batching(
     vllm_runner,
     example_prompts,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 8e736804bf2..e619546ea14 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,5 @@
 # coding=utf-8
+"""Inference-only Jurassic model."""
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple
 

From 7f97c4e3b56ba50fe23a7eaddcee15b9380fdc73 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 23:00:05 +0300
Subject: [PATCH 08/22] Revert "Bugfix, the dest index didn't run on the seq
 ids"

This reverts commit f1e792d984468165d6824346cbe413ec492c30e3.
---
 vllm/model_executor/models/jamba.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index e619546ea14..522d1eb5cc4 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -706,18 +706,16 @@ def _prepare_current_run_mamba_cache(
             self, request_ids_to_seq_ids: Dict[str, list[int]],
             batch_size: int, finished_requests_ids: List[str]):
         running_indices = []
-        dest_index = 0
-        for (request_id, seqs_id) in request_ids_to_seq_ids.items():
+        for dest_index, (request_id,
+                         seqs_id) in enumerate(request_ids_to_seq_ids.items()):
             if request_id in finished_requests_ids:
                 # Do not allocate cache index for requests that run
                 # and finish right after
-                dest_index += 1
                 continue
             for seq_id in seqs_id:
                 self._assign_seq_id_to_mamba_cache_in_specific_dest(
                     request_id, seq_id, dest_index)
                 running_indices.append(dest_index)
-                dest_index += 1
 
         self._clean_up_first_bs_blocks(batch_size, running_indices)
         conv_state = self.mamba_cache[0][:, :batch_size]

From 52239d0fb16eae37666eed00260d7cbff5b48fc2 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 23:00:07 +0300
Subject: [PATCH 09/22] Revert "Change tested model (trained), now the tests
 are more reliable"

This reverts commit bda987644367ffb3a69195b9830d3084143618e3.
---
 tests/models/test_jamba.py | 36 ++++++++----------------------------
 1 file changed, 8 insertions(+), 28 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 7329ebef135..c31bbd20ab5 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -4,11 +4,11 @@
 from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 
-MODELS = ["pszemraj/jamba-900M-v0.13-KIx2"]
+MODELS = ["ai21labs/Jamba-tiny-random"]
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -18,6 +18,8 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
+    # To pass the small model tests, we need full precision.
+    assert dtype == "float"
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -137,7 +139,7 @@ def test_models_preemption_recompute(
     max_tokens: int,
 ) -> None:
     # Tests that outputs are identical with and w/o preemtions (recompute)
-    # assert dtype == "float"
+    assert dtype == "float"
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_model.model.llm_engine.scheduler[
@@ -158,7 +160,7 @@ def test_models_preemption_recompute(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     vllm_runner,
     model: str,
@@ -180,29 +182,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_cleanup_upon_aborted_requests(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Jamba inner state management doesn't
-    # collapse in case where the number of incoming requests and
-    # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that Jamba does support
-    # statelessness mechanism where it can cleanup new incoming requests in
-    # a single step.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
-            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
-    except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 def test_state_cleanup(
     vllm_runner,
     model: str,
@@ -221,7 +201,7 @@ def test_state_cleanup(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 def test_model_print(
     vllm_runner,
     model: str,

From 27a15e40c0c58d6ae65be627c783560c15a3497f Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 22:11:04 +0300
Subject: [PATCH 10/22] Bugfix, the dest index didn't run on the seq ids

---
 vllm/model_executor/models/jamba.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 522d1eb5cc4..e619546ea14 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -706,16 +706,18 @@ def _prepare_current_run_mamba_cache(
             self, request_ids_to_seq_ids: Dict[str, list[int]],
             batch_size: int, finished_requests_ids: List[str]):
         running_indices = []
-        for dest_index, (request_id,
-                         seqs_id) in enumerate(request_ids_to_seq_ids.items()):
+        dest_index = 0
+        for (request_id, seqs_id) in request_ids_to_seq_ids.items():
             if request_id in finished_requests_ids:
                 # Do not allocate cache index for requests that run
                 # and finish right after
+                dest_index += 1
                 continue
             for seq_id in seqs_id:
                 self._assign_seq_id_to_mamba_cache_in_specific_dest(
                     request_id, seq_id, dest_index)
                 running_indices.append(dest_index)
+                dest_index += 1
 
         self._clean_up_first_bs_blocks(batch_size, running_indices)
         conv_state = self.mamba_cache[0][:, :batch_size]

From d7d07fb8962d7a338cb38910949bec7beb28d801 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Wed, 24 Jul 2024 23:03:57 +0300
Subject: [PATCH 11/22] Cleanup

---
 tests/models/test_jamba.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index c31bbd20ab5..4f15116e019 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models(
     hf_runner,
@@ -128,8 +128,8 @@ def test_mamba_cache_cg_padding(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
 def test_models_preemption_recompute(
     hf_runner,
     vllm_runner,

From 12d8648aeb10380b6a084a99da0f9ee100d0e6f5 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 25 Jul 2024 08:59:07 +0300
Subject: [PATCH 12/22] Prettier version

---
 vllm/model_executor/models/jamba.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index e619546ea14..3f3da193d63 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -706,18 +706,20 @@ def _prepare_current_run_mamba_cache(
             self, request_ids_to_seq_ids: Dict[str, list[int]],
             batch_size: int, finished_requests_ids: List[str]):
         running_indices = []
-        dest_index = 0
-        for (request_id, seqs_id) in request_ids_to_seq_ids.items():
+        request_ids_to_seq_ids_flatten = [
+            (req_id, seq_id)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+        for dest_index, (request_id,
+                         seq_id) in enumerate(request_ids_to_seq_ids_flatten):
             if request_id in finished_requests_ids:
                 # Do not allocate cache index for requests that run
                 # and finish right after
-                dest_index += 1
                 continue
-            for seq_id in seqs_id:
-                self._assign_seq_id_to_mamba_cache_in_specific_dest(
+            self._assign_seq_id_to_mamba_cache_in_specific_dest(
                     request_id, seq_id, dest_index)
-                running_indices.append(dest_index)
-                dest_index += 1
+            running_indices.append(dest_index)
 
         self._clean_up_first_bs_blocks(batch_size, running_indices)
         conv_state = self.mamba_cache[0][:, :batch_size]

From 4fc3dce840d693f1eb10988ccb7ab8a17ab254d3 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 25 Jul 2024 16:20:49 +0300
Subject: [PATCH 13/22] Half instead of bf16

---
 tests/models/test_jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 4f15116e019..35e3e67a3b8 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -65,7 +65,7 @@ def test_batching(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_n_lt_1(
     vllm_runner,

From f2c772341fe88ff2213aeffa241623defc0db15c Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 25 Jul 2024 16:34:30 +0300
Subject: [PATCH 14/22] Formattin

---
 vllm/model_executor/models/jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3f3da193d63..05c24b77057 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -718,7 +718,7 @@ def _prepare_current_run_mamba_cache(
                 # and finish right after
                 continue
             self._assign_seq_id_to_mamba_cache_in_specific_dest(
-                    request_id, seq_id, dest_index)
+                request_id, seq_id, dest_index)
             running_indices.append(dest_index)
 
         self._clean_up_first_bs_blocks(batch_size, running_indices)

From 44788c42a5467029b1567471cbbaabea6f4b04ee Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 25 Jul 2024 22:25:37 +0300
Subject: [PATCH 15/22] Change test to float

---
 tests/models/test_jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 35e3e67a3b8..560ffc734e0 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -65,7 +65,7 @@ def test_batching(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_n_lt_1(
     vllm_runner,

From e598d967655d2809e4428be9029b524ed7f09a34 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Fri, 26 Jul 2024 00:58:41 +0300
Subject: [PATCH 16/22] bf16 for the test

---
 tests/models/test_jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 560ffc734e0..4f15116e019 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -65,7 +65,7 @@ def test_batching(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_n_lt_1(
     vllm_runner,

From 7d553c9d4f51ac636d2d8d4a22527d754b7a3e6c Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Fri, 26 Jul 2024 10:25:14 +0300
Subject: [PATCH 17/22] Remove n > 1 test for now, need to check why it fails
 on L4

---
 tests/models/test_jamba.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 4f15116e019..07601701b3d 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -64,43 +64,6 @@ def test_batching(
     )
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [15])
-def test_n_lt_1(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # To pass the small model tests, we need full precision.
-    # assert dtype == "float"
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        for_loop_outputs = []
-        for _ in range(10):
-            for_loop_outputs.append(
-                vllm_model.generate_greedy([example_prompts[1]],
-                                           max_tokens)[0])
-        sampling_params = SamplingParams(n=10,
-                                         temperature=0.001,
-                                         seed=0,
-                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
-                                             sampling_params)
-    token_ids, texts = n_lt_1_outputs[0]
-    n_lt_1_outputs = [(token_id, text)
-                      for token_id, text in zip(token_ids, texts)]
-
-    check_outputs_equal(
-        outputs_0_lst=n_lt_1_outputs,
-        outputs_1_lst=for_loop_outputs,
-        name_0="vllm_n_lt_1_outputs",
-        name_1="vllm",
-    )
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])

From df269e54fb55166c64b6f9c758380eeadaeca391 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Fri, 26 Jul 2024 10:27:34 +0300
Subject: [PATCH 18/22] Format

---
 tests/models/test_jamba.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index 07601701b3d..774f2d9d9cd 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -1,7 +1,6 @@
 import pytest
 
 from tests.models.utils import check_outputs_equal
-from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 
 MODELS = ["ai21labs/Jamba-tiny-random"]

From 60857a3dee4e7dd00cf6850cf129fe1914bf676f Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 8 Aug 2024 13:22:28 +0300
Subject: [PATCH 19/22] Factor out moving out the occupied index

---
 vllm/model_executor/models/jamba.py | 31 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 05c24b77057..58c336e0486 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -663,30 +663,35 @@ def _copy_mamba_cache(self, to_index: int, from_index: int):
             cache_t[:, to_index].copy_(cache_t[:, from_index],
                                        non_blocking=True)
 
+    def _move_out_if_already_occupied(self, index: int,
+                              all_occupied_indices: List[int]):
+        if index in all_occupied_indices:
+            first_free_index = self._first_free_index_in_mamba_cache()
+            # In case occupied, move the occupied to a new empty block
+            self._move_cache_index_and_mappings(from_index=index,
+                                                to_index=first_free_index)
+
     def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
                                                        seq_id: int,
                                                        destination_index: int):
         all_occupied_indices = self._get_all_occupied_indices()
         if cur_rid not in self.mamba_cache_indices_mapping:
-            ## assign new free index
-            if destination_index in all_occupied_indices:
-                # In case occupied, move the occupied to a new empty block
-                self._move_cache_index_and_mappings(
-                    from_index=destination_index,
-                    to_index=self._first_free_index_in_mamba_cache())
+            self._move_out_if_already_occupied(
+                index=destination_index,
+                all_occupied_indices=all_occupied_indices)
             self.mamba_cache_indices_mapping[cur_rid] = {
                 seq_id: destination_index
             }
         elif seq_id not in (seq_ids2indices :=
                             self.mamba_cache_indices_mapping[cur_rid]):
-            # N > 1
-            first_free_index = self._first_free_index_in_mamba_cache()
-            if destination_index in all_occupied_indices:
-                # In case occupied, move the occupied to a new empty block
-                self._move_cache_index_and_mappings(
-                    from_index=destination_index, to_index=first_free_index)
+            # parallel sampling , where n > 1, assume prefill already happend
+            # now we only need to copy the already existing cache into the
+            # siblings seq_ids caches
+            self._move_out_if_already_occupied(
+                index=destination_index,
+                all_occupied_indices=all_occupied_indices)
             index_exists = list(seq_ids2indices.values())[0]
-            ## case of decoding n>1, copy prefill cache to decoding indices
+            # case of decoding n>1, copy prefill cache to decoding indices
             self._copy_mamba_cache(from_index=index_exists,
                                    to_index=destination_index)
             self.mamba_cache_indices_mapping[cur_rid][

From 9e583d6b5ae9af241c13369b8ae7f291a02f335f Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 8 Aug 2024 15:54:00 +0300
Subject: [PATCH 20/22] Add comment

---
 vllm/model_executor/models/jamba.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 58c336e0486..30d3bec380e 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -651,20 +651,20 @@ def forward(self,
                                    mamba_cache[1])
         return hidden_states
 
-    def _swap_mamba_cache(self, to_index: int, from_index: int):
+    def _swap_mamba_cache(self, from_index: int, to_index: int):
         assert len(self.mamba_cache) > 0
         for cache_t in self.mamba_cache:
             cache_t[:, [to_index,from_index]] = \
              cache_t[:, [from_index,to_index]]
 
-    def _copy_mamba_cache(self, to_index: int, from_index: int):
+    def _copy_mamba_cache(self, from_index: int, to_index: int):
         assert len(self.mamba_cache) > 0
         for cache_t in self.mamba_cache:
             cache_t[:, to_index].copy_(cache_t[:, from_index],
                                        non_blocking=True)
 
     def _move_out_if_already_occupied(self, index: int,
-                              all_occupied_indices: List[int]):
+                                      all_occupied_indices: List[int]):
         if index in all_occupied_indices:
             first_free_index = self._first_free_index_in_mamba_cache()
             # In case occupied, move the occupied to a new empty block
@@ -674,6 +674,10 @@ def _move_out_if_already_occupied(self, index: int,
     def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
                                                        seq_id: int,
                                                        destination_index: int):
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
         all_occupied_indices = self._get_all_occupied_indices()
         if cur_rid not in self.mamba_cache_indices_mapping:
             self._move_out_if_already_occupied(
@@ -697,7 +701,7 @@ def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
             self.mamba_cache_indices_mapping[cur_rid][
                 seq_id] = destination_index
         else:
-            ## already exists
+            # already exists
             cache_index_already_exists = self.mamba_cache_indices_mapping[
                 cur_rid][seq_id]
             if cache_index_already_exists != destination_index:

From c2e9a1d09c52f4d8973d673b2eb0da0db7579a54 Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Thu, 8 Aug 2024 23:32:19 +0300
Subject: [PATCH 21/22] Format

---
 vllm/model_executor/models/jamba.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 30d3bec380e..2eb42b7505c 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -688,9 +688,9 @@ def _assign_seq_id_to_mamba_cache_in_specific_dest(self, cur_rid: str,
             }
         elif seq_id not in (seq_ids2indices :=
                             self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill already happend
-            # now we only need to copy the already existing cache into the
-            # siblings seq_ids caches
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened now we only need to copy the already
+            # existing cache into the siblings seq_ids caches
             self._move_out_if_already_occupied(
                 index=destination_index,
                 all_occupied_indices=all_occupied_indices)

From 3eeeeb7ec2e9dc177afa76ecb9c2cb29de6cb0ad Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Fri, 9 Aug 2024 11:41:47 +0300
Subject: [PATCH 22/22] Jamba model

---
 vllm/model_executor/models/jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 2eb42b7505c..ededf9c533f 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-"""Inference-only Jurassic model."""
+"""Inference-only Jamba model."""
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple