Add kv copy kernel for between layers

morgendave · morgendave · commit e77dbafc0f51 · 2025-07-24T15:14:01.000-07:00
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -15,6 +15,13 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                  std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
+void copy_blocks_between_layers(
+    std::vector<torch::Tensor> const& src_key_caches,
+    std::vector<torch::Tensor> const& src_value_caches,
+    std::vector<torch::Tensor> const& dst_key_caches,
+    std::vector<torch::Tensor> const& dst_value_caches,
+    const torch::Tensor& block_mapping);
+
 void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
                      const torch::Tensor& block_mapping);
 
@@ -45,4 +52,4 @@ void gather_cache(
     torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
     torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -68,32 +68,42 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 
 namespace vllm {
 
-// Grid: (num_layers, num_pairs)
+// Grid: (layer_or_pair_idx, num_pairs)
 template <typename scalar_t>
-__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
-                                   int64_t* value_cache_ptrs,
-                                   const int64_t* __restrict__ block_mapping,
-                                   const int numel_per_block) {
-  const int layer_idx = blockIdx.x;
+__global__ void unified_copy_blocks_kernel(
+    int64_t* src_key_cache_ptrs, int64_t* src_value_cache_ptrs,
+    int64_t* dst_key_cache_ptrs, int64_t* dst_value_cache_ptrs,
+    const int64_t* __restrict__ block_mapping, const int numel_per_block) {
+  const int layer_or_pair_idx = blockIdx.x;
   const int pair_idx = blockIdx.y;
 
-  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
-  scalar_t* value_cache =
-      reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
+  scalar_t* src_key_cache =
+      reinterpret_cast<scalar_t*>(src_key_cache_ptrs[layer_or_pair_idx]);
+  scalar_t* src_value_cache =
+      reinterpret_cast<scalar_t*>(src_value_cache_ptrs[layer_or_pair_idx]);
+  scalar_t* dst_key_cache =
+      reinterpret_cast<scalar_t*>(dst_key_cache_ptrs[layer_or_pair_idx]);
+  scalar_t* dst_value_cache =
+      reinterpret_cast<scalar_t*>(dst_value_cache_ptrs[layer_or_pair_idx]);
+
   int64_t src_block_number = block_mapping[2 * pair_idx];
   int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
 
   const int64_t src_block_offset = src_block_number * numel_per_block;
   const int64_t dst_block_offset = dst_block_number * numel_per_block;
+
+  // Copy key cache from source to destination
   for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
     int64_t src_offset = src_block_offset + i;
     int64_t dst_offset = dst_block_offset + i;
-    key_cache[dst_offset] = key_cache[src_offset];
+    dst_key_cache[dst_offset] = src_key_cache[src_offset];
   }
+
+  // Copy value cache from source to destination
   for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
     int64_t src_offset = src_block_offset + i;
     int64_t dst_offset = dst_block_offset + i;
-    value_cache[dst_offset] = value_cache[src_offset];
+    dst_value_cache[dst_offset] = src_value_cache[src_offset];
   }
 }
 
@@ -117,58 +127,108 @@ __global__ void copy_blocks_mla_kernel(
 
 }  // namespace vllm
 
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  int num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
+// Unified implementation function for both copy_blocks and
+// copy_blocks_between_caches
+void copy_blocks_impl(std::vector<torch::Tensor> const& src_key_caches,
+                      std::vector<torch::Tensor> const& src_value_caches,
+                      std::vector<torch::Tensor> const& dst_key_caches,
+                      std::vector<torch::Tensor> const& dst_value_caches,
+                      const torch::Tensor& block_mapping) {
+  int num_src_dst_pairs = src_key_caches.size();
+  TORCH_CHECK(num_src_dst_pairs == src_value_caches.size());
+  TORCH_CHECK(num_src_dst_pairs == dst_key_caches.size());
+  TORCH_CHECK(num_src_dst_pairs == dst_value_caches.size());
+
+  if (num_src_dst_pairs == 0) {
     return;
   }
-  torch::Device cache_device = key_caches[0].device();
+
+  torch::Device cache_device = src_key_caches[0].device();
   TORCH_CHECK(cache_device.is_cuda());
 
-  // Create data structures for the kernel.
-  // Create an array of pointers to the key and value caches.
-  int64_t key_cache_ptrs[num_layers];
-  int64_t value_cache_ptrs[num_layers];
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    key_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-    value_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
+  // Create arrays of pointers to the source and destination key and value
+  // caches
+  int64_t src_key_cache_ptrs[num_src_dst_pairs];
+  int64_t src_value_cache_ptrs[num_src_dst_pairs];
+  int64_t dst_key_cache_ptrs[num_src_dst_pairs];
+  int64_t dst_value_cache_ptrs[num_src_dst_pairs];
+
+  for (int pair_idx = 0; pair_idx < num_src_dst_pairs; ++pair_idx) {
+    src_key_cache_ptrs[pair_idx] =
+        reinterpret_cast<int64_t>(src_key_caches[pair_idx].data_ptr());
+    src_value_cache_ptrs[pair_idx] =
+        reinterpret_cast<int64_t>(src_value_caches[pair_idx].data_ptr());
+    dst_key_cache_ptrs[pair_idx] =
+        reinterpret_cast<int64_t>(dst_key_caches[pair_idx].data_ptr());
+    dst_value_cache_ptrs[pair_idx] =
+        reinterpret_cast<int64_t>(dst_value_caches[pair_idx].data_ptr());
   }
 
   // block_mapping is a 2D tensor with shape (num_pairs, 2).
   int num_pairs = block_mapping.size(0);
 
-  // Move the data structures to the GPU.
-  // NOTE: This synchronizes the CPU and GPU.
-  torch::Tensor key_cache_ptrs_tensor =
-      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
+  // Move the data structures to the GPU
+  torch::Tensor src_key_cache_ptrs_tensor =
+      torch::from_blob(src_key_cache_ptrs, {num_src_dst_pairs}, torch::kInt64)
+          .to(cache_device);
+  torch::Tensor src_value_cache_ptrs_tensor =
+      torch::from_blob(src_value_cache_ptrs, {num_src_dst_pairs}, torch::kInt64)
+          .to(cache_device);
+  torch::Tensor dst_key_cache_ptrs_tensor =
+      torch::from_blob(dst_key_cache_ptrs, {num_src_dst_pairs}, torch::kInt64)
           .to(cache_device);
-  torch::Tensor value_cache_ptrs_tensor =
-      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
+  torch::Tensor dst_value_cache_ptrs_tensor =
+      torch::from_blob(dst_value_cache_ptrs, {num_src_dst_pairs}, torch::kInt64)
           .to(cache_device);
 
-  // Launch the kernel.
-  const int numel_per_block = key_caches[0][0].numel();
-  dim3 grid(num_layers, num_pairs);
+  // Launch the kernel
+  const int numel_per_block = src_key_caches[0][0].numel();
+  dim3 grid(num_src_dst_pairs, num_pairs);
   dim3 block(std::min(1024, numel_per_block));
   const at::cuda::OptionalCUDAGuard device_guard(cache_device);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
   VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
-        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            key_cache_ptrs_tensor.data_ptr<int64_t>(),
-            value_cache_ptrs_tensor.data_ptr<int64_t>(),
+      src_key_caches[0].scalar_type(), "unified_copy_blocks_kernel", ([&] {
+        vllm::unified_copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            src_key_cache_ptrs_tensor.data_ptr<int64_t>(),
+            src_value_cache_ptrs_tensor.data_ptr<int64_t>(),
+            dst_key_cache_ptrs_tensor.data_ptr<int64_t>(),
+            dst_value_cache_ptrs_tensor.data_ptr<int64_t>(),
             block_mapping.data_ptr<int64_t>(), numel_per_block);
       }));
 }
 
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
+                 const torch::Tensor& block_mapping) {
+  int num_layers = key_caches.size();
+  TORCH_CHECK(num_layers == value_caches.size());
+  if (num_layers == 0) {
+    return;
+  }
+
+  // Call the unified implementation with the same caches for both source and
+  // destination
+  copy_blocks_impl(key_caches, value_caches, key_caches, value_caches,
+                   block_mapping);
+}
+
+// Function to copy blocks between different layers
+void copy_blocks_between_layers(
+    std::vector<torch::Tensor> const& src_key_caches,
+    std::vector<torch::Tensor> const& src_value_caches,
+    std::vector<torch::Tensor> const& dst_key_caches,
+    std::vector<torch::Tensor> const& dst_value_caches,
+    const torch::Tensor& block_mapping) {
+  // Call the unified implementation with separate source and destination caches
+  copy_blocks_impl(src_key_caches, src_value_caches, dst_key_caches,
+                   dst_value_caches, block_mapping);
+}
+
 // copy blocks kernel for MLA (assumes a joint KV-cache)
 void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
                      const torch::Tensor& block_mapping) {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -660,6 +660,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
 
+  // Copy blocks between different caches
+  cache_ops.def(
+      "copy_blocks_between_layers(Tensor(a!)[] src_key_caches, Tensor(b!)[] "
+      "src_value_caches, "
+      "Tensor(c!)[] dst_key_caches, Tensor(d!)[] dst_value_caches, "
+      "Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_between_layers", torch::kCUDA,
+                 &copy_blocks_between_layers);
+
   // Reshape the key and value tensors and cache them.
   cache_ops.def(
       "reshape_and_cache(Tensor key, Tensor value,"
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
@@ -74,9 +74,9 @@ def parse_args():
         action="store_false",
         help="Disable prefill token shift (default: enabled)",
     )
-    parser.add_argument("--target_kv_layer_copy_from", type=int, default=-1)
+    parser.add_argument("--target-kv-layer-copy-from", type=int, default=-1)
     parser.add_argument(
-        "--draft_kv_layer_copy_to",
+        "--draft-kv-layer-copy-to",
         type=str,
         default="",
         help="comma separated list of layer indices to copy to",
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
@@ -117,6 +117,40 @@ def test_copy_blocks(
                                                cloned_value_caches):
         torch.testing.assert_close(value_cache, cloned_value_cache)
 
+    # Test copy_blocks_between_layers
+    num_source_layers = num_layers // 4
+    source_layers = random.sample(range(num_layers), num_source_layers)
+    target_layers = random.sample(range(num_layers), num_source_layers)
+
+    # Get source and target key/value caches using list comprehension
+    src_key_caches = [key_caches[i] for i in source_layers]
+    src_value_caches = [value_caches[i] for i in source_layers]
+    dst_key_caches = [key_caches[i] for i in target_layers]
+    dst_value_caches = [value_caches[i] for i in target_layers]
+
+    opcheck(torch.ops._C_cache_ops.copy_blocks_between_layers,
+            (src_key_caches, src_value_caches, dst_key_caches,
+             dst_value_caches, block_mapping_tensor),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+            cond=(head_size == HEAD_SIZES[0]))
+    ops.copy_blocks_between_layers(src_key_caches, src_value_caches,
+                                   dst_key_caches, dst_value_caches,
+                                   block_mapping_tensor)
+    # Run the reference implementation for copy_blocks_between_layers
+    for src, dst in block_mapping:
+        for src_layer, dst_layer in zip(source_layers, target_layers):
+            cloned_key_caches[dst_layer][dst].copy_(
+                cloned_key_caches[src_layer][src])
+            cloned_value_caches[dst_layer][dst].copy_(
+                cloned_value_caches[src_layer][src])
+
+    # Compare the results for copy_blocks_between_layers
+    for src_layer, dst_layer in zip(source_layers, target_layers):
+        torch.testing.assert_close(key_caches[dst_layer],
+                                   cloned_key_caches[dst_layer])
+        torch.testing.assert_close(value_caches[dst_layer],
+                                   cloned_value_caches[dst_layer])
+
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1655,6 +1655,28 @@ def copy_blocks_mla(kv_caches: list[torch.Tensor],
     torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
 
 
+def copy_blocks_between_layers(src_key_caches: list[torch.Tensor],
+                               src_value_caches: list[torch.Tensor],
+                               dst_key_caches: list[torch.Tensor],
+                               dst_value_caches: list[torch.Tensor],
+                               block_mapping: torch.Tensor) -> None:
+    """Copy blocks between different key-value caches across model layers.
+    
+    Args:
+        src_key_caches: List of source key cache tensors.
+        src_value_caches: List of source value cache tensors.
+        dst_key_caches: List of destination key cache tensors.
+        dst_value_caches: List of destination value cache tensors.
+        block_mapping: Tensor of shape (num_blocks, 2) containing pairs of
+            (src_block_idx, dst_block_idx) to copy.
+    """
+    torch.ops._C_cache_ops.copy_blocks_between_layers(src_key_caches,
+                                                      src_value_caches,
+                                                      dst_key_caches,
+                                                      dst_value_caches,
+                                                      block_mapping)
+
+
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -16,8 +16,8 @@
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-from vllm.utils import is_pin_memory_available
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -109,8 +109,7 @@ def _prepare_adjusted_tensors(
         block_table: torch.Tensor,
         batch_size: int,
         num_tokens: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int,
-               torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]:
         """
         Prepare adjusted tensors for different request types
         (partial prefill, full prefill, full decode).
@@ -130,7 +129,7 @@ def _prepare_adjusted_tensors(
             
         Returns:
             tuple: (target_positions, target_hidden_states, target_slot_mapping,
-                    cu_num_tokens, current_pos, partial_prefill_mask)
+                    cu_num_tokens, current_pos)
 
         Algorithm design:
         - Suppose target tokens are [1,2,3,...N], next token is N+1
@@ -358,7 +357,6 @@ def _prepare_adjusted_tensors(
             target_slot_mapping,
             cu_num_tokens,
             current_pos,
-            partial_prefill_mask,
         )
 
     def propose(
@@ -411,7 +409,6 @@ def propose(
                 target_slot_mapping,
                 query_start_loc,
                 num_tokens,
-                partial_prefill_mask,
             ) = self._prepare_adjusted_tensors(
                 target_token_ids,
                 target_positions,
@@ -452,19 +449,17 @@ def propose(
                 max_num_blocks_per_req = block_table.shape[1]
                 segment_indices = torch.arange(len(target_positions),
                                                device=target_positions.device)
-                segment_indices = (
-                    segment_indices.unsqueeze(0)
-                    >= common_attn_metadata.query_start_loc[:-1].unsqueeze(1)).sum(
-                    dim=0) - 1
+                segment_indices = (segment_indices.unsqueeze(0)
+                                   >= common_attn_metadata.query_start_loc[:-1]
+                                   .unsqueeze(1)).sum(dim=0) - 1
                 # Calculate the block table indices
                 block_table_indices = (
                     target_positions // self.block_size +
                     segment_indices * max_num_blocks_per_req)
                 block_numbers = block_table.flatten()[block_table_indices]
                 block_offsets = target_positions % self.block_size
                 common_attn_metadata.slot_mapping = (
-                    block_numbers * self.block_size + block_offsets
-                )
+                    block_numbers * self.block_size + block_offsets)
 
             # Use the original last token indices
         last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py