Infini-AI-Lab · preminstrel · Jan 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -171,4 +171,7 @@ cython_debug/
 .pypirc
 .vscode/
 app/.gradio/
-test*
+test*
+
+
+t.ipynb
diff --git a/README.md b/README.md
@@ -186,6 +186,7 @@ Evaluated on `ananyarn/Algorithm_and_Python_Source_Code`.
 ### 2.1 Install
 ```bash
 conda create -n umbrella python=3.10
+conda activate umbrella
 bash install.sh
 ```
 ### 2.2 CLI Chatbot

diff --git a/configs/chat_config_ar.json b/configs/chat_config_ar.json
@@ -0,0 +1,15 @@
+{
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "offload": false,
+    "max_length": 2048,
+    "num_cache_layers": 0,
+    "generation_length": 256,
+    "max_turns": 16,
+    "topk": 32,
+    "temperature": 0.6,
+    "topp": 0.9,
+    "repetition_penalty": 1.05,
+    "exit_layer":16,
+    "engine": "ar",
+    "template": "meta-llama3"
+}
diff --git a/umbrella/attn/cache.py b/umbrella/attn/cache.py
@@ -1,7 +1,9 @@
 from transformers import AutoConfig
 import torch
 import flashinfer
+from flash_attn import flash_attn_with_kvcache
 import math
+
 class KV_Cache:
 
     def __init__(self, 
@@ -16,6 +18,7 @@ def __init__(self,
         self.dtype = dtype
         self.k_cache = torch.zeros(
             config.num_hidden_layers,
+            batch_size,
             max_length,
             config.num_key_value_heads,
             config.hidden_size // config.num_attention_heads,
@@ -25,6 +28,7 @@ def __init__(self,
 
         self.v_cache = torch.zeros(
             config.num_hidden_layers,
+            batch_size,
             max_length,
             config.num_key_value_heads,
             config.hidden_size // config.num_attention_heads,
@@ -40,11 +44,11 @@ def __init__(self,
 
     def gather_kv_incremental(self, indices: torch.LongTensor, offset:int):
 
-        self.k_cache[:,offset:offset + len(indices), :,:] = self.k_cache[:,indices, :,:]
-        self.v_cache[:,offset:offset + len(indices), :,:] = self.v_cache[:,indices, :,:]
+        self.k_cache[:,:,offset:offset + len(indices), :,:] = self.k_cache[:,:,indices, :,:]
+        self.v_cache[:,:,offset:offset + len(indices), :,:] = self.v_cache[:,:,indices, :,:]
 
-        self.k_cache[:,offset + len(indices):, :,:] = 0.0
-        self.v_cache[:,offset + len(indices):, :,:] = 0.0
+        self.k_cache[:,:,offset + len(indices):, :,:] = 0.0
+        self.v_cache[:,:,offset + len(indices):, :,:] = 0.0
 
         self.kv_offset = offset + len(indices)
 
@@ -54,33 +58,39 @@ def update_kv_cache(self,
             new_k_cache :torch.Tensor,
             new_v_cache :torch.Tensor,
             layer_idx :int,
-            storage_ids: torch.LongTensor
+            storage_ids: torch.LongTensor=None
             ):
 
-        new_kv_len = storage_ids.shape[0]
+        new_kv_len = new_k_cache.shape[1] # [bsz, seq, num_heads, head_dim]
         if layer_idx == 0:
             self.kv_offset += new_kv_len
-        self.k_cache[layer_idx][self.kv_offset - new_kv_len:self.kv_offset] = new_k_cache
-        self.v_cache[layer_idx][self.kv_offset - new_kv_len:self.kv_offset] = new_v_cache
-        return self.k_cache[layer_idx][:self.kv_offset], self.v_cache[layer_idx][:self.kv_offset]
+        self.k_cache[layer_idx][:, self.kv_offset - new_kv_len:self.kv_offset] = new_k_cache
+        self.v_cache[layer_idx][:, self.kv_offset - new_kv_len:self.kv_offset] = new_v_cache
+        return self.k_cache[layer_idx][:, :self.kv_offset], self.v_cache[layer_idx][:, :self.kv_offset]
 
     def compute_attention(self, 
         query_states :torch.Tensor,
         key_states :torch.Tensor, 
         value_states :torch.Tensor,
         layer_idx, 
-        storage_ids :torch.Tensor,
-        attention_mask :torch.Tensor):
+        storage_ids :torch.Tensor=None,
+        attention_mask :torch.Tensor=None):
+
+        key_states, value_states = self.update_kv_cache(key_states, value_states, layer_idx, storage_ids)
 
-        key_states, value_states = self.update_kv_cache(key_states[0], value_states[0], layer_idx, storage_ids)
-        hidden_states = flashinfer.single_prefill_with_kv_cache(
-                q = query_states[0],
-                k = key_states,
-                v = value_states,
-                kv_layout="NHD",
-                custom_mask=attention_mask[:,:self.kv_offset],
-                allow_fp16_qk_reduction=True
-            )
+        if attention_mask is not None:
+            hidden_states = flashinfer.single_prefill_with_kv_cache(
+                    q = query_states[0],
+                    k = key_states[0],
+                    v = value_states[0],
+                    kv_layout="NHD",
+                    custom_mask=attention_mask[:,:self.kv_offset],
+                    allow_fp16_qk_reduction=True
+                )
+        else:
+            # do not use attn mask
+            # print(query_states.shape, key_states.shape, value_states.shape)
+            hidden_states = flash_attn_with_kvcache(q=query_states, k_cache=key_states, v_cache=value_states, causal=True)
 
         return hidden_states
 
@@ -92,6 +102,9 @@ def clear(self):
     def set_kv_len(self, kv_len :int):
             self.kv_offset = kv_len
 
+    def get_kv_len(self):
+        return self.kv_offset
+
 
 class StaticKV_Cache:
 

diff --git a/umbrella/models/auto_model.py b/umbrella/models/auto_model.py
@@ -60,6 +60,9 @@ class AutoModelLM:
         "meta-llama/Llama-3.3-70B-Instruct": Llama,
         "meta-llama/Llama-3.1-70B-Instruct": Llama,
         "meta-llama/Llama-3.1-8B-Instruct": Llama,
+
+        "gradientai/Llama-3-8B-Instruct-Gradient-1048k": Llama,
+
         "meta-llama/Meta-Llama-3-70B-Instruct": Llama,
         "meta-llama/Meta-Llama-3-8B-Instruct": Llama,
         "meta-llama/Llama-3.2-1B-Instruct": Llama,

diff --git a/umbrella/models/llama.py b/umbrella/models/llama.py
@@ -8,6 +8,7 @@
 from .base import LLMBase
 from .model_utils import apply_rotary_pos_emb, layer_norm, capture_graph
 from tqdm import tqdm
+
 class Llama(LLMBase):
     def __init__(self, 
         model_name: str,
@@ -78,8 +79,8 @@ def layer_compute(self,
             layer_idx :int, 
             hidden_states: torch.FloatTensor, 
             position_ids: torch.LongTensor, 
-            attention_mask: torch.FloatTensor,
-            storage_ids: torch.LongTensor):
+            attention_mask: torch.FloatTensor=None,
+            storage_ids: torch.LongTensor=None):
 
         residual = hidden_states
         bsz, q_len, _ = hidden_states.size()
@@ -118,8 +119,8 @@ def layer_compute(self,
     def inference(self,
             input_ids: torch.LongTensor,
             position_ids: torch.LongTensor,
-            attention_mask: torch.FloatTensor,
-            storage_ids: torch.LongTensor):
+            attention_mask: torch.FloatTensor=None,
+            storage_ids: torch.LongTensor=None):
 
         hidden_states = F.embedding(input_ids, self.embed_tokens)  
         for idx in range(self.num_layers):
-Original file line number
+Diff line change
@@ Expand Up / @@ -171,4 +171,7 @@ cython_debug/ @@
     .pypirc
     .vscode/
     app/.gradio/
-    test*
+    test*
+    t.ipynb