Infini-AI-Lab · 99versatile · Feb 11, 2025 · Feb 11, 2025
diff --git a/examples/generate.py b/examples/generate.py
@@ -6,7 +6,7 @@
 logger = setup_logger()
 import torch
 from umbrella.templates import Prompts, SysPrompts
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM, MistralForCausalLM
 from umbrella.speculation.speculation_utils import make_causal_mask, is_sentence_complete_regex, find_first_element_position
 import argparse
 import time
@@ -30,6 +30,7 @@
 text = system_prompt + text
 
 tokenizer = AutoTokenizer.from_pretrained(args.model)
+
 tokens = tokenizer.encode(text=text, return_tensors="pt").to(DEVICE)
 
 llm = AutoModelLM.from_pretrained(

diff --git a/examples/generate_directly.py b/examples/generate_directly.py
@@ -0,0 +1,44 @@
+# Load model directly
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, MistralForCausalLM
+from umbrella.speculation.speculation_utils import make_causal_mask, is_sentence_complete_regex, find_first_element_position
+
+DEVICE = "cuda:0"
+MAX_LEN = 2048
+
+attention_mask = make_causal_mask((MAX_LEN, MAX_LEN), DEVICE)
+
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
+
+model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.float16, _attn_implementation="eager").to(DEVICE)
+
+# # tokenizer.add_special_tokens({'pad_token_id': '[PAD]'})
+# tokenizer.padding_side = 'right'
+# tokenizer.add_eos_token = True
+# tokenizer.pad_token_id=2041
+# eos_token_id=tokenizer.eos_token_id
+# model.resize_token_embeddings(len(tokenizer))
+# model.config.pad_token_id = tokenizer.pad_token_id
+
+# model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.float16, '''_attn_implementation="eager"''' max_length=MAX_LEN, attention_mask=attention_mask).to("cuda:0")
+# text = "Tell me what you know about Reinforcement Learning in 100 words."
+text = "<s>[INST] Tell me what you know about Reinforcement Learning in 100 words.[/INST]"
+
+# messages = [{"role": "user", "content": text}]
+
+# # Modified template application
+# prompt = tokenizer.apply_chat_template(
+#     messages,
+#     tokenize=False,
+#     add_generation_prompt=True  # Critical for response triggering
+# )
+
+
+input_ids = tokenizer.encode(text=text, return_tensors="pt").to(DEVICE)
+
+# input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+# prefix_len = input_ids.shape[1]
+
+output = model.generate(input_ids, do_sample=False, max_new_tokens=512)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
diff --git a/umbrella/models/auto_model.py b/umbrella/models/auto_model.py
@@ -1,6 +1,6 @@
 from .llama import Llama, LlamaAwq, LlamaOffload, LlamaAwqOffload, LlamaCudagraph
 from .qwen import Qwen, QwenOffload, QwenAwq, QwenAwqOffload, QwenCudagraph
-from .gemma import Gemma2
+from .mistral import Mistral, MistralAwq, MistralOffload, MistralAwqOffload, MistralCudagraph
 class AutoModelLM:
     """
     自动模型加载器，根据模型类型动态加载对应的类。
@@ -17,6 +17,9 @@ class AutoModelLM:
         "meta-llama/Llama-3.1-8B-Instruct": LlamaOffload,
         "meta-llama/Meta-Llama-3-70B-Instruct": LlamaOffload,
         "meta-llama/Meta-Llama-3-8B-Instruct": LlamaOffload,
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-8B":LlamaOffload,
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B":LlamaOffload,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B":QwenOffload,
         "Qwen/Qwen2.5-Coder-72B-Instruct": QwenOffload,
         "Qwen/Qwen2.5-Coder-32B-Instruct": QwenOffload,
         "Qwen/Qwen2.5-Coder-14B-Instruct": QwenOffload,
@@ -47,8 +50,8 @@ class AutoModelLM:
         "Qwen/Qwen2.5-32B-Instruct-AWQ": QwenAwqOffload,
         "Qwen/Qwen2.5-72B-Instruct-AWQ": QwenAwqOffload,
         "KirillR/QwQ-32B-Preview-AWQ": QwenAwqOffload,
-        "casperhansen/deepseek-r1-distill-qwen-32b-awq":QwenAwqOffload
-
+        "casperhansen/deepseek-r1-distill-qwen-32b-awq":QwenAwqOffload,
+        "mistralai/Mistral-7B-v0.3": MistralOffload,   # Mistral 7B added by EJ
     }
 
     _MODEL_MAPPING = {
@@ -73,6 +76,12 @@ class AutoModelLM:
         "Zhuominc/Coder-400M-IT": Llama,
         "Zhuominc/FastCode-500M": Llama,
         "InfiniAILab/CodeDrafter-500M": Llama,
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-8B":Llama,
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B":Llama,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B":Qwen,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B":Qwen,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B":Qwen,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B":Qwen,
         "Qwen/Qwen2.5-Coder-72B-Instruct": Qwen,
         "Qwen/Qwen2.5-Coder-32B-Instruct": Qwen,
         "Qwen/Qwen2.5-Coder-14B-Instruct": Qwen,
@@ -104,8 +113,7 @@ class AutoModelLM:
         "Qwen/Qwen2.5-72B-Instruct-AWQ": QwenAwq,
         "KirillR/QwQ-32B-Preview-AWQ": QwenAwq,
         "casperhansen/deepseek-r1-distill-qwen-32b-awq":QwenAwq,
-        "google/gemma-2-2b-it": Gemma2,
-        "google/gemma-2-2b": Gemma2
+        "mistralai/Mistral-7B-v0.3": Mistral,   # Mistral 7B added by EJ
     }
 
     _CUDAGRAPH_MODEL_MAPPING = {
@@ -122,6 +130,7 @@ class AutoModelLM:
         "Zhuominc/Coder-400M-IT": LlamaCudagraph,
         "Zhuominc/FastCode-500M": LlamaCudagraph,
         "InfiniAILab/CodeDrafter-500M": LlamaCudagraph,
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B":QwenCudagraph,
         "Qwen/Qwen2.5-Coder-72B-Instruct": QwenCudagraph,
         "Qwen/Qwen2.5-Coder-32B-Instruct": QwenCudagraph,
         "Qwen/Qwen2.5-Coder-14B-Instruct": QwenCudagraph,
@@ -136,7 +145,8 @@ class AutoModelLM:
         "Qwen/Qwen2.5-14B-Instruct": QwenCudagraph,
         "Qwen/Qwen2.5-32B-Instruct": QwenCudagraph,
         "Qwen/Qwen2.5-72B-Instruct": QwenCudagraph,
-        "Qwen/QwQ-32B-Preview": QwenCudagraph
+        "Qwen/QwQ-32B-Preview": QwenCudagraph, 
+        "mistralai/Mistral-7B-v0.3": MistralCudagraph,   # Mistral 7B added by EJ
     }
 
     @classmethod
@@ -165,4 +175,4 @@ def from_pretrained(cls, model_name, offload=False, cuda_graph=False, **kwargs):
                 raise ValueError(f"Model type '{model_name}' is not supported (offload). "
                                 f"Supported (offload) types: {list(cls._OFFLOAD_MODEL_MAPPING.keys())}")
             model_class = cls._OFFLOAD_MODEL_MAPPING[model_name]
-            return model_class(model_name = model_name, **kwargs)
+            return model_class(model_name = model_name, **kwargs)