Merge branch 'main' into iree-turbine

monorimet · web-flow · commit 29f34ec4cc23 · 2024-04-29T13:35:06.000-05:00
diff --git a/apps/shark_studio/api/llm.py b/apps/shark_studio/api/llm.py
@@ -13,7 +13,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 llm_model_map = {
-    "llama2_7b": {
+    "meta-llama/Llama-2-7b-chat-hf": {
         "initializer": stateless_llama.export_transformer_model,
         "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
         "compile_flags": ["--iree-opt-const-expr-hoisting=False"],
@@ -258,7 +258,8 @@ def format_out(results):
 
             history.append(format_out(token))
             while (
-                format_out(token) != llm_model_map["llama2_7b"]["stop_token"]
+                format_out(token)
+                != llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
                 and len(history) < self.max_tokens
             ):
                 dec_time = time.time()
@@ -272,7 +273,10 @@ def format_out(results):
 
             self.prev_token_len = token_len + len(history)
 
-            if format_out(token) == llm_model_map["llama2_7b"]["stop_token"]:
+            if (
+                format_out(token)
+                == llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
+            ):
                 break
 
         for i in range(len(history)):
@@ -306,7 +310,7 @@ def chat_hf(self, prompt):
                 self.first_input = False
 
             history.append(int(token))
-            while token != llm_model_map["llama2_7b"]["stop_token"]:
+            while token != llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
                 dec_time = time.time()
                 result = self.hf_mod(token.reshape([1, 1]), past_key_values=pkv)
                 history.append(int(token))
@@ -317,7 +321,7 @@ def chat_hf(self, prompt):
 
             self.prev_token_len = token_len + len(history)
 
-            if token == llm_model_map["llama2_7b"]["stop_token"]:
+            if token == llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
                 break
         for i in range(len(history)):
             if type(history[i]) != int:
@@ -347,7 +351,11 @@ def llm_chat_api(InputData: dict):
     else:
         print(f"prompt : {InputData['prompt']}")
 
-    model_name = InputData["model"] if "model" in InputData.keys() else "llama2_7b"
+    model_name = (
+        InputData["model"]
+        if "model" in InputData.keys()
+        else "meta-llama/Llama-2-7b-chat-hf"
+    )
     model_path = llm_model_map[model_name]
     device = InputData["device"] if "device" in InputData.keys() else "cpu"
     precision = "fp16"
diff --git a/apps/shark_studio/web/ui/chat.py b/apps/shark_studio/web/ui/chat.py
@@ -9,6 +9,7 @@
     llm_model_map,
     LanguageModel,
 )
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
 import apps.shark_studio.web.utils.globals as global_obj
 
 B_SYS, E_SYS = "<s>", "</s>"
@@ -64,6 +65,7 @@ def chat_fn(
             external_weights="safetensors",
             use_system_prompt=prompt_prefix,
             streaming_llm=streaming_llm,
+            hf_auth_token=cmd_opts.hf_auth_token,
         )
         history[-1][-1] = "Getting the model ready... Done"
         yield history, ""
diff --git a/requirements.txt b/requirements.txt
@@ -5,8 +5,8 @@
 setuptools
 wheel
 
-torch==2.3.0.dev20240305
-shark-turbine @ git+https://github.com/nod-ai/SHARK-Turbine.git@main#subdirectory=core
+torch==2.3.0
+shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
 turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@main#subdirectory=models
 
 # SHARK Runner
@@ -35,6 +35,7 @@ safetensors==0.3.1
 py-cpuinfo
 pydantic==2.4.1 # pin until pyinstaller-hooks-contrib works with beta versions
 mpmath==1.3.0
+optimum
 
 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`llm_model_map,`
`10`	`10`	`LanguageModel,`
`11`	`11`	`)`
	`12`	`+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts`
`12`	`13`	`import apps.shark_studio.web.utils.globals as global_obj`
`13`	`14`
`14`	`15`	`B_SYS, E_SYS = "<s>", "</s>"`
`@@ -64,6 +65,7 @@ def chat_fn(`
`64`	`65`	`external_weights="safetensors",`
`65`	`66`	`use_system_prompt=prompt_prefix,`
`66`	`67`	`streaming_llm=streaming_llm,`
	`68`	`+ hf_auth_token=cmd_opts.hf_auth_token,`
`67`	`69`	`)`
`68`	`70`	`history[-1][-1] = "Getting the model ready... Done"`
`69`	`71`	`yield history, ""`