Remove hf_auth_token use

Abhishek-Varma · Abhishek-Varma · commit 1745c8f90bf8 · 2023-09-08T14:22:42.000Z
-- This commit removes `--hf_auth_token` uses from vicuna.py.
-- It adds llama2 models based on daryl49's HF.

Signed-off-by: Abhishek Varma &lt;abhishek@nod-labs.com&gt;
diff --git a/apps/language_models/scripts/vicuna.py b/apps/language_models/scripts/vicuna.py
@@ -110,12 +110,6 @@
     choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
     help="Specify which model to run.",
 )
-parser.add_argument(
-    "--hf_auth_token",
-    type=str,
-    default=None,
-    help="Specify your own huggingface authentication tokens for models like Llama2.",
-)
 parser.add_argument(
     "--cache_vicunas",
     default=False,
@@ -460,10 +454,6 @@ def __init__(
 
     def get_tokenizer(self):
         kwargs = {}
-        if self.model_name == "llama2":
-            kwargs = {
-                "use_auth_token": "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"
-            }
         tokenizer = AutoTokenizer.from_pretrained(
             self.hf_model_path,
             use_fast=False,
@@ -1217,7 +1207,6 @@ def __init__(
         self,
         model_name,
         hf_model_path="TheBloke/vicuna-7B-1.1-HF",
-        hf_auth_token: str = None,
         max_num_tokens=512,
         device="cpu",
         precision="int8",
@@ -1237,17 +1226,12 @@ def __init__(
             max_num_tokens,
             extra_args_cmd=extra_args_cmd,
         )
-        if "llama2" in self.model_name and hf_auth_token == None:
-            raise ValueError(
-                "HF auth token required. Pass it using --hf_auth_token flag."
-            )
-        self.hf_auth_token = hf_auth_token
         if self.model_name == "llama2_7b":
-            self.hf_model_path = "meta-llama/Llama-2-7b-chat-hf"
+            self.hf_model_path = "daryl149/llama-2-7b-chat-hf"
         elif self.model_name == "llama2_13b":
-            self.hf_model_path = "meta-llama/Llama-2-13b-chat-hf"
+            self.hf_model_path = "daryl149/llama-2-13b-chat-hf"
         elif self.model_name == "llama2_70b":
-            self.hf_model_path = "meta-llama/Llama-2-70b-chat-hf"
+            self.hf_model_path = "daryl149/llama-2-70b-chat-hf"
         print(f"[DEBUG] hf model name: {self.hf_model_path}")
         self.max_sequence_length = 256
         self.device = device
@@ -1276,18 +1260,15 @@ def get_model_path(self, suffix="mlir"):
         )
 
     def get_tokenizer(self):
-        kwargs = {"use_auth_token": self.hf_auth_token}
         tokenizer = AutoTokenizer.from_pretrained(
             self.hf_model_path,
             use_fast=False,
-            **kwargs,
         )
         return tokenizer
 
     def get_src_model(self):
         kwargs = {
             "torch_dtype": torch.float,
-            "use_auth_token": self.hf_auth_token,
         }
         vicuna_model = AutoModelForCausalLM.from_pretrained(
             self.hf_model_path,
@@ -1460,8 +1441,6 @@ def compile(self):
                     self.hf_model_path,
                     self.precision,
                     self.weight_group_size,
-                    self.model_name,
-                    self.hf_auth_token,
                 )
                 print(f"[DEBUG] generating torchscript graph")
                 is_f16 = self.precision in ["fp16", "int4"]
@@ -1553,24 +1532,18 @@ def compile(self):
                         self.hf_model_path,
                         self.precision,
                         self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
                     )
                 elif self.model_name == "llama2_70b":
                     model = SecondVicuna70B(
                         self.hf_model_path,
                         self.precision,
                         self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
                     )
                 else:
                     model = SecondVicuna7B(
                         self.hf_model_path,
                         self.precision,
                         self.weight_group_size,
-                        self.model_name,
-                        self.hf_auth_token,
                     )
                 print(f"[DEBUG] generating torchscript graph")
                 is_f16 = self.precision in ["fp16", "int4"]
@@ -1714,7 +1687,6 @@ def generate(self, prompt, cli):
             logits = generated_token_op["logits"]
             pkv = generated_token_op["past_key_values"]
             detok = generated_token_op["detok"]
-
             if token == 2:
                 break
             res_tokens.append(token)
@@ -1809,7 +1781,6 @@ def create_prompt(model_name, history):
         )
         vic = UnshardedVicuna(
             model_name=args.model_name,
-            hf_auth_token=args.hf_auth_token,
             device=args.device,
             precision=args.precision,
             vicuna_mlir_path=vic_mlir_path,
@@ -1851,9 +1822,9 @@ def create_prompt(model_name, history):
 
     model_list = {
         "vicuna": "vicuna=>TheBloke/vicuna-7B-1.1-HF",
-        "llama2_7b": "llama2_7b=>meta-llama/Llama-2-7b-chat-hf",
-        "llama2_13b": "llama2_13b=>meta-llama/Llama-2-13b-chat-hf",
-        "llama2_70b": "llama2_70b=>meta-llama/Llama-2-70b-chat-hf",
+        "llama2_7b": "llama2_7b=>daryl149/llama-2-7b-chat-hf",
+        "llama2_13b": "llama2_7b=>daryl149/llama-2-13b-chat-hf",
+        "llama2_70b": "llama2_7b=>daryl149/llama-2-70b-chat-hf",
     }
     while True:
         # TODO: Add break condition from user input
diff --git a/apps/language_models/src/model_wrappers/vicuna_model.py b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -8,13 +8,9 @@ def __init__(
         model_path,
         precision="fp32",
         weight_group_size=128,
-        model_name="vicuna",
-        hf_auth_token: str = None,
     ):
         super().__init__()
         kwargs = {"torch_dtype": torch.float32}
-        if "llama2" in model_name:
-            kwargs["use_auth_token"] = hf_auth_token
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, low_cpu_mem_usage=True, **kwargs
         )
@@ -57,13 +53,9 @@ def __init__(
         model_path,
         precision="fp32",
         weight_group_size=128,
-        model_name="vicuna",
-        hf_auth_token: str = None,
     ):
         super().__init__()
         kwargs = {"torch_dtype": torch.float32}
-        if "llama2" in model_name:
-            kwargs["use_auth_token"] = hf_auth_token
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, low_cpu_mem_usage=True, **kwargs
         )
@@ -303,13 +295,9 @@ def __init__(
         model_path,
         precision="int8",
         weight_group_size=128,
-        model_name="vicuna",
-        hf_auth_token: str = None,
     ):
         super().__init__()
         kwargs = {"torch_dtype": torch.float32}
-        if "llama2" in model_name:
-            kwargs["use_auth_token"] = hf_auth_token
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, low_cpu_mem_usage=True, **kwargs
         )
@@ -596,13 +584,9 @@ def __init__(
         model_path,
         precision="fp32",
         weight_group_size=128,
-        model_name="vicuna",
-        hf_auth_token: str = None,
     ):
         super().__init__()
         kwargs = {"torch_dtype": torch.float32}
-        if "llama2" in model_name:
-            kwargs["use_auth_token"] = hf_auth_token
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, low_cpu_mem_usage=True, **kwargs
         )
diff --git a/apps/stable_diffusion/web/ui/stablelm_ui.py b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -23,9 +23,9 @@ def user(message, history):
 past_key_values = None
 
 model_map = {
-    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
-    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
-    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
+    "llama2_7b": "daryl149/llama-2-7b-chat-hf",
+    "llama2_13b": "daryl149/llama-2-13b-chat-hf",
+    "llama2_70b": "daryl149/llama-2-70b-chat-hf",
     "vicuna": "TheBloke/vicuna-7B-1.1-HF",
 }
 
@@ -186,7 +186,6 @@ def chat(
             vicuna_model = UnshardedVicuna(
                 model_name,
                 hf_model_path=model_path,
-                hf_auth_token=args.hf_auth_token,
                 device=device,
                 precision=precision,
                 max_num_tokens=max_toks,
diff --git a/shark/iree_utils/compile_utils.py b/shark/iree_utils/compile_utils.py
@@ -356,7 +356,6 @@ def get_iree_module(flatbuffer_blob, device, device_idx=None):
 def load_vmfb_using_mmap(
     flatbuffer_blob_or_path, device: str, device_idx: int = None
 ):
-    print(f"Loading module {flatbuffer_blob_or_path}...")
     if "rocm" in device:
         device = "rocm"
     with DetailLogger(timeout=2.5) as dl: