Enable sql coder2

pramitchoudhary · web-flow · commit 81510e3d24fa · 2023-10-12T18:29:33.000-07:00
diff --git a/sidekick/configs/prompt_template.py b/sidekick/configs/prompt_template.py
@@ -76,3 +76,32 @@
 -- Using reference for TABLES '{table_name}' {context}; {question_txt}?
 
 SELECT"""
+
+# https://colab.research.google.com/drive/13BIKsqHnPOBcQ-ba2p77L5saiepTIwu0#scrollTo=0eI-VpCkf-fN
+STARCODER2_PROMPT = """
+### Instructions:
+Your task is convert a question into a SQL query, given a sqlite database schema.
+Only use the column names from the CREATE TABLE statement.
+Adhere to these rules:
+- **Deliberately go through the question and database schema word by word** to appropriately answer the question
+- **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.
+- When creating a ratio, always cast the numerator as float
+- Use COUNT(1) instead of COUNT(*)
+- If the question is asking for a rate, use COUNT to compute percentage
+- Avoid overly complex SQL queries
+- Avoid using the WITH statement
+- Don't use aggregate and window function together
+- Prefer NOT EXISTS to LEFT JOIN ON null id
+- When using DESC keep NULLs at the end
+- If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->
+
+### Input:
+For SQL TABLE '{table_name}' with sample question/answer pairs,\n({sample_queries}), create a SQL query to answer the following question:\n{question_txt}.
+This query will run on a database whose schema is represented in this string:
+CREATE TABLE '{table_name}' ({column_info}
+);
+
+-- Table '{table_name}', {context}, has sample values ({data_info_detailed})
+
+### Response:
+SELECT"""
diff --git a/sidekick/prompter.py b/sidekick/prompter.py
@@ -335,6 +335,7 @@ def query_api(
     table_info_path: str,
     sample_queries_path: str,
     table_name: str,
+    model_name: str = "h2ogpt-sql-nsql-llama-2-7B",
     is_regenerate: bool = False,
     is_regen_with_options: bool = False,
     is_command: bool = False,
@@ -365,7 +366,7 @@ def query_api(
     logger.info(f"Table in use: {table_names}")
     # Check if env.toml file exists
     api_key = None
-    if model_name != "h2ogpt-sql":
+    if "h2ogpt-sql" not in model_name:
         api_key = env_settings["MODEL_INFO"]["OPENAI_API_KEY"]
         if api_key is None or api_key == "":
             if os.getenv("OPENAI_API_KEY") is None or os.getenv("OPENAI_API_KEY") == "":
@@ -414,6 +415,7 @@ def query_api(
         sql_g = SQLGenerator(
             db_url,
             api_key,
+            model_name=model_name,
             job_path=base_path,
             data_input_path=table_info_path,
             sample_queries_path=sample_queries_path,
diff --git a/sidekick/query.py b/sidekick/query.py
@@ -17,7 +17,7 @@
 from llama_index.indices.struct_store import SQLContextContainerBuilder
 from sidekick.configs.prompt_template import (DEBUGGING_PROMPT,
                                               NSQL_QUERY_PROMPT, QUERY_PROMPT,
-                                              TASK_PROMPT)
+                                              STARCODER2_PROMPT, TASK_PROMPT)
 from sidekick.logger import logger
 from sidekick.utils import (_check_file_info, filter_samples, is_resource_low,
                             load_causal_lm_model, load_embedding_model,
@@ -32,7 +32,7 @@ def __new__(
         cls,
         db_url: str,
         openai_key: str = None,
-        model_name="NumbersStation/nsql-llama-2-7B",
+        model_name="h2ogpt-sql-nsql-llama-2-7B",
         data_input_path: str = "./table_info.jsonl",
         sample_queries_path: str = "./samples.csv",
         job_path: str = "./",
@@ -65,7 +65,7 @@ def __init__(
         self,
         db_url: str,
         openai_key: str = None,
-        model_name="NumbersStation/nsql-llama-2-7B",
+        model_name="h2ogpt-sql-nsql-llama-2-7B",
         data_input_path: str = "./table_info.jsonl",
         sample_queries_path: str = "./samples.csv",
         job_path: str = "./",
@@ -281,7 +281,7 @@ def generate_sql(
             context_queries = self.content_queries
             self.context_builder = SQLContextContainerBuilder(self.sql_database, context_dict=table_context_dict)
 
-            if model_name != "h2ogpt-sql":
+            if "h2ogpt-sql" not in model_name:
                 _tasks = self.task_formatter(self._tasks)
 
                 # TODO: The need to pass data info again could be eliminated if Task generation becomes more consistent and accurate.
@@ -427,7 +427,11 @@ def generate_sql(
                 logger.debug(f"Relevant sample column values: {data_samples_list}")
                 _table_name = ", ".join(table_names)
 
-                query = NSQL_QUERY_PROMPT.format(
+                query_prompt_format = STARCODER2_PROMPT
+                if model_name == "h2ogpt-sql-nsql-llama-2-7B":
+                    query_prompt_format = NSQL_QUERY_PROMPT
+
+                query = query_prompt_format.format(
                     table_name=_table_name,
                     column_info=_column_info,
                     data_info_detailed=data_samples_list,
@@ -449,7 +453,7 @@ def generate_sql(
                 # 3. Maybe positional interpolation --> https://arxiv.org/abs/2306.15595
                 if int(input_length) > 4000:
                     logger.info("Input length is greater than 1748, removing column description from the prompt")
-                    query = NSQL_QUERY_PROMPT.format(
+                    query = query_prompt_format.format(
                         table_name=_table_name,
                         column_info=_column_info,
                         data_info_detailed="",
diff --git a/sidekick/schema_generator.py b/sidekick/schema_generator.py
@@ -9,10 +9,12 @@ def generate_schema(data_path, output_path):
     schema = df.dtypes.to_dict()
     schema_list = []
     special_characters = {" ": "_", ":": "_", "/": "_", "-": "_"}
+    syntax_names = ["default"]
 
     for key, value in schema.items():
         new_key = "".join(special_characters[s] if s in special_characters.keys() else s for s in key)
-
+        if new_key.lower() in syntax_names:
+            new_key = new_key + "_col"
         if value == "object":
             value = "TEXT"
             unique_values = df[key].dropna().unique().tolist()
diff --git a/sidekick/utils.py b/sidekick/utils.py
@@ -193,7 +193,7 @@ def read_sample_pairs(input_path: str, model_name: str = "h2ogpt-sql"):
     df = df.reset_index(drop=True)
 
     # NSQL format
-    if model_name != "h2ogpt-sql":
+    if "h2ogpt-sql" not in model_name:
         # Open AI format
         # Convert frame to below format
         # [
@@ -281,7 +281,7 @@ def is_resource_low():
 
 
 def load_causal_lm_model(
-    model_name: str,
+    model_type: str,
     cache_path: str,
     device: str,
     load_in_8bit: bool = False,
@@ -290,7 +290,13 @@ def load_causal_lm_model(
     re_generate: bool = False,
 ):
     try:
-        # Load h2oGPT.NSQL model
+        model_choices_map = {
+            "h2ogpt-sql-nsql-llama-2-7B": "NumbersStation/nsql-llama-2-7B",
+            "h2ogpt-sql-sqlcoder2": "defog/sqlcoder2",
+        }
+        model_name = model_choices_map[model_type]
+        logger.info(f"Loading model: {model_name}")
+        # Load h2oGPT.SQL model
         device = {"": 0} if torch.cuda.is_available() else "cpu" if device == "auto" else device
         total_memory = int(torch.cuda.get_device_properties(0).total_memory / 1024**3)
         free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
@@ -402,7 +408,7 @@ def check_vulnerability(input_query: str):
         r'\b(SELECT\s+\*\s+FROM\s+\w+\s+WHERE\s+\w+\s*=\s*[\'"].*?[\'"]\s*;?\s*--)',
         r'\b(INSERT\s+INTO\s+\w+\s+\(\s*\w+\s*,\s*\w+\s*\)\s+VALUES\s*\(\s*[\'"].*?[\'"]\s*,\s*[\'"].*?[\'"]\s*\)\s*;?\s*--)',
         r"\b(DROP\s+TABLE|ALTER\s+TABLE|admin\'--)",  # DROP TABLE/ALTER TABLE
-        r"(?:'|\”|--|#|‘\s*OR\s*‘1|‘\s*OR\s*\d+\s*--\s*-|\"\s*OR\s*\"\" = \"|\"\s*OR\s*\d+\s*=\s*\d+\s*--\s*-|’\s*OR\s*''\s*=\s*‘|‘=‘|'=0--+|OR\s*\d+\s*=\s*\d+|‘\s*OR\s*‘x’=‘x’|AND\s*id\s*IS\s*NULL;\s*--|‘’’’’’’’’’’’’UNION\s*SELECT\s*‘\d+|%00|/\*.*?\*/|\+|\|\||%|@\w+|@@\w+)",
+        r"(?:'|\”|--|#|‘\s*OR\s*‘1|‘\s*OR\s*\d+\s*--\s*-|\"\s*OR\s*\"\" = \"|\"\s*OR\s*\d+\s*=\s*\d+\s*--\s*-|’\s*OR\s*''\s*=\s*‘|‘=‘|'=0--+|OR\s*\d+\s*=\s*\d+|‘\s*OR\s*‘x’=‘x’|AND\s*id\s*IS\s*NULL;\s*--|‘’’’’’’’’’’’’UNION\s*SELECT\s*‘\d+|%00|/\*.*?\*/|\+|\|\||@\w+|@@\w+)",
         r"AND\s[01]|AND\s(true|false)|[01]-((true|false))",
         r"\d+'\s*ORDER\s*BY\s*\d+--\+|\d+'\s*GROUP\s*BY\s*(\d+,)*\d+--\+|'\s*GROUP\s*BY\s*columnnames\s*having\s*1=1\s*--",
         r"\bUNION\b\s+\b(?:ALL\s+)?\bSELECT\b\s+[A-Za-z0-9]+",  # Union Based
diff --git a/start.py b/start.py
@@ -8,7 +8,10 @@
 
 logging.info(f"Download model...")
 base_path = (Path(__file__).parent).resolve()
+# Model 1:
 snapshot_download(repo_id="NumbersStation/nsql-llama-2-7B", cache_dir=f"{base_path}/models/")
+# Model 2:
+snapshot_download(repo_id="defog/sqlcoder2", cache_dir=f"{base_path}/models/")
 logging.info(f"Download embedding model...")
 snapshot_download(repo_id="BAAI/bge-base-en", cache_dir=f"{base_path}/models/sentence_transformers/")
 
diff --git a/ui/app.py b/ui/app.py
@@ -86,6 +86,11 @@ async def chat(q: Q):
                 original_name = meta_data[table].get("original_name", q.user.original_name)
                 table_names.append(ui.choice(table, f"{original_name}"))
 
+    model_choices = [
+        ui.choice("h2ogpt-sql-nsql-llama-2-7B", "h2ogpt-sql-nsql-llama-2-7B"),
+        ui.choice("h2ogpt-sql-sqlcoder2", "h2ogpt-sql-sqlcoder2"),
+    ]
+    q.user.model_choice_dropdown = "h2ogpt-sql-sqlcoder2"
     add_card(
         q,
         "background_card",
@@ -111,7 +116,15 @@ async def chat(q: Q):
                     choices=table_names,
                     value=q.user.table_name if q.user.table_name else None,
                     trigger=True,
-                )
+                ),
+                ui.dropdown(
+                    name="model_choice_dropdown",
+                    label="Model Choice",
+                    required=True,
+                    choices=model_choices,
+                    value=q.user.model_choice_dropdown if q.user.model_choice_dropdown else None,
+                    trigger=True,
+                ),
             ],
         ),
     )
@@ -209,6 +222,7 @@ async def chatbot(q: Q):
                     sample_queries_path=q.user.sample_qna_path,
                     table_info_path=q.user.table_info_path,
                     table_name=q.user.table_name,
+                    model_name=q.user.model_choice_dropdown,
                     is_regenerate=True,
                     is_regen_with_options=False,
                 )
@@ -227,6 +241,7 @@ async def chatbot(q: Q):
                     sample_queries_path=q.user.sample_qna_path,
                     table_info_path=q.user.table_info_path,
                     table_name=q.user.table_name,
+                    model_name=q.user.model_choice_dropdown,
                     is_regenerate=False,
                     is_regen_with_options=True,
                 )
@@ -248,6 +263,7 @@ async def chatbot(q: Q):
                 sample_queries_path=q.user.sample_qna_path,
                 table_info_path=q.user.table_info_path,
                 table_name=q.user.table_name,
+                model_name=q.user.model_choice_dropdown,
             )
             llm_response = "\n".join(llm_response)
     except (MemoryError, RuntimeError) as e:
@@ -567,12 +583,23 @@ async def on_event(q: Q):
     elif q.args.regenerate:
         q.args.chatbot = "regenerate"
 
-    if q.args.table_dropdown and not q.args.chatbot:
+    if q.args.table_dropdown and not q.args.chatbot and q.user.table_name != q.args.table_dropdown:
         logging.info(f"User selected table: {q.args.table_dropdown}")
         await submit_table(q)
         q.args.chatbot = f"Table {q.args.table_dropdown} selected"
         # Refresh response is triggered when user selects a table via dropdown
         event_handled = True
+    if (
+        q.args.model_choice_dropdown
+        and not q.args.chatbot
+        and q.user.model_choice_dropdown != q.args.model_choice_dropdown
+    ):
+        logging.info(f"User selected model type: {q.args.model_choice_dropdown}")
+        q.user.model_choice_dropdown = q.args.model_choice_dropdown
+        q.page["select_tables"].model_choice_dropdown.value = q.user.model_choice_dropdown
+        q.args.chatbot = f"Model {q.args.model_choice_dropdown} selected"
+        # Refresh response is triggered when user selects a table via dropdown
+        event_handled = True
 
     if q.args.save_conversation or (q.args.chatbot and "save the qna pair:" in q.args.chatbot.lower()):
         question = q.client.query