Pass Db dialect properly

pramitchoudhary · web-flow · commit abe06ef9f46c · 2023-12-07T10:38:27.000-08:00
diff --git a/app.toml b/app.toml
@@ -4,7 +4,7 @@ title = "SQL-Sidekick"
 description = "QnA with tabular data using NLQ"
 LongDescription = "about.md"
 Tags = ["DATA_SCIENCE", "MACHINE_LEARNING", "NLP"]
-Version = "0.1.7"
+Version = "0.1.8"
 
 [Runtime]
 MemoryLimit = "64Gi"
diff --git a/examples/notebooks/databricks_db.ipynb b/examples/notebooks/databricks_db.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sql-sidekick"
-version = "0.1.7"
+version = "0.1.8"
 license = "Proprietary"
 description = "An AI assistant for SQL"
 authors = [
diff --git a/sidekick/configs/prompt_template.py b/sidekick/configs/prompt_template.py
@@ -26,11 +26,11 @@
 # Reference: https://arxiv.org/pdf/2005.14165.pdf
 QUERY_PROMPT = """
                 ### System: Act as a SQL Expert
-                # For table {_table_name}, given an input *Question*, only generate syntactically correct SQL queries.
+                # For table {_table_name}, given an input *Question*, only generate syntactically correct {dialect} SQL queries.
                 # Let's work it out in a detailed step by step way using the reasoning from *Tasks* section.
                 # Pick the SQL query which has the highest average log probability if more than one result is likely to answer the
                 candidate *Question*.
-                ### {_dialect} SQL tables
+                ### {dialect} SQL tables
                 ### *Data:* \nFor table {_table_name} schema info is mentioned below,\n{_data_info}
                 ### *History*:\n{_sample_queries}
                 ### *Question*: For table {_table_name}, {_question}
@@ -52,7 +52,7 @@
             """
 
 DEBUGGING_PROMPT = {
-    "system_prompt": "Act as a SQL expert for {_dialect} code",
+    "system_prompt": "Act as a SQL expert for {dialect} code",
     "user_prompt": """
                 ### Fix syntax errors for provided incorrect SQL Query.
                 # Add ``` as prefix and ``` as suffix to generated SQL
@@ -63,7 +63,7 @@
 }
 
 NSQL_QUERY_PROMPT = """
-For SQL TABLE '{table_name}' with sample question/answer pairs,\n({sample_queries})
+For {dialect} SQL TABLE '{table_name}' with sample question/answer pairs,\n({sample_queries})
 
 CREATE TABLE '{table_name}'({column_info}
 )
@@ -72,7 +72,7 @@
 
 
 
--- Using valid and syntactically correct SQLite query, answer the following questions (check for typos, grammatical and spelling errors and fix them) with the information for '{table_name}' provided above; for final SQL only use column names from the CREATE TABLE (Do not query for columns that do not exist).
+-- Using valid and syntactically correct {dialect} SQL syntax, answer the following questions (check for typos, grammatical and spelling errors and fix them) with the information for '{table_name}' provided above; for final SQL only use column names from the CREATE TABLE (Do not query for columns that do not exist).
 
 
 -- Using reference for TABLES '{table_name}' {context}; {question_txt}?
@@ -82,7 +82,7 @@
 # https://colab.research.google.com/drive/13BIKsqHnPOBcQ-ba2p77L5saiepTIwu0#scrollTo=0eI-VpCkf-fN
 STARCODER2_PROMPT = """
 ### Instructions:
-Your task is convert a question into a valid SQLite SQL query, given a sqlite database schema. Let's work this out step by step to be sure we have the right answer.
+Your task is convert a question into a valid {dialect} syntax SQL query, given a {dialect} database schema. Let's work this out step by step to be sure we have the right answer.
 Only use the column names from the CREATE TABLE statement.
 Adhere to these rules:
 - **Deliberately go through the question and database schema word by word** to appropriately answer the question
@@ -101,7 +101,7 @@
 
 
 ### Input:
-For SQL TABLE '{table_name}' with sample question/answer pairs,\n({sample_queries}), create a valid SQL (dialect:SQLite) query to answer the following question:\n{question_txt}.
+For SQL TABLE '{table_name}' with sample question/answer pairs,\n({sample_queries}), create a valid SQL (dialect:{dialect}) query to answer the following question:\n{question_txt}.
 This query will run on a database whose schema is represented in this string:
 CREATE TABLE '{table_name}' ({column_info}
 );
diff --git a/sidekick/db_config.py b/sidekick/db_config.py
@@ -39,7 +39,6 @@ def __init__(
         self.base_path = base_path
         self.column_names = []
         if dialect == "sqlite":
-            logger.debug(f"Creating SQLite DB: sqlite:///{base_path}/db/sqlite/{db_name}.db")
             self._url = f"sqlite:///{base_path}/db/sqlite/{db_name}.db"
         else:
             self._url = f"{self.dialect}://{self.user_name}:{self.password}@{self.hostname}:{self.port}/"
@@ -86,63 +85,77 @@ def create_db(self):
             logger.debug("Error Occurred:", error)
             return None, error
 
-    def _extract_schema_info(self, schema_info_path=None):
+
+    def _parser(self, file_handle=None, schema_info=None):
+        sample_values = []
+        res = []
+        _lines = file_handle if file_handle else schema_info
+        for line in _lines:
+            data = json.loads(line) if isinstance(line, str) and line.strip() else line
+            if "Column Name" in data and "Column Type" in data:
+                col_name = data["Column Name"]
+                self.column_names.append(col_name)
+                col_type = data["Column Type"]
+                if col_type.lower() == "text":
+                    col_type = col_type + " COLLATE NOCASE"
+                # if column has sample values, save in cache for future use.
+                if "Sample Values" in data:
+                    _sample_values = data["Sample Values"]
+                    _ds = data_samples_template.format(
+                        column_name=col_name,
+                        comma_separated_sample_values=",".join(
+                            str(_sample_val) for _sample_val in _sample_values
+                        ),
+                    )
+                    sample_values.append(_ds)
+                _new_samples = f"{col_name} {col_type}"
+            res.append(_new_samples)
+        return res, sample_values
+
+
+    def _extract_schema_info(self, schema=None, schema_path=None):
         # From jsonl format
         # E.g. {"Column Name": "id", "Column Type": "uuid PRIMARY KEY"}
-        if schema_info_path is None:
-            table_info_file = f"{self.base_path}/var/lib/tmp/data/table_context.json"
-            if Path(table_info_file).exists():
-                with open(table_info_file, "w") as outfile:
-                    schema_info_path = json.load(outfile)["schema_info_path"]
         res = []
         sample_values = []
         try:
-            logger.debug(f"Schema path: {schema_info_path}")
-            if Path(schema_info_path).exists():
-                with open(schema_info_path, "r") as in_file:
-                    for line in in_file:
-                        if line.strip():
-                            data = json.loads(line)
-                            if "Column Name" in data and "Column Type" in data:
-                                col_name = data["Column Name"]
-                                self.column_names.append(col_name)
-                                col_type = data["Column Type"]
-                                if col_type.lower() == "text":
-                                    col_type = col_type + " COLLATE NOCASE"
-                                # if column has sample values, save in cache for future use.
-                                if "Sample Values" in data:
-                                    _sample_values = data["Sample Values"]
-                                    _ds = data_samples_template.format(
-                                        column_name=col_name,
-                                        comma_separated_sample_values=",".join(
-                                            str(_sample_val) for _sample_val in _sample_values
-                                        ),
-                                    )
-                                    sample_values.append(_ds)
-                                _new_samples = f"{col_name} {col_type}"
-                            res.append(_new_samples)
-                if len(sample_values) > 0:
-                    # cache it for future use
-                    with open(
-                        f"{self.base_path}/var/lib/tmp/data/{self._table_name}_column_values.json", "w"
-                    ) as outfile:
-                        json.dump(sample_values, outfile, indent=2, sort_keys=False)
+            if schema is not None:
+                logger.debug(f"Using passed schema information.")
+                res, sample_values =  self._parser(schema_info=schema)
+            else:
+                if schema_path is None:
+                    table_info_file = f"{self.base_path}/var/lib/tmp/data/table_context.json"
+                    if Path(table_info_file).exists():
+                        with open(table_info_file, "w") as outfile:
+                            schema_path = json.load(outfile)["schema_info_path"]
+                if Path(schema_path).exists():
+                    logger.debug(f"Using schema information from: {schema_path}")
+                    with open(schema_path, "r") as in_file:
+                        res, sample_values =  self._parser(file_handle=in_file)
+            if len(sample_values) > 0:
+                # cache it for future use
+                with open(
+                    f"{self.base_path}/var/lib/tmp/data/{self._table_name}_column_values.json", "w"
+                ) as outfile:
+                    json.dump(sample_values, outfile, indent=2, sort_keys=False)
         except ValueError as ve:
             logger.error(f"Error in reading table context file: {ve}")
             pass
         return res
 
-    def create_table(self, schema_info_path: str=None, schema_info=None):
+    def create_table(self, schema_info_path=None, schema_info=None):
         try:
             engine = create_engine(self._url, isolation_level="AUTOCOMMIT")
             self._engine = engine
-            if self.schema_info is None:
-                if schema_info is not None:
-                    self.schema_info = schema_info
-                else:
-                    # If schema information is not provided, extract from the template.
-                    self.schema_info = """,\n""".join(self._extract_schema_info(schema_info_path)).strip()
-                    logger.debug(f"Schema info used for creating table:\n {self.schema_info}")
+            if self.schema_info is None and schema_info_path:
+                # If schema information is not provided, extract from the template.
+                self.schema_info = """,\n""".join(self._extract_schema_info(schema_path=schema_info_path)).strip()
+            else:
+                self.schema_info = """,\n""".join(self._extract_schema_info(schema=schema_info)).strip()
+
+            logger.debug(f"Schema info used for creating table:\n {self.schema_info}")
+            # Currently, multiple tables is not supported.
+            # TODO https://github.com/h2oai/sql-sidekick/issues/62
             create_syntax = f"""
                     CREATE TABLE IF NOT EXISTS {self.table_name} (
                         {self.schema_info}
diff --git a/sidekick/prompter.py b/sidekick/prompter.py
@@ -3,6 +3,7 @@
 import json
 import os
 from pathlib import Path
+from typing import Optional
 
 import click
 import openai
@@ -212,12 +213,13 @@ def db_setup(
     user_name: str,
     password: str,
     port: int,
-    table_info_path: str,
-    table_samples_path: str,
     table_name: str,
+    table_info_path: Optional[str] = None,
+    table_schema: Optional[list] = None,
+    table_samples_path: Optional[str] = None,
     add_sample: bool=True,
     is_command: bool = False,
-    local_base_path: str = None
+    local_base_path: Optional[str] = None
 ):
     """Creates context for the new Database"""
     click.echo(f" Information supplied:\n {db_name}, {hostname}, {user_name}, {password}, {port}")
@@ -264,7 +266,7 @@ def db_setup(
             else:
                 break
 
-        if table_info_path is None:
+        if table_info_path is None and table_schema is None:
             logger.debug(f"Retrieve meta information for table {table_name}")
             table_info_path = _get_table_info(path, table_name)
             logger.debug(f"Updated table info path: {table_info_path}")
@@ -274,7 +276,11 @@ def db_setup(
             click.echo(f"Table name: {table_value}")
             # set table name
             db_obj.table_name = table_value.lower().replace(" ", "_")
-            res, err = db_obj.create_table(table_info_path)
+            if table_schema:
+                res, err = db_obj.create_table(schema_info=table_schema)
+            else:
+                if table_info_path:
+                    res, err = db_obj.create_table(schema_info_path=table_info_path)
 
         update_table_info(path, table_info_path, db_obj.table_name)
         # Check if table exists; pending --> and doesn't have any rows
@@ -407,11 +413,13 @@ def ask(
     sample_queries_path: str,
     table_name: str,
     model_name: str = "h2ogpt-sql-nsql-llama-2-7B",
+    db_dialect = "sqlite",
+    execute_db_dialect="sqlite",
     is_regenerate: bool = False,
     is_regen_with_options: bool = False,
     is_command: bool = False,
     execute_query: bool = True,
-    local_base_path = None
+    local_base_path = None,
 ):
     """Asks question and returns SQL."""
     results = []
@@ -438,6 +446,7 @@ def ask(
         with open(f"{path}/table_context.json", "w") as outfile:
             json.dump(table_context, outfile, indent=4, sort_keys=False)
     logger.info(f"Table in use: {table_names}")
+    logger.info(f"SQL dialect for generation: {db_dialect}")
     # Check if env.toml file exists
     api_key = os.getenv("OPENAI_API_KEY", None)
     if (model_name == 'gpt-3.5-turbo-0301' or model_name == 'gpt-3.5-turbo-1106') and api_key is None:
@@ -477,16 +486,18 @@ def ask(
         passwd = env_settings["LOCAL_DB_CONFIG"]["PASSWORD"]
         db_name = env_settings["LOCAL_DB_CONFIG"]["DB_NAME"]
 
-        if db_dialect == "sqlite":
+        if execute_db_dialect.lower() == "sqlite":
             db_url = f"sqlite:///{base_path}/db/sqlite/{db_name}.db"
-        else:
-            db_url = f"{db_dialect}+psycopg2://{user_name}:{passwd}@{host_name}/{db_name}".format(
+        elif execute_db_dialect.lower() == "postgresql":
+            db_url = f"{execute_db_dialect}+psycopg2://{user_name}:{passwd}@{host_name}/{db_name}".format(
                 user_name, passwd, host_name, db_name
             )
+        else:
+            db_url = None
 
         if table_info_path is None:
             table_info_path = _get_table_info(path, table_name)
-            logger.debug(f"Table info path: {table_info_path}")
+        logger.debug(f"Table info path: {table_info_path}")
 
         sql_g = SQLGenerator(
             db_url,
@@ -497,6 +508,7 @@ def ask(
             sample_queries_path=sample_queries_path,
             is_regenerate_with_options=is_regen_with_options,
             is_regenerate=is_regenerate,
+            db_dialect=db_dialect
         )
         if "h2ogpt-sql" not in model_name and not _execute_sql(question):
             sql_g._tasks = sql_g.generate_tasks(table_names, question)
@@ -531,7 +543,7 @@ def ask(
             _check_cond = question.strip().lower().split("execute sql:")
             if len(_check_cond) > 1:
                 question = question.strip().lower().split("execute sql:")[1].strip()
-            res, alt_res = sql_g.generate_sql(table_names, question, model_name=model_name, _dialect=db_dialect)
+            res, alt_res = sql_g.generate_sql(table_names, question, model_name=model_name)
         logger.info(f"Input query: {question}")
         logger.info(f"Generated response:\n\n{res}")
 
diff --git a/sidekick/query.py b/sidekick/query.py
@@ -39,6 +39,7 @@ def __new__(
         model_name="h2ogpt-sql-nsql-llama-2-7B",
         data_input_path: str = "./table_info.jsonl",
         sample_queries_path: str = "./samples.csv",
+        db_dialect = "sqlite",
         job_path: str = "./",
         device: str = "auto",
         is_regenerate: bool = False,
@@ -98,12 +99,14 @@ def __init__(
         sample_queries_path: str = "./samples.csv",
         job_path: str = "./",
         device: str = "cpu",
+        db_dialect = "sqlite",
         is_regenerate: bool = False,
         is_regenerate_with_options: bool = False,
     ):
         self.db_url = db_url
         self.engine = create_engine(db_url) if db_url else None
         self.sql_database = SQLDatabase(self.engine) if self.engine else None
+        self.dialect = db_dialect
         self.context_builder = None
         self.data_input_path = _check_file_info(data_input_path)
         self.sample_queries_path = sample_queries_path
@@ -218,7 +221,7 @@ def _query_tasks(self, question_str, data_info, sample_queries, table_name: list
             return res
 
     def generate_response(
-        self, sql_index, input_prompt, attempt_fix_on_error: bool = True, _dialect: str = "sqlite"
+        self, sql_index, input_prompt, attempt_fix_on_error: bool = True
     ):
         try:
             _sql_index = sql_index.as_query_engine()
@@ -234,7 +237,7 @@ def generate_response(
                     # Attempt to heal with simple feedback
                     # Reference: Teaching Large Language Models to Self-Debug, https://arxiv.org/abs/2304.05128
                     logger.info(f"Attempting to fix syntax error ...,\n {se}")
-                    system_prompt = DEBUGGING_PROMPT["system_prompt"].format(_dialect=_dialect)
+                    system_prompt = DEBUGGING_PROMPT["system_prompt"].format(dialect=self.dialect)
                     user_prompt = DEBUGGING_PROMPT["user_prompt"].format(ex_traceback=ex_traceback, qry_txt=qry_txt)
                     # Role and content
                     query_msg = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
@@ -304,7 +307,6 @@ def generate_sql(
         self,
         table_names: list,
         input_question: str,
-        _dialect: str = "sqlite",
         model_name: str = "h2ogpt-sql-nsql-llama-2-7B",
     ):
         # TODO: Update needed to support multiple tables
@@ -328,7 +330,7 @@ def generate_sql(
 
                 # TODO: The need to pass data info again could be eliminated if Task generation becomes more consistent and accurate.
                 query_str = QUERY_PROMPT.format(
-                    _dialect=_dialect,
+                    dialect=self.dialect,
                     _data_info=self._data_info,
                     _question=input_question,
                     _table_name=table_names,
@@ -368,7 +370,7 @@ def generate_sql(
                         )
                     else:
                         res = str(result).split("Explanation:", 1)[0].strip()
-                    res = sqlglot.transpile(res, identify=True, read=_dialect)[0]
+                    res = sqlglot.transpile(res, identify=True, write=self.dialect)[0]
                     result = res
                 except (sqlglot.errors.ParseError, ValueError, RuntimeError) as e:
                     logger.info("We did the best we could, there might be still be some error:\n")
@@ -488,6 +490,7 @@ def generate_sql(
                     sample_queries=qna_samples,
                     context=contextual_context_val,
                     question_txt=input_question,
+                    dialect=self.dialect
                 )
 
                 logger.debug(f"Query Text:\n {query}")
@@ -649,7 +652,7 @@ def generate_sql(
                 # Reference ticket: https://github.com/tobymao/sqlglot/issues/2011
                 result = res
                 try:
-                    result = sqlglot.transpile(res, identify=True, write=_dialect)[0]
+                    result = sqlglot.transpile(res, identify=True, write=self.dialect)[0]
                 except (sqlglot.errors.ParseError, ValueError, RuntimeError) as e:
                     logger.info("We did the best we could, there might be still be some error:\n")
                     logger.info(f"Realized query so far:\n {res}")
diff --git a/sidekick/utils.py b/sidekick/utils.py
diff --git a/ui/app.py b/ui/app.py