Partial logic for basic UI workflow + More improvements (#10)

pramitchoudhary · web-flow · commit 25a95709b023 · 2023-07-07T08:02:47.000-07:00
* Initial demo UI - courtesy Megan/narasimhard
* Update version n add h2o-wave dependency
* UI relate
* Few more updates n corrections
* Save the right generated SQL
* Add pandasql dependency
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sql-sidekick"
-version = "0.0.3"
+version = "0.0.4"
 license = "Proprietary"
 description = "An AI assistant for SQL"
 authors = [
@@ -36,6 +36,8 @@ transformers = "^4.29.0"
 sentence-transformers = "^2.2.2"
 torch = "^2.0.1"
 sqlalchemy-utils = "^0.41.1"
+h2o-wave = "0.26.1"
+pandasql = "0.7.3"
 
 [tool.poetry.scripts]
 sql-sidekick = "sidekick.prompter:cli"
diff --git a/requirements.txt b/requirements.txt
@@ -14,6 +14,7 @@ fsspec==2023.5.0 ; python_full_version >= "3.8.16" and python_version < "3.10"
 gptcache==0.1.29.1 ; python_full_version >= "3.8.16" and python_version < "3.10"
 greenlet==2.0.2 ; python_full_version >= "3.8.16" and platform_machine == "aarch64" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "ppc64le" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "x86_64" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "amd64" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "AMD64" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "win32" and python_version < "3.10" or python_full_version >= "3.8.16" and platform_machine == "WIN32" and python_version < "3.10"
 huggingface-hub==0.15.1 ; python_full_version >= "3.8.16" and python_version < "3.10"
+h2o-wave==0.26.1 ; python_full_version >= "3.8.16" and python_version < "3.10"
 idna==3.4 ; python_full_version >= "3.8.16" and python_version < "3.10"
 jinja2==3.1.2 ; python_full_version >= "3.8.16" and python_version < "3.10"
 joblib==1.2.0 ; python_full_version >= "3.8.16" and python_version < "3.10"
diff --git a/sidekick/configs/.env.toml b/sidekick/configs/.env.toml
@@ -1,12 +1,13 @@
 [OPENAI]
 OPENAI_API_KEY = ""
-MODEL_NAME = "gpt-3.5-turbo-0301" # Others: e.g. text-davinci-003
+MODEL_NAME = "gpt-3.5-turbo-0301" # Others: e.g. gpt-4, gpt-4-32k, text-davinci-003
 
 [LOCAL_DB_CONFIG]
 HOST_NAME = "localhost"
 USER_NAME = "postgres"
 PASSWORD = "abc"
 DB_NAME = "querydb"
+PORT = "5432"
 
 [LOGGING]
 LOG-LEVEL = "INFO"
diff --git a/sidekick/db_config.py b/sidekick/db_config.py
@@ -1,12 +1,12 @@
 # create db with supplied info
 import json
 from pathlib import Path
-import pandas as pd
 
+import pandas as pd
 import psycopg2 as pg
 import sqlalchemy
-from psycopg2.extras import Json
 from pandasql import sqldf
+from psycopg2.extras import Json
 from sidekick.logger import logger
 from sqlalchemy import create_engine
 from sqlalchemy_utils import database_exists
@@ -133,19 +133,17 @@ def add_samples(self, data_csv_path=None):
         conn_str = f"{self.dialect}://{self.user_name}:{self.password}@{self.hostname}:{self.port}/{self.db_name}"
         try:
             df = pd.read_csv(data_csv_path, infer_datetime_format=True)
-            engine = create_engine(conn_str, isolation_level='AUTOCOMMIT')
+            engine = create_engine(conn_str, isolation_level="AUTOCOMMIT")
 
-            sample_query = f'SELECT COUNT(*) AS ROWS FROM {self.table_name} LIMIT 1'
+            sample_query = f"SELECT COUNT(*) AS ROWS FROM {self.table_name} LIMIT 1"
             num_rows_bef = pd.read_sql_query(sample_query, engine)
 
             # Write rows to database
-            res = df.to_sql(self.table_name, engine, if_exists='append', index=False)
+            df.to_sql(self.table_name, engine, if_exists="append", index=False)
 
             # Fetch the number of rows from the table
             num_rows_aft = pd.read_sql_query(sample_query, engine)
-
             logger.info(f"Number of rows inserted: {num_rows_aft.iloc[0, 0] - num_rows_bef.iloc[0, 0]}")
-
             engine.dispose()
 
         except Exception as e:
@@ -154,24 +152,25 @@ def add_samples(self, data_csv_path=None):
             engine.dispose()
 
     def execute_query_db(self, query=None, n_rows=100):
+        output = []
         try:
             if query:
                 # Create an engine
-                conn_str = f"{self.dialect}://{self.user_name}:{self.password}@{self.hostname}:{self.port}/{self.db_name}"
+                conn_str = (
+                    f"{self.dialect}://{self.user_name}:{self.password}@{self.hostname}:{self.port}/{self.db_name}"
+                )
                 engine = create_engine(conn_str)
 
                 # Create a connection
                 connection = engine.connect()
-
                 result = connection.execute(query)
 
                 # Process the query results
                 cnt = 0
-                logger.info("Here are the results from the queries: ")
                 for row in result:
                     if cnt <= n_rows:
                         # Access row data using row[column_name]
-                        logger.info(row)
+                        output.append(row)
                         cnt += 1
                     else:
                         break
@@ -182,6 +181,7 @@ def execute_query_db(self, query=None, n_rows=100):
                 engine.dispose()
             else:
                 logger.info("Query Empty or None!")
+            return output
         except Exception as e:
             logger.info(f"Error occurred : {format(e)}")
         finally:
diff --git a/sidekick/prompter.py b/sidekick/prompter.py
@@ -9,17 +9,19 @@
 from colorama import Fore as F
 from colorama import Style
 from loguru import logger
+from pandasql import sqldf
 from sidekick.db_config import DBConfig
 from sidekick.memory import EntityMemory
 from sidekick.query import SQLGenerator
-from sidekick.utils import save_query, setup_dir, extract_table_names, execute_query_pd
+from sidekick.utils import (execute_query_pd, extract_table_names, save_query,
+                            setup_dir)
 
 # Load the config file and initialize required paths
 base_path = (Path(__file__).parent / "../").resolve()
 env_settings = toml.load(f"{base_path}/sidekick/configs/.env.toml")
 db_dialect = env_settings["DB-DIALECT"]["DB_TYPE"]
 os.environ["TOKENIZERS_PARALLELISM"] = "False"
-__version__ = "0.0.3"
+__version__ = "0.0.4"
 
 
 def color(fore="", back="", text=None):
@@ -51,8 +53,9 @@ def enter_table_name():
     val = input(color(F.GREEN, "", "Would you like to create a table for the database? (y/n): "))
     return val
 
+
 def enter_file_path(table: str):
-    val = input(color(F.GREEN, "", f"Please input the CSV file path to table: {table} : "))
+    val = input(color(F.GREEN, "", f"Please input the CSV file path to table {table} : "))
     return val
 
 
@@ -80,12 +83,12 @@ def _get_table_info(cache_path: str):
             else:
                 table_info_path = click.prompt("Enter table info path")
                 table_metadata["schema_info_path"] = table_info_path
-                with open(f"{cache_path}/table_context.json", "a") as outfile:
+                with open(f"{cache_path}/table_context.json", "w") as outfile:
                     json.dump(table_metadata, outfile, indent=4, sort_keys=False)
         else:
             table_info_path = click.prompt("Enter table info path")
             table_metadata = {"schema_info_path": table_info_path}
-            with open(f"{cache_path}/table_context.json", "a") as outfile:
+            with open(f"{cache_path}/table_context.json", "w") as outfile:
                 json.dump(table_metadata, outfile, indent=4, sort_keys=False)
     return table_info_path
 
@@ -104,6 +107,7 @@ def update_table_info(cache_path: str, table_info_path: str = None, table_name:
         if table_info_path:
             table_metadata = {"schema_info_path": table_info_path}
 
+    table_metadata["data_table_map"] = {}
     with open(f"{cache_path}/table_context.json", "w") as outfile:
         json.dump(table_metadata, outfile, indent=4, sort_keys=False)
 
@@ -335,16 +339,10 @@ def query(question: str, table_info_path: str, sample_queries: str):
                 logger.info(f"Input query: {question}")
                 logger.info(f"Generated response:\n\n{res}")
 
-        save_sql = click.prompt("Would you like to save the generated SQL (y/n)?")
-        if save_sql.lower() == "y" or save_sql.lower() == "yes":
-            # Persist for future use
-            _val = updated_sql if updated_sql else res
-            save_query(base_path, query=question, response=_val)
-
         exe_sql = click.prompt("Would you like to execute the generated SQL (y/n)?")
         if exe_sql.lower() == "y" or exe_sql.lower() == "yes":
-            # For the time being, the default option is Pandas, but the user can be asked to select Database or Panadas DF later.
-            option = "pandas" # or DB
+            # For the time being, the default option is Pandas, but the user can be asked to select Database or pandas DF later.
+            option = "DB"  # or DB
             _val = updated_sql if updated_sql else res
             if option == "DB":
                 hostname = env_settings["LOCAL_DB_CONFIG"]["HOST_NAME"]
@@ -354,30 +352,44 @@ def query(question: str, table_info_path: str, sample_queries: str):
                 db_name = env_settings["LOCAL_DB_CONFIG"]["DB_NAME"]
 
                 db_obj = DBConfig(db_name, hostname, user_name, password, port, base_path=base_path)
-                db_obj.execute_query(query=_val)
+                output_res = db_obj.execute_query_db(query=_val)
+                click.echo(f"The query results are:\n {output_res}")
             elif option == "pandas":
                 tables = extract_table_names(_val)
                 tables_path = dict()
-                for table in tables:
-                    while True:
-                        val = enter_file_path(table)
-                        if not os.path.isfile(val):
-                            click.echo("In-correct Path. Please enter again! Yes(y) or no(n)")
-                            # val = enter_file_path(table)
+                if Path(f"{path}/table_context.json").exists():
+                    f = open(f"{path}/table_context.json", "r")
+                    table_metadata = json.load(f)
+                    for table in tables:
+                        # Check if the local table_path exists in the cache
+                        if table not in table_metadata["data_table_map"].keys():
+                            val = enter_file_path(table)
+                            if not os.path.isfile(val):
+                                click.echo("In-correct Path. Please enter again! Yes(y) or no(n)")
+                            else:
+                                tables_path[table] = val
+                                table_metadata["data_table_map"][table] = val
+                                break
                         else:
-                            tables_path[table] = val
-                            break
-
-                assert len(tables) == len(tables_path)
-
-                res = execute_query_pd(query=_val, tables_path=tables_path, n_rows=100)
-
-                logger.info("The query results are:")
-                logger.info(res)
-
-            else:
-                click.echo("Exiting...")
+                            tables_path[table] = table_metadata["data_table_map"][table]
+                    assert len(tables) == len(tables_path)
+                    with open(f"{path}/table_context.json", "w") as outfile:
+                        json.dump(table_metadata, outfile, indent=4, sort_keys=False)
+                try:
+                    res = execute_query_pd(query=_val, tables_path=tables_path, n_rows=100)
+                    click.echo(f"The query results are:\n {res}")
+                except sqldf.PandaSQLException as e:
+                    logger.error(f"Error in executing the query: {e}")
+                    click.echo("Error in executing the query. Validate generate SQL and try again.")
+                    click.echo("No result to display.")
 
+        save_sql = click.prompt("Would you like to save the generated SQL (y/n)?")
+        if save_sql.lower() == "y" or save_sql.lower() == "yes":
+            # Persist for future use
+            _val = updated_sql if updated_sql else res
+            save_query(base_path, query=question, response=_val)
+        else:
+            click.echo("Exiting...")
 
 
 if __name__ == "__main__":
diff --git a/sidekick/query.py b/sidekick/query.py
@@ -8,11 +8,9 @@
 import sqlglot
 import toml
 from langchain import OpenAI
-from llama_index import (GPTSimpleVectorIndex, GPTSQLStructStoreIndex,
-                         LLMPredictor, ServiceContext, SQLDatabase)
+from llama_index import GPTSimpleVectorIndex, GPTSQLStructStoreIndex, LLMPredictor, ServiceContext, SQLDatabase
 from llama_index.indices.struct_store import SQLContextContainerBuilder
-from sidekick.configs.prompt_template import (DEBUGGING_PROMPT, QUERY_PROMPT,
-                                              TASK_PROMPT)
+from sidekick.configs.prompt_template import DEBUGGING_PROMPT, QUERY_PROMPT, TASK_PROMPT
 from sidekick.logger import logger
 from sidekick.utils import csv_parser, filter_samples, remove_duplicates
 from sqlalchemy import create_engine
@@ -186,14 +184,16 @@ def generate_tasks(self, table_names: list, input_question: str):
                         data = json.loads(line)
                         data_info += "\n" + json.dumps(data)
             self._data_info = data_info
-            task_list = self._query_tasks(input_question, data_info, _queries.lower(), table_names)
+            task_list = self._query_tasks(input_question, data_info, _queries, table_names)
             with open(f"{self.path}/var/lib/tmp/data/tasks.txt", "w") as f:
                 f.write(task_list)
             return task_list
         except Exception as se:
             raise se
 
-    def generate_sql(self, table_name: list, input_question: str, _dialect: str = "postgres", model_name: str = 'gpt-3.5-turbo-0301'):
+    def generate_sql(
+        self, table_name: list, input_question: str, _dialect: str = "postgres", model_name: str = "gpt-3.5-turbo-0301"
+    ):
         _tasks = self.task_formatter(self._tasks)
         context_file = f"{self.path}/var/lib/tmp/data/context.json"
         additional_context = json.load(open(context_file, "r")) if Path(context_file).exists() else {}
@@ -203,10 +203,10 @@ def generate_sql(self, table_name: list, input_question: str, _dialect: str = "p
         query_str = QUERY_PROMPT.format(
             _dialect=_dialect,
             _data_info=self._data_info,
-            _question=input_question.lower(),
+            _question=input_question,
             _table_name=table_name,
             _sample_queries=context_queries,
-            _tasks=_tasks.lower(),
+            _tasks=_tasks,
         )
 
         table_context_dict = {str(table_name[0]).lower(): str(additional_context).lower()}
@@ -230,6 +230,8 @@ def generate_sql(self, table_name: list, input_question: str, _dialect: str = "p
                 res = (
                     str(res).split("```", 1)[1].split(";", 1)[0].strip().replace("```", "").replace("sql\n", "").strip()
                 )
+            else:
+                res = str(res).split("Explanation:", 1)[0].strip()
             sqlglot.transpile(res)
         except (sqlglot.errors.ParseError, ValueError, RuntimeError) as e:
             logger.info("We did the best we could, there might be still be some error:\n")
diff --git a/sidekick/utils.py b/sidekick/utils.py
@@ -1,12 +1,12 @@
 import json
 import os
+import re
 from pathlib import Path
 from typing import Optional
 
 import numpy as np
 import pandas as pd
 from pandasql import sqldf
-import re
 from sentence_transformers import SentenceTransformer
 from sidekick.logger import logger
 from sklearn.metrics.pairwise import cosine_similarity
@@ -109,6 +109,7 @@ def csv_parser(input_path: str):
     res = df.apply(lambda row: f"# query: {row['query']}\n# answer: {row['answer']}", axis=1).to_list()
     return res
 
+
 def extract_table_names(query: str):
     """
     Extracts table names from a SQL query.
@@ -119,16 +120,16 @@ def extract_table_names(query: str):
     Returns:
         list: A list of table names.
     """
-    table_names = re.findall(r'\bFROM\s+(\w+)', query, re.IGNORECASE)
-    table_names += re.findall(r'\bJOIN\s+(\w+)', query, re.IGNORECASE)
-    table_names += re.findall(r'\bUPDATE\s+(\w+)', query, re.IGNORECASE)
-    table_names += re.findall(r'\bINTO\s+(\w+)', query, re.IGNORECASE)
+    table_names = re.findall(r"\bFROM\s+(\w+)", query, re.IGNORECASE)
+    table_names += re.findall(r"\bJOIN\s+(\w+)", query, re.IGNORECASE)
+    table_names += re.findall(r"\bUPDATE\s+(\w+)", query, re.IGNORECASE)
+    table_names += re.findall(r"\bINTO\s+(\w+)", query, re.IGNORECASE)
 
-    # Below keywords may not be relevant for the project but adding for sake for completness
-    table_names += re.findall(r'\bINSERT\s+INTO\s+(\w+)', query, re.IGNORECASE)
-    table_names += re.findall(r'\bDELETE\s+FROM\s+(\w+)', query, re.IGNORECASE)
+    # Below keywords may not be relevant for the project but adding for sake for completeness
+    table_names += re.findall(r"\bINSERT\s+INTO\s+(\w+)", query, re.IGNORECASE)
+    table_names += re.findall(r"\bDELETE\s+FROM\s+(\w+)", query, re.IGNORECASE)
+    return np.unique(table_names).tolist()
 
-    return table_names
 
 def execute_query_pd(query=None, tables_path=None, n_rows=100):
     """
@@ -142,8 +143,9 @@ def execute_query_pd(query=None, tables_path=None, n_rows=100):
         pandas DataFrame: The result of the SQL query.
     """
     for table in tables_path:
-          locals()[f"{table}"] = pd.read_csv(tables_path[table])
+        if not table in locals():
+            # Update the local namespace with the table name, pandas object
+            locals()[f"{table}"] = pd.read_csv(tables_path[table])
 
     res_df = sqldf(query, locals())
-
-    return res_df
+    return res_df
diff --git a/ui/.app_config.toml b/ui/.app_config.toml
@@ -0,0 +1,3 @@
+[WAVE_UI]
+TITLE = "SideKick Assistant UI"
+SUB_TITLE = "Get answers to your questions"
diff --git a/ui/app.py b/ui/app.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[WAVE_UI]`
	`2`	`+TITLE = "SideKick Assistant UI"`
	`3`	`+SUB_TITLE = "Get answers to your questions"`