Handle case wen column names starts with a number #44

pramitchoudhary · pramitchoudhary · commit 3b313be41c6b · 2023-11-01T21:25:35.000-07:00
diff --git a/sidekick/db_config.py b/sidekick/db_config.py
@@ -175,7 +175,7 @@ def add_samples(self, data_csv_path=None):
                 logger.debug(f"Inserting chunk: {idx}")
                 chunk.columns = self.column_names
                 # Make sure column names in the data-frame match the schema
-                chunk.to_sql(self.table_name, engine, if_exists="replace", index=False, method="multi")
+                chunk.to_sql(self.table_name, engine, if_exists="append", index=False, method="multi")
 
             logger.info(f"Data inserted into table: {self.table_name}")
             # Fetch the number of rows from the table
diff --git a/sidekick/query.py b/sidekick/query.py
@@ -12,17 +12,27 @@
 import torch
 import torch.nn.functional as F
 from langchain import OpenAI
-from llama_index import (GPTSimpleVectorIndex, GPTSQLStructStoreIndex,
-                         LLMPredictor, ServiceContext, SQLDatabase)
+from llama_index import GPTSimpleVectorIndex, GPTSQLStructStoreIndex, LLMPredictor, ServiceContext, SQLDatabase
 from llama_index.indices.struct_store import SQLContextContainerBuilder
-from sidekick.configs.prompt_template import (DEBUGGING_PROMPT,
-                                              NSQL_QUERY_PROMPT, QUERY_PROMPT,
-                                              STARCODER2_PROMPT, TASK_PROMPT)
+from sidekick.configs.prompt_template import (
+    DEBUGGING_PROMPT,
+    NSQL_QUERY_PROMPT,
+    QUERY_PROMPT,
+    STARCODER2_PROMPT,
+    TASK_PROMPT,
+)
 from sidekick.logger import logger
-from sidekick.utils import (_check_file_info, is_resource_low,
-                            load_causal_lm_model, load_embedding_model,
-                            make_dir, re_rank, read_sample_pairs,
-                            remove_duplicates, semantic_search)
+from sidekick.utils import (
+    _check_file_info,
+    is_resource_low,
+    load_causal_lm_model,
+    load_embedding_model,
+    make_dir,
+    re_rank,
+    read_sample_pairs,
+    remove_duplicates,
+    semantic_search,
+)
 from sqlalchemy import create_engine
 
 
diff --git a/sidekick/schema_generator.py b/sidekick/schema_generator.py
@@ -1,23 +1,30 @@
 import json
 import re
 import pandas as pd
+import random
 
 
 def generate_schema(data_path, output_path):
     df = pd.read_csv(data_path)
     # Extract the schema information
     schema = df.dtypes.to_dict()
     schema_list = []
-    special_characters = {" ": "_", ":": "_", "/": "_", "-": "_"}
+    special_characters = {" ": "_", ":": "_", "/": "_", "-": "_", "(": "", ")": "", ".": "_"}
     syntax_names = ["default"]
 
     for key, value in schema.items():
         new_key = "".join(special_characters[s] if s in special_characters.keys() else s for s in key)
         if new_key.lower() in syntax_names:
             new_key = new_key + "_col"
+        if new_key[0].isdigit():
+            _temp = "".join((new_key[:0], "Digit_", new_key[1:]))
+            new_key = _temp
         if value == "object":
             value = "TEXT"
             unique_values = df[key].dropna().unique().tolist()
+            if len(unique_values) > 10:
+                # Randomly sample 10 values
+                unique_values = random.sample(unique_values, k=10)
             if not bool(re.search(r"[A-Za-z]", unique_values[0])):
                 schema_list.append({"Column Name": new_key, "Column Type": value})
             else: