Skip to content

Commit 3b313be

Browse files
Handle case wen column names starts with a number #44
1 parent fb2be6a commit 3b313be

File tree

3 files changed

+28
-11
lines changed

3 files changed

+28
-11
lines changed

sidekick/db_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def add_samples(self, data_csv_path=None):
175175
logger.debug(f"Inserting chunk: {idx}")
176176
chunk.columns = self.column_names
177177
# Make sure column names in the data-frame match the schema
178-
chunk.to_sql(self.table_name, engine, if_exists="replace", index=False, method="multi")
178+
chunk.to_sql(self.table_name, engine, if_exists="append", index=False, method="multi")
179179

180180
logger.info(f"Data inserted into table: {self.table_name}")
181181
# Fetch the number of rows from the table

sidekick/query.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,27 @@
1212
import torch
1313
import torch.nn.functional as F
1414
from langchain import OpenAI
15-
from llama_index import (GPTSimpleVectorIndex, GPTSQLStructStoreIndex,
16-
LLMPredictor, ServiceContext, SQLDatabase)
15+
from llama_index import GPTSimpleVectorIndex, GPTSQLStructStoreIndex, LLMPredictor, ServiceContext, SQLDatabase
1716
from llama_index.indices.struct_store import SQLContextContainerBuilder
18-
from sidekick.configs.prompt_template import (DEBUGGING_PROMPT,
19-
NSQL_QUERY_PROMPT, QUERY_PROMPT,
20-
STARCODER2_PROMPT, TASK_PROMPT)
17+
from sidekick.configs.prompt_template import (
18+
DEBUGGING_PROMPT,
19+
NSQL_QUERY_PROMPT,
20+
QUERY_PROMPT,
21+
STARCODER2_PROMPT,
22+
TASK_PROMPT,
23+
)
2124
from sidekick.logger import logger
22-
from sidekick.utils import (_check_file_info, is_resource_low,
23-
load_causal_lm_model, load_embedding_model,
24-
make_dir, re_rank, read_sample_pairs,
25-
remove_duplicates, semantic_search)
25+
from sidekick.utils import (
26+
_check_file_info,
27+
is_resource_low,
28+
load_causal_lm_model,
29+
load_embedding_model,
30+
make_dir,
31+
re_rank,
32+
read_sample_pairs,
33+
remove_duplicates,
34+
semantic_search,
35+
)
2636
from sqlalchemy import create_engine
2737

2838

sidekick/schema_generator.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,30 @@
11
import json
22
import re
33
import pandas as pd
4+
import random
45

56

67
def generate_schema(data_path, output_path):
78
df = pd.read_csv(data_path)
89
# Extract the schema information
910
schema = df.dtypes.to_dict()
1011
schema_list = []
11-
special_characters = {" ": "_", ":": "_", "/": "_", "-": "_"}
12+
special_characters = {" ": "_", ":": "_", "/": "_", "-": "_", "(": "", ")": "", ".": "_"}
1213
syntax_names = ["default"]
1314

1415
for key, value in schema.items():
1516
new_key = "".join(special_characters[s] if s in special_characters.keys() else s for s in key)
1617
if new_key.lower() in syntax_names:
1718
new_key = new_key + "_col"
19+
if new_key[0].isdigit():
20+
_temp = "".join((new_key[:0], "Digit_", new_key[1:]))
21+
new_key = _temp
1822
if value == "object":
1923
value = "TEXT"
2024
unique_values = df[key].dropna().unique().tolist()
25+
if len(unique_values) > 10:
26+
# Randomly sample 10 values
27+
unique_values = random.sample(unique_values, k=10)
2128
if not bool(re.search(r"[A-Za-z]", unique_values[0])):
2229
schema_list.append({"Column Name": new_key, "Column Type": value})
2330
else:

0 commit comments

Comments
 (0)