@@ -120,13 +120,29 @@ def extract_columns_sql_query(self, entity: EntityItem) -> str:
120
120
def extract_distinct_values_sql_query (
121
121
self , entity : EntityItem , column : ColumnItem
122
122
) -> str :
123
- """A method to extract distinct values from a column in a database."""
123
+ """A method to extract distinct values from a column in a database. Can be sub-classed if needed.
124
+
125
+ Args:
126
+ entity (EntityItem): The entity to extract distinct values from.
127
+ column (ColumnItem): The column to extract distinct values from.
128
+
129
+ Returns:
130
+ str: The SQL query to extract distinct values from a column.
131
+ """
124
132
return f"""SELECT DISTINCT { column .name } FROM { entity .entity } ORDER BY { column .name } DESC;"""
125
133
126
134
async def query_entities (
127
135
self , sql_query : str , cast_to : any = None
128
136
) -> list [EntityItem ]:
129
- """A method to query a database for entities. Can be sub-classed if needed."""
137
+ """A method to query a database for entities. Can be sub-classed if needed.
138
+
139
+ Args:
140
+ sql_query (str): The SQL query to run.
141
+ cast_to (any, optional): The class to cast the results to. Defaults to None.
142
+
143
+ Returns:
144
+ list[EntityItem]: The list of entities.
145
+ """
130
146
connection_string = os .environ ["Text2Sql__DatabaseConnectionString" ]
131
147
132
148
logging .info (f"Running query: { sql_query } " )
@@ -147,8 +163,11 @@ async def query_entities(
147
163
148
164
return results
149
165
150
- async def extract_entities_with_descriptions (self ):
151
- """A method to extract entities with descriptions from a database."""
166
+ async def extract_entities_with_descriptions (self ) -> list [EntityItem ]:
167
+ """A method to extract entities with descriptions from a database.
168
+
169
+ Returns:
170
+ list[EntityItem]: The list of entities."""
152
171
table_entities = await self .query_entities (
153
172
self .extract_table_entities_sql_query , cast_to = EntityItem
154
173
)
@@ -177,7 +196,12 @@ async def extract_entities_with_descriptions(self):
177
196
async def extract_column_distinct_values (
178
197
self , entity : EntityItem , column : ColumnItem
179
198
):
180
- """A method to extract distinct values from a column in a database."""
199
+ """A method to extract distinct values from a column in a database.
200
+
201
+ Args:
202
+ entity (EntityItem): The entity to extract distinct values from.
203
+ column (ColumnItem): The column to extract distinct values from.
204
+ """
181
205
182
206
try :
183
207
distinct_values = await self .query_entities (
@@ -187,6 +211,7 @@ async def extract_column_distinct_values(
187
211
column .distinct_values = []
188
212
for value in distinct_values :
189
213
if value [column .name ] is not None :
214
+ # Remove any whitespace characters
190
215
if isinstance (value [column .name ], str ):
191
216
column .distinct_values .append (
192
217
re .sub (r"[\t\n\r\f\v]+" , "" , value [column .name ])
@@ -197,15 +222,18 @@ async def extract_column_distinct_values(
197
222
logging .error (f"Error extracting values for { column .name } " )
198
223
logging .error (e )
199
224
225
+ # Handle large set of distinct values
200
226
if column .distinct_values is not None and len (column .distinct_values ) > 5 :
201
227
column .sample_values = random .sample (column .distinct_values , 5 )
202
228
elif column .distinct_values is not None :
203
229
column .sample_values = column .distinct_values
204
230
205
231
async def generate_column_description (self , entity : EntityItem , column : ColumnItem ):
206
- """A method to generate a description for a column in a database."""
232
+ """A method to generate a description for a column in a database.
207
233
208
- # TODO: Avoid sending all values if cardinality it too high
234
+ Args:
235
+ entity (EntityItem): The entity the column belongs to.
236
+ column (ColumnItem): The column to generate a description for."""
209
237
210
238
column_description_system_prompt = """You are an expert in SQL Entity analysis. You must generate a brief description for this SQL Column. This description will be used to generate a SQL query with the correct values. Make sure to include a description of the data contained in this column.
211
239
@@ -245,7 +273,13 @@ async def generate_column_description(self, entity: EntityItem, column: ColumnIt
245
273
async def extract_columns_with_definitions (
246
274
self , entity : EntityItem
247
275
) -> list [ColumnItem ]:
248
- """A method to extract column information from a database."""
276
+ """A method to extract column information from a database.
277
+
278
+ Args:
279
+ entity (EntityItem): The entity to extract columns from.
280
+
281
+ Returns:
282
+ list[ColumnItem]: The list of columns."""
249
283
250
284
columns = await self .query_entities (
251
285
self .extract_columns_sql_query (entity ), cast_to = ColumnItem
@@ -270,8 +304,15 @@ async def extract_columns_with_definitions(
270
304
271
305
return columns
272
306
273
- async def send_request_to_llm (self , system_prompt , input ):
274
- """A method to use GPT to generate a description for an entity."""
307
+ async def send_request_to_llm (self , system_prompt : str , input : str ):
308
+ """A method to use GPT to generate a description for an entity.
309
+
310
+ Args:
311
+ system_prompt (str): The system prompt to use.
312
+ input (str): The input to use.
313
+
314
+ Returns:
315
+ str: The generated description."""
275
316
276
317
MAX_TOKENS = 2000
277
318
@@ -324,7 +365,10 @@ async def send_request_to_llm(self, system_prompt, input):
324
365
return response .choices [0 ].message .content
325
366
326
367
async def generate_entity_description (self , entity : EntityItem ):
327
- """A method to generate a description for an entity."""
368
+ """A method to generate a description for an entity.
369
+
370
+ Args:
371
+ entity (EntityItem): The entity to generate a description for."""
328
372
name_system_prompt = """You are an expert in SQL Entity analysis. You must generate a human readable name for this SQL Entity. This name will be used to select the most appropriate SQL entity to answer a given question. E.g. 'Sales Data', 'Customer Information', 'Product Catalog'."""
329
373
330
374
name_input = f"""Provide a human readable name for the {
@@ -358,8 +402,14 @@ async def generate_entity_description(self, entity: EntityItem):
358
402
logging .info (f"Description for { entity .entity } : { description } " )
359
403
entity .description = description
360
404
361
- async def build_entity_entry (self , entity : EntityItem ):
362
- """A method to build an entity entry."""
405
+ async def build_entity_entry (self , entity : EntityItem ) -> EntityItem :
406
+ """A method to build an entity entry.
407
+
408
+ Args:
409
+ entity (EntityItem): The entity to build an entry for.
410
+
411
+ Returns:
412
+ EntityItem: The entity entry."""
363
413
364
414
logging .info (f"Building entity entry for { entity .entity } " )
365
415
0 commit comments