Skip to content

Commit 041af68

Browse files
Add Snowflake Metadata Collection Script (#49)
1 parent 2440e73 commit 041af68

File tree

6 files changed

+197
-45
lines changed

6 files changed

+197
-45
lines changed

text_2_sql/data_dictionary/.env

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,9 @@ OpenAI__ApiVersion=<openAIApiVersion>
66
Text2Sql__DatabaseEngine=<databaseEngine>
77
Text2Sql__DatabaseName=<databaseName>
88
Text2Sql__DatabaseConnectionString=<databaseConnectionString>
9+
Text2Sql__Snowflake__User=<snowflakeUser if using Snowflake Data Source>
10+
Text2Sql__Snowflake__Password=<snowflakePassword if using Snowflake Data Source>
11+
Text2Sql__Snowflake__Account=<snowflakeAccount if using Snowflake Data Source>
12+
Text2Sql__Snowflake__Warehouse=<snowflakeWarehouse if using Snowflake Data Source>
913
IdentityType=<identityType> # system_assigned or user_assigned or key
10-
ClientId=<clientId if using user assigned identity>
14+
ClientId=<clientId if using user assigned identity>

text_2_sql/data_dictionary/README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,13 @@ A full data dictionary must be built for all the views / tables you which to exp
9393

9494
Manually creating the `entities.json` is a time consuming exercise. To speed up generation, a mixture of SQL Queries and an LLM can be used to generate a initial version. Existing comments and descriptions in the database, can be combined with sample values to generate the necessary descriptions. Manual input can then be used to tweak it for the use case and any improvements.
9595

96-
`data_dictionary_creator.py` contains a utility class that handles the automatic generation and selection of schemas from the source SQL database. It must be subclassed to the appropriate engine.
97-
98-
`sql_server_data_dictionary_creator.py` contains a subclassed version of `data_dictionary_creator.py` that implements the SQL Server specific functionality to extract the entities.
96+
`data_dictionary_creator.py` contains a utility class that handles the automatic generation and selection of schemas from the source SQL database. It must be subclassed to the appropriate engine to handle engine specific queries and connection details.
9997

10098
See `./generated_samples/` for an example output of the script. This can then be automatically indexed with the provided indexer for the **Vector-Based Approach**.
99+
100+
The following Databases have pre-built scripts for them:
101+
102+
- **Microsoft SQL Server:** `sql_server_data_dictionary_creator.py`
103+
- **Snowflake:** `snowflake_data_dictionary_creator.py`
104+
105+
If there is no pre-built script for your database engine, take one of the above as a starting point and adjust it.

text_2_sql/data_dictionary/data_dictionary_creator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def __init__(
161161
self,
162162
entities: list[str] = None,
163163
excluded_entities: list[str] = None,
164+
excluded_schemas: list[str] = None,
164165
single_file: bool = False,
165166
generate_definitions: bool = True,
166167
):
@@ -169,12 +170,14 @@ def __init__(
169170
Args:
170171
entities (list[str], optional): A list of entities to extract. Defaults to None. If None, all entities are extracted.
171172
excluded_entities (list[str], optional): A list of entities to exclude. Defaults to None.
173+
excluded_schemas (list[str], optional): A list of schemas to exclude. Defaults to None.
172174
single_file (bool, optional): A flag to indicate if the data dictionary should be saved to a single file. Defaults to False.
173175
generate_definitions (bool, optional): A flag to indicate if definitions should be generated. Defaults to True.
174176
"""
175177

176178
self.entities = entities
177179
self.excluded_entities = excluded_entities
180+
self.excluded_schemas = excluded_schemas
178181
self.single_file = single_file
179182
self.generate_definitions = generate_definitions
180183

@@ -381,6 +384,7 @@ async def extract_entities_with_definitions(self) -> list[EntityItem]:
381384
entity
382385
for entity in all_entities
383386
if entity.entity not in self.excluded_entities
387+
and entity.entity_schema not in self.excluded_schemas
384388
]
385389

386390
# Add warehouse and database to entities

text_2_sql/data_dictionary/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ azure-identity
33
python-dotenv
44
pydantic
55
openai
6+
snowflake-connector-python
67
networkx
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
from data_dictionary_creator import DataDictionaryCreator, EntityItem
4+
import asyncio
5+
import snowflake.connector
6+
import logging
7+
import os
8+
9+
10+
class SnowflakeDataDictionaryCreator(DataDictionaryCreator):
11+
def __init__(
12+
self,
13+
entities: list[str] = None,
14+
excluded_entities: list[str] = None,
15+
single_file: bool = False,
16+
):
17+
"""A method to initialize the DataDictionaryCreator class.
18+
19+
Args:
20+
entities (list[str], optional): A list of entities to extract. Defaults to None. If None, all entities are extracted.
21+
excluded_entities (list[str], optional): A list of entities to exclude. Defaults to None.
22+
single_file (bool, optional): A flag to indicate if the data dictionary should be saved to a single file. Defaults to False.
23+
"""
24+
if excluded_entities is None:
25+
excluded_entities = []
26+
27+
excluded_schemas = ["INFORMATION_SCHEMA"]
28+
return super().__init__(
29+
entities, excluded_entities, excluded_schemas, single_file
30+
)
31+
32+
"""A class to extract data dictionary information from a Snowflake database."""
33+
34+
@property
35+
def extract_table_entities_sql_query(self) -> str:
36+
"""A property to extract table entities from a Snowflake database."""
37+
return """SELECT
38+
t.TABLE_NAME AS Entity,
39+
t.TABLE_SCHEMA AS EntitySchema,
40+
t.COMMENT AS Definition
41+
FROM
42+
INFORMATION_SCHEMA.TABLES t"""
43+
44+
@property
45+
def extract_view_entities_sql_query(self) -> str:
46+
"""A property to extract view entities from a Snowflake database."""
47+
return """SELECT
48+
v.TABLE_NAME AS Entity,
49+
v.TABLE_SCHEMA AS EntitySchema,
50+
v.COMMENT AS Definition
51+
FROM
52+
INFORMATION_SCHEMA.VIEWS v"""
53+
54+
def extract_columns_sql_query(self, entity: EntityItem) -> str:
55+
"""A property to extract column information from a Snowflake database."""
56+
return f"""SELECT
57+
COLUMN_NAME AS Name,
58+
DATA_TYPE AS Type,
59+
COMMENT AS Definition
60+
FROM
61+
INFORMATION_SCHEMA.COLUMNS
62+
WHERE
63+
TABLE_SCHEMA = '{entity.entity_schema}'
64+
AND TABLE_NAME = '{entity.name}';"""
65+
66+
@property
67+
def extract_entity_relationships_sql_query(self) -> str:
68+
"""A property to extract entity relationships from a SQL Server database."""
69+
return """SELECT
70+
tc.table_schema AS EntitySchema,
71+
tc.table_name AS Entity,
72+
rc.unique_constraint_schema AS ForeignEntitySchema,
73+
rc.unique_constraint_name AS ForeignEntityConstraint,
74+
rc.constraint_name AS ForeignKeyConstraint
75+
FROM
76+
information_schema.referential_constraints rc
77+
JOIN
78+
information_schema.table_constraints tc
79+
ON rc.constraint_schema = tc.constraint_schema
80+
AND rc.constraint_name = tc.constraint_name
81+
WHERE
82+
tc.constraint_type = 'FOREIGN KEY'
83+
ORDER BY
84+
EntitySchema, Entity, ForeignEntitySchema, ForeignEntityConstraint;
85+
"""
86+
87+
async def query_entities(
88+
self, sql_query: str, cast_to: any = None
89+
) -> list[EntityItem]:
90+
"""A method to query a database for entities using Snowflake Connector. Overrides the base class method.
91+
92+
Args:
93+
sql_query (str): The SQL query to run.
94+
cast_to (any, optional): The class to cast the results to. Defaults to None.
95+
96+
Returns:
97+
list[EntityItem]: The list of entities.
98+
"""
99+
logging.info(f"Running query: {sql_query}")
100+
results = []
101+
102+
# Create a connection to Snowflake, without specifying a schema
103+
conn = snowflake.connector.connect(
104+
user=os.environ["Text2Sql__Snowflake__User"],
105+
password=os.environ["Text2Sql__Snowflake__Password"],
106+
account=os.environ["Text2Sql__Snowflake__Account"],
107+
warehouse=os.environ["Text2Sql__Snowflake__Warehouse"],
108+
database=os.environ["Text2Sql__DatabaseName"],
109+
)
110+
111+
try:
112+
# Using the connection to create a cursor
113+
cursor = conn.cursor()
114+
115+
# Execute the query
116+
await asyncio.to_thread(cursor.execute, sql_query)
117+
118+
# Fetch column names
119+
columns = [col[0] for col in cursor.description]
120+
121+
# Fetch rows
122+
rows = await asyncio.to_thread(cursor.fetchall)
123+
124+
# Process rows
125+
for row in rows:
126+
if cast_to:
127+
results.append(cast_to.from_sql_row(row, columns))
128+
else:
129+
results.append(dict(zip(columns, row)))
130+
131+
finally:
132+
cursor.close()
133+
conn.close()
134+
135+
return results
136+
137+
138+
if __name__ == "__main__":
139+
data_dictionary_creator = SnowflakeDataDictionaryCreator()
140+
asyncio.run(data_dictionary_creator.create_data_dictionary())

text_2_sql/data_dictionary/sql_sever_data_dictionary_creator.py

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,8 @@ def __init__(
2222
if excluded_entities is None:
2323
excluded_entities = []
2424

25-
excluded_entities.extend(
26-
["dbo.BuildVersion", "dbo.ErrorLog", "sys.database_firewall_rules"]
27-
)
28-
super().__init__(entities, excluded_entities, single_file)
25+
excluded_schemas = ["dbo", "sys"]
26+
super().__init__(entities, excluded_entities, excluded_schemas, single_file)
2927
self.database = os.environ["Text2Sql__DatabaseName"]
3028

3129
"""A class to extract data dictionary information from a SQL Server database."""
@@ -34,53 +32,53 @@ def __init__(
3432
def extract_table_entities_sql_query(self) -> str:
3533
"""A property to extract table entities from a SQL Server database."""
3634
return """SELECT
37-
t.TABLE_NAME AS Entity,
38-
t.TABLE_SCHEMA AS EntitySchema,
39-
CAST(ep.value AS NVARCHAR(500)) AS Definition
40-
FROM
41-
INFORMATION_SCHEMA.TABLES t
42-
LEFT JOIN
43-
sys.extended_properties ep
44-
ON ep.major_id = OBJECT_ID(t.TABLE_SCHEMA + '.' + t.TABLE_NAME)
45-
AND ep.minor_id = 0
46-
AND ep.class = 1
47-
AND ep.name = 'MS_Description'
48-
WHERE
49-
t.TABLE_TYPE = 'BASE TABLE';"""
35+
t.TABLE_NAME AS Entity,
36+
t.TABLE_SCHEMA AS EntitySchema,
37+
CAST(ep.value AS NVARCHAR(500)) AS Definition
38+
FROM
39+
INFORMATION_SCHEMA.TABLES t
40+
LEFT JOIN
41+
sys.extended_properties ep
42+
ON ep.major_id = OBJECT_ID(t.TABLE_SCHEMA + '.' + t.TABLE_NAME)
43+
AND ep.minor_id = 0
44+
AND ep.class = 1
45+
AND ep.name = 'MS_Description'
46+
WHERE
47+
t.TABLE_TYPE = 'BASE TABLE';"""
5048

5149
@property
5250
def extract_view_entities_sql_query(self) -> str:
5351
"""A property to extract view entities from a SQL Server database."""
5452
return """SELECT
55-
v.TABLE_NAME AS Entity,
56-
v.TABLE_SCHEMA AS EntitySchema,
57-
CAST(ep.value AS NVARCHAR(500)) AS Definition
58-
FROM
59-
INFORMATION_SCHEMA.VIEWS v
60-
LEFT JOIN
61-
sys.extended_properties ep
62-
ON ep.major_id = OBJECT_ID(v.TABLE_SCHEMA + '.' + v.TABLE_NAME)
63-
AND ep.minor_id = 0
64-
AND ep.class = 1
53+
v.TABLE_NAME AS Entity,
54+
v.TABLE_SCHEMA AS EntitySchema,
55+
CAST(ep.value AS NVARCHAR(500)) AS Definition
56+
FROM
57+
INFORMATION_SCHEMA.VIEWS v
58+
LEFT JOIN
59+
sys.extended_properties ep
60+
ON ep.major_id = OBJECT_ID(v.TABLE_SCHEMA + '.' + v.TABLE_NAME)
61+
AND ep.minor_id = 0
62+
AND ep.class = 1
6563
AND ep.name = 'MS_Description';"""
6664

6765
def extract_columns_sql_query(self, entity: EntityItem) -> str:
6866
"""A property to extract column information from a SQL Server database."""
6967
return f"""SELECT
70-
c.COLUMN_NAME AS Name,
71-
c.DATA_TYPE AS DataType,
72-
CAST(ep.value AS NVARCHAR(500)) AS Definition
73-
FROM
74-
INFORMATION_SCHEMA.COLUMNS c
75-
LEFT JOIN
76-
sys.extended_properties ep
77-
ON ep.major_id = OBJECT_ID(c.TABLE_SCHEMA + '.' + c.TABLE_NAME)
78-
AND ep.minor_id = c.ORDINAL_POSITION
79-
AND ep.class = 1
80-
AND ep.name = 'MS_Description'
81-
WHERE
82-
c.TABLE_SCHEMA = '{entity.entity_schema}'
83-
AND c.TABLE_NAME = '{entity.name}';"""
68+
c.COLUMN_NAME AS Name,
69+
c.DATA_TYPE AS DataType,
70+
CAST(ep.value AS NVARCHAR(500)) AS Definition
71+
FROM
72+
INFORMATION_SCHEMA.COLUMNS c
73+
LEFT JOIN
74+
sys.extended_properties ep
75+
ON ep.major_id = OBJECT_ID(c.TABLE_SCHEMA + '.' + c.TABLE_NAME)
76+
AND ep.minor_id = c.ORDINAL_POSITION
77+
AND ep.class = 1
78+
AND ep.name = 'MS_Description'
79+
WHERE
80+
c.TABLE_SCHEMA = '{entity.entity_schema}'
81+
AND c.TABLE_NAME = '{entity.name}';"""
8482

8583
@property
8684
def extract_entity_relationships_sql_query(self) -> str:

0 commit comments

Comments
 (0)