Skip to content

Commit 785594c

Browse files
Schema Store Improvements (#50)
* Update schema store * Updated query store * Update query cache setup * Update the query cache * Update vector store layout * Update skillset ingestion * Add graph traversal * Store warehouse and entity info * Update env * Bug fixes * Add entity relationship graphs * Update schema store index * Update outputs * Update schema store and graph creation * Update outputs * Reduce path duplicates * Update
1 parent d035ba2 commit 785594c

26 files changed

+2620
-1876
lines changed

deploy_ai_search/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ The associated scripts in this portion of the repository contains pre-built scri
1515

1616
## Steps for Text2SQL Index Deployment
1717

18-
### Entity Schema Index
18+
### Schema Store Index
1919

2020
1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
21-
2. Adjust `text_2_sql.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source.
21+
2. Adjust `text_2_sql_schema_store.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source.
2222
3. Run `deploy.py` with the following args:
2323

24-
- `index_type text_2_sql`. This selects the `Text2SQLAISearch` sub class.
24+
- `index_type text_2_sql_schema_store`. This selects the `Text2SQLSchemaStoreAISearch` sub class.
2525
- `rebuild`. Whether to delete and rebuild the index.
2626
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
2727
- `single_data_dictionary`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.

deploy_ai_search/ai_search.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def __init__(
4848
"""
4949

5050
if not hasattr(self, "indexer_type"):
51-
self.indexer_type = None # Needed to help mypy understand that indexer_type is defined in the child class
51+
# Needed to help mypy understand that indexer_type is defined in the child class
52+
self.indexer_type = None
5253
raise ValueError("indexer_type is not defined in the child class.")
5354

5455
if rebuild is not None:
@@ -126,13 +127,14 @@ def get_index_fields(self) -> list[SearchableField]:
126127
Returns:
127128
list[SearchableField]: The index fields"""
128129

129-
@abstractmethod
130130
def get_semantic_search(self) -> SemanticSearch:
131131
"""Get the semantic search configuration for the indexer.
132132
133133
Returns:
134134
SemanticSearch: The semantic search configuration"""
135135

136+
return None
137+
136138
def get_skills(self) -> list:
137139
"""Get the skillset for the indexer.
138140

deploy_ai_search/deploy.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Licensed under the MIT License.
33
import argparse
44
from rag_documents import RagDocumentsAISearch
5-
from text_2_sql import Text2SqlAISearch
5+
from text_2_sql_schema_store import Text2SqlSchemaStoreAISearch
66
from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
77
import logging
88

@@ -20,8 +20,8 @@ def deploy_config(arguments: argparse.Namespace):
2020
rebuild=arguments.rebuild,
2121
enable_page_by_chunking=arguments.enable_page_chunking,
2222
)
23-
elif arguments.index_type == "text_2_sql":
24-
index_config = Text2SqlAISearch(
23+
elif arguments.index_type == "text_2_sql_schema_store":
24+
index_config = Text2SqlSchemaStoreAISearch(
2525
suffix=arguments.suffix,
2626
rebuild=arguments.rebuild,
2727
single_data_dictionary=arguments.single_data_dictionary,

deploy_ai_search/environment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class IndexerType(Enum):
1212
"""The type of the indexer"""
1313

1414
RAG_DOCUMENTS = "rag-documents"
15-
TEXT_2_SQL = "text-2-sql"
15+
TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
1616
TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
1717

1818

deploy_ai_search/rag_documents.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def get_indexer(self) -> SearchIndexer:
281281
indexer_parameters = IndexingParameters(
282282
batch_size=batch_size,
283283
configuration=IndexingParametersConfiguration(
284-
data_to_extract=BlobIndexerDataToExtract.ALL_METADATA,
284+
data_to_extract=BlobIndexerDataToExtract.STORAGE_METADATA,
285285
query_timeout=None,
286286
execution_environment=execution_environment,
287287
fail_on_unprocessable_document=False,

deploy_ai_search/text_2_sql_query_cache.py

Lines changed: 32 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@
55
SearchFieldDataType,
66
SearchField,
77
SearchableField,
8-
SemanticField,
9-
SemanticPrioritizedFields,
10-
SemanticConfiguration,
11-
SemanticSearch,
128
SimpleField,
139
ComplexField,
1410
)
@@ -52,42 +48,52 @@ def get_index_fields(self) -> list[SearchableField]:
5248
vector_search_dimensions=self.environment.open_ai_embedding_dimensions,
5349
vector_search_profile_name=self.vector_search_profile_name,
5450
),
55-
SearchableField(
56-
name="Query", type=SearchFieldDataType.String, filterable=True
57-
),
5851
ComplexField(
59-
name="Schemas",
52+
name="SqlQueryDecomposition",
6053
collection=True,
6154
fields=[
6255
SearchableField(
63-
name="Entity",
56+
name="SqlQuery",
6457
type=SearchFieldDataType.String,
6558
filterable=True,
6659
),
6760
ComplexField(
68-
name="Columns",
61+
name="Schemas",
6962
collection=True,
7063
fields=[
7164
SearchableField(
72-
name="Name", type=SearchFieldDataType.String
73-
),
74-
SearchableField(
75-
name="Definition", type=SearchFieldDataType.String
76-
),
77-
SearchableField(
78-
name="Type", type=SearchFieldDataType.String
79-
),
80-
SearchableField(
81-
name="AllowedValues",
65+
name="Entity",
8266
type=SearchFieldDataType.String,
83-
collection=True,
84-
searchable=False,
67+
filterable=True,
8568
),
86-
SearchableField(
87-
name="SampleValues",
88-
type=SearchFieldDataType.String,
69+
ComplexField(
70+
name="Columns",
8971
collection=True,
90-
searchable=False,
72+
fields=[
73+
SearchableField(
74+
name="Name",
75+
type=SearchFieldDataType.String,
76+
),
77+
SearchableField(
78+
name="Definition",
79+
type=SearchFieldDataType.String,
80+
),
81+
SearchableField(
82+
name="DataType", type=SearchFieldDataType.String
83+
),
84+
SearchableField(
85+
name="AllowedValues",
86+
type=SearchFieldDataType.String,
87+
collection=True,
88+
searchable=False,
89+
),
90+
SearchableField(
91+
name="SampleValues",
92+
type=SearchFieldDataType.String,
93+
collection=True,
94+
searchable=False,
95+
),
96+
],
9197
),
9298
],
9399
),
@@ -101,23 +107,3 @@ def get_index_fields(self) -> list[SearchableField]:
101107
]
102108

103109
return fields
104-
105-
def get_semantic_search(self) -> SemanticSearch:
106-
"""This function returns the semantic search configuration for sql index
107-
108-
Returns:
109-
SemanticSearch: The semantic search configuration"""
110-
111-
semantic_config = SemanticConfiguration(
112-
name=self.semantic_config_name,
113-
prioritized_fields=SemanticPrioritizedFields(
114-
title_field=SemanticField(field_name="Question"),
115-
keywords_fields=[
116-
SemanticField(field_name="Query"),
117-
],
118-
),
119-
)
120-
121-
semantic_search = SemanticSearch(configurations=[semantic_config])
122-
123-
return semantic_search

deploy_ai_search/text_2_sql.py renamed to deploy_ai_search/text_2_sql_schema_store.py

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
)
2727

2828

29-
class Text2SqlAISearch(AISearch):
29+
class Text2SqlSchemaStoreAISearch(AISearch):
3030
"""This class is used to deploy the sql index."""
3131

3232
def __init__(
@@ -41,7 +41,7 @@ def __init__(
4141
suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
4242
rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
4343
"""
44-
self.indexer_type = IndexerType.TEXT_2_SQL
44+
self.indexer_type = IndexerType.TEXT_2_SQL_SCHEMA_STORE
4545
super().__init__(suffix, rebuild)
4646

4747
if single_data_dictionary:
@@ -62,34 +62,43 @@ def get_index_fields(self) -> list[SearchableField]:
6262
key=True,
6363
analyzer_name="keyword",
6464
),
65+
SearchableField(
66+
name="EntityName", type=SearchFieldDataType.String, filterable=True
67+
),
6568
SearchableField(
6669
name="Entity",
6770
type=SearchFieldDataType.String,
6871
analyzer_name="keyword",
6972
),
7073
SearchableField(
71-
name="EntityName", type=SearchFieldDataType.String, filterable=True
74+
name="Database",
75+
type=SearchFieldDataType.String,
7276
),
7377
SearchableField(
74-
name="Description",
78+
name="Warehouse",
79+
type=SearchFieldDataType.String,
80+
),
81+
SearchableField(
82+
name="Definition",
7583
type=SearchFieldDataType.String,
7684
sortable=False,
7785
filterable=False,
7886
facetable=False,
7987
),
8088
SearchField(
81-
name="DescriptionEmbedding",
89+
name="DefinitionEmbedding",
8290
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
8391
vector_search_dimensions=self.environment.open_ai_embedding_dimensions,
8492
vector_search_profile_name=self.vector_search_profile_name,
93+
hidden=True,
8594
),
8695
ComplexField(
8796
name="Columns",
8897
collection=True,
8998
fields=[
9099
SearchableField(name="Name", type=SearchFieldDataType.String),
91100
SearchableField(name="Definition", type=SearchFieldDataType.String),
92-
SearchableField(name="Type", type=SearchFieldDataType.String),
101+
SearchableField(name="DataType", type=SearchFieldDataType.String),
93102
SearchableField(
94103
name="AllowedValues",
95104
type=SearchFieldDataType.String,
@@ -102,6 +111,11 @@ def get_index_fields(self) -> list[SearchableField]:
102111
collection=True,
103112
searchable=False,
104113
),
114+
SearchableField(
115+
name="JoinableEntities",
116+
type=SearchFieldDataType.String,
117+
collection=True,
118+
),
105119
],
106120
),
107121
SearchableField(
@@ -111,6 +125,40 @@ def get_index_fields(self) -> list[SearchableField]:
111125
hidden=True,
112126
# This is needed to enable semantic searching against the column names as complex field types are not used.
113127
),
128+
SearchableField(
129+
name="ColumnDefinitions",
130+
type=SearchFieldDataType.String,
131+
collection=True,
132+
hidden=True,
133+
# This is needed to enable semantic searching against the column names as complex field types are not used.
134+
),
135+
ComplexField(
136+
name="EntityRelationships",
137+
collection=True,
138+
fields=[
139+
SearchableField(
140+
name="ForeignEntity",
141+
type=SearchFieldDataType.String,
142+
),
143+
ComplexField(
144+
name="ForeignKeys",
145+
collection=True,
146+
fields=[
147+
SearchableField(
148+
name="Column", type=SearchFieldDataType.String
149+
),
150+
SearchableField(
151+
name="ForeignColumn", type=SearchFieldDataType.String
152+
),
153+
],
154+
),
155+
],
156+
),
157+
SearchableField(
158+
name="CompleteEntityRelationshipsGraph",
159+
type=SearchFieldDataType.String,
160+
collection=True,
161+
),
114162
SimpleField(
115163
name="DateLastModified",
116164
type=SearchFieldDataType.DateTimeOffset,
@@ -131,7 +179,8 @@ def get_semantic_search(self) -> SemanticSearch:
131179
prioritized_fields=SemanticPrioritizedFields(
132180
title_field=SemanticField(field_name="EntityName"),
133181
content_fields=[
134-
SemanticField(field_name="Description"),
182+
SemanticField(field_name="Definition"),
183+
SemanticField(field_name="ColumnDefinitions"),
135184
],
136185
keywords_fields=[
137186
SemanticField(field_name="ColumnNames"),
@@ -151,7 +200,7 @@ def get_skills(self) -> list:
151200
list: The skillsets used in the indexer"""
152201

153202
embedding_skill = self.get_vector_skill(
154-
"/document", "/document/Description", target_name="DescriptionEmbedding"
203+
"/document", "/document/Definition", target_name="DefinitionEmbedding"
155204
)
156205

157206
skills = [embedding_skill]
@@ -222,12 +271,20 @@ def get_indexer(self) -> SearchIndexer:
222271
target_field_name="EntityName",
223272
),
224273
FieldMapping(
225-
source_field_name="/document/Description",
226-
target_field_name="Description",
274+
source_field_name="/document/Database",
275+
target_field_name="Database",
227276
),
228277
FieldMapping(
229-
source_field_name="/document/DescriptionEmbedding",
230-
target_field_name="DescriptionEmbedding",
278+
source_field_name="/document/Warehouse",
279+
target_field_name="Warehouse",
280+
),
281+
FieldMapping(
282+
source_field_name="/document/Definition",
283+
target_field_name="Definition",
284+
),
285+
FieldMapping(
286+
source_field_name="/document/DefinitionEmbedding",
287+
target_field_name="DefinitionEmbedding",
231288
),
232289
FieldMapping(
233290
source_field_name="/document/Columns",
@@ -237,6 +294,18 @@ def get_indexer(self) -> SearchIndexer:
237294
source_field_name="/document/Columns/*/Name",
238295
target_field_name="ColumnNames",
239296
),
297+
FieldMapping(
298+
source_field_name="/document/Columns/*/Definition",
299+
target_field_name="ColumnDefinitions",
300+
),
301+
FieldMapping(
302+
source_field_name="/document/EntityRelationships",
303+
target_field_name="EntityRelationships",
304+
),
305+
FieldMapping(
306+
source_field_name="/document/CompleteEntityRelationshipsGraph/*",
307+
target_field_name="CompleteEntityRelationshipsGraph",
308+
),
240309
FieldMapping(
241310
source_field_name="/document/DateLastModified",
242311
target_field_name="DateLastModified",

text_2_sql/data_dictionary/.env

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,7 @@ OpenAI__Endpoint=<openAIEndpoint>
44
OpenAI__ApiKey=<openAIKey if using non managed identity>
55
OpenAI__ApiVersion=<openAIApiVersion>
66
Text2Sql__DatabaseEngine=<databaseEngine>
7-
Text2Sql__UseQueryCache=<whether to use the query cache first or not>
8-
Text2Sql__PreRunQueryCache=<whether to pre-run the top result from the query cache or not>
97
Text2Sql__DatabaseName=<databaseName>
108
Text2Sql__DatabaseConnectionString=<databaseConnectionString>
11-
AIService__AzureSearchOptions__Endpoint=<searchServiceEndpoint>
12-
AIService__AzureSearchOptions__Key=<searchServiceKey if not using identity>
13-
AIService__AzureSearchOptions__RagDocuments__Index=<ragDocumentsIndexName>
14-
AIService__AzureSearchOptions__Text2Sql__Index=<text2SQLIndexName>
15-
AIService__AzureSearchOptions__Text2SqlQueryCache__Index=<text2SQLIndexName>
16-
AIService__AzureSearchOptions__RagDocuments__SemanticConfig=<ragDocumentsSemanticConfig>
17-
AIService__AzureSearchOptions__Text2Sql__SemanticConfig=<text2SQLSemanticConfig>
18-
AIService__AzureSearchOptions__Text2SqlQueryCache__SemanticConfig=<text2SQLSemanticConfig>
199
IdentityType=<identityType> # system_assigned or user_assigned or key
2010
ClientId=<clientId if using user assigned identity>

0 commit comments

Comments
 (0)