Skip to content

Commit ee5be48

Browse files
Data Dictionary Updates (#81)
1 parent d66b628 commit ee5be48

23 files changed

+669
-3907
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ share/python-wheels/
2525
.installed.cfg
2626
*.egg
2727
MANIFEST
28+
uv.lock
2829

2930
# PyInstaller
3031
# Usually these files are written by a python script from a template

adi_function_app/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies = [
2323
"pymupdf>=1.24.14",
2424
"aiohttp>=3.11.9",
2525
"pillow>=11.0.0",
26-
"numpy>=2.1.3",
26+
"numpy<2.0.0",
2727
"spacy>=3.7.5",
2828
"tiktoken>=0.8.0",
2929
"en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz"

deploy_ai_search/.env

Lines changed: 0 additions & 24 deletions
This file was deleted.

deploy_ai_search/pyproject.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,19 @@ dependencies = [
1010
"azure-search-documents==11.6.0b8",
1111
"azure-storage-blob>=12.24.0",
1212
"python-dotenv>=1.0.1",
13+
"text_2_sql_core",
1314
]
15+
16+
[dependency-groups]
17+
dev = [
18+
"black>=24.10.0",
19+
"ipykernel>=6.29.5",
20+
"jupyter>=1.1.1",
21+
"pre-commit>=4.0.1",
22+
"pygments>=2.18.0",
23+
"ruff>=0.8.1",
24+
"python-dotenv>=1.0.1",
25+
]
26+
27+
[tool.uv.sources]
28+
text_2_sql_core = { workspace = true }

deploy_ai_search/README.md renamed to deploy_ai_search/src/deploy_ai_search/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ The associated scripts in this portion of the repository contains pre-built scri
2626
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
2727
- `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
2828

29+
### Column Value Store Index
30+
31+
1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
32+
2. Adjust `text_2_sql_column_value_store.py` with any changes to the index / indexer.
33+
3. Run `deploy.py` with the following args:
34+
35+
- `index_type text_2_sql_column_value_store`. This selects the `Text2SQLColumnValueStoreAISearch` sub class.
36+
- `rebuild`. Whether to delete and rebuild the index.
37+
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
38+
2939
### Query Cache Index
3040

3141
1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.

deploy_ai_search/deploy.py renamed to deploy_ai_search/src/deploy_ai_search/deploy.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from rag_documents import RagDocumentsAISearch
55
from text_2_sql_schema_store import Text2SqlSchemaStoreAISearch
66
from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
7+
from text_2_sql_column_value_store import Text2SqlColumnValueStoreAISearch
78
import logging
89

910
logging.basicConfig(level=logging.INFO)
@@ -33,6 +34,11 @@ def deploy_config(arguments: argparse.Namespace):
3334
single_query_cache_file=arguments.single_query_cache_file,
3435
enable_query_cache_indexer=arguments.enable_query_cache_indexer,
3536
)
37+
elif arguments.index_type == "text_2_sql_column_value_store":
38+
index_config = Text2SqlColumnValueStoreAISearch(
39+
suffix=arguments.suffix,
40+
rebuild=arguments.rebuild,
41+
)
3642
else:
3743
raise ValueError("Invalid Indexer Type")
3844

deploy_ai_search/environment.py renamed to deploy_ai_search/src/deploy_ai_search/environment.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class IndexerType(Enum):
1414
RAG_DOCUMENTS = "rag-documents"
1515
TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
1616
TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
17+
TEXT_2_SQL_COLUMN_VALUE_STORE = "text-2-sql-column-value-store"
1718

1819

1920
class IdentityType(Enum):
@@ -172,10 +173,18 @@ def storage_account_blob_container_name(self) -> str:
172173
This function returns azure blob container name
173174
"""
174175

175-
return os.environ.get(
176+
container = os.environ.get(
176177
f"StorageAccount__{self.normalised_indexer_type}__Container"
177178
)
178179

180+
if container is None:
181+
raise ValueError(
182+
f"""Populate environment variable 'StorageAccount__{
183+
self.normalised_indexer_type}__Container' with container name."""
184+
)
185+
186+
return container
187+
179188
@property
180189
def function_app_end_point(self) -> str:
181190
"""
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
from azure.search.documents.indexes.models import (
5+
SearchFieldDataType,
6+
SearchableField,
7+
SearchIndexer,
8+
FieldMapping,
9+
SimpleField,
10+
IndexingParameters,
11+
IndexingParametersConfiguration,
12+
BlobIndexerDataToExtract,
13+
IndexerExecutionEnvironment,
14+
BlobIndexerParsingMode,
15+
FieldMappingFunction,
16+
)
17+
from ai_search import AISearch
18+
from environment import (
19+
IndexerType,
20+
)
21+
import os
22+
from text_2_sql_core.utils.database import DatabaseEngine
23+
24+
25+
class Text2SqlColumnValueStoreAISearch(AISearch):
26+
"""This class is used to deploy the sql index."""
27+
28+
def __init__(
29+
self,
30+
suffix: str | None = None,
31+
rebuild: bool | None = False,
32+
):
33+
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
34+
35+
Args:
36+
suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
37+
rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
38+
"""
39+
self.indexer_type = IndexerType.TEXT_2_SQL_COLUMN_VALUE_STORE
40+
super().__init__(suffix, rebuild)
41+
42+
self.database_engine = DatabaseEngine[
43+
os.environ["Text2Sql__DatabaseEngine"].upper()
44+
]
45+
46+
self.parsing_mode = BlobIndexerParsingMode.JSON_LINES
47+
48+
@property
49+
def excluded_fields_for_database_engine(self):
50+
"""A method to get the excluded fields for the database engine."""
51+
52+
all_engine_specific_fields = ["Warehouse", "Database", "Catalog"]
53+
if self.database_engine == DatabaseEngine.SNOWFLAKE:
54+
engine_specific_fields = ["Warehouse", "Database"]
55+
elif self.database_engine == DatabaseEngine.TSQL:
56+
engine_specific_fields = ["Database"]
57+
elif self.database_engine == DatabaseEngine.DATABRICKS:
58+
engine_specific_fields = ["Catalog"]
59+
60+
return [
61+
field
62+
for field in all_engine_specific_fields
63+
if field not in engine_specific_fields
64+
]
65+
66+
def get_index_fields(self) -> list[SearchableField]:
67+
"""This function returns the index fields for sql index.
68+
69+
Returns:
70+
list[SearchableField]: The index fields for sql index"""
71+
72+
fields = [
73+
SimpleField(
74+
name="Id",
75+
type=SearchFieldDataType.String,
76+
key=True,
77+
analyzer_name="keyword",
78+
),
79+
SimpleField(
80+
name="Entity",
81+
type=SearchFieldDataType.String,
82+
),
83+
SimpleField(
84+
name="Database",
85+
type=SearchFieldDataType.String,
86+
),
87+
SimpleField(
88+
name="Warehouse",
89+
type=SearchFieldDataType.String,
90+
),
91+
SimpleField(
92+
name="Catalog",
93+
type=SearchFieldDataType.String,
94+
),
95+
SimpleField(
96+
name="Column",
97+
type=SearchFieldDataType.String,
98+
),
99+
SearchableField(
100+
name="Value",
101+
type=SearchFieldDataType.String,
102+
hidden=False,
103+
),
104+
SimpleField(
105+
name="Synonyms", type=SearchFieldDataType.String, collection=True
106+
),
107+
SimpleField(
108+
name="DateLastModified",
109+
type=SearchFieldDataType.DateTimeOffset,
110+
filterable=True,
111+
),
112+
]
113+
114+
# Remove fields that are not supported by the database engine
115+
fields = [
116+
field
117+
for field in fields
118+
if field.name not in self.excluded_fields_for_database_engine
119+
]
120+
121+
return fields
122+
123+
def get_skills(self) -> list:
124+
"""Get the skillset for the indexer.
125+
126+
Returns:
127+
list: The skillsets used in the indexer"""
128+
129+
skills = []
130+
131+
return skills
132+
133+
def get_indexer(self) -> SearchIndexer:
134+
"""This function returns the indexer for sql.
135+
136+
Returns:
137+
SearchIndexer: The indexer for sql"""
138+
139+
# Only place on schedule if it is not a test deployment
140+
if self.test:
141+
schedule = None
142+
batch_size = 4
143+
else:
144+
schedule = {"interval": "PT24H"}
145+
batch_size = 16
146+
147+
if self.environment.use_private_endpoint:
148+
execution_environment = IndexerExecutionEnvironment.PRIVATE
149+
else:
150+
execution_environment = IndexerExecutionEnvironment.STANDARD
151+
152+
indexer_parameters = IndexingParameters(
153+
batch_size=batch_size,
154+
configuration=IndexingParametersConfiguration(
155+
data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
156+
query_timeout=None,
157+
execution_environment=execution_environment,
158+
fail_on_unprocessable_document=False,
159+
fail_on_unsupported_content_type=False,
160+
index_storage_metadata_only_for_oversized_documents=True,
161+
indexed_file_name_extensions=".jsonl",
162+
parsing_mode=self.parsing_mode,
163+
),
164+
max_failed_items=5,
165+
)
166+
167+
indexer = SearchIndexer(
168+
name=self.indexer_name,
169+
description="Indexer to column values",
170+
target_index_name=self.index_name,
171+
data_source_name=self.data_source_name,
172+
schedule=schedule,
173+
field_mappings=[
174+
FieldMapping(
175+
source_field_name="metadata_storage_last_modified",
176+
target_field_name="DateLastModified",
177+
)
178+
],
179+
output_field_mappings=[
180+
FieldMapping(
181+
source_field_name="/document/Id",
182+
target_field_name="Id",
183+
mapping_function=FieldMappingFunction(
184+
name="base64Encode",
185+
parameters={"useHttpServerUtilityUrlTokenEncode": False},
186+
),
187+
),
188+
FieldMapping(
189+
source_field_name="/document/Entity", target_field_name="Entity"
190+
),
191+
FieldMapping(
192+
source_field_name="/document/Database",
193+
target_field_name="Database",
194+
),
195+
FieldMapping(
196+
source_field_name="/document/Warehouse",
197+
target_field_name="Warehouse",
198+
),
199+
FieldMapping(
200+
source_field_name="/document/Column",
201+
target_field_name="Column",
202+
),
203+
FieldMapping(
204+
source_field_name="/document/Value",
205+
target_field_name="Value",
206+
),
207+
FieldMapping(
208+
source_field_name="/document/Synonyms",
209+
target_field_name="Synonyms",
210+
),
211+
FieldMapping(
212+
source_field_name="/document/DateLastModified",
213+
target_field_name="DateLastModified",
214+
),
215+
],
216+
parameters=indexer_parameters,
217+
)
218+
219+
# Remove fields that are not supported by the database engine
220+
indexer.output_field_mappings = [
221+
field_mapping
222+
for field_mapping in indexer.output_field_mappings
223+
if field_mapping.target_field_name
224+
not in self.excluded_fields_for_database_engine
225+
]
226+
227+
return indexer

0 commit comments

Comments
 (0)