Skip to content

Data Dictionary Updates #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
uv.lock

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
2 changes: 1 addition & 1 deletion adi_function_app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
"pymupdf>=1.24.14",
"aiohttp>=3.11.9",
"pillow>=11.0.0",
"numpy>=2.1.3",
"numpy<2.0.0",
"spacy>=3.7.5",
"tiktoken>=0.8.0",
"en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz"
Expand Down
24 changes: 0 additions & 24 deletions deploy_ai_search/.env

This file was deleted.

15 changes: 15 additions & 0 deletions deploy_ai_search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,19 @@ dependencies = [
"azure-search-documents==11.6.0b8",
"azure-storage-blob>=12.24.0",
"python-dotenv>=1.0.1",
"text_2_sql_core",
]

[dependency-groups]
dev = [
"black>=24.10.0",
"ipykernel>=6.29.5",
"jupyter>=1.1.1",
"pre-commit>=4.0.1",
"pygments>=2.18.0",
"ruff>=0.8.1",
"python-dotenv>=1.0.1",
]

[tool.uv.sources]
text_2_sql_core = { workspace = true }
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ The associated scripts in this portion of the repository contains pre-built scri
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
- `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.

### Column Value Store Index

1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
2. Adjust `text_2_sql_column_value_store.py` with any changes to the index / indexer.
3. Run `deploy.py` with the following args:

- `index_type text_2_sql_column_value_store`. This selects the `Text2SQLColumnValueStoreAISearch` sub class.
- `rebuild`. Whether to delete and rebuild the index.
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.

### Query Cache Index

1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from rag_documents import RagDocumentsAISearch
from text_2_sql_schema_store import Text2SqlSchemaStoreAISearch
from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
from text_2_sql_column_value_store import Text2SqlColumnValueStoreAISearch
import logging

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -33,6 +34,11 @@ def deploy_config(arguments: argparse.Namespace):
single_query_cache_file=arguments.single_query_cache_file,
enable_query_cache_indexer=arguments.enable_query_cache_indexer,
)
elif arguments.index_type == "text_2_sql_column_value_store":
index_config = Text2SqlColumnValueStoreAISearch(
suffix=arguments.suffix,
rebuild=arguments.rebuild,
)
else:
raise ValueError("Invalid Indexer Type")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class IndexerType(Enum):
RAG_DOCUMENTS = "rag-documents"
TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
TEXT_2_SQL_COLUMN_VALUE_STORE = "text-2-sql-column-value-store"


class IdentityType(Enum):
Expand Down Expand Up @@ -172,10 +173,18 @@ def storage_account_blob_container_name(self) -> str:
This function returns azure blob container name
"""

return os.environ.get(
container = os.environ.get(
f"StorageAccount__{self.normalised_indexer_type}__Container"
)

if container is None:
raise ValueError(
f"""Populate environment variable 'StorageAccount__{
self.normalised_indexer_type}__Container' with container name."""
)

return container

@property
def function_app_end_point(self) -> str:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from azure.search.documents.indexes.models import (
SearchFieldDataType,
SearchableField,
SearchIndexer,
FieldMapping,
SimpleField,
IndexingParameters,
IndexingParametersConfiguration,
BlobIndexerDataToExtract,
IndexerExecutionEnvironment,
BlobIndexerParsingMode,
FieldMappingFunction,
)
from ai_search import AISearch
from environment import (
IndexerType,
)
import os
from text_2_sql_core.utils.database import DatabaseEngine


class Text2SqlColumnValueStoreAISearch(AISearch):
"""This class is used to deploy the sql index."""

def __init__(
self,
suffix: str | None = None,
rebuild: bool | None = False,
):
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.

Args:
suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
"""
self.indexer_type = IndexerType.TEXT_2_SQL_COLUMN_VALUE_STORE
super().__init__(suffix, rebuild)

self.database_engine = DatabaseEngine[
os.environ["Text2Sql__DatabaseEngine"].upper()
]

self.parsing_mode = BlobIndexerParsingMode.JSON_LINES

@property
def excluded_fields_for_database_engine(self):
"""A method to get the excluded fields for the database engine."""

all_engine_specific_fields = ["Warehouse", "Database", "Catalog"]
if self.database_engine == DatabaseEngine.SNOWFLAKE:
engine_specific_fields = ["Warehouse", "Database"]
elif self.database_engine == DatabaseEngine.TSQL:
engine_specific_fields = ["Database"]
elif self.database_engine == DatabaseEngine.DATABRICKS:
engine_specific_fields = ["Catalog"]

return [
field
for field in all_engine_specific_fields
if field not in engine_specific_fields
]

def get_index_fields(self) -> list[SearchableField]:
"""This function returns the index fields for sql index.

Returns:
list[SearchableField]: The index fields for sql index"""

fields = [
SimpleField(
name="Id",
type=SearchFieldDataType.String,
key=True,
analyzer_name="keyword",
),
SimpleField(
name="Entity",
type=SearchFieldDataType.String,
),
SimpleField(
name="Database",
type=SearchFieldDataType.String,
),
SimpleField(
name="Warehouse",
type=SearchFieldDataType.String,
),
SimpleField(
name="Catalog",
type=SearchFieldDataType.String,
),
SimpleField(
name="Column",
type=SearchFieldDataType.String,
),
SearchableField(
name="Value",
type=SearchFieldDataType.String,
hidden=False,
),
SimpleField(
name="Synonyms", type=SearchFieldDataType.String, collection=True
),
SimpleField(
name="DateLastModified",
type=SearchFieldDataType.DateTimeOffset,
filterable=True,
),
]

# Remove fields that are not supported by the database engine
fields = [
field
for field in fields
if field.name not in self.excluded_fields_for_database_engine
]

return fields

def get_skills(self) -> list:
"""Get the skillset for the indexer.

Returns:
list: The skillsets used in the indexer"""

skills = []

return skills

def get_indexer(self) -> SearchIndexer:
"""This function returns the indexer for sql.

Returns:
SearchIndexer: The indexer for sql"""

# Only place on schedule if it is not a test deployment
if self.test:
schedule = None
batch_size = 4
else:
schedule = {"interval": "PT24H"}
batch_size = 16

if self.environment.use_private_endpoint:
execution_environment = IndexerExecutionEnvironment.PRIVATE
else:
execution_environment = IndexerExecutionEnvironment.STANDARD

indexer_parameters = IndexingParameters(
batch_size=batch_size,
configuration=IndexingParametersConfiguration(
data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
query_timeout=None,
execution_environment=execution_environment,
fail_on_unprocessable_document=False,
fail_on_unsupported_content_type=False,
index_storage_metadata_only_for_oversized_documents=True,
indexed_file_name_extensions=".jsonl",
parsing_mode=self.parsing_mode,
),
max_failed_items=5,
)

indexer = SearchIndexer(
name=self.indexer_name,
description="Indexer to column values",
target_index_name=self.index_name,
data_source_name=self.data_source_name,
schedule=schedule,
field_mappings=[
FieldMapping(
source_field_name="metadata_storage_last_modified",
target_field_name="DateLastModified",
)
],
output_field_mappings=[
FieldMapping(
source_field_name="/document/Id",
target_field_name="Id",
mapping_function=FieldMappingFunction(
name="base64Encode",
parameters={"useHttpServerUtilityUrlTokenEncode": False},
),
),
FieldMapping(
source_field_name="/document/Entity", target_field_name="Entity"
),
FieldMapping(
source_field_name="/document/Database",
target_field_name="Database",
),
FieldMapping(
source_field_name="/document/Warehouse",
target_field_name="Warehouse",
),
FieldMapping(
source_field_name="/document/Column",
target_field_name="Column",
),
FieldMapping(
source_field_name="/document/Value",
target_field_name="Value",
),
FieldMapping(
source_field_name="/document/Synonyms",
target_field_name="Synonyms",
),
FieldMapping(
source_field_name="/document/DateLastModified",
target_field_name="DateLastModified",
),
],
parameters=indexer_parameters,
)

# Remove fields that are not supported by the database engine
indexer.output_field_mappings = [
field_mapping
for field_mapping in indexer.output_field_mappings
if field_mapping.target_field_name
not in self.excluded_fields_for_database_engine
]

return indexer
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,6 @@ def get_index_fields(self) -> list[SearchableField]:
SearchableField(
name="DataType", type=SearchFieldDataType.String
),
SearchableField(
name="AllowedValues",
type=SearchFieldDataType.String,
collection=True,
searchable=False,
),
SearchableField(
name="SampleValues",
type=SearchFieldDataType.String,
Expand Down
Loading
Loading