microsoft · BenConstable9 · Dec 4, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

@@ -23,7 +23,7 @@ dependencies = [
     "pymupdf>=1.24.14",
     "aiohttp>=3.11.9",
     "pillow>=11.0.0",
-    "numpy>=2.1.3",
+    "numpy<2.0.0",
     "spacy>=3.7.5",
     "tiktoken>=0.8.0",
     "en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz"

@@ -10,4 +10,19 @@ dependencies = [
     "azure-search-documents==11.6.0b8",
     "azure-storage-blob>=12.24.0",
     "python-dotenv>=1.0.1",
+    "text_2_sql_core",
 ]
+
+[dependency-groups]
+dev = [
+    "black>=24.10.0",
+    "ipykernel>=6.29.5",
+    "jupyter>=1.1.1",
+    "pre-commit>=4.0.1",
+    "pygments>=2.18.0",
+    "ruff>=0.8.1",
+    "python-dotenv>=1.0.1",
+]
+
+[tool.uv.sources]
+text_2_sql_core = { workspace = true }
@@ -26,6 +26,16 @@ The associated scripts in this portion of the repository contains pre-built scri
     - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
     - `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
 
+### Column Value Store Index
+
+1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
+2. Adjust `text_2_sql_column_value_store.py` with any changes to the index / indexer.
+3. Run `deploy.py` with the following args:
+
+    - `index_type text_2_sql_column_value_store`. This selects the `Text2SQLColumnValueStoreAISearch` sub class.
+    - `rebuild`. Whether to delete and rebuild the index.
+    - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
+
 ### Query Cache Index
 
 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.

@@ -4,6 +4,7 @@
 from rag_documents import RagDocumentsAISearch
 from text_2_sql_schema_store import Text2SqlSchemaStoreAISearch
 from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
+from text_2_sql_column_value_store import Text2SqlColumnValueStoreAISearch
 import logging
 
 logging.basicConfig(level=logging.INFO)
@@ -33,6 +34,11 @@ def deploy_config(arguments: argparse.Namespace):
             single_query_cache_file=arguments.single_query_cache_file,
             enable_query_cache_indexer=arguments.enable_query_cache_indexer,
         )
+    elif arguments.index_type == "text_2_sql_column_value_store":
+        index_config = Text2SqlColumnValueStoreAISearch(
+            suffix=arguments.suffix,
+            rebuild=arguments.rebuild,
+        )
     else:
         raise ValueError("Invalid Indexer Type")
 

@@ -14,6 +14,7 @@ class IndexerType(Enum):
     RAG_DOCUMENTS = "rag-documents"
     TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
     TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
+    TEXT_2_SQL_COLUMN_VALUE_STORE = "text-2-sql-column-value-store"
 
 
 class IdentityType(Enum):
@@ -172,10 +173,18 @@ def storage_account_blob_container_name(self) -> str:
         This function returns azure blob container name
         """
 
-        return os.environ.get(
+        container = os.environ.get(
             f"StorageAccount__{self.normalised_indexer_type}__Container"
         )
 
+        if container is None:
+            raise ValueError(
+                f"""Populate environment variable 'StorageAccount__{
+                             self.normalised_indexer_type}__Container' with container name."""
+            )
+
+        return container
+
     @property
     def function_app_end_point(self) -> str:
         """

@@ -0,0 +1,227 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from azure.search.documents.indexes.models import (
+    SearchFieldDataType,
+    SearchableField,
+    SearchIndexer,
+    FieldMapping,
+    SimpleField,
+    IndexingParameters,
+    IndexingParametersConfiguration,
+    BlobIndexerDataToExtract,
+    IndexerExecutionEnvironment,
+    BlobIndexerParsingMode,
+    FieldMappingFunction,
+)
+from ai_search import AISearch
+from environment import (
+    IndexerType,
+)
+import os
+from text_2_sql_core.utils.database import DatabaseEngine
+
+
+class Text2SqlColumnValueStoreAISearch(AISearch):
+    """This class is used to deploy the sql index."""
+
+    def __init__(
+        self,
+        suffix: str | None = None,
+        rebuild: bool | None = False,
+    ):
+        """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
+
+        Args:
+            suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
+            rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
+        """
+        self.indexer_type = IndexerType.TEXT_2_SQL_COLUMN_VALUE_STORE
+        super().__init__(suffix, rebuild)
+
+        self.database_engine = DatabaseEngine[
+            os.environ["Text2Sql__DatabaseEngine"].upper()
+        ]
+
+        self.parsing_mode = BlobIndexerParsingMode.JSON_LINES
+
+    @property
+    def excluded_fields_for_database_engine(self):
+        """A method to get the excluded fields for the database engine."""
+
+        all_engine_specific_fields = ["Warehouse", "Database", "Catalog"]
+        if self.database_engine == DatabaseEngine.SNOWFLAKE:
+            engine_specific_fields = ["Warehouse", "Database"]
+        elif self.database_engine == DatabaseEngine.TSQL:
+            engine_specific_fields = ["Database"]
+        elif self.database_engine == DatabaseEngine.DATABRICKS:
+            engine_specific_fields = ["Catalog"]
+
+        return [
+            field
+            for field in all_engine_specific_fields
+            if field not in engine_specific_fields
+        ]
+
+    def get_index_fields(self) -> list[SearchableField]:
+        """This function returns the index fields for sql index.
+
+        Returns:
+            list[SearchableField]: The index fields for sql index"""
+
+        fields = [
+            SimpleField(
+                name="Id",
+                type=SearchFieldDataType.String,
+                key=True,
+                analyzer_name="keyword",
+            ),
+            SimpleField(
+                name="Entity",
+                type=SearchFieldDataType.String,
+            ),
+            SimpleField(
+                name="Database",
+                type=SearchFieldDataType.String,
+            ),
+            SimpleField(
+                name="Warehouse",
+                type=SearchFieldDataType.String,
+            ),
+            SimpleField(
+                name="Catalog",
+                type=SearchFieldDataType.String,
+            ),
+            SimpleField(
+                name="Column",
+                type=SearchFieldDataType.String,
+            ),
+            SearchableField(
+                name="Value",
+                type=SearchFieldDataType.String,
+                hidden=False,
+            ),
+            SimpleField(
+                name="Synonyms", type=SearchFieldDataType.String, collection=True
+            ),
+            SimpleField(
+                name="DateLastModified",
+                type=SearchFieldDataType.DateTimeOffset,
+                filterable=True,
+            ),
+        ]
+
+        # Remove fields that are not supported by the database engine
+        fields = [
+            field
+            for field in fields
+            if field.name not in self.excluded_fields_for_database_engine
+        ]
+
+        return fields
+
+    def get_skills(self) -> list:
+        """Get the skillset for the indexer.
+
+        Returns:
+            list: The skillsets  used in the indexer"""
+
+        skills = []
+
+        return skills
+
+    def get_indexer(self) -> SearchIndexer:
+        """This function returns the indexer for sql.
+
+        Returns:
+            SearchIndexer: The indexer for sql"""
+
+        # Only place on schedule if it is not a test deployment
+        if self.test:
+            schedule = None
+            batch_size = 4
+        else:
+            schedule = {"interval": "PT24H"}
+            batch_size = 16
+
+        if self.environment.use_private_endpoint:
+            execution_environment = IndexerExecutionEnvironment.PRIVATE
+        else:
+            execution_environment = IndexerExecutionEnvironment.STANDARD
+
+        indexer_parameters = IndexingParameters(
+            batch_size=batch_size,
+            configuration=IndexingParametersConfiguration(
+                data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
+                query_timeout=None,
+                execution_environment=execution_environment,
+                fail_on_unprocessable_document=False,
+                fail_on_unsupported_content_type=False,
+                index_storage_metadata_only_for_oversized_documents=True,
+                indexed_file_name_extensions=".jsonl",
+                parsing_mode=self.parsing_mode,
+            ),
+            max_failed_items=5,
+        )
+
+        indexer = SearchIndexer(
+            name=self.indexer_name,
+            description="Indexer to column values",
+            target_index_name=self.index_name,
+            data_source_name=self.data_source_name,
+            schedule=schedule,
+            field_mappings=[
+                FieldMapping(
+                    source_field_name="metadata_storage_last_modified",
+                    target_field_name="DateLastModified",
+                )
+            ],
+            output_field_mappings=[
+                FieldMapping(
+                    source_field_name="/document/Id",
+                    target_field_name="Id",
+                    mapping_function=FieldMappingFunction(
+                        name="base64Encode",
+                        parameters={"useHttpServerUtilityUrlTokenEncode": False},
+                    ),
+                ),
+                FieldMapping(
+                    source_field_name="/document/Entity", target_field_name="Entity"
+                ),
+                FieldMapping(
+                    source_field_name="/document/Database",
+                    target_field_name="Database",
+                ),
+                FieldMapping(
+                    source_field_name="/document/Warehouse",
+                    target_field_name="Warehouse",
+                ),
+                FieldMapping(
+                    source_field_name="/document/Column",
+                    target_field_name="Column",
+                ),
+                FieldMapping(
+                    source_field_name="/document/Value",
+                    target_field_name="Value",
+                ),
+                FieldMapping(
+                    source_field_name="/document/Synonyms",
+                    target_field_name="Synonyms",
+                ),
+                FieldMapping(
+                    source_field_name="/document/DateLastModified",
+                    target_field_name="DateLastModified",
+                ),
+            ],
+            parameters=indexer_parameters,
+        )
+
+        # Remove fields that are not supported by the database engine
+        indexer.output_field_mappings = [
+            field_mapping
+            for field_mapping in indexer.output_field_mappings
+            if field_mapping.target_field_name
+            not in self.excluded_fields_for_database_engine
+        ]
+
+        return indexer
@@ -112,12 +112,6 @@ def get_index_fields(self) -> list[SearchableField]:
                                     SearchableField(
                                         name="DataType", type=SearchFieldDataType.String
                                     ),
-                                    SearchableField(
-                                        name="AllowedValues",
-                                        type=SearchFieldDataType.String,
-                                        collection=True,
-                                        searchable=False,
-                                    ),
                                     SearchableField(
                                         name="SampleValues",
                                         type=SearchFieldDataType.String,