Add optional query cache indexer (#66)

BenConstable9 · web-flow · commit 1c8ed08bd41d · 2024-11-29T13:29:16.000Z
* Add queyr cache indexer

* Remove output

* Update parameters
diff --git a/deploy_ai_search/.env b/deploy_ai_search/.env
@@ -15,6 +15,7 @@ StorageAccount__FQEndpoint=<Fully qualified endpoint in form ResourceId=resource
 StorageAccount__ConnectionString=<connectionString if using non managed identity. In format: DefaultEndpointsProtocol=https;AccountName=<STG NAME>;AccountKey=<ACCOUNT KEY>;EndpointSuffix=core.windows.net>
 StorageAccount__RagDocuments__Container=<containerName>
 StorageAccount__Text2SqlSchemaStore__Container=<containerName>
+StorageAccount__Text2SqlQueryCache__Container=<containerName>
 OpenAI__ApiKey=<openAIKey if using non managed identity>
 OpenAI__Endpoint=<openAIEndpoint>
 OpenAI__EmbeddingModel=<openAIEmbeddingModelName>
diff --git a/deploy_ai_search/README.md b/deploy_ai_search/README.md
@@ -24,17 +24,19 @@ The associated scripts in this portion of the repository contains pre-built scri
     - `index_type text_2_sql_schema_store`. This selects the `Text2SQLSchemaStoreAISearch` sub class.
     - `rebuild`. Whether to delete and rebuild the index.
     - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
-    - `single_data_dictionary`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
+    - `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
 
 ### Query Cache Index
 
 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
-2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is no provided indexer or skillset for this cache, it is expected that application code will write directly to it. See the details in the Text2SQL README for different cache strategies.**
+2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is an optional provided indexer or skillset for this cache. You may instead want the application code will write directly to it. See the details in the Text2SQL README for different cache strategies.**
 3. Run `deploy.py` with the following args:
 
     - `index_type text_2_sql_query_cache`. This selects the `Text2SQLQueryCacheAISearch` sub class.
     - `rebuild`. Whether to delete and rebuild the index.
     - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
+    - `enable_cache_indexer`. Optional parameter that will enable the query cache indexer. Defaults to False.
+    - `single_cache__file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
 
 ## ai_search.py & environment.py
 
diff --git a/deploy_ai_search/deploy.py b/deploy_ai_search/deploy.py
@@ -24,11 +24,14 @@ def deploy_config(arguments: argparse.Namespace):
         index_config = Text2SqlSchemaStoreAISearch(
             suffix=arguments.suffix,
             rebuild=arguments.rebuild,
-            single_data_dictionary=arguments.single_data_dictionary,
+            single_data_dictionary_file=arguments.single_data_dictionary_file,
         )
     elif arguments.index_type == "text_2_sql_query_cache":
         index_config = Text2SqlQueryCacheAISearch(
-            suffix=arguments.suffix, rebuild=arguments.rebuild
+            suffix=arguments.suffix,
+            rebuild=arguments.rebuild,
+            single_query_cache_file=arguments.single_query_cache_file,
+            enable_query_cache_indexer=arguments.enable_query_cache_indexer,
         )
     else:
         raise ValueError("Invalid Indexer Type")
@@ -60,11 +63,23 @@ def deploy_config(arguments: argparse.Namespace):
         help="Whether want to enable chunking by page in adi skill, if no value is passed considered False",
     )
     parser.add_argument(
-        "--single_data_dictionary",
+        "--single_data_dictionary_file",
         type=bool,
         required=False,
         help="Whether or not a single data dictionary file should be uploaded, or one per entity",
     )
+    parser.add_argument(
+        "--single_query_cache_file",
+        type=bool,
+        required=False,
+        help="Whether or not a single cache file should be uploaded, or one per question",
+    )
+    parser.add_argument(
+        "--enable_query_cache_indexer",
+        type=bool,
+        required=False,
+        help="Whether or not the sql query cache indexer should be enabled",
+    )
     parser.add_argument(
         "--suffix",
         type=str,
diff --git a/deploy_ai_search/text_2_sql_query_cache.py b/deploy_ai_search/text_2_sql_query_cache.py
@@ -5,12 +5,20 @@
     SearchFieldDataType,
     SearchField,
     SearchableField,
-    SimpleField,
-    ComplexField,
     SemanticField,
     SemanticPrioritizedFields,
     SemanticConfiguration,
     SemanticSearch,
+    SearchIndexer,
+    FieldMapping,
+    SimpleField,
+    ComplexField,
+    IndexingParameters,
+    IndexingParametersConfiguration,
+    BlobIndexerDataToExtract,
+    IndexerExecutionEnvironment,
+    BlobIndexerParsingMode,
+    FieldMappingFunction,
 )
 from ai_search import AISearch
 from environment import (
@@ -21,16 +29,30 @@
 class Text2SqlQueryCacheAISearch(AISearch):
     """This class is used to deploy the sql index."""
 
-    def __init__(self, suffix: str | None = None, rebuild: bool | None = False):
+    def __init__(
+        self,
+        suffix: str | None = None,
+        rebuild: bool | None = False,
+        single_query_cache_file: bool | None = False,
+        enable_query_cache_indexer: bool | None = False,
+    ):
         """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
 
         Args:
             suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
             rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
+            single_query_cache_file (bool, optional): Whether to use a single cache file. Defaults to False. Only applies if the cache indexer is enabled.
+            enable_query_cache_indexer (bool, optional): Whether to enable cache indexer. Defaults to False.
         """
         self.indexer_type = IndexerType.TEXT_2_SQL_QUERY_CACHE
+        self.enable_query_cache_indexer = enable_query_cache_indexer
         super().__init__(suffix, rebuild)
 
+        if single_query_cache_file:
+            self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
+        else:
+            self.parsing_mode = BlobIndexerParsingMode.JSON
+
     def get_index_fields(self) -> list[SearchableField]:
         """This function returns the index fields for sql index.
 
@@ -56,6 +78,11 @@ def get_index_fields(self) -> list[SearchableField]:
                 name="SqlQueryDecomposition",
                 collection=True,
                 fields=[
+                    SearchableField(
+                        name="SubQuestion",
+                        type=SearchFieldDataType.String,
+                        filterable=True,
+                    ),
                     SearchableField(
                         name="SqlQuery",
                         type=SearchFieldDataType.String,
@@ -130,3 +157,100 @@ def get_semantic_search(self) -> SemanticSearch:
         semantic_search = SemanticSearch(configurations=[semantic_config])
 
         return semantic_search
+
+    def get_skills(self) -> list:
+        """Get the skillset for the indexer.
+
+        Returns:
+            list: The skillsets  used in the indexer"""
+
+        if self.enable_query_cache_indexer is False:
+            return []
+
+        embedding_skill = self.get_vector_skill(
+            "/document", "/document/Question", target_name="QuestionEmbedding"
+        )
+
+        skills = [embedding_skill]
+
+        return skills
+
+    def get_indexer(self) -> SearchIndexer:
+        """This function returns the indexer for sql.
+
+        Returns:
+            SearchIndexer: The indexer for sql"""
+
+        if self.enable_query_cache_indexer is False:
+            return None
+
+        # Only place on schedule if it is not a test deployment
+        if self.test:
+            schedule = None
+            batch_size = 4
+        else:
+            schedule = {"interval": "PT24H"}
+            batch_size = 16
+
+        if self.environment.use_private_endpoint:
+            execution_environment = IndexerExecutionEnvironment.PRIVATE
+        else:
+            execution_environment = IndexerExecutionEnvironment.STANDARD
+
+        indexer_parameters = IndexingParameters(
+            batch_size=batch_size,
+            configuration=IndexingParametersConfiguration(
+                data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
+                query_timeout=None,
+                execution_environment=execution_environment,
+                fail_on_unprocessable_document=False,
+                fail_on_unsupported_content_type=False,
+                index_storage_metadata_only_for_oversized_documents=True,
+                indexed_file_name_extensions=".json",
+                parsing_mode=self.parsing_mode,
+            ),
+            max_failed_items=5,
+        )
+
+        indexer = SearchIndexer(
+            name=self.indexer_name,
+            description="Indexer to sql entities and generate embeddings",
+            skillset_name=self.skillset_name,
+            target_index_name=self.index_name,
+            data_source_name=self.data_source_name,
+            schedule=schedule,
+            field_mappings=[
+                FieldMapping(
+                    source_field_name="metadata_storage_last_modified",
+                    target_field_name="DateLastModified",
+                )
+            ],
+            output_field_mappings=[
+                FieldMapping(
+                    source_field_name="/document/Question",
+                    target_field_name="Id",
+                    mapping_function=FieldMappingFunction(
+                        name="base64Encode",
+                        parameters={"useHttpServerUtilityUrlTokenEncode": False},
+                    ),
+                ),
+                FieldMapping(
+                    source_field_name="/document/Question", target_field_name="Question"
+                ),
+                FieldMapping(
+                    source_field_name="/document/QuestionEmbedding",
+                    target_field_name="QuestionEmbedding",
+                ),
+                FieldMapping(
+                    source_field_name="/document/SqlQueryDecomposition",
+                    target_field_name="SqlQueryDecomposition",
+                ),
+                FieldMapping(
+                    source_field_name="/document/DateLastModified",
+                    target_field_name="DateLastModified",
+                ),
+            ],
+            parameters=indexer_parameters,
+        )
+
+        return indexer
diff --git a/deploy_ai_search/text_2_sql_schema_store.py b/deploy_ai_search/text_2_sql_schema_store.py
@@ -43,7 +43,7 @@ def __init__(
         self,
         suffix: str | None = None,
         rebuild: bool | None = False,
-        single_data_dictionary: bool | None = False,
+        single_data_dictionary_file: bool | None = False,
     ):
         """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
 
@@ -57,7 +57,7 @@ def __init__(
         ]
         super().__init__(suffix, rebuild)
 
-        if single_data_dictionary:
+        if single_data_dictionary_file:
             self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
         else:
             self.parsing_mode = BlobIndexerParsingMode.JSON