From c1fc4af39bbb3104530a13638b121c64ae206283 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 28 Nov 2024 12:44:31 +0000 Subject: [PATCH 1/3] Add queyr cache indexer --- deploy_ai_search/.env | 1 + deploy_ai_search/README.md | 6 +- deploy_ai_search/deploy.py | 21 ++- deploy_ai_search/text_2_sql_query_cache.py | 138 +++++++++++++++++++- deploy_ai_search/text_2_sql_schema_store.py | 4 +- 5 files changed, 160 insertions(+), 10 deletions(-) diff --git a/deploy_ai_search/.env b/deploy_ai_search/.env index af194da..7ff4c19 100644 --- a/deploy_ai_search/.env +++ b/deploy_ai_search/.env @@ -15,6 +15,7 @@ StorageAccount__FQEndpoint=;AccountKey=;EndpointSuffix=core.windows.net> StorageAccount__RagDocuments__Container= StorageAccount__Text2SqlSchemaStore__Container= +StorageAccount__Text2SqlQueryCache__Container= OpenAI__ApiKey= OpenAI__Endpoint= OpenAI__EmbeddingModel= diff --git a/deploy_ai_search/README.md b/deploy_ai_search/README.md index 049b046..36b2254 100644 --- a/deploy_ai_search/README.md +++ b/deploy_ai_search/README.md @@ -24,17 +24,19 @@ The associated scripts in this portion of the repository contains pre-built scri - `index_type text_2_sql_schema_store`. This selects the `Text2SQLSchemaStoreAISearch` sub class. - `rebuild`. Whether to delete and rebuild the index. - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version. - - `single_data_dictionary`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False. + - `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False. ### Query Cache Index 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. -2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is no provided indexer or skillset for this cache, it is expected that application code will write directly to it. See the details in the Text2SQL README for different cache strategies.** +2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is an optional provided indexer or skillset for this cache. You may instead want the application code will write directly to it. See the details in the Text2SQL README for different cache strategies.** 3. Run `deploy.py` with the following args: - `index_type text_2_sql_query_cache`. This selects the `Text2SQLQueryCacheAISearch` sub class. - `rebuild`. Whether to delete and rebuild the index. - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version. + - `enable_cache_indexer`. Optional parameter that will enable the query cache indexer. Defaults to False. + - `single_cache__file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False. ## ai_search.py & environment.py diff --git a/deploy_ai_search/deploy.py b/deploy_ai_search/deploy.py index 5ff2483..b1d073d 100644 --- a/deploy_ai_search/deploy.py +++ b/deploy_ai_search/deploy.py @@ -24,11 +24,14 @@ def deploy_config(arguments: argparse.Namespace): index_config = Text2SqlSchemaStoreAISearch( suffix=arguments.suffix, rebuild=arguments.rebuild, - single_data_dictionary=arguments.single_data_dictionary, + single_data_dictionary_file=arguments.single_data_dictionary_file, ) elif arguments.index_type == "text_2_sql_query_cache": index_config = Text2SqlQueryCacheAISearch( - suffix=arguments.suffix, rebuild=arguments.rebuild + suffix=arguments.suffix, + rebuild=arguments.rebuild, + single_cache_file=arguments.single_cache_file, + enable_cache_indexer=arguments.enable_cache_indexer, ) else: raise ValueError("Invalid Indexer Type") @@ -60,11 +63,23 @@ def deploy_config(arguments: argparse.Namespace): help="Whether want to enable chunking by page in adi skill, if no value is passed considered False", ) parser.add_argument( - "--single_data_dictionary", + "--single_data_dictionary_file", type=bool, required=False, help="Whether or not a single data dictionary file should be uploaded, or one per entity", ) + parser.add_argument( + "--single_cache_file", + type=bool, + required=False, + help="Whether or not a single cache file should be uploaded, or one per question", + ) + parser.add_argument( + "--enable_cache_indexer", + type=bool, + required=False, + help="Whether or not the sql query cache indexer should be enabled", + ) parser.add_argument( "--suffix", type=str, diff --git a/deploy_ai_search/text_2_sql_query_cache.py b/deploy_ai_search/text_2_sql_query_cache.py index 4c685cf..76ed39b 100644 --- a/deploy_ai_search/text_2_sql_query_cache.py +++ b/deploy_ai_search/text_2_sql_query_cache.py @@ -5,12 +5,20 @@ SearchFieldDataType, SearchField, SearchableField, - SimpleField, - ComplexField, SemanticField, SemanticPrioritizedFields, SemanticConfiguration, SemanticSearch, + SearchIndexer, + FieldMapping, + SimpleField, + ComplexField, + IndexingParameters, + IndexingParametersConfiguration, + BlobIndexerDataToExtract, + IndexerExecutionEnvironment, + BlobIndexerParsingMode, + FieldMappingFunction, ) from ai_search import AISearch from environment import ( @@ -21,16 +29,30 @@ class Text2SqlQueryCacheAISearch(AISearch): """This class is used to deploy the sql index.""" - def __init__(self, suffix: str | None = None, rebuild: bool | None = False): + def __init__( + self, + suffix: str | None = None, + rebuild: bool | None = False, + single_cache_file: bool | None = False, + enable_cache_indexer: bool | None = False, + ): """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index. Args: suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. rebuild (bool, optional): Whether to rebuild the index. Defaults to False. + single_cache_file (bool, optional): Whether to use a single cache file. Defaults to False. Only applies if the cache indexer is enabled. + enable_cache_indexer (bool, optional): Whether to enable cache indexer. Defaults to False. """ self.indexer_type = IndexerType.TEXT_2_SQL_QUERY_CACHE + self.enable_cache_indexer = enable_cache_indexer super().__init__(suffix, rebuild) + if single_cache_file: + self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY + else: + self.parsing_mode = BlobIndexerParsingMode.JSON + def get_index_fields(self) -> list[SearchableField]: """This function returns the index fields for sql index. @@ -56,6 +78,11 @@ def get_index_fields(self) -> list[SearchableField]: name="SqlQueryDecomposition", collection=True, fields=[ + SearchableField( + name="SubQuestion", + type=SearchFieldDataType.String, + filterable=True, + ), SearchableField( name="SqlQuery", type=SearchFieldDataType.String, @@ -130,3 +157,108 @@ def get_semantic_search(self) -> SemanticSearch: semantic_search = SemanticSearch(configurations=[semantic_config]) return semantic_search + + def get_skills(self) -> list: + """Get the skillset for the indexer. + + Returns: + list: The skillsets used in the indexer""" + + if self.enable_cache_indexer is False: + return [] + + embedding_skill = self.get_vector_skill( + "/document", "/document/Question", target_name="QuestionEmbedding" + ) + + skills = [embedding_skill] + + return skills + + def get_indexer(self) -> SearchIndexer: + """This function returns the indexer for sql. + + Returns: + SearchIndexer: The indexer for sql""" + + if self.enable_cache_indexer is False: + return None + + # Only place on schedule if it is not a test deployment + if self.test: + schedule = None + batch_size = 4 + else: + schedule = {"interval": "PT24H"} + batch_size = 16 + + if self.environment.use_private_endpoint: + execution_environment = IndexerExecutionEnvironment.PRIVATE + else: + execution_environment = IndexerExecutionEnvironment.STANDARD + + indexer_parameters = IndexingParameters( + batch_size=batch_size, + configuration=IndexingParametersConfiguration( + data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA, + query_timeout=None, + execution_environment=execution_environment, + fail_on_unprocessable_document=False, + fail_on_unsupported_content_type=False, + index_storage_metadata_only_for_oversized_documents=True, + indexed_file_name_extensions=".json", + parsing_mode=self.parsing_mode, + ), + max_failed_items=5, + ) + + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer to sql entities and generate embeddings", + skillset_name=self.skillset_name, + target_index_name=self.index_name, + data_source_name=self.data_source_name, + schedule=schedule, + field_mappings=[ + FieldMapping( + source_field_name="metadata_storage_last_modified", + target_field_name="DateLastModified", + ) + ], + output_field_mappings=[ + FieldMapping( + source_field_name="/document/Question", + target_field_name="Id", + mapping_function=FieldMappingFunction( + name="base64Encode", + parameters={"useHttpServerUtilityUrlTokenEncode": False}, + ), + ), + FieldMapping( + source_field_name="/document/Question", target_field_name="Question" + ), + FieldMapping( + source_field_name="/document/QuestionEmbedding", + target_field_name="QuestionEmbedding", + ), + FieldMapping( + source_field_name="/document/SqlQueryDecomposition", + target_field_name="SqlQueryDecomposition", + ), + FieldMapping( + source_field_name="/document/DateLastModified", + target_field_name="DateLastModified", + ), + ], + parameters=indexer_parameters, + ) + + # Remove fields that are not supported by the database engine + indexer.output_field_mappings = [ + field_mapping + for field_mapping in indexer.output_field_mappings + if field_mapping.target_field_name + not in self.excluded_fields_for_database_engine + ] + + return indexer diff --git a/deploy_ai_search/text_2_sql_schema_store.py b/deploy_ai_search/text_2_sql_schema_store.py index 3c7b50c..db84a72 100644 --- a/deploy_ai_search/text_2_sql_schema_store.py +++ b/deploy_ai_search/text_2_sql_schema_store.py @@ -43,7 +43,7 @@ def __init__( self, suffix: str | None = None, rebuild: bool | None = False, - single_data_dictionary: bool | None = False, + single_data_dictionary_file: bool | None = False, ): """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index. @@ -57,7 +57,7 @@ def __init__( ] super().__init__(suffix, rebuild) - if single_data_dictionary: + if single_data_dictionary_file: self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY else: self.parsing_mode = BlobIndexerParsingMode.JSON From c8c1f5d364285e8f1cb4548a543755a62f89df83 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 28 Nov 2024 12:46:23 +0000 Subject: [PATCH 2/3] Remove output --- deploy_ai_search/text_2_sql_query_cache.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/deploy_ai_search/text_2_sql_query_cache.py b/deploy_ai_search/text_2_sql_query_cache.py index 76ed39b..56bf6a2 100644 --- a/deploy_ai_search/text_2_sql_query_cache.py +++ b/deploy_ai_search/text_2_sql_query_cache.py @@ -253,12 +253,4 @@ def get_indexer(self) -> SearchIndexer: parameters=indexer_parameters, ) - # Remove fields that are not supported by the database engine - indexer.output_field_mappings = [ - field_mapping - for field_mapping in indexer.output_field_mappings - if field_mapping.target_field_name - not in self.excluded_fields_for_database_engine - ] - return indexer From 2c3ab580366d853809a72a92d109c82b11e7ec6d Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 28 Nov 2024 12:48:22 +0000 Subject: [PATCH 3/3] Update parameters --- deploy_ai_search/deploy.py | 8 ++++---- deploy_ai_search/text_2_sql_query_cache.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/deploy_ai_search/deploy.py b/deploy_ai_search/deploy.py index b1d073d..0a6fb9b 100644 --- a/deploy_ai_search/deploy.py +++ b/deploy_ai_search/deploy.py @@ -30,8 +30,8 @@ def deploy_config(arguments: argparse.Namespace): index_config = Text2SqlQueryCacheAISearch( suffix=arguments.suffix, rebuild=arguments.rebuild, - single_cache_file=arguments.single_cache_file, - enable_cache_indexer=arguments.enable_cache_indexer, + single_query_cache_file=arguments.single_query_cache_file, + enable_query_cache_indexer=arguments.enable_query_cache_indexer, ) else: raise ValueError("Invalid Indexer Type") @@ -69,13 +69,13 @@ def deploy_config(arguments: argparse.Namespace): help="Whether or not a single data dictionary file should be uploaded, or one per entity", ) parser.add_argument( - "--single_cache_file", + "--single_query_cache_file", type=bool, required=False, help="Whether or not a single cache file should be uploaded, or one per question", ) parser.add_argument( - "--enable_cache_indexer", + "--enable_query_cache_indexer", type=bool, required=False, help="Whether or not the sql query cache indexer should be enabled", diff --git a/deploy_ai_search/text_2_sql_query_cache.py b/deploy_ai_search/text_2_sql_query_cache.py index 56bf6a2..4b27be5 100644 --- a/deploy_ai_search/text_2_sql_query_cache.py +++ b/deploy_ai_search/text_2_sql_query_cache.py @@ -33,22 +33,22 @@ def __init__( self, suffix: str | None = None, rebuild: bool | None = False, - single_cache_file: bool | None = False, - enable_cache_indexer: bool | None = False, + single_query_cache_file: bool | None = False, + enable_query_cache_indexer: bool | None = False, ): """Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index. Args: suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. rebuild (bool, optional): Whether to rebuild the index. Defaults to False. - single_cache_file (bool, optional): Whether to use a single cache file. Defaults to False. Only applies if the cache indexer is enabled. - enable_cache_indexer (bool, optional): Whether to enable cache indexer. Defaults to False. + single_query_cache_file (bool, optional): Whether to use a single cache file. Defaults to False. Only applies if the cache indexer is enabled. + enable_query_cache_indexer (bool, optional): Whether to enable cache indexer. Defaults to False. """ self.indexer_type = IndexerType.TEXT_2_SQL_QUERY_CACHE - self.enable_cache_indexer = enable_cache_indexer + self.enable_query_cache_indexer = enable_query_cache_indexer super().__init__(suffix, rebuild) - if single_cache_file: + if single_query_cache_file: self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY else: self.parsing_mode = BlobIndexerParsingMode.JSON @@ -164,7 +164,7 @@ def get_skills(self) -> list: Returns: list: The skillsets used in the indexer""" - if self.enable_cache_indexer is False: + if self.enable_query_cache_indexer is False: return [] embedding_skill = self.get_vector_skill( @@ -181,7 +181,7 @@ def get_indexer(self) -> SearchIndexer: Returns: SearchIndexer: The indexer for sql""" - if self.enable_cache_indexer is False: + if self.enable_query_cache_indexer is False: return None # Only place on schedule if it is not a test deployment