Update ai search deploy

BenConstable9 · BenConstable9 · commit 27e5251dc808 · 2025-01-21T21:07:19.000Z
diff --git a/deploy_ai_search/.env.example b/deploy_ai_search/.env.example
diff --git a/deploy_ai_search/README.md b/deploy_ai_search/README.md
@@ -2,14 +2,14 @@
 
 The associated scripts in this portion of the repository contains pre-built scripts to deploy the skillsets needed for both Text2SQL and Image Processing.
 
-## Steps for Rag Documents Index Deployment (For Image Processing)
+## Steps for Image Processing Index Deployment (For Image Processing)
 
 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
-2. Adjust `rag_documents.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source.
+2. Adjust `image_processing.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source.
 3. Run `deploy.py` with the following args:
 
-    - `index_type rag`. This selects the `RagDocumentsAISearch` sub class.
-    - `enable_page_chunking True`. This determines whether page wise chunking is applied in ADI, or whether the inbuilt skill is used for TextSplit. **Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.**
+    - `index_type image_processing`. This selects the `ImageProcessingAISearch` sub class.
+    - `enable_page_chunking True`. This determines whether page wise chunking is applied in ADI, or whether the inbuilt skill is used for TextSplit. This suits documents that are inheritely page-wise e.g. pptx files.
     - `rebuild`. Whether to delete and rebuild the index.
     - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
 
diff --git a/deploy_ai_search/src/deploy_ai_search/deploy.py b/deploy_ai_search/src/deploy_ai_search/deploy.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import argparse
-from rag_documents import RagDocumentsAISearch
+from image_processing import ImageProcessingAISearch
 from text_2_sql_schema_store import Text2SqlSchemaStoreAISearch
 from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
 from text_2_sql_column_value_store import Text2SqlColumnValueStoreAISearch
@@ -17,8 +17,8 @@ def deploy_config(arguments: argparse.Namespace):
         arguments (argparse.Namespace): The arguments passed to the script"""
 
     suffix = None if args.suffix == "None" else args.suffix
-    if arguments.index_type == "rag":
-        index_config = RagDocumentsAISearch(
+    if arguments.index_type == "image_processing":
+        index_config = ImageProcessingAISearch(
             suffix=suffix,
             rebuild=arguments.rebuild,
             enable_page_by_chunking=arguments.enable_page_chunking,
diff --git a/deploy_ai_search/src/deploy_ai_search/environment.py b/deploy_ai_search/src/deploy_ai_search/environment.py
@@ -11,7 +11,7 @@
 class IndexerType(Enum):
     """The type of the indexer"""
 
-    RAG_DOCUMENTS = "rag-documents"
+    IMAGE_PROCESSING = "image-processing"
     TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
     TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
     TEXT_2_SQL_COLUMN_VALUE_STORE = "text-2-sql-column-value-store"
diff --git a/deploy_ai_search/src/deploy_ai_search/image_processing.py b/deploy_ai_search/src/deploy_ai_search/image_processing.py
@@ -29,7 +29,7 @@
 )
 
 
-class RagDocumentsAISearch(AISearch):
+class ImageProcessingAISearch(AISearch):
     """This class is used to deploy the rag document index."""
 
     def __init__(
@@ -38,13 +38,13 @@ def __init__(
         rebuild: bool | None = False,
         enable_page_by_chunking=False,
     ):
-        """Initialize the RagDocumentsAISearch class. This class implements the deployment of the rag document index.
+        """Initialize the ImageProcessingAISearch class. This class implements the deployment of the rag document index.
 
         Args:
             suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
             rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
         """
-        self.indexer_type = IndexerType.RAG_DOCUMENTS
+        self.indexer_type = IndexerType.IMAGE_PROCESSING
         super().__init__(suffix, rebuild)
 
         if enable_page_by_chunking is not None:
@@ -140,7 +140,7 @@ def get_index_fields(self) -> list[SearchableField]:
         if self.enable_page_by_chunking:
             fields.extend(
                 [
-                    SearchableField(
+                    SimpleField(
                         name="PageNumber",
                         type=SearchFieldDataType.Int64,
                         sortable=True,
@@ -286,7 +286,7 @@ def get_indexer(self) -> SearchIndexer:
         indexer_parameters = IndexingParameters(
             batch_size=batch_size,
             configuration=IndexingParametersConfiguration(
-                data_to_extract=BlobIndexerDataToExtract.STORAGE_METADATA,
+                data_to_extract=BlobIndexerDataToExtract.ALL_METADATA,
                 query_timeout=None,
                 execution_environment=execution_environment,
                 fail_on_unprocessable_document=False,
diff --git a/image_processing/requirements.txt b/image_processing/requirements.txt