From 39b83956cdbbd7e79892d253dd19c6e7e9653cdb Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:59:53 +0530 Subject: [PATCH 1/5] cotribution md change --- CONTRIBUTING.md | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8637528..8a07f92 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,70 @@ +## Contributing to this Repository +Welcome ! We greatly appreciate your interest in contributing to our project. Please follow the guidelines below to ensure a smooth and successful contribution process. + +## Fork the Repository +To get started, fork the dstoolkit-text2sql-and-imageprocessing + main repository to your own GitHub account by clicking on the "Fork" button at the top right corner of the repository page. This will create a copy of the repository under your account, which you can freely make changes to. + +## Clone the Repository +Next, clone the forked repository to your local machine using the following command: + +``` +git clone https://github.com/[your-github-username]/[your-repository-name].git +``` + +Make sure to replace [your-github-username] with your actual GitHub username and [your-repository-name] with the name of your forked repository. + +## Set Up Access to Relevant Services +Please ensure that you have the appropriate permissions and credentials to avoid any issues during the contribution process. This includes Azure DevOps project, repository, pipelines and Azure Subscription. If your contribution requires access to Azure Machine Learning compute, make sure you have the necessary permissions and access before proceeding with your changes. + +## Install Dependencies and Validate Environment +Before making changes, ensure that you have installed all the dependencies required for the project. This include Conda, Python 3.8 (ideal), azureml sdk v2 and tools. Validate that your development environment is set up correctly and meets the project's requirements. + +## Create a Branch +Create a new branch for your contribution. It's important to create a new branch for each contribution to keep the main branch clean and stable. You can create a new branch using the following command: +``` +git checkout -b [branch-name] +``` + +Replace [branch-name] with a descriptive name for your branch that indicates the purpose of your contribution. + +## Make Changes +Now it's time to make your changes! Follow the coding style and guidelines of the project, and thoroughly test your changes in your local environment. Ensure that your changes do not introduce any errors or break the existing functionality. Be sure to add appropriate comments and documentation as needed. + +## Validate code changes +Before submitting your contribution, it's crucial to validate your changes by building and testing the project on your environment. This includes running code quality checks, linting, unit tests, MLOps CI/CD and AzureML pipelines including training scripts, executing tests, or other validation processes. Make sure that your changes do not cause any build failures or test errors. + +## Commit and Push Changes +Once you're confident with your changes, commit your changes and push them to your forked repository using the following commands: + +``` +git add . +git commit -m "Your commit message here" +git push origin [branch-name] +``` +Replace [branch-name] with the name of your branch. + +## Create a Pull Request +Go to the original [Your Repository Name] repository on GitHub and click on the "New Pull Request" button. Select your branch from the base and compare branches drop-down menus. Review your changes and provide a descriptive title and detailed description for your pull request. Include relevant information, such as the purpose of your contribution, the changes made, and any necessary context. Click on the "Create Pull Request" button to submit your contribution. + +## Validate Builds and Tests +After PR is created, build validation must pass before the code can be merged on the target develop branch. Any feedback from build validation must be addressed or else the PR will not get merged to target develop branch. + +## Review and Address Feedback +Your pull request will be reviewed by the repository maintainers, and they may provide feedback or request changes. Be sure to monitor your pull request and address any feedback in a timely manner. This may involve making additional changes, providing clarification, or addressing any issues raised during the review process. + +## Follow Code of Conduct +As a contributor, it's important to adhere to the project's code of conduct. Make sure to follow the project's guidelines, respect the contributions of others, and avoid any inappropriate behavior. Additionally, ensure that your contribution does not violate any copyright or intellectual property rights. + +## Merge and Close +Once your contribution has been approved and all feedback has been addressed, you should merge your changes into the develop branch. After the changes have been merged, your contribution will be credited and acknowledged in the project's documentation or contributors list. Your pull request will then be closed, and your contribution will become part of the project's codebase. + +Congratulations! You have successfully contributed to dstoolkit-text2sql-and-imageprocessing. Thank you for your valuable contribution and for following the contribution guidelines. + +If you have any questions or need further assistance, feel free to reach out to the repository maintainers or the project's team channel for support. + +Happy contributing! + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 61794f8678b21d2aeb6e86c01a6abfcb4b23047a Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:44:10 +0530 Subject: [PATCH 2/5] changes to contriuting md --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8a07f92..31ef3d8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,10 +15,10 @@ git clone https://github.com/[your-github-username]/[your-repository-name].git Make sure to replace [your-github-username] with your actual GitHub username and [your-repository-name] with the name of your forked repository. ## Set Up Access to Relevant Services -Please ensure that you have the appropriate permissions and credentials to avoid any issues during the contribution process. This includes Azure DevOps project, repository, pipelines and Azure Subscription. If your contribution requires access to Azure Machine Learning compute, make sure you have the necessary permissions and access before proceeding with your changes. +Please ensure that you have the appropriate permissions and credentials to avoid any issues during the contribution process. This includes Azure DevOps project, repository, pipelines and Azure Subscription. ## Install Dependencies and Validate Environment -Before making changes, ensure that you have installed all the dependencies required for the project. This include Conda, Python 3.8 (ideal), azureml sdk v2 and tools. Validate that your development environment is set up correctly and meets the project's requirements. +Before making changes, ensure that you have installed all the dependencies required for the project. This include Conda, Python 3.8 (ideal) and tools. Validate that your development environment is set up correctly and meets the project's requirements. ## Create a Branch Create a new branch for your contribution. It's important to create a new branch for each contribution to keep the main branch clean and stable. You can create a new branch using the following command: @@ -32,7 +32,7 @@ Replace [branch-name] with a descriptive name for your branch that indicates the Now it's time to make your changes! Follow the coding style and guidelines of the project, and thoroughly test your changes in your local environment. Ensure that your changes do not introduce any errors or break the existing functionality. Be sure to add appropriate comments and documentation as needed. ## Validate code changes -Before submitting your contribution, it's crucial to validate your changes by building and testing the project on your environment. This includes running code quality checks, linting, unit tests, MLOps CI/CD and AzureML pipelines including training scripts, executing tests, or other validation processes. Make sure that your changes do not cause any build failures or test errors. +Before submitting your contribution, it's crucial to validate your changes by building and testing the project on your environment. This includes running code quality checks, linting, unit tests including training scripts, executing tests, or other validation processes. Make sure that your changes do not cause any build failures or test errors. ## Commit and Push Changes Once you're confident with your changes, commit your changes and push them to your forked repository using the following commands: From ade699781a0ba448601845f0f7b81fd46460c082 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:46:19 +0530 Subject: [PATCH 3/5] changes to storage account --- adi_function_app/storage_account.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adi_function_app/storage_account.py b/adi_function_app/storage_account.py index 015a144..cdad5fb 100644 --- a/adi_function_app/storage_account.py +++ b/adi_function_app/storage_account.py @@ -27,7 +27,7 @@ async def get_client(self): return BlobServiceClient(account_url=endpoint, credential=credential) else: endpoint = os.environ.get("StorageAccount__ConnectionString") - return BlobServiceClient(account_url=endpoint) + return BlobServiceClient.from_connection_string(endpoint) async def add_metadata_to_blob( self, source: str, container: str, metadata: dict From f8808269158e7581afc224ec6a737e1a834ca286 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Mon, 23 Sep 2024 19:30:16 +0530 Subject: [PATCH 4/5] changes for comparison with old rag --- adi_function_app/function_app.py | 39 ++ adi_function_app/ocr.py | 82 ++++ deploy_ai_search/ai_search.py | 107 ++++- deploy_ai_search/deploy.py | 3 +- deploy_ai_search/environment.py | 9 + deploy_ai_search/rag_documents.py | 1 + deploy_ai_search/rag_documents_old.py | 339 +++++++++++++ local_test/adi_2_ai_search.py | 656 ++++++++++++++++++++++++++ 8 files changed, 1234 insertions(+), 2 deletions(-) create mode 100644 adi_function_app/ocr.py create mode 100644 deploy_ai_search/rag_documents_old.py create mode 100644 local_test/adi_2_ai_search.py diff --git a/adi_function_app/function_app.py b/adi_function_app/function_app.py index cca6005..8a66f5a 100644 --- a/adi_function_app/function_app.py +++ b/adi_function_app/function_app.py @@ -8,6 +8,7 @@ from adi_2_ai_search import process_adi_2_ai_search from pre_embedding_cleaner import process_pre_embedding_cleaner from key_phrase_extraction import process_key_phrase_extraction +from ocr import process_ocr logging.basicConfig(level=logging.DEBUG) app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) @@ -124,3 +125,41 @@ async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse: status_code=200, mimetype="application/json", ) + +@app.route(route="ocr", methods=[func.HttpMethod.POST]) +async def ocr(req: func.HttpRequest) -> func.HttpResponse: + """HTTP trigger for data cleanup function. + + Args: + req (func.HttpRequest): The HTTP request object. + + Returns: + func.HttpResponse: The HTTP response object.""" + logging.info("Python HTTP trigger data cleanup function processed a request.") + + try: + req_body = req.get_json() + values = req_body.get("values") + logging.info(req_body) + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append( + asyncio.create_task(process_ocr(value)) + ) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + + return func.HttpResponse( + json.dumps({"values": results}), + status_code=200, + mimetype="application/json", + ) diff --git a/adi_function_app/ocr.py b/adi_function_app/ocr.py new file mode 100644 index 0000000..dae85df --- /dev/null +++ b/adi_function_app/ocr.py @@ -0,0 +1,82 @@ +import logging +import os +from azure.ai.vision.imageanalysis.aio import ImageAnalysisClient +from azure.ai.vision.imageanalysis.models import VisualFeatures +from azure.core.credentials import AzureKeyCredential + + +async def process_ocr(record: dict) -> dict: + logging.info("Python HTTP trigger function processed a request.") + + try: + url = record["data"]["image"]["url"] + logging.info(f"Request Body: {record}") + except KeyError: + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract data with ocr. Pass a valid source in the request body.", + } + ], + "warnings": None, + } + else: + logging.info(f"image url: {url}") + + if url is not None: + try: + # keyvault_helper = KeyVaultHelper() + client = ImageAnalysisClient( + endpoint=os.environ["AIService__Services__Endpoint"], + credential=AzureKeyCredential(os.environ["AIService__Services__Key"]) + ), + result = await client.analyze_from_url( + image_url=url, visual_features=[VisualFeatures.READ] + ) + logging.info("logging output") + + # Extract text from OCR results + text = " ".join([line.text for line in result.read.blocks[0].lines]) + logging.info(text) + + except KeyError as e: + logging.error(e) + logging.error(f"Failed to authenticate with ocr: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to authenticate with Ocr. Check the service credentials exist. {e}", + } + ], + "warnings": None, + } + except Exception as e: + logging.error(e) + logging.error( + f"Failed to analyze the document with Azure Document Intelligence: {e}" + ) + logging.error(e.InnerError) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with ocr. Check the source and try again. {e}", + } + ], + "warnings": None, + } + else: + return { + "recordId": record["recordId"], + "data": {"text": ""}, + } + + return { + "recordId": record["recordId"], + "data": {"text": text}, + } \ No newline at end of file diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py index 0ea69ff..45b456f 100644 --- a/deploy_ai_search/ai_search.py +++ b/deploy_ai_search/ai_search.py @@ -24,6 +24,9 @@ InputFieldMappingEntry, SynonymMap, SplitSkill, + DocumentExtractionSkill, + OcrSkill, + MergeSkill, SearchIndexerIndexProjections, BlobIndexerParsingMode, ) @@ -147,7 +150,7 @@ def get_indexer(self) -> SearchIndexer: return None def get_index_projections(self) -> SearchIndexerIndexProjections: - """Get the index projections for the indexer.""" + """Get the index projections for the indexer.""" return None @@ -420,6 +423,108 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: return key_phrase_extraction_skill + def get_document_extraction_skill(self, context, source) -> DocumentExtractionSkill: + """Get the document extraction utility skill. + + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + + Returns: + -------- + DocumentExtractionSkill: The document extraction utility skill""" + + doc_extraction_skill = DocumentExtractionSkill( + description="Extraction skill to extract content from office docs like excel, ppt, doc etc", + context=context, + inputs=[InputFieldMappingEntry(name="file_data", source=source)], + outputs=[ + OutputFieldMappingEntry( + name="content", target_name="extracted_content" + ), + OutputFieldMappingEntry( + name="normalized_images", target_name="extracted_normalized_images" + ), + ], + ) + + return doc_extraction_skill + + def get_ocr_skill(self, context, source) -> OcrSkill: + """Get the ocr utility skill + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + + Returns: + -------- + OcrSkill: The ocr skill""" + + if self.test: + batch_size = 2 + degree_of_parallelism = 2 + else: + batch_size = 2 + degree_of_parallelism = 2 + + ocr_skill_inputs = [ + InputFieldMappingEntry(name="image", source=source), + ] + ocr__skill_outputs = [OutputFieldMappingEntry(name="text", target_name="text")] + ocr_skill = WebApiSkill( + name="ocr API", + description="Skill to extract text from images", + context=context, + uri=self.environment.get_custom_skill_function_url("ocr"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + inputs=ocr_skill_inputs, + outputs=ocr__skill_outputs, + ) + + if self.environment.identity_type != IdentityType.KEY: + ocr_skill.auth_identity = ( + self.environment.function_app_app_registration_resource_id + ) + + if self.environment.identity_type == IdentityType.USER_ASSIGNED: + ocr_skill.auth_identity = ( + self.environment.ai_search_user_assigned_identity + ) + + return ocr_skill + + def get_merge_skill(self, context, source) -> MergeSkill: + """Get the merge + Args: + ----- + context (str): The context of the skill + source (array): The source of the skill + + Returns: + -------- + mergeSkill: The merge skill""" + + merge_skill = MergeSkill( + description="Merge skill for combining OCR'd and regular text", + context=context, + inputs=[ + InputFieldMappingEntry(name="text", source=source[0]), + InputFieldMappingEntry(name="itemsToInsert", source=source[1]), + InputFieldMappingEntry(name="offsets", source=source[2]), + ], + outputs=[ + OutputFieldMappingEntry(name="mergedText", target_name="merged_content") + ], + ) + + return merge_skill + + def get_vector_search(self) -> VectorSearch: """Get the vector search configuration for compass. diff --git a/deploy_ai_search/deploy.py b/deploy_ai_search/deploy.py index 3288ebf..abb4fce 100644 --- a/deploy_ai_search/deploy.py +++ b/deploy_ai_search/deploy.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import argparse -from rag_documents import RagDocumentsAISearch +# from rag_documents import RagDocumentsAISearch +from rag_documents_old import RagDocumentsAISearch from text_2_sql import Text2SqlAISearch from text_2_sql_query_cache import Text2SqlQueryCacheAISearch diff --git a/deploy_ai_search/environment.py b/deploy_ai_search/environment.py index 3e47955..a69b2c4 100644 --- a/deploy_ai_search/environment.py +++ b/deploy_ai_search/environment.py @@ -217,6 +217,13 @@ def function_app_key_phrase_extractor_route(self) -> str: This function returns function app keyphrase extractor name """ return os.environ.get("FunctionApp__KeyPhraseExtractor__FunctionName") + + @property + def function_app_key_ocr_route(self) -> str: + """ + This function returns function app keyphrase extractor name + """ + return os.environ.get("FunctionApp__Ocr__FunctionName") @property def open_ai_embedding_dimensions(self) -> str: @@ -249,6 +256,8 @@ def get_custom_skill_function_url(self, skill_type: str): route = self.function_app_adi_route elif skill_type == "key_phrase_extraction": route = self.function_app_key_phrase_extractor_route + elif skill_type == "ocr": + route = self.function_app_key_ocr_route else: raise ValueError(f"Invalid skill type: {skill_type}") diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py index dba2645..73bbcc8 100644 --- a/deploy_ai_search/rag_documents.py +++ b/deploy_ai_search/rag_documents.py @@ -162,6 +162,7 @@ def get_skills(self) -> list: Returns: list: The skillsets used in the indexer""" + adi_skill = self.get_adi_skill(self.enable_page_by_chunking) diff --git a/deploy_ai_search/rag_documents_old.py b/deploy_ai_search/rag_documents_old.py new file mode 100644 index 0000000..f4b114d --- /dev/null +++ b/deploy_ai_search/rag_documents_old.py @@ -0,0 +1,339 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from azure.search.documents.indexes.models import ( + SearchFieldDataType, + SearchField, + SearchableField, + SemanticField, + SemanticPrioritizedFields, + SemanticConfiguration, + SemanticSearch, + InputFieldMappingEntry, + SearchIndexer, + FieldMapping, + IndexingParameters, + IndexingParametersConfiguration, + SearchIndexerIndexProjections, + SearchIndexerIndexProjectionSelector, + SearchIndexerIndexProjectionsParameters, + IndexProjectionMode, + SimpleField, + ComplexField, + BlobIndexerDataToExtract, + IndexerExecutionEnvironment, +) +from ai_search import AISearch +from environment import ( + IndexerType, +) + + +class RagDocumentsAISearch(AISearch): + """This class is used to deploy the rag document index.""" + + def __init__( + self, + suffix: str | None = None, + rebuild: bool | None = False, + enable_page_by_chunking=False, + ): + """Initialize the RagDocumentsAISearch class. This class implements the deployment of the rag document index. + + Args: + suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. + rebuild (bool, optional): Whether to rebuild the index. Defaults to False. + """ + self.indexer_type = IndexerType.RAG_DOCUMENTS + super().__init__(suffix, rebuild) + + if enable_page_by_chunking is not None: + self.enable_page_by_chunking = enable_page_by_chunking + else: + self.enable_page_by_chunking = False + + def get_index_fields(self) -> list[SearchableField]: + """This function returns the index fields for rag document. + + Returns: + list[SearchableField]: The index fields for rag document""" + + fields = [ + SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), + SearchableField( + name="Title", type=SearchFieldDataType.String, filterable=True + ), + SearchableField( + name="ChunkId", + type=SearchFieldDataType.String, + key=True, + analyzer_name="keyword", + ), + SearchableField( + name="Chunk", + type=SearchFieldDataType.String, + sortable=False, + filterable=False, + facetable=False, + ), + SearchableField( + name="Sections", + type=SearchFieldDataType.String, + collection=True, + ), + SearchField( + name="ChunkEmbedding", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + vector_search_dimensions=self.environment.open_ai_embedding_dimensions, + vector_search_profile_name=self.vector_search_profile_name, + ), + SearchableField( + name="Keywords", type=SearchFieldDataType.String, collection=True + ), + SearchableField( + name="SourceUri", + type=SearchFieldDataType.String, + sortable=True, + filterable=True, + facetable=True, + ), + ComplexField( + name="Figures", + collection=True, + fields=[ + SimpleField( + name="FigureId", + type=SearchFieldDataType.String, + collection=True, + ), + SimpleField( + name="FigureUri", + type=SearchFieldDataType.String, + collection=True, + ), + ], + ), + SimpleField( + name="DateLastModified", + type=SearchFieldDataType.DateTimeOffset, + filterable=True, + ), + ] + + if self.enable_page_by_chunking: + fields.extend( + [ + SearchableField( + name="PageNumber", + type=SearchFieldDataType.Int64, + sortable=True, + filterable=True, + facetable=True, + ) + ] + ) + + return fields + + def get_semantic_search(self) -> SemanticSearch: + """This function returns the semantic search configuration for rag document + + Returns: + SemanticSearch: The semantic search configuration""" + + semantic_config = SemanticConfiguration( + name=self.semantic_config_name, + prioritized_fields=SemanticPrioritizedFields( + title_field=SemanticField(field_name="Title"), + content_fields=[SemanticField(field_name="Chunk")], + keywords_fields=[ + SemanticField(field_name="Keywords"), + SemanticField(field_name="Sections"), + ], + ), + ) + + semantic_search = SemanticSearch(configurations=[semantic_config]) + + return semantic_search + + def get_skills(self) -> list: + """Get the skillset for the indexer. + + Returns: + list: The skillsets used in the indexer""" + + adi_skill = self.get_adi_skill(self.enable_page_by_chunking) + + + doc_extraction_skill = self.get_document_extraction_skill( + "/document", "/document/file_data" + ) + + ocr_skill = self.get_ocr_skill( + "/document/normalized_images/*", "/document/normalized_images/*" + ) + + merge_skill = self.get_merge_skill( + "/document", + [ + "/document/content", + "/document/normalized_images/*/text", + "/document/normalized_images/*/contentOffset", + ], + ) + + text_split_skill = self.get_text_split_skill( + "/document", "/document/merged_content/content" + ) + + pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( + "/document/pages/*", "/document/pages/*" + ) + + key_phrase_extraction_skill = self.get_key_phrase_extraction_skill( + "/document/pages/*", "/document/pages/*/cleanedChunk" + ) + + embedding_skill = self.get_vector_skill( + "/document/pages/*", "/document/pages/*/cleanedChunk" + ) + + if self.enable_page_by_chunking: + skills = [ + adi_skill, + pre_embedding_cleaner_skill, + key_phrase_extraction_skill, + embedding_skill, + ] + else: + skills = [ + doc_extraction_skill, + ocr_skill, + merge_skill, + text_split_skill, + pre_embedding_cleaner_skill, + key_phrase_extraction_skill, + embedding_skill, + ] + + return skills + + def get_index_projections(self) -> SearchIndexerIndexProjections: + """This function returns the index projections for rag document.""" + mappings = [ + InputFieldMappingEntry(name="Chunk", source="/document/pages/*/chunk"), + InputFieldMappingEntry( + name="ChunkEmbedding", + source="/document/pages/*/vector", + ), + InputFieldMappingEntry(name="Title", source="/document/Title"), + InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"), + InputFieldMappingEntry( + name="Keywords", source="/document/pages/*/keywords" + ), + InputFieldMappingEntry( + name="Sections", source="/document/pages/*/cleanedSections" + ), + InputFieldMappingEntry( + name="Figures", + source_context="/document/pages/*/figures/*", + inputs=[ + InputFieldMappingEntry( + name="FigureId", source="/document/pages/*/figures/*/figureId" + ), + InputFieldMappingEntry( + name="FigureUri", source="/document/pages/*/figures/*/figureUri" + ), + ], + ), + InputFieldMappingEntry( + name="DateLastModified", source="/document/DateLastModified" + ), + ] + + if self.enable_page_by_chunking: + mappings.extend( + [ + InputFieldMappingEntry( + name="PageNumber", source="/document/pages/*/pageNumber" + ) + ] + ) + + index_projections = SearchIndexerIndexProjections( + selectors=[ + SearchIndexerIndexProjectionSelector( + target_index_name=self.index_name, + parent_key_field_name="Id", + source_context="/document/pages/*", + mappings=mappings, + ), + ], + parameters=SearchIndexerIndexProjectionsParameters( + projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS + ), + ) + + return index_projections + + def get_indexer(self) -> SearchIndexer: + """This function returns the indexer for rag document. + + Returns: + SearchIndexer: The indexer for rag document""" + + # Only place on schedule if it is not a test deployment + if self.test: + schedule = None + batch_size = 4 + else: + schedule = {"interval": "PT15M"} + batch_size = 16 + + if self.environment.use_private_endpoint: + execution_environment = IndexerExecutionEnvironment.PRIVATE + else: + execution_environment = IndexerExecutionEnvironment.STANDARD + + indexer_parameters = IndexingParameters( + batch_size=batch_size, + configuration=IndexingParametersConfiguration( + data_to_extract=BlobIndexerDataToExtract.ALL_METADATA, + query_timeout=None, + allow_skillset_to_read_file_data=True, + execution_environment=execution_environment, + fail_on_unprocessable_document=False, + fail_on_unsupported_content_type=False, + index_storage_metadata_only_for_oversized_documents=True, + excluded_file_name_extensions=".png,.jpg,.jpeg,.xlsx", + indexed_file_name_extensions=".pdf,.pptx,.docx,.txt", + parsing_mode=self.parsing_mode, + ), + max_failed_items=5, + ) + + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer to index documents and generate embeddings", + skillset_name=self.skillset_name, + target_index_name=self.index_name, + data_source_name=self.data_source_name, + schedule=schedule, + field_mappings=[ + FieldMapping( + source_field_name="metadata_storage_name", target_field_name="Title" + ), + FieldMapping( + source_field_name="metadata_storage_path", + target_field_name="SourceUri", + ), + FieldMapping( + source_field_name="metadata_storage_last_modified", + target_field_name="DateLastModified", + ), + ], + parameters=indexer_parameters, + ) + + return indexer diff --git a/local_test/adi_2_ai_search.py b/local_test/adi_2_ai_search.py new file mode 100644 index 0000000..7e8ff27 --- /dev/null +++ b/local_test/adi_2_ai_search.py @@ -0,0 +1,656 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +import base64 +from azure.core.credentials import AzureKeyCredential +from azure.ai.documentintelligence.aio import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeResult, + ContentFormat, + AnalyzeOutputOption, +) +import os +import re +import asyncio +import logging +from storage_account import StorageAccountHelper +import concurrent.futures +import json +from openai import AsyncAzureOpenAI +from typing import Union +import openai +from environment import IdentityType, get_identity_type + + +def build_and_clean_markdown_for_response( + markdown_text: str, + figures: dict, + page_no: int = None, + remove_irrelevant_figures=False, +): + """Clean Markdown text extracted by the Azure Document Intelligence service. + + Args: + ----- + markdown_text (str): The original Markdown text. + remove_irrelevant_figures (bool): Whether to remove all figures or just irrelevant ones. + + Returns: + -------- + str: The cleaned Markdown text. + """ + + output_dict = {} + comment_patterns = r"|||" + cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) + + # Remove irrelevant figures + if remove_irrelevant_figures: + irrelevant_figure_pattern = r"\s*" + cleaned_text = re.sub( + irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL + ) + + logging.info(f"Cleaned Text: {cleaned_text}") + + markdown_without_figure_content = re.sub( + r"", "", cleaned_text, flags=re.DOTALL + ) + + combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" + doc_metadata = re.findall( + combined_pattern, markdown_without_figure_content, re.DOTALL + ) + doc_metadata = [match for group in doc_metadata for match in group if match] + + output_dict["content"] = cleaned_text + output_dict["sections"] = doc_metadata + + output_dict["figures"] = figures + + # add page number when chunk by page is enabled + if page_no is not None: + output_dict["pageNumber"] = page_no + + return output_dict + + +def update_figure_description( + md_content: str, figure_id: str, img_description: str, offset: int, length: int +): + """ + Updates the figure description in the Markdown content. + + Args: + md_content (str): The original Markdown content. + img_description (str): The new description for the image. + offset (int): Position offset in the text. + length (int): Length of the original figure in the text. + + Returns: + str: The updated Markdown content with the new figure description. + """ + + # Define the new string to replace the old content + new_string = f'' + + # Calculate the end index of the content to be replaced + end_index = offset + length + + # Ensure that the end_index does not exceed the length of the Markdown content + if end_index > len(md_content): + end_index = len(md_content) + + # Replace the old string with the new string + new_md_content = md_content[:offset] + new_string + md_content[end_index:] + + return new_md_content, len(new_string) + + +async def understand_image_with_gptv(image_base64, caption, tries_left=3): + """ + Generates a description for an image using the GPT-4V model. + + Parameters: + - image_base64 (str): image file. + - caption (str): The caption for the image. + + Returns: + - img_description (str): The generated description for the image. + """ + + MAX_TOKENS = 2000 + api_version = os.environ["OpenAI__ApiVersion"] + model = os.environ["OpenAI__MultiModalDeployment"] + + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + token_provider = get_bearer_token_provider( + DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default" + ) + api_key = None + elif get_identity_type() == IdentityType.USER_ASSIGNED: + token_provider = get_bearer_token_provider( + DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ), + "https://cognitiveservices.azure.com/.default", + ) + api_key = None + else: + token_provider = None + api_key = os.environ["OpenAI__ApiKey"] + + system_prompt = """You are an expert in technical image analysis. Your task is to provided analysis of images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image. Do not describe the image in a general way or describe the image in a way that is not useful for decision-making. + + If the image is a chart for instance, you should describe the data trends, patterns, and insights that can be drawn from the chart. For example, you could describe the increase or decrease in sales over time, the peak sales period, or the sales performance of a particular product. + + If the image is a map, you should describe the geographical features, landmarks, and any other relevant information that can be inferred from the map. + + If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram. + + Include any data points, labels, and other relevant information that can be inferred from the image. + + Provide a well-structured, detailed, and actionable analysis of the image. Focus on extracting data and information that can be inferred from the image. + + IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.""" + + user_input = "Perform technical analysis on this image. Provide a well-structured, description." + + if caption is not None and len(caption) > 0: + user_input += f" (note: it has the following caption: {caption})" + + try: + async with AsyncAzureOpenAI( + api_key=api_key, + api_version=api_version, + azure_ad_token_provider=token_provider, + azure_endpoint=os.environ.get("OpenAI__Endpoint"), + ) as client: + # We send both image caption and the image body to GPTv for better understanding + response = await client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": user_input, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, + }, + ], + }, + ], + max_tokens=MAX_TOKENS, + ) + + logging.info(f"Response: {response}") + + img_description = response.choices[0].message.content + + logging.info(f"Image Description: {img_description}") + + return img_description + except openai.RateLimitError as e: + logging.error("OpenAI Rate Limit Error: %s", e) + + if tries_left > 0: + logging.info( + "Retrying understanding of image with %s tries left.", tries_left + ) + remaining_tries = tries_left - 1 + backoff = 20 ** (3 - remaining_tries) + await asyncio.sleep(backoff) + return await understand_image_with_gptv( + image_base64, caption, tries_left=remaining_tries + ) + else: + raise Exception("OpenAI Rate Limit Error: No retries left.") from e + except (openai.OpenAIError, openai.APIConnectionError) as e: + logging.error("OpenAI Error: %s", e) + + raise Exception("OpenAI Connection Error: No retries left.") from e + + +async def download_figure_image( + model_id: str, operation_id: str, figure_id: str +) -> bytearray: + """Download the image associated with a figure extracted by the Azure Document Intelligence service. + + Args: + ----- + model_id (str): The model ID used for the analysis. + operation_id (str): The operation ID of the analysis. + figure_id (str): The ID of the figure to download. + + Returns: + -------- + bytes: The image associated with the figure.""" + document_intelligence_client = await get_document_intelligence_client() + async with document_intelligence_client: + response = await document_intelligence_client.get_analyze_result_figure( + model_id=model_id, result_id=operation_id, figure_id=figure_id + ) + + full_bytes = bytearray() + async for chunk in response: + full_bytes.extend(chunk) + + return full_bytes + + +async def process_figures_from_extracted_content( + result: AnalyzeResult, + operation_id: str, + container_and_blob: str, + markdown_content: str, + page_number: None | int = None, + page_offset: int = 0, +) -> Union[str, dict]: + """Process the figures extracted from the content using ADI and send them for analysis. + + Args: + ----- + result (AnalyzeResult): The result of the document analysis. + operation_id (str): The operation ID of the analysis. + container_and_blob (str): The container and blob of the document. + markdown_content (str): The extracted content in Markdown format. + page_number (int): The page number to process. If None, all pages are processed. + page_offset (int): The offset of the page. + + Returns: + -------- + str: The updated Markdown content with the figure descriptions. + dict: A mapping of the FigureId to the stored Uri in blob storage.""" + + figure_processing_datas = [] + download_image_tasks = [] + figure_understanding_tasks = [] + figure_upload_tasks = [] + + if result.figures: + for figure in result.figures: + if figure.id is None: + continue + + for region in figure.bounding_regions: + if page_number is not None and region.page_number != page_number: + continue + + logging.info(f"Figure ID: {figure.id}") + download_image_tasks.append( + download_figure_image( + model_id=result.model_id, + operation_id=operation_id, + figure_id=figure.id, + ) + ) + + container, blob = container_and_blob + image_blob = f"{blob}/{figure.id}.png" + + caption = figure.caption.content if figure.caption is not None else None + + logging.info(f"Figure Caption: {caption}") + + figure_processing_datas.append( + (figure.id, container, image_blob, caption, figure.spans[0]) + ) + + break + + logging.info("Running image download tasks") + image_responses = await asyncio.gather(*download_image_tasks) + logging.info("Finished image download tasks") + + storage_account_helper = await get_storage_account_helper() + + for figure_processing_data, response in zip( + figure_processing_datas, image_responses + ): + _, container, image_blob, caption, _ = figure_processing_data + base_64_image = base64.b64encode(response).decode("utf-8") + + logging.info(f"Image Blob: {image_blob}") + + figure_understanding_tasks.append( + understand_image_with_gptv(base_64_image, caption) + ) + + image_data = base64.b64decode(base_64_image) + + figure_upload_tasks.append( + storage_account_helper.upload_blob( + container, image_blob, image_data, "image/png" + ) + ) + + figure_ids = [ + figure_processing_data[0] for figure_processing_data in figure_processing_datas + ] + logging.info("Running image understanding tasks") + figure_descriptions = await asyncio.gather(*figure_understanding_tasks) + logging.info("Finished image understanding tasks") + logging.info(f"Image Descriptions: {figure_descriptions}") + + logging.info("Running image upload tasks") + figure_uris = await asyncio.gather(*figure_upload_tasks) + logging.info("Finished image upload tasks") + + figures = [ + {"figureId": figure_id, "figureUri": figure_uri} + for figure_id, figure_uri in zip(figure_ids, figure_uris) + ] + + running_offset = 0 + for figure_processing_data, figure_description in zip( + figure_processing_datas, figure_descriptions + ): + figure_id, _, _, _, figure_span = figure_processing_data + starting_offset = figure_span.offset + running_offset - page_offset + markdown_content, desc_offset = update_figure_description( + markdown_content, + figure_id, + figure_description, + starting_offset, + figure_span.length, + ) + running_offset += desc_offset + + return markdown_content, figures + + +def create_page_wise_content(result: AnalyzeResult) -> list: + """Create a list of page-wise content extracted by the Azure Document Intelligence service. + + Args: + ----- + result (AnalyzeResult): The result of the document analysis. + + Returns: + -------- + list: A list of page-wise content extracted by the Azure Document Intelligence service. + """ + + page_wise_content = [] + page_numbers = [] + page_offsets = [] + + for page in result.pages: + page_content = result.content[ + page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"] + ] + page_wise_content.append(page_content) + page_numbers.append(page.page_number) + page_offsets.append(page.spans[0]["offset"]) + + return page_wise_content, page_numbers, page_offsets + + +async def get_document_intelligence_client() -> DocumentIntelligenceClient: + """Get the Azure Document Intelligence client. + + Returns: + -------- + DocumentIntelligenceClient: The Azure Document Intelligence client.""" + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + credential = DefaultAzureCredential() + elif get_identity_type() == IdentityType.USER_ASSIGNED: + credential = DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ) + else: + credential = AzureKeyCredential( + os.environ["AIService__DocumentIntelligence__Key"] + ) + + return DocumentIntelligenceClient( + endpoint=os.environ["AIService__DocumentIntelligence__Endpoint"], + credential=credential, + ) + + +async def get_storage_account_helper() -> StorageAccountHelper: + """Get the Storage Account Helper. + + Returns: + -------- + StorageAccountHelper: The Storage Account Helper.""" + + return StorageAccountHelper() + + +async def analyse_document(file_path: str) -> tuple[AnalyzeResult, str]: + """Analyse a document using the Azure Document Intelligence service. + + Args: + ----- + file_path (str): The path to the document to analyse. + + Returns: + -------- + AnalyzeResult: The result of the document analysis. + str: The operation ID of the analysis. + """ + with open(file_path, "rb") as f: + file_read = f.read() + + document_intelligence_client = await get_document_intelligence_client() + async with document_intelligence_client: + poller = await document_intelligence_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=file_read, + output_content_format=ContentFormat.MARKDOWN, + output=[AnalyzeOutputOption.FIGURES], + content_type="application/octet-stream", + ) + + result = await poller.result() + + operation_id = poller.details["operation_id"] + + if result is None or result.content is None or result.pages is None: + raise ValueError( + "Failed to analyze the document with Azure Document Intelligence." + ) + + return result, operation_id + + +async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict: + """Process the extracted content from the Azure Document Intelligence service and prepare it for Azure Search. + + Args: + ----- + record (dict): The record containing the extracted content. + chunk_by_page (bool): Whether to chunk the content by page. + + Returns: + -------- + dict: The processed content ready for Azure Search.""" + logging.info("Python HTTP trigger function processed a request.") + + storage_account_helper = await get_storage_account_helper() + + try: + source = record["data"]["source"] + logging.info(f"Request Body: {record}") + except KeyError: + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract data with ADI. Pass a valid source in the request body.", + } + ], + "warnings": None, + } + else: + logging.info(f"Source: {source}") + + try: + source_parts = source.split("/") + blob = "/".join(source_parts[4:]) + logging.info(f"Blob: {blob}") + + container = source_parts[3] + + container_and_blob = (container, blob) + + file_extension = blob.split(".")[-1] + target_file_name = f"{record['recordId']}.{file_extension}" + + temp_file_path, _ = await storage_account_helper.download_blob_to_temp_dir( + blob, container, target_file_name + ) + logging.info(temp_file_path) + except Exception as e: + logging.error(f"Failed to download the blob: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to download the blob. Check the source and try again. {e}", + } + ], + "warnings": None, + } + + try: + result, operation_id = await analyse_document(temp_file_path) + except Exception as e: + logging.error(e) + logging.info("Sleeping for 10 seconds and retrying") + await asyncio.sleep(10) + try: + result, operation_id = await analyse_document(temp_file_path) + except ValueError as inner_e: + logging.error(inner_e) + logging.error( + f"Failed to analyze the document with Azure Document Intelligence: {e}" + ) + logging.error( + "Failed to analyse %s with Azure Document Intelligence.", blob + ) + await storage_account_helper.add_metadata_to_blob( + blob, container, {"AzureSearch_Skip": "true"} + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with Azure Document Intelligence. This blob will now be skipped {inner_e}", + } + ], + "warnings": None, + } + except Exception as inner_e: + logging.error(inner_e) + logging.error( + "Failed to analyse %s with Azure Document Intelligence.", blob + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with Azure Document Intelligence. Check the logs and try again. {inner_e}", + } + ], + "warnings": None, + } + + try: + if chunk_by_page: + cleaned_result = [] + markdown_content, page_numbers, page_offsets = create_page_wise_content( + result + ) + content_with_figures_tasks = [ + process_figures_from_extracted_content( + result, + operation_id, + container_and_blob, + page_content, + page_number=page_number, + page_offset=page_offset, + ) + for page_content, page_number, page_offset in zip( + markdown_content, page_numbers, page_offsets + ) + ] + content_with_figures = await asyncio.gather(*content_with_figures_tasks) + + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = { + executor.submit( + build_and_clean_markdown_for_response, + extracted_page_content[0], + extracted_page_content[1], + page_number, + True, + ): extracted_page_content + for extracted_page_content, page_number in zip( + content_with_figures, page_numbers + ) + } + for future in concurrent.futures.as_completed(futures): + cleaned_result.append(future.result()) + + else: + markdown_content = result.content + + ( + extracted_content, + figures, + ) = await process_figures_from_extracted_content( + result, + operation_id, + container_and_blob, + markdown_content, + page_offset=0, + page_number=None, + ) + + cleaned_result = build_and_clean_markdown_for_response( + extracted_content, figures, remove_irrelevant_figures=True + ) + except Exception as e: + logging.error(e) + logging.error(f"Failed to process the extracted content: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to process the extracted content. Check the logs and try again. {e}", + } + ], + "warnings": None, + } + + logging.info("Document Extracted") + logging.info(f"Result: {cleaned_result}") + + src = { + "recordId": record["recordId"], + "data": {"extracted_content": cleaned_result}, + } + + json_str = json.dumps(src, indent=4) + + logging.info(f"final output: {json_str}") + + return src From 3f70652a8f2265a23e4213d2304bd48184f03aa9 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:24:05 +0530 Subject: [PATCH 5/5] changes for old rag --- deploy_ai_search/rag_documents_old.py | 98 +++++++++++++-------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/deploy_ai_search/rag_documents_old.py b/deploy_ai_search/rag_documents_old.py index f4b114d..3d2698e 100644 --- a/deploy_ai_search/rag_documents_old.py +++ b/deploy_ai_search/rag_documents_old.py @@ -60,9 +60,9 @@ def get_index_fields(self) -> list[SearchableField]: fields = [ SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), - SearchableField( - name="Title", type=SearchFieldDataType.String, filterable=True - ), + # SearchableField( + # name="Title", type=SearchFieldDataType.String, filterable=True + # ), SearchableField( name="ChunkId", type=SearchFieldDataType.String, @@ -76,11 +76,11 @@ def get_index_fields(self) -> list[SearchableField]: filterable=False, facetable=False, ), - SearchableField( - name="Sections", - type=SearchFieldDataType.String, - collection=True, - ), + # SearchableField( + # name="Sections", + # type=SearchFieldDataType.String, + # collection=True, + # ), SearchField( name="ChunkEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), @@ -97,22 +97,22 @@ def get_index_fields(self) -> list[SearchableField]: filterable=True, facetable=True, ), - ComplexField( - name="Figures", - collection=True, - fields=[ - SimpleField( - name="FigureId", - type=SearchFieldDataType.String, - collection=True, - ), - SimpleField( - name="FigureUri", - type=SearchFieldDataType.String, - collection=True, - ), - ], - ), + # ComplexField( + # name="Figures", + # collection=True, + # fields=[ + # SimpleField( + # name="FigureId", + # type=SearchFieldDataType.String, + # collection=True, + # ), + # SimpleField( + # name="FigureUri", + # type=SearchFieldDataType.String, + # collection=True, + # ), + # ], + # ), SimpleField( name="DateLastModified", type=SearchFieldDataType.DateTimeOffset, @@ -144,11 +144,11 @@ def get_semantic_search(self) -> SemanticSearch: semantic_config = SemanticConfiguration( name=self.semantic_config_name, prioritized_fields=SemanticPrioritizedFields( - title_field=SemanticField(field_name="Title"), + # title_field=SemanticField(field_name="Title"), content_fields=[SemanticField(field_name="Chunk")], keywords_fields=[ SemanticField(field_name="Keywords"), - SemanticField(field_name="Sections"), + # SemanticField(field_name="Sections"), ], ), ) @@ -171,20 +171,20 @@ def get_skills(self) -> list: ) ocr_skill = self.get_ocr_skill( - "/document/normalized_images/*", "/document/normalized_images/*" + "/document/extracted_normalized_images/*", "/document/extracted_normalized_images/*" ) - merge_skill = self.get_merge_skill( + merge_skill = self.get_merge_skill( "/document", [ - "/document/content", - "/document/normalized_images/*/text", - "/document/normalized_images/*/contentOffset", + "/document/extracted_content", + "/document/extracted_normalized_images/*/text", + "/document/extracted_normalized_images/*/contentOffset", ], ) text_split_skill = self.get_text_split_skill( - "/document", "/document/merged_content/content" + "/document", "/document/merged_content" ) pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( @@ -199,7 +199,7 @@ def get_skills(self) -> list: "/document/pages/*", "/document/pages/*/cleanedChunk" ) - if self.enable_page_by_chunking: + if self.enable_page_by_chunking: skills = [ adi_skill, pre_embedding_cleaner_skill, @@ -227,26 +227,26 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: name="ChunkEmbedding", source="/document/pages/*/vector", ), - InputFieldMappingEntry(name="Title", source="/document/Title"), + # InputFieldMappingEntry(name="Title", source="/document/Title"), InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"), InputFieldMappingEntry( name="Keywords", source="/document/pages/*/keywords" ), - InputFieldMappingEntry( - name="Sections", source="/document/pages/*/cleanedSections" - ), - InputFieldMappingEntry( - name="Figures", - source_context="/document/pages/*/figures/*", - inputs=[ - InputFieldMappingEntry( - name="FigureId", source="/document/pages/*/figures/*/figureId" - ), - InputFieldMappingEntry( - name="FigureUri", source="/document/pages/*/figures/*/figureUri" - ), - ], - ), + # InputFieldMappingEntry( + # name="Sections", source="/document/pages/*/cleanedSections" + # ), + # InputFieldMappingEntry( + # name="Figures", + # source_context="/document/pages/*/figures/*", + # inputs=[ + # InputFieldMappingEntry( + # name="FigureId", source="/document/pages/*/figures/*/figureId" + # ), + # InputFieldMappingEntry( + # name="FigureUri", source="/document/pages/*/figures/*/figureUri" + # ), + # ], + # ), InputFieldMappingEntry( name="DateLastModified", source="/document/DateLastModified" ),