changes for comparison with old rag

priyal1508 · priyal1508 · commit f8808269158e · 2024-09-23T19:30:16.000+05:30
diff --git a/adi_function_app/function_app.py b/adi_function_app/function_app.py
@@ -8,6 +8,7 @@
 from adi_2_ai_search import process_adi_2_ai_search
 from pre_embedding_cleaner import process_pre_embedding_cleaner
 from key_phrase_extraction import process_key_phrase_extraction
+from ocr import process_ocr
 
 logging.basicConfig(level=logging.DEBUG)
 app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
@@ -124,3 +125,41 @@ async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse:
             status_code=200,
             mimetype="application/json",
         )
+
+@app.route(route="ocr", methods=[func.HttpMethod.POST])
+async def ocr(req: func.HttpRequest) -> func.HttpResponse:
+    """HTTP trigger for data cleanup function.
+
+    Args:
+        req (func.HttpRequest): The HTTP request object.
+
+    Returns:
+        func.HttpResponse: The HTTP response object."""
+    logging.info("Python HTTP trigger data cleanup function processed a request.")
+
+    try:
+        req_body = req.get_json()
+        values = req_body.get("values")
+        logging.info(req_body)
+    except ValueError:
+        return func.HttpResponse(
+            "Please valid Custom Skill Payload in the request body", status_code=400
+        )
+    else:
+        logging.debug("Input Values: %s", values)
+
+        record_tasks = []
+
+        for value in values:
+            record_tasks.append(
+                asyncio.create_task(process_ocr(value))
+            )
+
+        results = await asyncio.gather(*record_tasks)
+        logging.debug("Results: %s", results)
+
+        return func.HttpResponse(
+            json.dumps({"values": results}),
+            status_code=200,
+            mimetype="application/json",
+        )
diff --git a/adi_function_app/ocr.py b/adi_function_app/ocr.py
@@ -0,0 +1,82 @@
+import logging
+import os
+from azure.ai.vision.imageanalysis.aio import ImageAnalysisClient
+from azure.ai.vision.imageanalysis.models import VisualFeatures
+from azure.core.credentials import AzureKeyCredential
+
+
+async def process_ocr(record: dict) -> dict:
+    logging.info("Python HTTP trigger function processed a request.")
+
+    try:
+        url = record["data"]["image"]["url"]
+        logging.info(f"Request Body: {record}")
+    except KeyError:
+        return {
+            "recordId": record["recordId"],
+            "data": {},
+            "errors": [
+                {
+                    "message": "Failed to extract data with ocr. Pass a valid source in the request body.",
+                }
+            ],
+            "warnings": None,
+        }
+    else:
+        logging.info(f"image url: {url}")
+
+        if url is not None:
+            try:
+                # keyvault_helper = KeyVaultHelper()
+                client = ImageAnalysisClient(
+                    endpoint=os.environ["AIService__Services__Endpoint"],
+                    credential=AzureKeyCredential(os.environ["AIService__Services__Key"])
+                ),
+                result = await client.analyze_from_url(
+                    image_url=url, visual_features=[VisualFeatures.READ]
+                )
+                logging.info("logging output")
+
+                # Extract text from OCR results
+                text = " ".join([line.text for line in result.read.blocks[0].lines])
+                logging.info(text)
+
+            except KeyError as e:
+                logging.error(e)
+                logging.error(f"Failed to authenticate with ocr: {e}")
+                return {
+                    "recordId": record["recordId"],
+                    "data": {},
+                    "errors": [
+                        {
+                            "message": f"Failed to authenticate with Ocr. Check the service credentials exist. {e}",
+                        }
+                    ],
+                    "warnings": None,
+                }
+            except Exception as e:
+                logging.error(e)
+                logging.error(
+                    f"Failed to analyze the document with Azure Document Intelligence: {e}"
+                )
+                logging.error(e.InnerError)
+                return {
+                    "recordId": record["recordId"],
+                    "data": {},
+                    "errors": [
+                        {
+                            "message": f"Failed to analyze the document with ocr. Check the source and try again. {e}",
+                        }
+                    ],
+                    "warnings": None,
+                }
+        else:
+            return {
+                "recordId": record["recordId"],
+                "data": {"text": ""},
+            }
+
+        return {
+            "recordId": record["recordId"],
+            "data": {"text": text},
+        }
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
@@ -24,6 +24,9 @@
     InputFieldMappingEntry,
     SynonymMap,
     SplitSkill,
+    DocumentExtractionSkill,
+    OcrSkill,
+    MergeSkill,
     SearchIndexerIndexProjections,
     BlobIndexerParsingMode,
 )
@@ -147,7 +150,7 @@ def get_indexer(self) -> SearchIndexer:
         return None
 
     def get_index_projections(self) -> SearchIndexerIndexProjections:
-        """Get the index projections for the indexer."""
+        """Get the index projections    for the indexer."""
 
         return None
 
@@ -420,6 +423,108 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill:
 
         return key_phrase_extraction_skill
 
+    def get_document_extraction_skill(self, context, source) -> DocumentExtractionSkill:
+        """Get the document extraction utility skill.
+
+        Args:
+        -----
+            context (str): The context of the skill
+            source (str): The source of the skill
+
+        Returns:
+        --------
+            DocumentExtractionSkill: The document extraction utility skill"""
+
+        doc_extraction_skill = DocumentExtractionSkill(
+            description="Extraction skill to extract content from office docs like excel, ppt, doc etc",
+            context=context,
+            inputs=[InputFieldMappingEntry(name="file_data", source=source)],
+            outputs=[
+                OutputFieldMappingEntry(
+                    name="content", target_name="extracted_content"
+                ),
+                OutputFieldMappingEntry(
+                    name="normalized_images", target_name="extracted_normalized_images"
+                ),
+            ],
+        )
+
+        return doc_extraction_skill
+
+    def get_ocr_skill(self, context, source) -> OcrSkill:
+        """Get the ocr utility skill
+        Args:
+        -----
+            context (str): The context of the skill
+            source (str): The source of the skill
+
+        Returns:
+        --------
+            OcrSkill: The ocr skill"""
+
+        if self.test:
+            batch_size = 2
+            degree_of_parallelism = 2
+        else:
+            batch_size = 2
+            degree_of_parallelism = 2
+
+        ocr_skill_inputs = [
+            InputFieldMappingEntry(name="image", source=source),
+        ]
+        ocr__skill_outputs = [OutputFieldMappingEntry(name="text", target_name="text")]
+        ocr_skill = WebApiSkill(
+            name="ocr API",
+            description="Skill to extract text from images",
+            context=context,
+            uri=self.environment.get_custom_skill_function_url("ocr"),
+            timeout="PT230S",
+            batch_size=batch_size,
+            degree_of_parallelism=degree_of_parallelism,
+            http_method="POST",
+            inputs=ocr_skill_inputs,
+            outputs=ocr__skill_outputs,
+        )
+
+        if self.environment.identity_type != IdentityType.KEY:
+                ocr_skill.auth_identity = (
+                self.environment.function_app_app_registration_resource_id
+            )
+
+        if self.environment.identity_type == IdentityType.USER_ASSIGNED:
+            ocr_skill.auth_identity = (
+                self.environment.ai_search_user_assigned_identity
+            )
+
+        return ocr_skill
+
+    def get_merge_skill(self, context, source) -> MergeSkill:
+        """Get the merge
+        Args:
+        -----
+            context (str): The context of the skill
+            source (array): The source of the skill
+
+        Returns:
+        --------
+            mergeSkill: The merge skill"""
+
+        merge_skill = MergeSkill(
+            description="Merge skill for combining OCR'd and regular text",
+            context=context,
+            inputs=[
+                InputFieldMappingEntry(name="text", source=source[0]),
+                InputFieldMappingEntry(name="itemsToInsert", source=source[1]),
+                InputFieldMappingEntry(name="offsets", source=source[2]),
+            ],
+            outputs=[
+                OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
+            ],
+        )
+
+        return merge_skill
+    
+    
     def get_vector_search(self) -> VectorSearch:
         """Get the vector search configuration for compass.
 
diff --git a/deploy_ai_search/deploy.py b/deploy_ai_search/deploy.py
@@ -1,7 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import argparse
-from rag_documents import RagDocumentsAISearch
+# from rag_documents import RagDocumentsAISearch
+from rag_documents_old import RagDocumentsAISearch
 from text_2_sql import Text2SqlAISearch
 from text_2_sql_query_cache import Text2SqlQueryCacheAISearch
 
diff --git a/deploy_ai_search/environment.py b/deploy_ai_search/environment.py
@@ -217,6 +217,13 @@ def function_app_key_phrase_extractor_route(self) -> str:
         This function returns function app keyphrase extractor name
         """
         return os.environ.get("FunctionApp__KeyPhraseExtractor__FunctionName")
+    
+    @property
+    def function_app_key_ocr_route(self) -> str:
+        """
+        This function returns function app keyphrase extractor name
+        """
+        return os.environ.get("FunctionApp__Ocr__FunctionName")
 
     @property
     def open_ai_embedding_dimensions(self) -> str:
@@ -249,6 +256,8 @@ def get_custom_skill_function_url(self, skill_type: str):
             route = self.function_app_adi_route
         elif skill_type == "key_phrase_extraction":
             route = self.function_app_key_phrase_extractor_route
+        elif skill_type == "ocr":
+            route = self.function_app_key_ocr_route
         else:
             raise ValueError(f"Invalid skill type: {skill_type}")
 
diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py
@@ -162,6 +162,7 @@ def get_skills(self) -> list:
 
         Returns:
             list: The skillsets  used in the indexer"""
+        
 
         adi_skill = self.get_adi_skill(self.enable_page_by_chunking)
 
diff --git a/deploy_ai_search/rag_documents_old.py b/deploy_ai_search/rag_documents_old.py
diff --git a/local_test/adi_2_ai_search.py b/local_test/adi_2_ai_search.py