From f05ced709ee621d0d18d6f32d98c02343571995d Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 12:23:35 +0100 Subject: [PATCH 01/10] Update adi code --- adi_function_app/adi_2_ai_search.py | 173 ++++++++++++++-------------- adi_function_app/requirements.txt | 2 +- adi_function_app/storage_account.py | 15 +++ 3 files changed, 105 insertions(+), 85 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 2f434f4..4558fc8 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -8,8 +8,6 @@ import os import re import asyncio -import fitz -from PIL import Image import io import logging from storage_account import StorageAccountHelper @@ -20,36 +18,6 @@ from environment import IdentityType, get_identity_type -def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): - """ - Crops a region from a given page in a PDF and returns it as an image. - - :param pdf_path: Path to the PDF file. - :param page_number: The page number to crop from (0-indexed). - :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box. - :return: A PIL Image of the cropped area. - """ - doc = fitz.open(pdf_path) - page = doc.load_page(page_number) - - logging.debug(f"Bounding Box: {bounding_box}") - logging.debug(f"Page Number: {page_number}") - - # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1). - bbx = [x * 72 for x in bounding_box] - rect = fitz.Rect(bbx) - pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), clip=rect) - - img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) - - if pix.width == 0 or pix.height == 0: - logging.error("Cropped image has 0 width or height.") - return None - - doc.close() - return img - - def clean_adi_markdown( markdown_text: str, page_no: int = None, remove_irrelevant_figures=False ): @@ -166,7 +134,7 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): user_input = "Describe this image with technical analysis. Provide a well-structured, description." - if caption != "": + if caption is not None and len(caption) > 0: user_input += f" (note: it has image caption: {caption})" try: @@ -252,9 +220,10 @@ async def mark_image_as_irrelevant(): async def process_figures_from_extracted_content( - file_path: str, + result, + operation_id: str, + container_and_blob: str, markdown_content: str, - figures: list, page_number: None | int = None, page_offset: int = 0, ) -> str: @@ -274,41 +243,47 @@ async def process_figures_from_extracted_content( figure_spans = [] image_understanding_tasks = [] - for idx, figure in enumerate(figures): - img_description = "" - logging.debug(f"Figure #{idx} has the following spans: {figure.spans}") - - caption_region = figure.caption.bounding_regions if figure.caption else [] - for region in figure.bounding_regions: - # Skip the region if it is not on the specified page - if page_number is not None and region.page_number != page_number: - continue - - if region not in caption_region: - # To learn more about bounding regions, see https://aka.ms/bounding-region - bounding_box = ( - region.polygon[0], # x0 (left) - region.polygon[1], # y0 (top) - region.polygon[4], # x1 (right) - region.polygon[5], # y1 (bottom) - ) - cropped_image = crop_image_from_pdf_page( - file_path, region.page_number - 1, bounding_box - ) # page_number is 1-indexed - figure_spans.append(figure.spans[0]) + document_intelligence_client = await get_document_intelligence_client() + storage_account_helper = await get_storage_account_helper() + + async with document_intelligence_client: + if result.figures: + for figure in result.figures: + if figure.id is None: + continue + + for region in figure.bounding_regions: + if page_number is not None and region.page_number != page_number: + continue - if cropped_image is None: - image_understanding_tasks.append(mark_image_as_irrelevant()) - else: - image_base64 = pil_image_to_base64(cropped_image) + figure_spans.append(figure.spans[0]) + + response = ( + await document_intelligence_client.get_analyze_result_figure( + model_id=result.model_id, + result_id=operation_id, + figure_id=figure.id, + ) + ) + + logging.info(f"Figure ID: {figure.id}") + logging.info(f"Figure Caption: {figure.caption.content}") + logging.info(f"Figure Response: {response}") + + container, blob = container_and_blob + image_blob = f"{blob}/{figure.id}.png" + await storage_account_helper.upload_blob( + container, image_blob, response + ) image_understanding_tasks.append( - understand_image_with_gptv(image_base64, figure.caption.content) + understand_image_with_gptv(response, figure.caption.content) ) - logging.info(f"\tDescription of figure {idx}: {img_description}") + break + logging.info("Running image understanding tasks") image_descriptions = await asyncio.gather(*image_understanding_tasks) logging.info(f"Image Descriptions: {image_descriptions}") @@ -351,19 +326,12 @@ def create_page_wise_content(result: AnalyzeResult) -> list: return page_wise_content, page_numbers, page_offsets -async def analyse_document(file_path: str) -> AnalyzeResult: - """Analyse a document using the Azure Document Intelligence service. - - Args: - ----- - file_path (str): The path to the document to analyse. +async def get_document_intelligence_client() -> DocumentIntelligenceClient: + """Get the Azure Document Intelligence client. Returns: -------- - AnalyzeResult: The result of the document analysis.""" - with open(file_path, "rb") as f: - file_read = f.read() - + DocumentIntelligenceClient: The Azure Document Intelligence client.""" if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: credential = DefaultAzureCredential() elif get_identity_type() == IdentityType.USER_ASSIGNED: @@ -375,10 +343,39 @@ async def analyse_document(file_path: str) -> AnalyzeResult: os.environ["AIService__DocumentIntelligence__Key"] ) - async with DocumentIntelligenceClient( + return DocumentIntelligenceClient( endpoint=os.environ["AIService__DocumentIntelligence__Endpoint"], credential=credential, - ) as document_intelligence_client: + ) + + +async def get_storage_account_helper() -> StorageAccountHelper: + """Get the Storage Account Helper. + + Returns: + -------- + StorageAccountHelper: The Storage Account Helper.""" + + return StorageAccountHelper() + + +async def analyse_document(file_path: str) -> tuple[AnalyzeResult, str]: + """Analyse a document using the Azure Document Intelligence service. + + Args: + ----- + file_path (str): The path to the document to analyse. + + Returns: + -------- + AnalyzeResult: The result of the document analysis. + str: The operation ID of the analysis. + """ + with open(file_path, "rb") as f: + file_read = f.read() + + document_intelligence_client = await get_document_intelligence_client() + async with document_intelligence_client: poller = await document_intelligence_client.begin_analyze_document( model_id="prebuilt-layout", analyze_request=file_read, @@ -388,12 +385,14 @@ async def analyse_document(file_path: str) -> AnalyzeResult: result = await poller.result() + operation_id = poller.details["operation_id"] + if result is None or result.content is None or result.pages is None: raise ValueError( "Failed to analyze the document with Azure Document Intelligence." ) - return result + return result, operation_id async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict: @@ -409,7 +408,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict: The processed content ready for Azure Search.""" logging.info("Python HTTP trigger function processed a request.") - storage_account_helper = StorageAccountHelper() + storage_account_helper = await get_storage_account_helper() try: source = record["data"]["source"] @@ -435,6 +434,8 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> container = source_parts[3] + container_and_blob = (container, blob) + file_extension = blob.split(".")[-1] target_file_name = f"{record['recordId']}.{file_extension}" @@ -456,13 +457,13 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> } try: - result = await analyse_document(temp_file_path) + result, operation_id = await analyse_document(temp_file_path) except Exception as e: logging.error(e) logging.info("Sleeping for 10 seconds and retrying") await asyncio.sleep(10) try: - result = await analyse_document(temp_file_path) + result, operation_id = await analyse_document(temp_file_path) except ValueError as inner_e: logging.error(inner_e) logging.error( @@ -508,9 +509,10 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> ) content_with_figures_tasks = [ process_figures_from_extracted_content( - temp_file_path, + result, + operation_id, + container_and_blob, page_content, - result.figures, page_number=page_number, page_offset=page_offset, ) @@ -534,13 +536,16 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> else: markdown_content = result.content + content_with_figures = await process_figures_from_extracted_content( - temp_file_path, + result, + operation_id, + container_and_blob, markdown_content, - result.figures, page_offset=0, - page_number=1, + page_number=None, ) + cleaned_result = clean_adi_markdown( content_with_figures, remove_irrelevant_figures=True ) diff --git a/adi_function_app/requirements.txt b/adi_function_app/requirements.txt index a923e7f..c9bf5be 100644 --- a/adi_function_app/requirements.txt +++ b/adi_function_app/requirements.txt @@ -13,7 +13,7 @@ nltk==3.8.1 bs4 azure-search azure-search-documents -azure-ai-documentintelligence +azure-ai-documentintelligence==1.0.0b4 azure-ai-textanalytics azure-ai-vision-imageanalysis PyMuPDF diff --git a/adi_function_app/storage_account.py b/adi_function_app/storage_account.py index 8c5aa98..f691477 100644 --- a/adi_function_app/storage_account.py +++ b/adi_function_app/storage_account.py @@ -50,6 +50,21 @@ async def add_metadata_to_blob( logging.info("Metadata Added") + async def upload_blob(self, container, blob, data) -> str: + """Upload the file to the Azure Blob Storage. + + Args: + container (str): The container of the blob. + blob (str): The blob name. + data (bytes): The data to upload.""" + + blob_service_client = await self.get_client() + async with blob_service_client: + async with blob_service_client.get_blob_client( + container=container, blob=blob + ) as blob_client: + await blob_client.upload_blob(data, overwrite=True) + async def download_blob_to_temp_dir( self, source: str, container: str, target_file_name ) -> tuple[str, dict]: From c969f7edfa51ad9445f6085db13ddb59d71bab5e Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 12:53:30 +0100 Subject: [PATCH 02/10] Update adi code --- adi_function_app/adi_2_ai_search.py | 46 ++++++++++++++++++----------- adi_function_app/storage_account.py | 6 +++- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 4558fc8..212148f 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -100,8 +100,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): """ MAX_TOKENS = 2000 - api_version = os.environ.get("OpenAI__ApiVersion") - model = os.environ.get("OpenAI__MultiModalDeployment") + api_version = os.environ["OpenAI__ApiVersion"] + model = os.environ["OpenAI__MultiModalDeployment"] if get_identity_type() != IdentityType.SYSTEM_ASSIGNED: token_provider = get_bearer_token_provider( @@ -111,14 +111,14 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): elif get_identity_type() != IdentityType.USER_ASSIGNED: token_provider = get_bearer_token_provider( DefaultAzureCredential( - managed_identity_client_id=os.environ.get("FunctionApp__ClientId") + managed_identity_client_id=os.environ["FunctionApp__ClientId"] ), "https://cognitiveservices.azure.com/.default", ) api_key = None else: token_provider = None - api_key = os.environ.get("OpenAI__ApiKey") + api_key = os.environ["OpenAI__ApiKey"] system_prompt = """You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image. @@ -241,8 +241,10 @@ async def process_figures_from_extracted_content( -------- str: The updated Markdown content with the figure descriptions.""" - figure_spans = [] + image_processing_data = [] + download_image_tasks = [] image_understanding_tasks = [] + image_upload_tasks = [] document_intelligence_client = await get_document_intelligence_client() storage_account_helper = await get_storage_account_helper() @@ -257,39 +259,47 @@ async def process_figures_from_extracted_content( if page_number is not None and region.page_number != page_number: continue - figure_spans.append(figure.spans[0]) - - response = ( - await document_intelligence_client.get_analyze_result_figure( + logging.info(f"Figure ID: {figure.id}") + download_image_tasks.append( + document_intelligence_client.get_analyze_result_figure( model_id=result.model_id, result_id=operation_id, figure_id=figure.id, ) ) - logging.info(f"Figure ID: {figure.id}") logging.info(f"Figure Caption: {figure.caption.content}") - logging.info(f"Figure Response: {response}") container, blob = container_and_blob image_blob = f"{blob}/{figure.id}.png" - await storage_account_helper.upload_blob( - container, image_blob, response - ) - image_understanding_tasks.append( - understand_image_with_gptv(response, figure.caption.content) + image_processing_data.append( + container, image_blob, figure.caption.content, figure.spans[0] ) break + image_responses = await asyncio.gather(*download_image_tasks) + for image_processing_data, response in zip(image_processing_data, image_responses): + container, image_blob, caption, _ = image_processing_data + image_upload_tasks.append( + storage_account_helper.upload_blob(container, image_blob, response) + ) + + image_understanding_tasks.append(understand_image_with_gptv(response, caption)) + logging.info("Running image understanding tasks") image_descriptions = await asyncio.gather(*image_understanding_tasks) - logging.info(f"Image Descriptions: {image_descriptions}") + logging.info("Running image upload tasks") + await asyncio.gather(*image_upload_tasks) + running_offset = 0 - for figure_span, image_description in zip(figure_spans, image_descriptions): + for image_processing_data, image_description in zip( + image_processing_data, image_descriptions + ): + _, _, _, figure_span = image_processing_data starting_offset = figure_span.offset + running_offset - page_offset markdown_content, desc_offset = update_figure_description( markdown_content, image_description, starting_offset, figure_span.length diff --git a/adi_function_app/storage_account.py b/adi_function_app/storage_account.py index f691477..27c006e 100644 --- a/adi_function_app/storage_account.py +++ b/adi_function_app/storage_account.py @@ -50,7 +50,7 @@ async def add_metadata_to_blob( logging.info("Metadata Added") - async def upload_blob(self, container, blob, data) -> str: + async def upload_blob(self, container: str, blob: str, data) -> str: """Upload the file to the Azure Blob Storage. Args: @@ -58,6 +58,10 @@ async def upload_blob(self, container, blob, data) -> str: blob (str): The blob name. data (bytes): The data to upload.""" + logging.info("Uploading Blob...") + logging.info(f"Container: {container}") + logging.info(f"Blob: {blob}") + blob_service_client = await self.get_client() async with blob_service_client: async with blob_service_client.get_blob_client( From f11d0c138d0f306fa9c5a63b91986b1214c39a5f Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 13:48:45 +0100 Subject: [PATCH 03/10] Update adi code --- adi_function_app/adi_2_ai_search.py | 109 +++++++++++++++++----------- 1 file changed, 68 insertions(+), 41 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 212148f..76d3129 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -4,11 +4,14 @@ import base64 from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence.aio import DocumentIntelligenceClient -from azure.ai.documentintelligence.models import AnalyzeResult, ContentFormat +from azure.ai.documentintelligence.models import ( + AnalyzeResult, + ContentFormat, + AnalyzeOutputOption, +) import os import re import asyncio -import io import logging from storage_account import StorageAccountHelper import concurrent.futures @@ -171,6 +174,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): max_tokens=MAX_TOKENS, ) + logging.info(f"Response: {response}") + img_description = response.choices[0].message.content logging.info(f"Image Description: {img_description}") @@ -197,26 +202,35 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): raise Exception("OpenAI Rate Limit Error: No retries left.") from e -def pil_image_to_base64(image, image_format="JPEG"): - """ - Converts a PIL image to a base64-encoded string. +async def mark_image_as_irrelevant(): + return "Irrelevant Image" + + +async def download_figure_image( + model_id: str, operation_id: str, figure_id: str +) -> bytearray: + """Download the image associated with a figure extracted by the Azure Document Intelligence service. Args: - image (PIL.Image.Image): The image to be converted. - image_format (str): The format to save the image in. Defaults to "JPEG". + ----- + model_id (str): The model ID used for the analysis. + operation_id (str): The operation ID of the analysis. + figure_id (str): The ID of the figure to download. Returns: - str: The base64-encoded string representation of the image. - """ - if image.mode == "RGBA" and image_format == "JPEG": - image = image.convert("RGB") - buffered = io.BytesIO() - image.save(buffered, format=image_format) - return base64.b64encode(buffered.getvalue()).decode("utf-8") + -------- + bytes: The image associated with the figure.""" + document_intelligence_client = await get_document_intelligence_client() + async with document_intelligence_client: + response = await document_intelligence_client.get_analyze_result_figure( + model_id=model_id, result_id=operation_id, figure_id=figure_id + ) + full_bytes = bytearray() + async for chunk in response: + full_bytes.extend(chunk) -async def mark_image_as_irrelevant(): - return "Irrelevant Image" + return full_bytes async def process_figures_from_extracted_content( @@ -246,54 +260,66 @@ async def process_figures_from_extracted_content( image_understanding_tasks = [] image_upload_tasks = [] - document_intelligence_client = await get_document_intelligence_client() storage_account_helper = await get_storage_account_helper() - async with document_intelligence_client: - if result.figures: - for figure in result.figures: - if figure.id is None: - continue + if result.figures: + for figure in result.figures: + if figure.id is None: + continue - for region in figure.bounding_regions: - if page_number is not None and region.page_number != page_number: - continue + for region in figure.bounding_regions: + if page_number is not None and region.page_number != page_number: + continue - logging.info(f"Figure ID: {figure.id}") - download_image_tasks.append( - document_intelligence_client.get_analyze_result_figure( - model_id=result.model_id, - result_id=operation_id, - figure_id=figure.id, - ) + logging.info(f"Figure ID: {figure.id}") + download_image_tasks.append( + download_figure_image( + model_id=result.model_id, + operation_id=operation_id, + figure_id=figure.id, ) + ) - logging.info(f"Figure Caption: {figure.caption.content}") + container, blob = container_and_blob + image_blob = f"{blob}/{figure.id}.png" - container, blob = container_and_blob - image_blob = f"{blob}/{figure.id}.png" + caption = figure.caption.content if figure.caption is not None else None - image_processing_data.append( - container, image_blob, figure.caption.content, figure.spans[0] - ) + logging.info(f"Figure Caption: {caption}") + + image_processing_data.append( + (container, image_blob, caption, figure.spans[0]) + ) - break + break + logging.info("Running image download tasks") image_responses = await asyncio.gather(*download_image_tasks) + logging.info("Finished image download tasks") + for image_processing_data, response in zip(image_processing_data, image_responses): container, image_blob, caption, _ = image_processing_data + base_64_image = base64.b64encode(response).decode("utf-8") + + logging.info(f"Image Blob: {image_blob}") + logging.info(f"Response: {response}") + + image_understanding_tasks.append( + understand_image_with_gptv(base_64_image, caption) + ) + image_upload_tasks.append( storage_account_helper.upload_blob(container, image_blob, response) ) - image_understanding_tasks.append(understand_image_with_gptv(response, caption)) - logging.info("Running image understanding tasks") image_descriptions = await asyncio.gather(*image_understanding_tasks) + logging.info("Finished image understanding tasks") logging.info(f"Image Descriptions: {image_descriptions}") logging.info("Running image upload tasks") await asyncio.gather(*image_upload_tasks) + logging.info("Finished image upload tasks") running_offset = 0 for image_processing_data, image_description in zip( @@ -390,6 +416,7 @@ async def analyse_document(file_path: str) -> tuple[AnalyzeResult, str]: model_id="prebuilt-layout", analyze_request=file_read, output_content_format=ContentFormat.MARKDOWN, + output=[AnalyzeOutputOption.FIGURES], content_type="application/octet-stream", ) From 74c9731ee5466a5e88cf5525a532d68fdc367e03 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 13:51:51 +0100 Subject: [PATCH 04/10] Update adi handling --- adi_function_app/adi_2_ai_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 76d3129..f65ad10 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -234,7 +234,7 @@ async def download_figure_image( async def process_figures_from_extracted_content( - result, + result: AnalyzeResult, operation_id: str, container_and_blob: str, markdown_content: str, @@ -245,9 +245,10 @@ async def process_figures_from_extracted_content( Args: ----- - file_path (str): The path to the PDF file. + result (AnalyzeResult): The result of the document analysis. + operation_id (str): The operation ID of the analysis. + container_and_blob (str): The container and blob of the document. markdown_content (str): The extracted content in Markdown format. - figures (list): The list of figures extracted by the Azure Document Intelligence service. page_number (int): The page number to process. If None, all pages are processed. page_offset (int): The offset of the page. From 3bd1bbd92af966c135be8bf1445b64d28bc316af Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 13:57:53 +0100 Subject: [PATCH 05/10] Update code --- adi_function_app/adi_2_ai_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index f65ad10..4654be4 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -106,12 +106,12 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): api_version = os.environ["OpenAI__ApiVersion"] model = os.environ["OpenAI__MultiModalDeployment"] - if get_identity_type() != IdentityType.SYSTEM_ASSIGNED: + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: token_provider = get_bearer_token_provider( DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default" ) api_key = None - elif get_identity_type() != IdentityType.USER_ASSIGNED: + elif get_identity_type() == IdentityType.USER_ASSIGNED: token_provider = get_bearer_token_provider( DefaultAzureCredential( managed_identity_client_id=os.environ["FunctionApp__ClientId"] From bfc1c9e3f2235c58f172c1820fd447f3bd6c6c03 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 14:08:06 +0100 Subject: [PATCH 06/10] Fix open ai request --- adi_function_app/adi_2_ai_search.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 4654be4..4321657 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -159,11 +159,11 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): "role": "user", "content": [ { - "Type": "text", + "type": "text", "text": user_input, }, { - "Type": "image_url", + "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, @@ -199,7 +199,7 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): except (openai.OpenAIError, openai.APIConnectionError) as e: logging.error("OpenAI Error: %s", e) - raise Exception("OpenAI Rate Limit Error: No retries left.") from e + raise Exception("OpenAI Connection Error: No retries left.") from e async def mark_image_as_irrelevant(): @@ -303,7 +303,6 @@ async def process_figures_from_extracted_content( base_64_image = base64.b64encode(response).decode("utf-8") logging.info(f"Image Blob: {image_blob}") - logging.info(f"Response: {response}") image_understanding_tasks.append( understand_image_with_gptv(base_64_image, caption) From 0d3ddc842fe58c69130128b62c6a985502ff9e11 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 14:56:10 +0100 Subject: [PATCH 07/10] Further bug fixes in adi --- adi_function_app/adi_2_ai_search.py | 18 +++++++++++------- adi_function_app/storage_account.py | 11 +++++++++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 4321657..047b12e 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -256,13 +256,11 @@ async def process_figures_from_extracted_content( -------- str: The updated Markdown content with the figure descriptions.""" - image_processing_data = [] + image_processing_datas = [] download_image_tasks = [] image_understanding_tasks = [] image_upload_tasks = [] - storage_account_helper = await get_storage_account_helper() - if result.figures: for figure in result.figures: if figure.id is None: @@ -288,7 +286,7 @@ async def process_figures_from_extracted_content( logging.info(f"Figure Caption: {caption}") - image_processing_data.append( + image_processing_datas.append( (container, image_blob, caption, figure.spans[0]) ) @@ -298,7 +296,9 @@ async def process_figures_from_extracted_content( image_responses = await asyncio.gather(*download_image_tasks) logging.info("Finished image download tasks") - for image_processing_data, response in zip(image_processing_data, image_responses): + storage_account_helper = await get_storage_account_helper() + + for image_processing_data, response in zip(image_processing_datas, image_responses): container, image_blob, caption, _ = image_processing_data base_64_image = base64.b64encode(response).decode("utf-8") @@ -308,8 +308,12 @@ async def process_figures_from_extracted_content( understand_image_with_gptv(base_64_image, caption) ) + image_data = base64.b64decode(base_64_image) + image_upload_tasks.append( - storage_account_helper.upload_blob(container, image_blob, response) + storage_account_helper.upload_blob( + container, image_blob, image_data, "image/png" + ) ) logging.info("Running image understanding tasks") @@ -323,7 +327,7 @@ async def process_figures_from_extracted_content( running_offset = 0 for image_processing_data, image_description in zip( - image_processing_data, image_descriptions + image_processing_datas, image_descriptions ): _, _, _, figure_span = image_processing_data starting_offset = figure_span.offset + running_offset - page_offset diff --git a/adi_function_app/storage_account.py b/adi_function_app/storage_account.py index 27c006e..5289260 100644 --- a/adi_function_app/storage_account.py +++ b/adi_function_app/storage_account.py @@ -50,7 +50,9 @@ async def add_metadata_to_blob( logging.info("Metadata Added") - async def upload_blob(self, container: str, blob: str, data) -> str: + async def upload_blob( + self, container: str, blob: str, data, content_type: str + ) -> str: """Upload the file to the Azure Blob Storage. Args: @@ -67,7 +69,12 @@ async def upload_blob(self, container: str, blob: str, data) -> str: async with blob_service_client.get_blob_client( container=container, blob=blob ) as blob_client: - await blob_client.upload_blob(data, overwrite=True) + await blob_client.upload_blob( + data, + overwrite=True, + blob_type="BlockBlob", + content_type=content_type, + ) async def download_blob_to_temp_dir( self, source: str, container: str, target_file_name From dcf72c191901d4569a8c0c1819ca566c8a959a8d Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 15:29:49 +0100 Subject: [PATCH 08/10] Final code fix --- adi_function_app/adi_2_ai_search.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 047b12e..998cf33 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -40,17 +40,25 @@ def clean_adi_markdown( comment_patterns = r"|||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) - combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" - doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) - doc_metadata = [match for group in doc_metadata for match in group if match] - + # Remove irrelevant figures if remove_irrelevant_figures: - # Remove irrelevant figures irrelevant_figure_pattern = r"\s*" cleaned_text = re.sub( irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL ) + logging.info(f"Cleaned Text: {cleaned_text}") + + markdown_without_figure_content = re.sub( + r"", "", cleaned_text, flags=re.DOTALL + ) + + combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" + doc_metadata = re.findall( + combined_pattern, markdown_without_figure_content, re.DOTALL + ) + doc_metadata = [match for group in doc_metadata for match in group if match] + output_dict["content"] = cleaned_text output_dict["sections"] = doc_metadata From d558e8a5eec39a6bf333cc82dfc4ff87af797661 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 15:30:37 +0100 Subject: [PATCH 09/10] Update readmes --- adi_function_app/README.md | 14 ++++++ text_2_sql/README.md | 88 +++++++++++++++++++------------------- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/adi_function_app/README.md b/adi_function_app/README.md index 72c60ab..679d534 100644 --- a/adi_function_app/README.md +++ b/adi_function_app/README.md @@ -36,6 +36,20 @@ The properties returned from the ADI Custom Skill are then used to perform the f - Keyphrase extraction - Vectorisation +## Sample Output + +Using the [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/pdf/2404.14219) as an example, the following output can be obtained for page 7: + +```json +{ + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Table 1: Comparison results on RepoQA benchmark.
ModelCtx SizePythonC++RustJavaTypeScriptAverage
gpt-4O-2024-05-13128k958085969790.6
gemini-1.5-flash-latest1000k937987949790
Phi-3.5-MoE128k897481889585
Phi-3.5-Mini128k866773778277
Llama-3.1-8B-Instruct128k806573766371
Mixtral-8x7B-Instruct-v0.132k666564717468
Mixtral-8x22B-Instruct-v0.164k606774835567.8
\n\n\nsuch as Arabic, Chinese, Russian, Ukrainian, and Vietnamese, with average MMLU-multilingual scores\nof 55.4 and 47.3, respectively. Due to its larger model capacity, phi-3.5-MoE achieves a significantly\nhigher average score of 69.9, outperforming phi-3.5-mini.\n\nMMLU(5-shot) MultiLingual\n\nPhi-3-mini\n\nPhi-3.5-mini\n\nPhi-3.5-MoE\n\n\n\n\n\nWe evaluate the phi-3.5-mini and phi-3.5-MoE models on two long-context understanding tasks:\nRULER [HSK+24] and RepoQA [LTD+24]. As shown in Tables 1 and 2, both phi-3.5-MoE and phi-\n3.5-mini outperform other open-source models with larger sizes, such as Llama-3.1-8B, Mixtral-8x7B,\nand Mixtral-8x22B, on the RepoQA task, and achieve comparable performance to Llama-3.1-8B on\nthe RULER task. However, we observe a significant performance drop when testing the 128K context\nwindow on the RULER task. We suspect this is due to the lack of high-quality long-context data in\nmid-training, an issue we plan to address in the next version of the model release.\n\nIn the table 3, we present a detailed evaluation of the phi-3.5-mini and phi-3.5-MoE models\ncompared with recent SoTA pretrained language models, such as GPT-4o-mini, Gemini-1.5 Flash, and\nopen-source models like Llama-3.1-8B and the Mistral models. The results show that phi-3.5-mini\nachieves performance comparable to much larger models like Mistral-Nemo-12B and Llama-3.1-8B, while\nphi-3.5-MoE significantly outperforms other open-source models, offers performance comparable to\nGemini-1.5 Flash, and achieves above 90% of the average performance of GPT-4o-mini across various\nlanguage benchmarks.\n\n\n\n\n", + "sections": [], + "page_number": 7 +} +``` + +The Figure 4 content has been interpreted and added into the extracted chunk to enhance the context for a RAG application. This is particularly powerful for applications where the documents are heavily imaged or chart based. + ## Provided Notebooks \& Utilities - `./ai_search_with_adi_function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. diff --git a/text_2_sql/README.md b/text_2_sql/README.md index 916c2c9..523177c 100644 --- a/text_2_sql/README.md +++ b/text_2_sql/README.md @@ -44,6 +44,50 @@ Both approaches limit the number of tokens used and avoids filling the prompt wi Using Auto-Function calling capabilities, the LLM is able to retrieve from the plugin the full schema information for the views / tables that it considers useful for answering the question. Once retrieved, the full SQL query can then be generated. The schemas for multiple views / tables can be retrieved to allow the LLM to perform joins and other complex queries. +## Sample Output + +### What is the top performing product by quantity of units sold? + +#### SQL Query Generated + +*SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC* + +#### JSON Result + +```json +{ + "answer": "The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2].", + "sources": [ + { + "title": "Sales Order Detail", + "chunk": "| ProductID | TotalUnitsSold |\n|-----------|----------------|\n| 864 | 87 |\n", + "reference": "SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC;" + }, + { + "title": "Product and Description", + "chunk": "| Name | ProductModel |\n|----------------|---------------|\n| Classic Vest, S| Classic Vest |\n", + "reference": "SELECT Name, ProductModel FROM SalesLT.vProductAndDescription WHERE ProductID = 864;" + } + ] +} +``` + +The **answer** and **sources** properties can be rendered to the user to visualize the results. Markdown support is useful for complex answer outputs and explaining the source of the information. + +#### Rendered Output + +The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2]. + +#### Rendered Sources + +| ProductID | TotalUnitsSold | +|-----------|----------------| +| 864 | 87 | + +| Name | ProductModel | +|----------------|---------------| +| Classic Vest, S| Classic Vest | + ## Provided Notebooks - `./rag_with_prompt_based_text_2_sql.ipynb` provides example of how to utilise the Prompt Based Text2SQL plugin to query the database. @@ -151,50 +195,6 @@ The search text passed is vectorised against the entity level **Description** co This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to run a SQL query against the given database. It returns a JSON string containing a row wise dump of the results returned. These results are then interpreted to answer the question. -## Sample Usage - -### What is the top performing product by quantity of units sold? - -#### SQL Query Generated - -*SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC* - -#### JSON Result - -```json -{ - "answer": "The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2].", - "sources": [ - { - "title": "Sales Order Detail", - "chunk": "| ProductID | TotalUnitsSold |\n|-----------|----------------|\n| 864 | 87 |\n", - "reference": "SELECT TOP 1 ProductID, SUM(OrderQty) AS TotalUnitsSold FROM SalesLT.SalesOrderDetail GROUP BY ProductID ORDER BY TotalUnitsSold DESC;" - }, - { - "title": "Product and Description", - "chunk": "| Name | ProductModel |\n|----------------|---------------|\n| Classic Vest, S| Classic Vest |\n", - "reference": "SELECT Name, ProductModel FROM SalesLT.vProductAndDescription WHERE ProductID = 864;" - } - ] -} -``` - -The **answer** and **sources** properties can be rendered to the user to visualize the results. Markdown support is useful for complex answer outputs and explaining the source of the information. - -#### Rendered Output - -The top-performing product by quantity of units sold is the **Classic Vest, S** from the **Classic Vest** product model, with a total of 87 units sold [1][2]. - -#### Rendered Sources - -| ProductID | TotalUnitsSold | -|-----------|----------------| -| 864 | 87 | - -| Name | ProductModel | -|----------------|---------------| -| Classic Vest, S| Classic Vest | - ## Tips for good Text2SQL performance. - Pre-assemble views to avoid the LLM having to make complex joins between multiple tables From 51618df03dc236b9ba6876dc6dc98e9ae2f0cc87 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 12 Sep 2024 15:32:14 +0100 Subject: [PATCH 10/10] Update code --- adi_function_app/adi_2_ai_search.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 998cf33..947f35f 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -210,10 +210,6 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): raise Exception("OpenAI Connection Error: No retries left.") from e -async def mark_image_as_irrelevant(): - return "Irrelevant Image" - - async def download_figure_image( model_id: str, operation_id: str, figure_id: str ) -> bytearray: