Skip to content

Commit 3aad38f

Browse files
committed
Update figure understanding code
1 parent fd3c4f0 commit 3aad38f

File tree

4 files changed

+89
-32
lines changed

4 files changed

+89
-32
lines changed

adi_function_app/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ This method takes the detected figures, and crops them out of the page to save t
9191

9292
`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure.
9393

94-
##### clean_adi_markdown
94+
##### build_and_clean_markdown_for_response
9595

9696
This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer.
9797

adi_function_app/adi_2_ai_search.py

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@
1717
import concurrent.futures
1818
import json
1919
from openai import AsyncAzureOpenAI
20+
from typing import Union
2021
import openai
2122
from environment import IdentityType, get_identity_type
2223

2324

24-
def clean_adi_markdown(
25-
markdown_text: str, page_no: int = None, remove_irrelevant_figures=False
25+
def build_and_clean_markdown_for_response(
26+
markdown_text: str,
27+
figures: dict,
28+
page_no: int = None,
29+
remove_irrelevant_figures=False,
2630
):
2731
"""Clean Markdown text extracted by the Azure Document Intelligence service.
2832
@@ -62,28 +66,33 @@ def clean_adi_markdown(
6266
output_dict["content"] = cleaned_text
6367
output_dict["sections"] = doc_metadata
6468

69+
output_dict["figures"] = figures
70+
6571
# add page number when chunk by page is enabled
6672
if page_no is not None:
6773
output_dict["page_number"] = page_no
6874

6975
return output_dict
7076

7177

72-
def update_figure_description(md_content, img_description, offset, length):
78+
def update_figure_description(
79+
md_content: str, figure_id: str, img_description: str, offset: int, length: int
80+
):
7381
"""
7482
Updates the figure description in the Markdown content.
7583
7684
Args:
7785
md_content (str): The original Markdown content.
7886
img_description (str): The new description for the image.
79-
idx (int): The index of the figure.
87+
offset (int): Position offset in the text.
88+
length (int): Length of the original figure in the text.
8089
8190
Returns:
8291
str: The updated Markdown content with the new figure description.
8392
"""
8493

8594
# Define the new string to replace the old content
86-
new_string = f'<!-- FigureContent="{img_description}" -->'
95+
new_string = f'<!-- FigureId="{figure_id}" FigureContent="{img_description}" -->'
8796

8897
# Calculate the end index of the content to be replaced
8998
end_index = offset + length
@@ -244,7 +253,7 @@ async def process_figures_from_extracted_content(
244253
markdown_content: str,
245254
page_number: None | int = None,
246255
page_offset: int = 0,
247-
) -> str:
256+
) -> Union[str, dict]:
248257
"""Process the figures extracted from the content using ADI and send them for analysis.
249258
250259
Args:
@@ -258,12 +267,13 @@ async def process_figures_from_extracted_content(
258267
259268
Returns:
260269
--------
261-
str: The updated Markdown content with the figure descriptions."""
270+
str: The updated Markdown content with the figure descriptions.
271+
dict: A mapping of the FigureId to the stored Uri in blob storage."""
262272

263-
image_processing_datas = []
273+
figure_processing_datas = []
264274
download_image_tasks = []
265-
image_understanding_tasks = []
266-
image_upload_tasks = []
275+
figure_understanding_tasks = []
276+
figure_upload_tasks = []
267277

268278
if result.figures:
269279
for figure in result.figures:
@@ -290,8 +300,8 @@ async def process_figures_from_extracted_content(
290300

291301
logging.info(f"Figure Caption: {caption}")
292302

293-
image_processing_datas.append(
294-
(container, image_blob, caption, figure.spans[0])
303+
figure_processing_datas.append(
304+
(figure.id, container, image_blob, caption, figure.spans[0])
295305
)
296306

297307
break
@@ -302,45 +312,59 @@ async def process_figures_from_extracted_content(
302312

303313
storage_account_helper = await get_storage_account_helper()
304314

305-
for image_processing_data, response in zip(image_processing_datas, image_responses):
306-
container, image_blob, caption, _ = image_processing_data
315+
for figure_processing_data, response in zip(
316+
figure_processing_datas, image_responses
317+
):
318+
_, container, image_blob, caption, _ = figure_processing_data
307319
base_64_image = base64.b64encode(response).decode("utf-8")
308320

309321
logging.info(f"Image Blob: {image_blob}")
310322

311-
image_understanding_tasks.append(
323+
figure_understanding_tasks.append(
312324
understand_image_with_gptv(base_64_image, caption)
313325
)
314326

315327
image_data = base64.b64decode(base_64_image)
316328

317-
image_upload_tasks.append(
329+
figure_upload_tasks.append(
318330
storage_account_helper.upload_blob(
319331
container, image_blob, image_data, "image/png"
320332
)
321333
)
322334

335+
figure_ids = [
336+
figure_processing_data[0] for figure_processing_data in figure_processing_datas
337+
]
323338
logging.info("Running image understanding tasks")
324-
image_descriptions = await asyncio.gather(*image_understanding_tasks)
339+
figure_descriptions = await asyncio.gather(*figure_understanding_tasks)
325340
logging.info("Finished image understanding tasks")
326-
logging.info(f"Image Descriptions: {image_descriptions}")
341+
logging.info(f"Image Descriptions: {figure_descriptions}")
327342

328343
logging.info("Running image upload tasks")
329-
await asyncio.gather(*image_upload_tasks)
344+
figure_uris = await asyncio.gather(*figure_upload_tasks)
330345
logging.info("Finished image upload tasks")
331346

347+
figures = [
348+
{"FigureId": figure_id, "FigureUri": figure_uri}
349+
for figure_id, figure_uri in zip(figure_ids, figure_uris)
350+
]
351+
332352
running_offset = 0
333-
for image_processing_data, image_description in zip(
334-
image_processing_datas, image_descriptions
353+
for figure_processing_data, figure_description in zip(
354+
figure_processing_datas, figure_descriptions
335355
):
336-
_, _, _, figure_span = image_processing_data
356+
figure_id, _, _, _, figure_span = figure_processing_data
337357
starting_offset = figure_span.offset + running_offset - page_offset
338358
markdown_content, desc_offset = update_figure_description(
339-
markdown_content, image_description, starting_offset, figure_span.length
359+
markdown_content,
360+
figure_id,
361+
figure_description,
362+
starting_offset,
363+
figure_span.length,
340364
)
341365
running_offset += desc_offset
342366

343-
return markdown_content
367+
return markdown_content, figures
344368

345369

346370
def create_page_wise_content(result: AnalyzeResult) -> list:
@@ -570,9 +594,13 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
570594
with concurrent.futures.ProcessPoolExecutor() as executor:
571595
futures = {
572596
executor.submit(
573-
clean_adi_markdown, page_content, page_number, True
574-
): page_content
575-
for page_content, page_number in zip(
597+
build_and_clean_markdown_for_response,
598+
extracted_page_content[0],
599+
extracted_page_content[1],
600+
page_number,
601+
True,
602+
): extracted_page_content
603+
for extracted_page_content, page_number in zip(
576604
content_with_figures, page_numbers
577605
)
578606
}
@@ -582,7 +610,10 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
582610
else:
583611
markdown_content = result.content
584612

585-
content_with_figures = await process_figures_from_extracted_content(
613+
(
614+
extracted_content,
615+
figures,
616+
) = await process_figures_from_extracted_content(
586617
result,
587618
operation_id,
588619
container_and_blob,
@@ -591,8 +622,8 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
591622
page_number=None,
592623
)
593624

594-
cleaned_result = clean_adi_markdown(
595-
content_with_figures, remove_irrelevant_figures=True
625+
cleaned_result = build_and_clean_markdown_for_response(
626+
extracted_content, figures, remove_irrelevant_figures=True
596627
)
597628
except Exception as e:
598629
logging.error(e)

adi_function_app/storage_account.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,10 @@ async def upload_blob(
5858
Args:
5959
container (str): The container of the blob.
6060
blob (str): The blob name.
61-
data (bytes): The data to upload."""
61+
data (bytes): The data to upload.
62+
63+
Returns:
64+
str: url of the uploaded blob."""
6265

6366
logging.info("Uploading Blob...")
6467
logging.info(f"Container: {container}")
@@ -76,6 +79,8 @@ async def upload_blob(
7679
content_type=content_type,
7780
)
7881

82+
return blob_client.url
83+
7984
async def download_blob_to_temp_dir(
8085
self, source: str, container: str, target_file_name
8186
) -> tuple[str, dict]:

deploy_ai_search/rag_documents.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
SearchIndexerIndexProjectionsParameters,
2020
IndexProjectionMode,
2121
SimpleField,
22+
ComplexField,
2223
BlobIndexerDataToExtract,
2324
IndexerExecutionEnvironment,
2425
)
@@ -96,6 +97,22 @@ def get_index_fields(self) -> list[SearchableField]:
9697
filterable=True,
9798
facetable=True,
9899
),
100+
ComplexField(
101+
name="Figures",
102+
collection=True,
103+
fields=[
104+
SimpleField(
105+
name="FigureId",
106+
type=SearchFieldDataType.String,
107+
collection=True,
108+
),
109+
SimpleField(
110+
name="FigureUri",
111+
type=SearchFieldDataType.String,
112+
collection=True,
113+
),
114+
],
115+
),
99116
SimpleField(
100117
name="DateLastModified",
101118
type=SearchFieldDataType.DateTimeOffset,
@@ -198,9 +215,13 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
198215
InputFieldMappingEntry(
199216
name="Sections", source="/document/pages/*/cleaned_sections"
200217
),
218+
InputFieldMappingEntry(
219+
name="Figures", source="/document/pages/*/cleaned_sections"
220+
),
201221
InputFieldMappingEntry(
202222
name="DateLastModified", source="/document/DateLastModified"
203223
),
224+
InputFieldMappingEntry(name="Figures", source="/document/pages/*/figures"),
204225
]
205226

206227
if self.enable_page_by_chunking:

0 commit comments

Comments
 (0)