17
17
import concurrent .futures
18
18
import json
19
19
from openai import AsyncAzureOpenAI
20
+ from typing import Union
20
21
import openai
21
22
from environment import IdentityType , get_identity_type
22
23
23
24
24
- def clean_adi_markdown (
25
- markdown_text : str , page_no : int = None , remove_irrelevant_figures = False
25
+ def build_and_clean_markdown_for_response (
26
+ markdown_text : str ,
27
+ figures : dict ,
28
+ page_no : int = None ,
29
+ remove_irrelevant_figures = False ,
26
30
):
27
31
"""Clean Markdown text extracted by the Azure Document Intelligence service.
28
32
@@ -62,28 +66,33 @@ def clean_adi_markdown(
62
66
output_dict ["content" ] = cleaned_text
63
67
output_dict ["sections" ] = doc_metadata
64
68
69
+ output_dict ["figures" ] = figures
70
+
65
71
# add page number when chunk by page is enabled
66
72
if page_no is not None :
67
73
output_dict ["page_number" ] = page_no
68
74
69
75
return output_dict
70
76
71
77
72
- def update_figure_description (md_content , img_description , offset , length ):
78
+ def update_figure_description (
79
+ md_content : str , figure_id : str , img_description : str , offset : int , length : int
80
+ ):
73
81
"""
74
82
Updates the figure description in the Markdown content.
75
83
76
84
Args:
77
85
md_content (str): The original Markdown content.
78
86
img_description (str): The new description for the image.
79
- idx (int): The index of the figure.
87
+ offset (int): Position offset in the text.
88
+ length (int): Length of the original figure in the text.
80
89
81
90
Returns:
82
91
str: The updated Markdown content with the new figure description.
83
92
"""
84
93
85
94
# Define the new string to replace the old content
86
- new_string = f'<!-- FigureContent="{ img_description } " -->'
95
+ new_string = f'<!-- FigureId=" { figure_id } " FigureContent="{ img_description } " -->'
87
96
88
97
# Calculate the end index of the content to be replaced
89
98
end_index = offset + length
@@ -244,7 +253,7 @@ async def process_figures_from_extracted_content(
244
253
markdown_content : str ,
245
254
page_number : None | int = None ,
246
255
page_offset : int = 0 ,
247
- ) -> str :
256
+ ) -> Union [ str , dict ] :
248
257
"""Process the figures extracted from the content using ADI and send them for analysis.
249
258
250
259
Args:
@@ -258,12 +267,13 @@ async def process_figures_from_extracted_content(
258
267
259
268
Returns:
260
269
--------
261
- str: The updated Markdown content with the figure descriptions."""
270
+ str: The updated Markdown content with the figure descriptions.
271
+ dict: A mapping of the FigureId to the stored Uri in blob storage."""
262
272
263
- image_processing_datas = []
273
+ figure_processing_datas = []
264
274
download_image_tasks = []
265
- image_understanding_tasks = []
266
- image_upload_tasks = []
275
+ figure_understanding_tasks = []
276
+ figure_upload_tasks = []
267
277
268
278
if result .figures :
269
279
for figure in result .figures :
@@ -290,8 +300,8 @@ async def process_figures_from_extracted_content(
290
300
291
301
logging .info (f"Figure Caption: { caption } " )
292
302
293
- image_processing_datas .append (
294
- (container , image_blob , caption , figure .spans [0 ])
303
+ figure_processing_datas .append (
304
+ (figure . id , container , image_blob , caption , figure .spans [0 ])
295
305
)
296
306
297
307
break
@@ -302,45 +312,59 @@ async def process_figures_from_extracted_content(
302
312
303
313
storage_account_helper = await get_storage_account_helper ()
304
314
305
- for image_processing_data , response in zip (image_processing_datas , image_responses ):
306
- container , image_blob , caption , _ = image_processing_data
315
+ for figure_processing_data , response in zip (
316
+ figure_processing_datas , image_responses
317
+ ):
318
+ _ , container , image_blob , caption , _ = figure_processing_data
307
319
base_64_image = base64 .b64encode (response ).decode ("utf-8" )
308
320
309
321
logging .info (f"Image Blob: { image_blob } " )
310
322
311
- image_understanding_tasks .append (
323
+ figure_understanding_tasks .append (
312
324
understand_image_with_gptv (base_64_image , caption )
313
325
)
314
326
315
327
image_data = base64 .b64decode (base_64_image )
316
328
317
- image_upload_tasks .append (
329
+ figure_upload_tasks .append (
318
330
storage_account_helper .upload_blob (
319
331
container , image_blob , image_data , "image/png"
320
332
)
321
333
)
322
334
335
+ figure_ids = [
336
+ figure_processing_data [0 ] for figure_processing_data in figure_processing_datas
337
+ ]
323
338
logging .info ("Running image understanding tasks" )
324
- image_descriptions = await asyncio .gather (* image_understanding_tasks )
339
+ figure_descriptions = await asyncio .gather (* figure_understanding_tasks )
325
340
logging .info ("Finished image understanding tasks" )
326
- logging .info (f"Image Descriptions: { image_descriptions } " )
341
+ logging .info (f"Image Descriptions: { figure_descriptions } " )
327
342
328
343
logging .info ("Running image upload tasks" )
329
- await asyncio .gather (* image_upload_tasks )
344
+ figure_uris = await asyncio .gather (* figure_upload_tasks )
330
345
logging .info ("Finished image upload tasks" )
331
346
347
+ figures = [
348
+ {"FigureId" : figure_id , "FigureUri" : figure_uri }
349
+ for figure_id , figure_uri in zip (figure_ids , figure_uris )
350
+ ]
351
+
332
352
running_offset = 0
333
- for image_processing_data , image_description in zip (
334
- image_processing_datas , image_descriptions
353
+ for figure_processing_data , figure_description in zip (
354
+ figure_processing_datas , figure_descriptions
335
355
):
336
- _ , _ , _ , figure_span = image_processing_data
356
+ figure_id , _ , _ , _ , figure_span = figure_processing_data
337
357
starting_offset = figure_span .offset + running_offset - page_offset
338
358
markdown_content , desc_offset = update_figure_description (
339
- markdown_content , image_description , starting_offset , figure_span .length
359
+ markdown_content ,
360
+ figure_id ,
361
+ figure_description ,
362
+ starting_offset ,
363
+ figure_span .length ,
340
364
)
341
365
running_offset += desc_offset
342
366
343
- return markdown_content
367
+ return markdown_content , figures
344
368
345
369
346
370
def create_page_wise_content (result : AnalyzeResult ) -> list :
@@ -570,9 +594,13 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
570
594
with concurrent .futures .ProcessPoolExecutor () as executor :
571
595
futures = {
572
596
executor .submit (
573
- clean_adi_markdown , page_content , page_number , True
574
- ): page_content
575
- for page_content , page_number in zip (
597
+ build_and_clean_markdown_for_response ,
598
+ extracted_page_content [0 ],
599
+ extracted_page_content [1 ],
600
+ page_number ,
601
+ True ,
602
+ ): extracted_page_content
603
+ for extracted_page_content , page_number in zip (
576
604
content_with_figures , page_numbers
577
605
)
578
606
}
@@ -582,7 +610,10 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
582
610
else :
583
611
markdown_content = result .content
584
612
585
- content_with_figures = await process_figures_from_extracted_content (
613
+ (
614
+ extracted_content ,
615
+ figures ,
616
+ ) = await process_figures_from_extracted_content (
586
617
result ,
587
618
operation_id ,
588
619
container_and_blob ,
@@ -591,8 +622,8 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
591
622
page_number = None ,
592
623
)
593
624
594
- cleaned_result = clean_adi_markdown (
595
- content_with_figures , remove_irrelevant_figures = True
625
+ cleaned_result = build_and_clean_markdown_for_response (
626
+ extracted_content , figures , remove_irrelevant_figures = True
596
627
)
597
628
except Exception as e :
598
629
logging .error (e )
0 commit comments