@@ -92,7 +92,7 @@ def clean_adi_markdown(
92
92
return output_dict
93
93
94
94
95
- def update_figure_description (md_content , img_description , idx ):
95
+ def update_figure_description (md_content , img_description , offset , length ):
96
96
"""
97
97
Updates the figure description in the Markdown content.
98
98
@@ -105,26 +105,20 @@ def update_figure_description(md_content, img_description, idx):
105
105
str: The updated Markdown content with the new figure description.
106
106
"""
107
107
108
- # The substring you're looking for
109
- start_substring = f""
110
- end_substring = "</figure>"
108
+ # Define the new string to replace the old content
111
109
new_string = f'<!-- FigureContent="{ img_description } " -->'
112
110
113
- new_md_content = md_content
114
- # Find the start and end indices of the part to replace
115
- start_index = md_content .find (start_substring )
116
- if start_index != - 1 : # if start_substring is found
117
- start_index += len (
118
- start_substring
119
- ) # move the index to the end of start_substring
120
- end_index = md_content .find (end_substring , start_index )
121
- if end_index != - 1 : # if end_substring is found
122
- # Replace the old string with the new string
123
- new_md_content = (
124
- md_content [:start_index ] + new_string + md_content [end_index :]
125
- )
111
+ # Calculate the end index of the content to be replaced
112
+ end_index = offset + length
113
+
114
+ # Ensure that the end_index does not exceed the length of the Markdown content
115
+ if end_index > len (md_content ):
116
+ end_index = len (md_content )
117
+
118
+ # Replace the old string with the new string
119
+ new_md_content = md_content [:offset ] + new_string + md_content [end_index :]
126
120
127
- return new_md_content
121
+ return new_md_content , len ( new_string )
128
122
129
123
130
124
async def understand_image_with_gptv (image_base64 , caption , tries_left = 3 ):
@@ -260,7 +254,11 @@ async def mark_image_as_irrelevant():
260
254
261
255
262
256
async def process_figures_from_extracted_content (
263
- file_path : str , markdown_content : str , figures : list , page_number : None | int = None
257
+ file_path : str ,
258
+ markdown_content : str ,
259
+ figures : list ,
260
+ page_number : None | int = None ,
261
+ page_offset : int = 0 ,
264
262
) -> str :
265
263
"""Process the figures extracted from the content using ADI and send them for analysis.
266
264
@@ -270,6 +268,7 @@ async def process_figures_from_extracted_content(
270
268
markdown_content (str): The extracted content in Markdown format.
271
269
figures (list): The list of figures extracted by the Azure Document Intelligence service.
272
270
page_number (int): The page number to process. If None, all pages are processed.
271
+ page_offset (int): The offset of the page.
273
272
274
273
Returns:
275
274
--------
@@ -313,10 +312,14 @@ async def process_figures_from_extracted_content(
313
312
314
313
logging .info (f"Image Descriptions: { image_descriptions } " )
315
314
316
- for idx , img_description in enumerate (image_descriptions ):
317
- markdown_content = update_figure_description (
318
- markdown_content , img_description , idx
315
+ running_offset = 0
316
+ for idx , figure in enumerate (figures ):
317
+ img_description = image_descriptions [idx ]
318
+ starting_offset = figure .spans [0 ].offset + running_offset - page_offset
319
+ markdown_content , desc_offset = update_figure_description (
320
+ markdown_content , img_description , starting_offset , figure .spans [0 ].length
319
321
)
322
+ running_offset += desc_offset
320
323
321
324
return markdown_content
322
325
@@ -335,13 +338,15 @@ def create_page_wise_content(result: AnalyzeResult) -> list:
335
338
336
339
page_wise_content = []
337
340
page_numbers = []
341
+ page_offsets = []
338
342
339
343
for page_number , page in enumerate (result .pages ):
340
344
page_content = result .content [
341
345
page .spans [0 ]["offset" ] : page .spans [0 ]["offset" ] + page .spans [0 ]["length" ]
342
346
]
343
347
page_wise_content .append (page_content )
344
348
page_numbers .append (page_number )
349
+ page_offsets .append (page .spans [0 ]["offset" ])
345
350
346
351
return page_wise_content , page_numbers
347
352
@@ -496,15 +501,20 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
496
501
try :
497
502
if chunk_by_page :
498
503
cleaned_result = []
499
- markdown_content , page_numbers = create_page_wise_content (result )
504
+ markdown_content , page_numbers , page_offsets = create_page_wise_content (
505
+ result
506
+ )
500
507
content_with_figures_tasks = [
501
508
process_figures_from_extracted_content (
502
509
temp_file_path ,
503
510
page_content ,
504
511
result .figures ,
505
512
page_number = page_number ,
513
+ page_offset = page_offset ,
514
+ )
515
+ for page_content , page_number , page_offset in zip (
516
+ markdown_content , page_numbers , page_offsets
506
517
)
507
- for page_content , page_number in zip (markdown_content , page_numbers )
508
518
]
509
519
content_with_figures = await asyncio .gather (* content_with_figures_tasks )
510
520
@@ -523,7 +533,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
523
533
else :
524
534
markdown_content = result .content
525
535
content_with_figures = await process_figures_from_extracted_content (
526
- temp_file_path , markdown_content , result .figures
536
+ temp_file_path , markdown_content , result .figures , page_offset = 0
527
537
)
528
538
cleaned_result = clean_adi_markdown (
529
539
content_with_figures , remove_irrelevant_figures = False
0 commit comments