Skip to content

Commit 3928edb

Browse files
committed
Handle new way of finding figures
1 parent 4ecf2c8 commit 3928edb

File tree

1 file changed

+35
-25
lines changed

1 file changed

+35
-25
lines changed

ai_search_with_adi/adi_function_app/adi_2_ai_search.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def clean_adi_markdown(
9292
return output_dict
9393

9494

95-
def update_figure_description(md_content, img_description, idx):
95+
def update_figure_description(md_content, img_description, offset, length):
9696
"""
9797
Updates the figure description in the Markdown content.
9898
@@ -105,26 +105,20 @@ def update_figure_description(md_content, img_description, idx):
105105
str: The updated Markdown content with the new figure description.
106106
"""
107107

108-
# The substring you're looking for
109-
start_substring = f"![](figures/{idx})"
110-
end_substring = "</figure>"
108+
# Define the new string to replace the old content
111109
new_string = f'<!-- FigureContent="{img_description}" -->'
112110

113-
new_md_content = md_content
114-
# Find the start and end indices of the part to replace
115-
start_index = md_content.find(start_substring)
116-
if start_index != -1: # if start_substring is found
117-
start_index += len(
118-
start_substring
119-
) # move the index to the end of start_substring
120-
end_index = md_content.find(end_substring, start_index)
121-
if end_index != -1: # if end_substring is found
122-
# Replace the old string with the new string
123-
new_md_content = (
124-
md_content[:start_index] + new_string + md_content[end_index:]
125-
)
111+
# Calculate the end index of the content to be replaced
112+
end_index = offset + length
113+
114+
# Ensure that the end_index does not exceed the length of the Markdown content
115+
if end_index > len(md_content):
116+
end_index = len(md_content)
117+
118+
# Replace the old string with the new string
119+
new_md_content = md_content[:offset] + new_string + md_content[end_index:]
126120

127-
return new_md_content
121+
return new_md_content, len(new_string)
128122

129123

130124
async def understand_image_with_gptv(image_base64, caption, tries_left=3):
@@ -260,7 +254,11 @@ async def mark_image_as_irrelevant():
260254

261255

262256
async def process_figures_from_extracted_content(
263-
file_path: str, markdown_content: str, figures: list, page_number: None | int = None
257+
file_path: str,
258+
markdown_content: str,
259+
figures: list,
260+
page_number: None | int = None,
261+
page_offset: int = 0,
264262
) -> str:
265263
"""Process the figures extracted from the content using ADI and send them for analysis.
266264
@@ -270,6 +268,7 @@ async def process_figures_from_extracted_content(
270268
markdown_content (str): The extracted content in Markdown format.
271269
figures (list): The list of figures extracted by the Azure Document Intelligence service.
272270
page_number (int): The page number to process. If None, all pages are processed.
271+
page_offset (int): The offset of the page.
273272
274273
Returns:
275274
--------
@@ -313,10 +312,14 @@ async def process_figures_from_extracted_content(
313312

314313
logging.info(f"Image Descriptions: {image_descriptions}")
315314

316-
for idx, img_description in enumerate(image_descriptions):
317-
markdown_content = update_figure_description(
318-
markdown_content, img_description, idx
315+
running_offset = 0
316+
for idx, figure in enumerate(figures):
317+
img_description = image_descriptions[idx]
318+
starting_offset = figure.spans[0].offset + running_offset - page_offset
319+
markdown_content, desc_offset = update_figure_description(
320+
markdown_content, img_description, starting_offset, figure.spans[0].length
319321
)
322+
running_offset += desc_offset
320323

321324
return markdown_content
322325

@@ -335,13 +338,15 @@ def create_page_wise_content(result: AnalyzeResult) -> list:
335338

336339
page_wise_content = []
337340
page_numbers = []
341+
page_offsets = []
338342

339343
for page_number, page in enumerate(result.pages):
340344
page_content = result.content[
341345
page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"]
342346
]
343347
page_wise_content.append(page_content)
344348
page_numbers.append(page_number)
349+
page_offsets.append(page.spans[0]["offset"])
345350

346351
return page_wise_content, page_numbers
347352

@@ -496,15 +501,20 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
496501
try:
497502
if chunk_by_page:
498503
cleaned_result = []
499-
markdown_content, page_numbers = create_page_wise_content(result)
504+
markdown_content, page_numbers, page_offsets = create_page_wise_content(
505+
result
506+
)
500507
content_with_figures_tasks = [
501508
process_figures_from_extracted_content(
502509
temp_file_path,
503510
page_content,
504511
result.figures,
505512
page_number=page_number,
513+
page_offset=page_offset,
514+
)
515+
for page_content, page_number, page_offset in zip(
516+
markdown_content, page_numbers, page_offsets
506517
)
507-
for page_content, page_number in zip(markdown_content, page_numbers)
508518
]
509519
content_with_figures = await asyncio.gather(*content_with_figures_tasks)
510520

@@ -523,7 +533,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
523533
else:
524534
markdown_content = result.content
525535
content_with_figures = await process_figures_from_extracted_content(
526-
temp_file_path, markdown_content, result.figures
536+
temp_file_path, markdown_content, result.figures, page_offset=0
527537
)
528538
cleaned_result = clean_adi_markdown(
529539
content_with_figures, remove_irrelevant_figures=False

0 commit comments

Comments
 (0)