Skip to content

Commit 6fb785c

Browse files
Return Source Uris for Figures (#22)
* Update adi * Add last modified date * Update figure understanding code * Update the prompt
1 parent 1b6edab commit 6fb785c

File tree

9 files changed

+155
-58
lines changed

9 files changed

+155
-58
lines changed

adi_function_app/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ This method takes the detected figures, and crops them out of the page to save t
9191

9292
`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure.
9393

94-
##### clean_adi_markdown
94+
##### build_and_clean_markdown_for_response
9595

9696
This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer.
9797

adi_function_app/adi_2_ai_search.py

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@
1717
import concurrent.futures
1818
import json
1919
from openai import AsyncAzureOpenAI
20+
from typing import Union
2021
import openai
2122
from environment import IdentityType, get_identity_type
2223

2324

24-
def clean_adi_markdown(
25-
markdown_text: str, page_no: int = None, remove_irrelevant_figures=False
25+
def build_and_clean_markdown_for_response(
26+
markdown_text: str,
27+
figures: dict,
28+
page_no: int = None,
29+
remove_irrelevant_figures=False,
2630
):
2731
"""Clean Markdown text extracted by the Azure Document Intelligence service.
2832
@@ -62,28 +66,33 @@ def clean_adi_markdown(
6266
output_dict["content"] = cleaned_text
6367
output_dict["sections"] = doc_metadata
6468

69+
output_dict["figures"] = figures
70+
6571
# add page number when chunk by page is enabled
6672
if page_no is not None:
67-
output_dict["page_number"] = page_no
73+
output_dict["pageNumber"] = page_no
6874

6975
return output_dict
7076

7177

72-
def update_figure_description(md_content, img_description, offset, length):
78+
def update_figure_description(
79+
md_content: str, figure_id: str, img_description: str, offset: int, length: int
80+
):
7381
"""
7482
Updates the figure description in the Markdown content.
7583
7684
Args:
7785
md_content (str): The original Markdown content.
7886
img_description (str): The new description for the image.
79-
idx (int): The index of the figure.
87+
offset (int): Position offset in the text.
88+
length (int): Length of the original figure in the text.
8089
8190
Returns:
8291
str: The updated Markdown content with the new figure description.
8392
"""
8493

8594
# Define the new string to replace the old content
86-
new_string = f'<!-- FigureContent="{img_description}" -->'
95+
new_string = f'<!-- FigureId="{figure_id}" FigureContent="{img_description}" -->'
8796

8897
# Calculate the end index of the content to be replaced
8998
end_index = offset + length
@@ -131,22 +140,24 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3):
131140
token_provider = None
132141
api_key = os.environ["OpenAI__ApiKey"]
133142

134-
system_prompt = """You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image.
143+
system_prompt = """You are an expert in technical image analysis. Your task is to provided analysis of images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image. Do not describe the image in a general way or describe the image in a way that is not useful for decision-making.
135144
136-
If the image is a chart for instance, you should describe the data trends, patterns, and insights that can be drawn from the chart.
145+
If the image is a chart for instance, you should describe the data trends, patterns, and insights that can be drawn from the chart. For example, you could describe the increase or decrease in sales over time, the peak sales period, or the sales performance of a particular product.
137146
138147
If the image is a map, you should describe the geographical features, landmarks, and any other relevant information that can be inferred from the map.
139148
140149
If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram.
141150
142151
Include any data points, labels, and other relevant information that can be inferred from the image.
143152
153+
Provide a well-structured, detailed, and actionable analysis of the image. Focus on extracting data and information that can be inferred from the image.
154+
144155
IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'."""
145156

146-
user_input = "Describe this image with technical analysis. Provide a well-structured, description."
157+
user_input = "Perform technical analysis on this image. Provide a well-structured, description."
147158

148159
if caption is not None and len(caption) > 0:
149-
user_input += f" (note: it has image caption: {caption})"
160+
user_input += f" (note: it has the following caption: {caption})"
150161

151162
try:
152163
async with AsyncAzureOpenAI(
@@ -244,7 +255,7 @@ async def process_figures_from_extracted_content(
244255
markdown_content: str,
245256
page_number: None | int = None,
246257
page_offset: int = 0,
247-
) -> str:
258+
) -> Union[str, dict]:
248259
"""Process the figures extracted from the content using ADI and send them for analysis.
249260
250261
Args:
@@ -258,12 +269,13 @@ async def process_figures_from_extracted_content(
258269
259270
Returns:
260271
--------
261-
str: The updated Markdown content with the figure descriptions."""
272+
str: The updated Markdown content with the figure descriptions.
273+
dict: A mapping of the FigureId to the stored Uri in blob storage."""
262274

263-
image_processing_datas = []
275+
figure_processing_datas = []
264276
download_image_tasks = []
265-
image_understanding_tasks = []
266-
image_upload_tasks = []
277+
figure_understanding_tasks = []
278+
figure_upload_tasks = []
267279

268280
if result.figures:
269281
for figure in result.figures:
@@ -290,8 +302,8 @@ async def process_figures_from_extracted_content(
290302

291303
logging.info(f"Figure Caption: {caption}")
292304

293-
image_processing_datas.append(
294-
(container, image_blob, caption, figure.spans[0])
305+
figure_processing_datas.append(
306+
(figure.id, container, image_blob, caption, figure.spans[0])
295307
)
296308

297309
break
@@ -302,45 +314,59 @@ async def process_figures_from_extracted_content(
302314

303315
storage_account_helper = await get_storage_account_helper()
304316

305-
for image_processing_data, response in zip(image_processing_datas, image_responses):
306-
container, image_blob, caption, _ = image_processing_data
317+
for figure_processing_data, response in zip(
318+
figure_processing_datas, image_responses
319+
):
320+
_, container, image_blob, caption, _ = figure_processing_data
307321
base_64_image = base64.b64encode(response).decode("utf-8")
308322

309323
logging.info(f"Image Blob: {image_blob}")
310324

311-
image_understanding_tasks.append(
325+
figure_understanding_tasks.append(
312326
understand_image_with_gptv(base_64_image, caption)
313327
)
314328

315329
image_data = base64.b64decode(base_64_image)
316330

317-
image_upload_tasks.append(
331+
figure_upload_tasks.append(
318332
storage_account_helper.upload_blob(
319333
container, image_blob, image_data, "image/png"
320334
)
321335
)
322336

337+
figure_ids = [
338+
figure_processing_data[0] for figure_processing_data in figure_processing_datas
339+
]
323340
logging.info("Running image understanding tasks")
324-
image_descriptions = await asyncio.gather(*image_understanding_tasks)
341+
figure_descriptions = await asyncio.gather(*figure_understanding_tasks)
325342
logging.info("Finished image understanding tasks")
326-
logging.info(f"Image Descriptions: {image_descriptions}")
343+
logging.info(f"Image Descriptions: {figure_descriptions}")
327344

328345
logging.info("Running image upload tasks")
329-
await asyncio.gather(*image_upload_tasks)
346+
figure_uris = await asyncio.gather(*figure_upload_tasks)
330347
logging.info("Finished image upload tasks")
331348

349+
figures = [
350+
{"figureId": figure_id, "figureUri": figure_uri}
351+
for figure_id, figure_uri in zip(figure_ids, figure_uris)
352+
]
353+
332354
running_offset = 0
333-
for image_processing_data, image_description in zip(
334-
image_processing_datas, image_descriptions
355+
for figure_processing_data, figure_description in zip(
356+
figure_processing_datas, figure_descriptions
335357
):
336-
_, _, _, figure_span = image_processing_data
358+
figure_id, _, _, _, figure_span = figure_processing_data
337359
starting_offset = figure_span.offset + running_offset - page_offset
338360
markdown_content, desc_offset = update_figure_description(
339-
markdown_content, image_description, starting_offset, figure_span.length
361+
markdown_content,
362+
figure_id,
363+
figure_description,
364+
starting_offset,
365+
figure_span.length,
340366
)
341367
running_offset += desc_offset
342368

343-
return markdown_content
369+
return markdown_content, figures
344370

345371

346372
def create_page_wise_content(result: AnalyzeResult) -> list:
@@ -359,12 +385,12 @@ def create_page_wise_content(result: AnalyzeResult) -> list:
359385
page_numbers = []
360386
page_offsets = []
361387

362-
for page_number, page in enumerate(result.pages):
388+
for page in result.pages:
363389
page_content = result.content[
364390
page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"]
365391
]
366392
page_wise_content.append(page_content)
367-
page_numbers.append(page_number + 1)
393+
page_numbers.append(page.page_number)
368394
page_offsets.append(page.spans[0]["offset"])
369395

370396
return page_wise_content, page_numbers, page_offsets
@@ -570,9 +596,13 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
570596
with concurrent.futures.ProcessPoolExecutor() as executor:
571597
futures = {
572598
executor.submit(
573-
clean_adi_markdown, page_content, page_number, True
574-
): page_content
575-
for page_content, page_number in zip(
599+
build_and_clean_markdown_for_response,
600+
extracted_page_content[0],
601+
extracted_page_content[1],
602+
page_number,
603+
True,
604+
): extracted_page_content
605+
for extracted_page_content, page_number in zip(
576606
content_with_figures, page_numbers
577607
)
578608
}
@@ -582,7 +612,10 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
582612
else:
583613
markdown_content = result.content
584614

585-
content_with_figures = await process_figures_from_extracted_content(
615+
(
616+
extracted_content,
617+
figures,
618+
) = await process_figures_from_extracted_content(
586619
result,
587620
operation_id,
588621
container_and_blob,
@@ -591,8 +624,8 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
591624
page_number=None,
592625
)
593626

594-
cleaned_result = clean_adi_markdown(
595-
content_with_figures, remove_irrelevant_figures=True
627+
cleaned_result = build_and_clean_markdown_for_response(
628+
extracted_content, figures, remove_irrelevant_figures=True
596629
)
597630
except Exception as e:
598631
logging.error(e)

adi_function_app/pre_embedding_cleaner.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
nltk.download("punkt")
1010
nltk.download("stopwords")
11+
nltk.download("punkt_tab")
1112

1213

1314
def get_section(cleaned_text: str) -> list:
@@ -29,9 +30,9 @@ def get_section(cleaned_text: str) -> list:
2930

3031
def clean_sections(sections: list) -> list:
3132
"""Cleans the sections by removing special characters and extra white spaces."""
32-
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
33+
cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
3334

34-
return cleaned_sections
35+
return cleanedSections
3536

3637

3738
def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
@@ -123,19 +124,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
123124

124125
# scenarios when page by chunking is enabled
125126
if isinstance(record["data"]["chunk"], dict):
126-
cleaned_record["data"]["cleaned_chunk"] = clean_text(
127+
cleaned_record["data"]["cleanedChunk"] = clean_text(
127128
record["data"]["chunk"]["content"]
128129
)
129130
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
130-
cleaned_record["data"]["cleaned_sections"] = clean_sections(
131+
cleaned_record["data"]["cleanedSections"] = clean_sections(
131132
record["data"]["chunk"]["sections"]
132133
)
133134
else:
134-
cleaned_record["data"]["cleaned_chunk"] = clean_text(
135-
record["data"]["chunk"]
136-
)
135+
cleaned_record["data"]["cleanedChunk"] = clean_text(record["data"]["chunk"])
137136
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
138-
cleaned_record["data"]["cleaned_sections"] = get_section(
137+
cleaned_record["data"]["cleanedSections"] = get_section(
139138
record["data"]["chunk"]
140139
)
141140

adi_function_app/storage_account.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,10 @@ async def upload_blob(
5858
Args:
5959
container (str): The container of the blob.
6060
blob (str): The blob name.
61-
data (bytes): The data to upload."""
61+
data (bytes): The data to upload.
62+
63+
Returns:
64+
str: url of the uploaded blob."""
6265

6366
logging.info("Uploading Blob...")
6467
logging.info(f"Container: {container}")
@@ -76,6 +79,8 @@ async def upload_blob(
7679
content_type=content_type,
7780
)
7881

82+
return blob_client.url
83+
7984
async def download_blob_to_temp_dir(
8085
self, source: str, container: str, target_file_name
8186
) -> tuple[str, dict]:

deploy_ai_search/ai_search.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,13 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
194194

195195
return data_source_connection
196196

197-
def get_pre_embedding_cleaner_skill(
198-
self, context, source, target_name="cleaned_chunk"
199-
) -> WebApiSkill:
197+
def get_pre_embedding_cleaner_skill(self, context, source) -> WebApiSkill:
200198
"""Get the custom skill for data cleanup.
201199
202200
Args:
203201
-----
204202
context (str): The context of the skill
205-
inputs (List[InputFieldMappingEntry]): The inputs of the skill
206-
outputs (List[OutputFieldMappingEntry]): The outputs of the skill
203+
source (str): The source of the skill
207204
208205
Returns:
209206
--------
@@ -221,10 +218,10 @@ def get_pre_embedding_cleaner_skill(
221218
]
222219

223220
pre_embedding_cleaner_skill_outputs = [
224-
OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
221+
OutputFieldMappingEntry(name="cleanedChunk", target_name="cleanedChunk"),
225222
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
226223
OutputFieldMappingEntry(
227-
name="cleaned_sections", target_name="cleaned_sections"
224+
name="cleanedSections", target_name="cleanedSections"
228225
),
229226
]
230227

0 commit comments

Comments
 (0)