Skip to content

Commit a5a8722

Browse files
committed
Update code and flows
1 parent ecece23 commit a5a8722

File tree

7 files changed

+35
-21
lines changed

7 files changed

+35
-21
lines changed

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ def get_figure_analysis_skill(self, chunk_by_page=False) -> WebApiSkill:
425425
]
426426

427427
if chunk_by_page:
428-
figure_context = "/document/page_wise_layout/*"
428+
figure_context = "/document/page_wise_layout/*/figures/*"
429429
inputs = [
430430
InputFieldMappingEntry(
431431
name="figure", source="/document/page_wise_layout/*/figures/*"

deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,11 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
250250
name="DateLastModified", source="/document/DateLastModified"
251251
),
252252
InputFieldMappingEntry(
253-
name="PageNumber", source="/document/page_wise_layout/*/#"
253+
name="PageNumber", source="/document/page_wise_layout/*/page_number"
254254
),
255255
]
256256
else:
257-
source_context = "/document/chunks/*"
257+
source_context = "/document/chunk_mark_ups/*"
258258
mappings = [
259259
InputFieldMappingEntry(
260260
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
182 KB
Loading

image_processing/src/image_processing/figure_analysis.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,10 +197,9 @@ async def analyse(self, record: dict) -> dict:
197197
- record (dict): The record containing the image, its caption, and the generated description.
198198
"""
199199

200-
logging.info(f"Record: {record}")
201-
figure = FigureHolder(**record["data"]["figure"])
202-
203200
try:
201+
logging.info(f"Record: {record}")
202+
figure = FigureHolder(**record["data"]["figure"])
204203
updated_data = await self.understand_image_with_gptv(figure)
205204
logging.info(f"Updated Figure Data: {updated_data}")
206205
except RetryError as e:
@@ -230,6 +229,19 @@ async def analyse(self, record: dict) -> dict:
230229
],
231230
"warnings": None,
232231
}
232+
except Exception as e:
233+
logging.error(f"Failed to analyse image. Error: {e}")
234+
logging.error(f"Failed input: {record}")
235+
return {
236+
"recordId": record["recordId"],
237+
"data": None,
238+
"errors": [
239+
{
240+
"message": "Failed to analyse image. Check the logs for more details.",
241+
}
242+
],
243+
"warnings": None,
244+
}
233245
else:
234246
return {
235247
"recordId": record["recordId"],

image_processing/src/image_processing/function_app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ async def layout_analysis(req: func.HttpRequest) -> func.HttpResponse:
2222
values = req_body.get("values")
2323
adi_config = req.headers
2424

25-
page_wise = adi_config.get("page_wise", "False").lower() == "true"
25+
page_wise = adi_config.get("chunk_by_page", "False").lower() == "true"
2626
extract_figures = adi_config.get("extract_figures", "True").lower() == "true"
2727
logging.info(f"Chunk by Page: {page_wise}")
2828
except ValueError:

image_processing/src/image_processing/layout_holders.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# Licensed under the MIT License.
33

4-
from pydantic import BaseModel, Field
4+
from pydantic import BaseModel, Field, ConfigDict
55
from typing import Optional
66

77

88
class FigureHolder(BaseModel):
99

1010
"""A class to hold the figure extracted from the document."""
1111

12-
figure_id: str
13-
container: str = Field(exclude=True)
14-
blob: str = Field(exclude=True)
15-
caption: Optional[str] = Field(default=None)
12+
figure_id: str = Field(..., alias="FigureId")
13+
container: Optional[str] = Field(exclude=True, default=None)
14+
blob: Optional[str] = Field(exclude=True, default=None)
15+
caption: Optional[str] = Field(default=None, alias="Caption")
1616
offset: int
1717
length: int
18-
page_number: Optional[int] = Field(default=None)
19-
uri: str
20-
description: Optional[str] = Field(default="")
21-
data: Optional[str] = Field(default=None)
18+
page_number: Optional[int] = Field(default=None, alias="PageNumber")
19+
uri: str = Field(..., alias="Uri")
20+
description: Optional[str] = Field(default="", alias="Description")
21+
data: Optional[str] = Field(default=None, alias="Data")
22+
23+
model_config = ConfigDict(populate_by_name=True)
2224

2325
@property
2426
def markdown(self) -> str:

image_processing/src/image_processing/mark_up_cleaner.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,18 +94,18 @@ def clean_text_and_extract_metadata(
9494
logging.error("Input text is empty")
9595
raise ValueError("Input text is empty")
9696

97-
return_record["marked_up_chunk"] = text
97+
return_record["chunk_mark_up"] = text
9898

9999
figure_ids = self.get_figure_ids(text)
100100

101-
return_record["sections"] = self.get_sections(text)
102-
return_record["figures"] = [
101+
return_record["chunk_sections"] = self.get_sections(text)
102+
return_record["chunk_figures"] = [
103103
figure.model_dump(by_alias=True)
104104
for figure in figures
105105
if figure.figure_id in figure_ids
106106
]
107107

108-
logging.info(f"Sections: {return_record['sections']}")
108+
logging.info(f"Sections: {return_record['chunk_sections']}")
109109

110110
# Define specific patterns for each tag
111111
tag_patterns = {
@@ -128,7 +128,7 @@ def clean_text_and_extract_metadata(
128128
logging.error("Cleaned text is empty")
129129
raise ValueError("Cleaned text is empty")
130130
else:
131-
return_record["cleaned_chunk"] = cleaned_text
131+
return_record["chunk_cleaned"] = cleaned_text
132132
except Exception as e:
133133
logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
134134
return ""

0 commit comments

Comments
 (0)