Skip to content

Commit bde3d6d

Browse files
committed
Fix section bugs
1 parent 23ec029 commit bde3d6d

File tree

4 files changed

+37
-24
lines changed

4 files changed

+37
-24
lines changed

ai_search_with_adi/adi_function_app/adi_2_ai_search.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def clean_adi_markdown(
6969
comment_patterns = r"<!-- PageNumber=\"\d+\" -->|<!-- PageHeader=\".*?\" -->|<!-- PageFooter=\".*?\" -->|<!-- PageBreak -->"
7070
cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
7171

72-
combined_pattern = r"(.*?)\n===|\n# (.*?)\n|\n## ?(.*?)\n|\n### ?(.*?)\n|\n#### ?(.*?)\n|\n##### ?(.*?)\n|\n###### ?(.*?)\n"
72+
combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
7373
doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
7474
doc_metadata = [match for group in doc_metadata for match in group if match]
7575

@@ -170,6 +170,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3):
170170
171171
If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram.
172172
173+
Include any data points, labels, and other relevant information that can be inferred from the image.
174+
173175
IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'."""
174176

175177
user_input = "Describe this image with technical analysis. Provide a well-structured, description."
@@ -255,6 +257,10 @@ def pil_image_to_base64(image, image_format="JPEG"):
255257
return base64.b64encode(buffered.getvalue()).decode("utf-8")
256258

257259

260+
async def mark_image_as_irrelevant():
261+
return "Irrelevant Image"
262+
263+
258264
async def process_figures_from_extracted_content(
259265
file_path: str, markdown_content: str, figures: list, page_number: None | int = None
260266
) -> str:
@@ -270,6 +276,8 @@ async def process_figures_from_extracted_content(
270276
Returns:
271277
--------
272278
str: The updated Markdown content with the figure descriptions."""
279+
280+
image_understanding_tasks = []
273281
for idx, figure in enumerate(figures):
274282
img_description = ""
275283
logging.debug(f"Figure #{idx} has the following spans: {figure.spans}")
@@ -293,16 +301,19 @@ async def process_figures_from_extracted_content(
293301
) # page_number is 1-indexed3
294302

295303
if cropped_image is None:
296-
img_description += "Irrelevant Image"
304+
image_understanding_tasks.append(mark_image_as_irrelevant())
297305
else:
298306
image_base64 = pil_image_to_base64(cropped_image)
299307

300-
img_description = await understand_image_with_gptv(
301-
image_base64, figure.caption.content
308+
image_understanding_tasks.append(
309+
understand_image_with_gptv(image_base64, figure.caption.content)
302310
)
303311
logging.info(f"\tDescription of figure {idx}: {img_description}")
304312
break
305313

314+
image_descriptions = await asyncio.gather(*image_understanding_tasks)
315+
316+
for idx, img_description in enumerate(image_descriptions):
306317
markdown_content = update_figure_description(
307318
markdown_content, img_description, idx
308319
)

ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,17 @@ def get_section(cleaned_text: str) -> list:
2121
list: The sections related to text
2222
2323
"""
24-
combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n"
24+
combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
2525
doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
2626
doc_metadata = [match for group in doc_metadata for match in group if match]
27-
return doc_metadata
27+
return clean_sections(doc_metadata)
28+
29+
30+
def clean_sections(sections: list) -> list:
31+
"""Cleans the sections by removing special characters and extra white spaces."""
32+
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
33+
34+
return cleaned_sections
2835

2936

3037
def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
@@ -120,16 +127,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
120127
record["data"]["chunk"]["content"]
121128
)
122129
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
123-
cleaned_record["data"]["section"] = record["data"]["chunk"]["section"]
124-
cleaned_record["data"]["page_number"] = record["data"]["chunk"][
125-
"page_number"
126-
]
130+
cleaned_record["data"]["sections"] = clean_sections(
131+
record["data"]["chunk"]["sections"]
132+
)
127133
else:
128134
cleaned_record["data"]["cleaned_chunk"] = clean_text(
129135
record["data"]["chunk"]
130136
)
131137
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
132-
cleaned_record["data"]["section"] = get_section(record["data"]["chunk"])
138+
cleaned_record["data"]["cleaned_sections"] = get_section(
139+
record["data"]["chunk"]
140+
)
133141

134142
except Exception as e:
135143
logging.error("string cleanup Error: %s", e)

ai_search_with_adi/ai_search/ai_search.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
175175
return data_source_connection
176176

177177
def get_pre_embedding_cleaner_skill(
178-
self, context, source, chunk_by_page=False, target_name="cleaned_chunk"
178+
self, context, source, target_name="cleaned_chunk"
179179
) -> WebApiSkill:
180180
"""Get the custom skill for data cleanup.
181181
@@ -203,18 +203,11 @@ def get_pre_embedding_cleaner_skill(
203203
pre_embedding_cleaner_skill_outputs = [
204204
OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
205205
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
206-
OutputFieldMappingEntry(name="section", target_name="section"),
206+
OutputFieldMappingEntry(
207+
name="cleaned_sections", target_name="cleaned_sections"
208+
),
207209
]
208210

209-
if chunk_by_page:
210-
pre_embedding_cleaner_skill_outputs.extend(
211-
[
212-
OutputFieldMappingEntry(
213-
name="page_number", target_name="page_number"
214-
),
215-
]
216-
)
217-
218211
pre_embedding_cleaner_skill = WebApiSkill(
219212
name="Pre Embedding Cleaner Skill",
220213
description="Skill to clean the data before sending to embedding",
@@ -277,8 +270,9 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
277270
batch_size = 1
278271
degree_of_parallelism = 4
279272
else:
273+
# Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism
280274
batch_size = 1
281-
degree_of_parallelism = 16
275+
degree_of_parallelism = 8
282276

283277
if chunk_by_page:
284278
output = [

ai_search_with_adi/ai_search/rag_documents.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
191191
name="Keywords", source="/document/pages/*/keywords"
192192
),
193193
InputFieldMappingEntry(
194-
name="Sections", source="/document/pages/*/sections"
194+
name="Sections", source="/document/pages/*/cleaned_sections"
195195
),
196196
]
197197

0 commit comments

Comments
 (0)