Skip to content

Commit 9c3d28c

Browse files
committed
Update pipeline
1 parent 6834517 commit 9c3d28c

File tree

3 files changed

+22
-17
lines changed

3 files changed

+22
-17
lines changed

adi_function_app/pre_embedding_cleaner.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def get_section(cleaned_text: str) -> list:
2929

3030
def clean_sections(sections: list) -> list:
3131
"""Cleans the sections by removing special characters and extra white spaces."""
32-
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
32+
cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
3333

34-
return cleaned_sections
34+
return cleanedSections
3535

3636

3737
def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
@@ -123,19 +123,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
123123

124124
# scenarios when page by chunking is enabled
125125
if isinstance(record["data"]["chunk"], dict):
126-
cleaned_record["data"]["cleaned_chunk"] = clean_text(
126+
cleaned_record["data"]["cleanedChunk"] = clean_text(
127127
record["data"]["chunk"]["content"]
128128
)
129129
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
130-
cleaned_record["data"]["cleaned_sections"] = clean_sections(
130+
cleaned_record["data"]["cleanedSections"] = clean_sections(
131131
record["data"]["chunk"]["sections"]
132132
)
133133
else:
134-
cleaned_record["data"]["cleaned_chunk"] = clean_text(
135-
record["data"]["chunk"]
136-
)
134+
cleaned_record["data"]["cleanedChunk"] = clean_text(record["data"]["chunk"])
137135
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
138-
cleaned_record["data"]["cleaned_sections"] = get_section(
136+
cleaned_record["data"]["cleanedSections"] = get_section(
139137
record["data"]["chunk"]
140138
)
141139

deploy_ai_search/ai_search.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
195195
return data_source_connection
196196

197197
def get_pre_embedding_cleaner_skill(
198-
self, context, source, target_name="cleaned_chunk"
198+
self, context, source, target_name="cleanedChunk"
199199
) -> WebApiSkill:
200200
"""Get the custom skill for data cleanup.
201201
@@ -221,10 +221,10 @@ def get_pre_embedding_cleaner_skill(
221221
]
222222

223223
pre_embedding_cleaner_skill_outputs = [
224-
OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
224+
OutputFieldMappingEntry(name="cleanedChunk", target_name=target_name),
225225
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
226226
OutputFieldMappingEntry(
227-
name="cleaned_sections", target_name="cleaned_sections"
227+
name="cleanedSections", target_name="cleanedSections"
228228
),
229229
]
230230

deploy_ai_search/rag_documents.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,11 @@ def get_skills(self) -> list:
174174
)
175175

176176
key_phrase_extraction_skill = self.get_key_phrase_extraction_skill(
177-
"/document/pages/*", "/document/pages/*/cleaned_chunk"
177+
"/document/pages/*", "/document/pages/*/cleanedChunk"
178178
)
179179

180180
embedding_skill = self.get_vector_skill(
181-
"/document/pages/*", "/document/pages/*/cleaned_chunk"
181+
"/document/pages/*", "/document/pages/*/cleanedChunk"
182182
)
183183

184184
if self.enable_page_by_chunking:
@@ -213,22 +213,29 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
213213
name="Keywords", source="/document/pages/*/keywords"
214214
),
215215
InputFieldMappingEntry(
216-
name="Sections", source="/document/pages/*/cleaned_sections"
216+
name="Sections", source="/document/pages/*/cleanedSections"
217217
),
218218
InputFieldMappingEntry(
219-
name="Figures", source="/document/pages/*/cleaned_sections"
219+
name="Figures",
220+
inputs=[
221+
InputFieldMappingEntry(
222+
name="FigureID", source="/document/pages/*/figures/figureId"
223+
),
224+
InputFieldMappingEntry(
225+
name="FigureUri", source="/document/pages/*/figures/figureUri"
226+
),
227+
],
220228
),
221229
InputFieldMappingEntry(
222230
name="DateLastModified", source="/document/DateLastModified"
223231
),
224-
InputFieldMappingEntry(name="Figures", source="/document/pages/*/figures"),
225232
]
226233

227234
if self.enable_page_by_chunking:
228235
mappings.extend(
229236
[
230237
InputFieldMappingEntry(
231-
name="PageNumber", source="/document/pages/*/page_number"
238+
name="PageNumber", source="/document/pages/*/pageNumber"
232239
)
233240
]
234241
)

0 commit comments

Comments
 (0)