Skip to content

Commit 869f098

Browse files
committed
Update maping
1 parent f23126f commit 869f098

File tree

2 files changed

+42
-29
lines changed

2 files changed

+42
-29
lines changed

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,11 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
219219
mark_up_cleaner_context = "/document/page_wise_layout/*"
220220
inputs = [
221221
InputFieldMappingEntry(
222-
name="chunk", source="/document/page_wise_layout/*/merged_content"
222+
name="mark_up", source="/document/page_wise_layout/*/merged_content"
223+
),
224+
InputFieldMappingEntry(
225+
name="page_number",
226+
source="/document/page_wise_layout/*/page_number",
223227
),
224228
InputFieldMappingEntry(
225229
name="figures",
@@ -230,20 +234,22 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
230234
mark_up_cleaner_context = "/document/chunk_mark_ups/*"
231235
inputs = [
232236
InputFieldMappingEntry(
233-
name="chunk", source="/document/chunk_mark_ups/*"
237+
name="mark_up", source="/document/chunk_mark_ups/*/mark_up"
238+
),
239+
InputFieldMappingEntry(
240+
name="page_number", source="/document/chunk_mark_ups/*/page_number"
234241
),
235242
InputFieldMappingEntry(
236243
name="figures", source="/document/layout/figures/*/updated_figure"
237244
),
238245
]
239246

240247
mark_up_cleaner_skill_outputs = [
241-
OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
242-
OutputFieldMappingEntry(
243-
name="chunk_sections", target_name="chunk_sections"
244-
),
245-
OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
246-
OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
248+
OutputFieldMappingEntry(name="cleaned_text", target_name="cleaned_text"),
249+
OutputFieldMappingEntry(name="sections", target_name="sections"),
250+
OutputFieldMappingEntry(name="mark_up", target_name="mark_up"),
251+
OutputFieldMappingEntry(name="figures", target_name="chunk_figures"),
252+
OutputFieldMappingEntry(name="page_number", target_name="page_number"),
247253
]
248254

249255
mark_up_cleaner_skill = WebApiSkill(
@@ -302,7 +308,11 @@ def get_semantic_chunker_skill(
302308
semantic_text_chunker_skill_inputs = [
303309
InputFieldMappingEntry(
304310
name="content", source="/document/layout_merged_content"
305-
)
311+
),
312+
InputFieldMappingEntry(
313+
name="per_page_starting_sentences",
314+
source="/document/per_page_starting_sentences",
315+
),
306316
]
307317

308318
semantic_text_chunker_skill_outputs = [
@@ -368,7 +378,13 @@ def get_layout_analysis_skill(
368378
)
369379
]
370380
else:
371-
output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
381+
output = [
382+
OutputFieldMappingEntry(name="layout", target_name="layout"),
383+
OutputFieldMappingEntry(
384+
name="per_page_starting_sentences",
385+
target_name="per_page_starting_sentences",
386+
),
387+
]
372388

373389
layout_analysis_skill = WebApiSkill(
374390
name="Layout Analysis Skill",

deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ def get_index_fields(self) -> list[SearchableField]:
8181
type=SearchFieldDataType.String,
8282
collection=True,
8383
),
84+
SimpleField(
85+
name="PageNumber",
86+
type=SearchFieldDataType.Int64,
87+
sortable=True,
88+
filterable=True,
89+
facetable=True,
90+
),
8491
SearchField(
8592
name="ChunkEmbedding",
8693
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
@@ -137,19 +144,6 @@ def get_index_fields(self) -> list[SearchableField]:
137144
),
138145
]
139146

140-
if self.enable_page_by_chunking:
141-
fields.extend(
142-
[
143-
SimpleField(
144-
name="PageNumber",
145-
type=SearchFieldDataType.Int64,
146-
sortable=True,
147-
filterable=True,
148-
facetable=True,
149-
)
150-
]
151-
)
152-
153147
return fields
154148

155149
def get_semantic_search(self) -> SemanticSearch:
@@ -194,11 +188,11 @@ def get_skills(self) -> list:
194188
if self.enable_page_by_chunking:
195189
embedding_skill = self.get_vector_skill(
196190
"/document/page_wise_layout/*",
197-
"/document/page_wise_layout/*/chunk_cleaned",
191+
"/document/page_wise_layout/*/cleaned_text",
198192
)
199193
else:
200194
embedding_skill = self.get_vector_skill(
201-
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
195+
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/cleaned_text"
202196
)
203197

204198
if self.enable_page_by_chunking:
@@ -229,7 +223,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
229223
source_context = "/document/page_wise_layout/*"
230224
mappings = [
231225
InputFieldMappingEntry(
232-
name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
226+
name="Chunk", source="/document/page_wise_layout/*/mark_up"
233227
),
234228
InputFieldMappingEntry(
235229
name="ChunkEmbedding",
@@ -239,7 +233,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
239233
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
240234
InputFieldMappingEntry(
241235
name="Sections",
242-
source="/document/page_wise_layout/*/chunk_sections",
236+
source="/document/page_wise_layout/*/sections",
243237
),
244238
InputFieldMappingEntry(
245239
name="ChunkFigures",
@@ -256,7 +250,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
256250
source_context = "/document/chunk_mark_ups/*"
257251
mappings = [
258252
InputFieldMappingEntry(
259-
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
253+
name="Chunk", source="/document/chunk_mark_ups/*/mark_up"
260254
),
261255
InputFieldMappingEntry(
262256
name="ChunkEmbedding",
@@ -265,7 +259,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
265259
InputFieldMappingEntry(name="Title", source="/document/Title"),
266260
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
267261
InputFieldMappingEntry(
268-
name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
262+
name="Sections", source="/document/chunk_mark_ups/*/sections"
269263
),
270264
InputFieldMappingEntry(
271265
name="ChunkFigures",
@@ -274,6 +268,9 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
274268
InputFieldMappingEntry(
275269
name="DateLastModified", source="/document/DateLastModified"
276270
),
271+
InputFieldMappingEntry(
272+
name="PageNumber", source="/document/chunk_mark_ups/*/page_number"
273+
),
277274
]
278275

279276
index_projections = SearchIndexerIndexProjection(

0 commit comments

Comments
 (0)