Update pipeline

BenConstable9 · BenConstable9 · commit 9c3d28c53cb2 · 2024-09-19T10:02:34.000+01:00
diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py
@@ -29,9 +29,9 @@ def get_section(cleaned_text: str) -> list:
 
 def clean_sections(sections: list) -> list:
     """Cleans the sections by removing special characters and extra white spaces."""
-    cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
+    cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
 
-    return cleaned_sections
+    return cleanedSections
 
 
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
@@ -123,19 +123,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
 
         # scenarios when page by chunking is enabled
         if isinstance(record["data"]["chunk"], dict):
-            cleaned_record["data"]["cleaned_chunk"] = clean_text(
+            cleaned_record["data"]["cleanedChunk"] = clean_text(
                 record["data"]["chunk"]["content"]
             )
             cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
-            cleaned_record["data"]["cleaned_sections"] = clean_sections(
+            cleaned_record["data"]["cleanedSections"] = clean_sections(
                 record["data"]["chunk"]["sections"]
             )
         else:
-            cleaned_record["data"]["cleaned_chunk"] = clean_text(
-                record["data"]["chunk"]
-            )
+            cleaned_record["data"]["cleanedChunk"] = clean_text(record["data"]["chunk"])
             cleaned_record["data"]["chunk"] = record["data"]["chunk"]
-            cleaned_record["data"]["cleaned_sections"] = get_section(
+            cleaned_record["data"]["cleanedSections"] = get_section(
                 record["data"]["chunk"]
             )
 
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
@@ -195,7 +195,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
         return data_source_connection
 
     def get_pre_embedding_cleaner_skill(
-        self, context, source, target_name="cleaned_chunk"
+        self, context, source, target_name="cleanedChunk"
     ) -> WebApiSkill:
         """Get the custom skill for data cleanup.
 
@@ -221,10 +221,10 @@ def get_pre_embedding_cleaner_skill(
         ]
 
         pre_embedding_cleaner_skill_outputs = [
-            OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
+            OutputFieldMappingEntry(name="cleanedChunk", target_name=target_name),
             OutputFieldMappingEntry(name="chunk", target_name="chunk"),
             OutputFieldMappingEntry(
-                name="cleaned_sections", target_name="cleaned_sections"
+                name="cleanedSections", target_name="cleanedSections"
             ),
         ]
 
diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py
@@ -174,11 +174,11 @@ def get_skills(self) -> list:
         )
 
         key_phrase_extraction_skill = self.get_key_phrase_extraction_skill(
-            "/document/pages/*", "/document/pages/*/cleaned_chunk"
+            "/document/pages/*", "/document/pages/*/cleanedChunk"
         )
 
         embedding_skill = self.get_vector_skill(
-            "/document/pages/*", "/document/pages/*/cleaned_chunk"
+            "/document/pages/*", "/document/pages/*/cleanedChunk"
         )
 
         if self.enable_page_by_chunking:
@@ -213,22 +213,29 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
                 name="Keywords", source="/document/pages/*/keywords"
             ),
             InputFieldMappingEntry(
-                name="Sections", source="/document/pages/*/cleaned_sections"
+                name="Sections", source="/document/pages/*/cleanedSections"
             ),
             InputFieldMappingEntry(
-                name="Figures", source="/document/pages/*/cleaned_sections"
+                name="Figures",
+                inputs=[
+                    InputFieldMappingEntry(
+                        name="FigureID", source="/document/pages/*/figures/figureId"
+                    ),
+                    InputFieldMappingEntry(
+                        name="FigureUri", source="/document/pages/*/figures/figureUri"
+                    ),
+                ],
             ),
             InputFieldMappingEntry(
                 name="DateLastModified", source="/document/DateLastModified"
             ),
-            InputFieldMappingEntry(name="Figures", source="/document/pages/*/figures"),
         ]
 
         if self.enable_page_by_chunking:
             mappings.extend(
                 [
                     InputFieldMappingEntry(
-                        name="PageNumber", source="/document/pages/*/page_number"
+                        name="PageNumber", source="/document/pages/*/pageNumber"
                     )
                 ]
             )

Original file line number	Diff line number	Diff line change
`@@ -174,11 +174,11 @@ def get_skills(self) -> list:`
`174`	`174`	`)`
`175`	`175`
`176`	`176`	`key_phrase_extraction_skill = self.get_key_phrase_extraction_skill(`
`177`		`- "/document/pages/", "/document/pages//cleaned_chunk"`
	`177`	`+ "/document/pages/", "/document/pages//cleanedChunk"`
`178`	`178`	`)`
`179`	`179`
`180`	`180`	`embedding_skill = self.get_vector_skill(`
`181`		`- "/document/pages/", "/document/pages//cleaned_chunk"`
	`181`	`+ "/document/pages/", "/document/pages//cleanedChunk"`
`182`	`182`	`)`
`183`	`183`
`184`	`184`	`if self.enable_page_by_chunking:`
`@@ -213,22 +213,29 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:`
`213`	`213`	`name="Keywords", source="/document/pages/*/keywords"`
`214`	`214`	`),`
`215`	`215`	`InputFieldMappingEntry(`
`216`		`- name="Sections", source="/document/pages/*/cleaned_sections"`
	`216`	`+ name="Sections", source="/document/pages/*/cleanedSections"`
`217`	`217`	`),`
`218`	`218`	`InputFieldMappingEntry(`
`219`		`- name="Figures", source="/document/pages/*/cleaned_sections"`
	`219`	`+ name="Figures",`
	`220`	`+ inputs=[`
	`221`	`+ InputFieldMappingEntry(`
	`222`	`+ name="FigureID", source="/document/pages/*/figures/figureId"`
	`223`	`+ ),`
	`224`	`+ InputFieldMappingEntry(`
	`225`	`+ name="FigureUri", source="/document/pages/*/figures/figureUri"`
	`226`	`+ ),`
	`227`	`+ ],`
`220`	`228`	`),`
`221`	`229`	`InputFieldMappingEntry(`
`222`	`230`	`name="DateLastModified", source="/document/DateLastModified"`
`223`	`231`	`),`
`224`		`- InputFieldMappingEntry(name="Figures", source="/document/pages/*/figures"),`
`225`	`232`	`]`
`226`	`233`
`227`	`234`	`if self.enable_page_by_chunking:`
`228`	`235`	`mappings.extend(`
`229`	`236`	`[`
`230`	`237`	`InputFieldMappingEntry(`
`231`		`- name="PageNumber", source="/document/pages/*/page_number"`
	`238`	`+ name="PageNumber", source="/document/pages/*/pageNumber"`
`232`	`239`	`)`
`233`	`240`	`]`
`234`	`241`	`)`