final updates

BenConstable9 · BenConstable9 · commit 9209d5efd627 · 2025-01-23T12:21:43.000Z
diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py
@@ -187,8 +187,6 @@ def get_skills(self) -> list:
             self.enable_page_by_chunking
         )
 
-        text_split_skill = self.get_semantic_chunker_skill(self.enable_page_by_chunking)
-
         mark_up_cleaner_skill = self.get_mark_up_cleaner_skill(
             self.enable_page_by_chunking
         )
@@ -212,11 +210,12 @@ def get_skills(self) -> list:
                 embedding_skill,
             ]
         else:
+            semantic_chunker_skill = self.get_semantic_chunker_skill()
             skills = [
                 layout_skill,
                 figure_skill,
                 merger_skill,
-                text_split_skill,
+                semantic_chunker_skill,
                 mark_up_cleaner_skill,
                 embedding_skill,
             ]
diff --git a/image_processing/README.md b/image_processing/README.md
@@ -25,6 +25,7 @@ Instead of using OCR to extract the contents of the document, ADIv4 is used to a
 Once the Markdown is obtained, several steps are carried out:
 
 1. **Extraction of figures / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt-4o-mini in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart.
+    - **The prompt aims to generate a description and summary of the chart so it can be retrieved later during search. It does not aim to summarise every part of the figure. At runtime, retrieve the figures for the given chunk from the index and pass them to the visual model for context.**
 
 2. **Chunking**. The obtained content is chunked accordingly depending on the chunking strategy. This function app supports two chunking methods, **page wise** and **semantic chunking**. The page wise chunking is performed natively by Azure Document Intelligence. For a Semantic Chunking, we include a customer chunker that splits the text with the following strategy:
 
@@ -38,9 +39,21 @@ Once the Markdown is obtained, several steps are carried out:
 
 3. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant figures.
 
+### AI Search Enrichment Steps
+
 > [!NOTE]
 >
-> For scalability, the above steps are performed across 5 differnet function app endpoints that are orchestrated by AI search.
+> For scalability, the above steps are performed across 5 different function app endpoints that are orchestrated by AI search.
+
+### Page Wise Chunking
+
+![AI Search Enrichment Steps & Flow for Page Wise Chunking](./images/Page%20Wise%20Chunking.png "Page Wise Chunking Enrichment Steps")
+
+### Semantic Chunking
+
+![AI Search Enrichment Steps & Flow for Semantic Chunking](./images/Semantic%20Chunking.png "Semantic Chunking Enrichment Steps")
+
+Here, the output from the layout is considered a single block of text and the customer semantic chunker is used before vectorisation and projections. The custom chunker aims to retain figures and tables within the same chunks, and chunks when the similarity between sentences is lower than the threshold.
 
 ## Sample Output
 
diff --git a/image_processing/images/Semantic Chunking.png b/image_processing/images/Semantic Chunking.png
diff --git a/image_processing/src/image_processing/function_app.py b/image_processing/src/image_processing/function_app.py
@@ -171,14 +171,16 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
 
         semantic_text_chunker_config = req.headers
 
-        num_surrounding_sentences = semantic_text_chunker_config.get(
-            "num_surrounding_sentences", 1
+        num_surrounding_sentences = int(
+            semantic_text_chunker_config.get("num_surrounding_sentences", 1)
         )
-        similarity_threshold = semantic_text_chunker_config.get(
-            "similarity_threshold", 0.8
+        similarity_threshold = float(
+            semantic_text_chunker_config.get("similarity_threshold", 0.8)
         )
-        max_chunk_tokens = semantic_text_chunker_config.get("max_chunk_tokens", 500)
-        min_chunk_tokens = semantic_text_chunker_config.get("min_chunk_tokens", 50)
+        max_chunk_tokens = int(
+            semantic_text_chunker_config.get("max_chunk_tokens", 500)
+        )
+        min_chunk_tokens = int(semantic_text_chunker_config.get("min_chunk_tokens", 50))
 
     except ValueError:
         return func.HttpResponse(

Original file line number	Diff line number	Diff line change
`@@ -187,8 +187,6 @@ def get_skills(self) -> list:`
`187`	`187`	`self.enable_page_by_chunking`
`188`	`188`	`)`
`189`	`189`
`190`		`- text_split_skill = self.get_semantic_chunker_skill(self.enable_page_by_chunking)`
`191`		`-`
`192`	`190`	`mark_up_cleaner_skill = self.get_mark_up_cleaner_skill(`
`193`	`191`	`self.enable_page_by_chunking`
`194`	`192`	`)`
`@@ -212,11 +210,12 @@ def get_skills(self) -> list:`
`212`	`210`	`embedding_skill,`
`213`	`211`	`]`
`214`	`212`	`else:`
	`213`	`+ semantic_chunker_skill = self.get_semantic_chunker_skill()`
`215`	`214`	`skills = [`
`216`	`215`	`layout_skill,`
`217`	`216`	`figure_skill,`
`218`	`217`	`merger_skill,`
`219`		`- text_split_skill,`
	`218`	`+ semantic_chunker_skill,`
`220`	`219`	`mark_up_cleaner_skill,`
`221`	`220`	`embedding_skill,`
`222`	`221`	`]`