From 86cde08db8860343aa41501a7746ac6d308d5e1e Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 18 Nov 2024 12:00:04 +0000
Subject: [PATCH 01/10] Add New Text Chunker

---
 adi_function_app/function_app.py          |  57 ++++
 adi_function_app/pre_embedding_cleaner.py |  44 +--
 adi_function_app/requirements.txt         |   3 +
 adi_function_app/semantic_text_chunker.py | 360 ++++++++++++++++++++++
 deploy_ai_search/.env                     |   1 +
 deploy_ai_search/ai_search.py             |  51 ++-
 deploy_ai_search/environment.py           |   9 +
 deploy_ai_search/rag_documents.py         |  25 +-
 8 files changed, 484 insertions(+), 66 deletions(-)
 create mode 100644 adi_function_app/semantic_text_chunker.py

diff --git a/adi_function_app/function_app.py b/adi_function_app/function_app.py
index cca6005..a12d973 100644
--- a/adi_function_app/function_app.py
+++ b/adi_function_app/function_app.py
@@ -8,6 +8,7 @@
 from adi_2_ai_search import process_adi_2_ai_search
 from pre_embedding_cleaner import process_pre_embedding_cleaner
 from key_phrase_extraction import process_key_phrase_extraction
+from semantic_text_chunker import process_semantic_text_chunker, SemanticTextChunker
 
 logging.basicConfig(level=logging.DEBUG)
 app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
@@ -87,6 +88,62 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
         )
 
 
+@app.route(route="semantic_text_chunker", methods=[func.HttpMethod.POST])
+async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
+    """HTTP trigger for text chunking function.
+
+    Args:
+        req (func.HttpRequest): The HTTP request object.
+
+    Returns:
+        func.HttpResponse: The HTTP response object."""
+    logging.info("Python HTTP trigger text chunking function processed a request.")
+
+    try:
+        req_body = req.get_json()
+        values = req_body.get("values")
+
+        semantic_text_chunker_config = req.headers
+
+        num_surrounding_sentences = semantic_text_chunker_config.get(
+            "num_surrounding_sentences", 1
+        )
+        similarity_threshold = semantic_text_chunker_config.get(
+            "similarity_threshold", 0.8
+        )
+        max_chunk_tokens = semantic_text_chunker_config.get("max_chunk_tokens", 500)
+
+    except ValueError:
+        return func.HttpResponse(
+            "Please valid Custom Skill Payload in the request body", status_code=400
+        )
+    else:
+        logging.debug("Input Values: %s", values)
+
+        record_tasks = []
+
+        semantic_text_chunker = SemanticTextChunker(
+            num_surrounding_sentences=num_surrounding_sentences,
+            similarity_threshold=similarity_threshold,
+            max_chunk_tokens=max_chunk_tokens,
+        )
+
+        for value in values:
+            record_tasks.append(
+                asyncio.create_task(
+                    process_semantic_text_chunker(value, semantic_text_chunker)
+                )
+            )
+
+        results = await asyncio.gather(*record_tasks)
+        logging.debug("Results: %s", results)
+        cleaned_tasks = {"values": results}
+
+        return func.HttpResponse(
+            json.dump(cleaned_tasks), status_code=200, mimetype="application/json"
+        )
+
+
 @app.route(route="key_phrase_extractor", methods=[func.HttpMethod.POST])
 async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse:
     """HTTP trigger for data cleanup function.
diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py
index 5c787e6..056103b 100644
--- a/adi_function_app/pre_embedding_cleaner.py
+++ b/adi_function_app/pre_embedding_cleaner.py
@@ -5,30 +5,6 @@
 import re
 
 
-def get_sections(cleaned_text: str) -> list:
-    """
-    Returns the section details from the content
-
-    Args:
-        cleaned_text: The input text
-
-    Returns:
-        list: The sections related to text
-
-    """
-    combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
-    doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
-    doc_metadata = [match for group in doc_metadata for match in group if match]
-    return clean_sections(doc_metadata)
-
-
-def clean_sections(sections: list) -> list:
-    """Cleans the sections by removing special characters and extra white spaces."""
-    cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
-
-    return cleanedSections
-
-
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     """
     Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -52,7 +28,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     return text
 
 
-def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
+def clean_text(src_text: str) -> str:
     """This function performs following cleanup activities on the text, remove all unicode characters
     remove line spacing,remove stop words, normalize characters
 
@@ -77,8 +53,6 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
         }
         cleaned_text = remove_markdown_tags(src_text, tag_patterns)
 
-        sections = get_sections(cleaned_text)
-
         # Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
         # while also removing non-printable characters
         cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text)
@@ -90,7 +64,7 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
     except Exception as e:
         logging.error(f"An error occurred in clean_text: {e}")
         return ""
-    return cleaned_text, sections
+    return cleaned_text
 
 
 async def process_pre_embedding_cleaner(record: dict) -> dict:
@@ -114,19 +88,7 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
             "warnings": None,
         }
 
-        # scenarios when page by chunking is enabled
-        if isinstance(record["data"]["chunk"], dict):
-            (
-                cleaned_record["data"]["cleanedChunk"],
-                cleaned_record["data"]["sections"],
-            ) = clean_text_with_section_extraction(record["data"]["chunk"]["content"])
-            cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
-        else:
-            (
-                cleaned_record["data"]["cleanedChunk"],
-                cleaned_record["data"]["sections"],
-            ) = clean_text_with_section_extraction(record["data"]["chunk"])
-            cleaned_record["data"]["chunk"] = record["data"]["chunk"]
+        cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["content"])
 
     except Exception as e:
         logging.error("string cleanup Error: %s", e)
diff --git a/adi_function_app/requirements.txt b/adi_function_app/requirements.txt
index b97a6d6..7ac403b 100644
--- a/adi_function_app/requirements.txt
+++ b/adi_function_app/requirements.txt
@@ -19,3 +19,6 @@ azure-ai-vision-imageanalysis
 PyMuPDF
 aiohttp
 Pillow
+spacy
+en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz
+tiktoken
diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
new file mode 100644
index 0000000..78dd82f
--- /dev/null
+++ b/adi_function_app/semantic_text_chunker.py
@@ -0,0 +1,360 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# This code originates from: https://github.com/microsoft/dstoolkit-text2sql-and-imageprocessing
+import logging
+import json
+import re
+import tiktoken
+import spacy
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+class SemanticTextChunker:
+    def __init__(
+        self,
+        num_surrounding_sentences=1,
+        similarity_threshold=0.8,
+        max_chunk_tokens=100,
+    ):
+        self.num_surrounding_sentences = num_surrounding_sentences
+        self.similarity_threshold = similarity_threshold
+        self.max_chunk_tokens = max_chunk_tokens
+        try:
+            self._nlp_model = spacy.load("en_core_web_md")
+        except IOError as e:
+            raise ValueError("Spacy model 'en_core_web_md' not found.") from e
+
+    def sentence_contains_table(self, text: str) -> bool:
+        """Detects if a sentence contains table tags.
+
+        Args:
+            text (str): The text to check.
+
+        Returns:
+            bool: If it contains a table."""
+        return "<table>" in text or "</table>" in text
+
+    def sentence_contains_figure(self, text: str) -> bool:
+        """Detects if a sentence contains figure tags.
+
+        Args:
+            text (str): The text to check.
+
+        Returns:
+            bool: If it contains a figure."""
+        return "<figure>" in text or "</figure>" in text
+
+    def num_tokens_from_string(self, string: str) -> int:
+        """Gets the number of tokens in a string using a specific encoding.
+
+        Args:
+            string: The input string.
+
+        Returns:
+            int: The number of tokens in the string."""
+        encoding = tiktoken.get_encoding("cl100k_base")
+        return len(encoding.encode(string))
+
+    def get_sections(self, cleaned_text: str) -> list:
+        """
+        Returns the section details from the content
+
+        Args:
+            cleaned_text: The input text
+
+        Returns:
+            list: The sections related to text
+
+        """
+        combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
+        doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
+        doc_metadata = [match for group in doc_metadata for match in group if match]
+        return self.clean_sections(doc_metadata)
+
+    def clean_sections(self, sections: list) -> list:
+        """Cleans the sections by removing special characters and extra white spaces."""
+        cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
+
+        return cleaned_sections
+
+    async def chunk(self, text: str) -> list[dict]:
+        """Attempts to chunk the text and then assigns the sections from the relevant chunk, to a separate field.
+
+        Args:
+            text (str): The set of text to chunk
+
+        Returns:
+            list(dict): The list of matching chunks and sections"""
+        final_chunks = self.chunk_into_sentences(text)
+
+        # now extract section data
+        chunk_and_section_output = []
+        for chunk in final_chunks:
+            sections = self.get_sections(chunk)
+
+            chunk_and_section_output.append({"content": chunk, "sections": sections})
+
+        return chunk_and_section_output
+
+    def chunk_into_sentences(self, text: str) -> list[str]:
+        """Attempts to chunk the text by:
+            Splitting into sentences
+            Grouping sentences that contain figures and tables
+            Merging semanticly similar chunks
+
+        Args:
+            text (str): The set of text to chunk
+
+        Returns:
+            list(str): The list of chunks"""
+
+        sentences = self.split_into_sentences(text)
+        (
+            grouped_sentences,
+            is_table_or_figure_map,
+        ) = self.group_figures_and_tables_into_sentences(sentences)
+        forward_pass_chunks, is_table_or_figure_map = self.merge_chunks(
+            grouped_sentences, is_table_or_figure_map
+        )
+
+        backwards_pass_chunks, _ = self.merge_chunks(
+            forward_pass_chunks, is_table_or_figure_map, forwards_direction=False
+        )
+        return backwards_pass_chunks
+
+    def split_into_sentences(self, text: str) -> list[str]:
+        """Splits a set of text into a list of sentences uses the Spacy NLP model.
+
+        Args:
+            text (str): The set of text to chunk
+
+        Returns:
+            list(str): The extracted sentences
+        """
+        doc = self._nlp_model(text)
+        sentences = [sent.text for sent in doc.sents]
+        return sentences
+
+    def group_figures_and_tables_into_sentences(self, sentences):
+        grouped_sentences = []
+        holding_sentences = []
+
+        is_table_or_figure_map = []
+
+        is_grouped_sentence = False
+        for current_sentence in sentences:
+            if is_grouped_sentence is False:
+                if self.sentence_contains_figure(
+                    current_sentence
+                ) or self.sentence_contains_table(current_sentence):
+                    is_grouped_sentence = True
+                    holding_sentences.append(current_sentence)
+                else:
+                    grouped_sentences.append(current_sentence)
+                    is_table_or_figure_map = False
+            else:
+                # check for ending case
+                if self.sentence_contains_figure(
+                    current_sentence
+                ) or self.sentence_contains_table(current_sentence):
+                    holding_sentences.append(current_sentence)
+
+                    full_sentence = " ".join(holding_sentences)
+                    grouped_sentences.append(full_sentence)
+                    holding_sentences = []
+
+                    is_grouped_sentence = False
+                    is_table_or_figure_map = True
+                else:
+                    holding_sentences.append(current_sentence)
+
+        return grouped_sentences, is_table_or_figure_map
+
+    def look_ahead_and_behind_sentences(
+        self, total_sentences, is_table_or_figure_map, current_sentence_index
+    ):
+        is_table_or_figure_ahead = False
+        is_table_or_figure_behind = False
+
+        distance_to_next_figure = self.num_surrounding_sentences
+
+        if current_sentence_index < self.num_surrounding_sentences:
+            is_table_or_figure_behind = is_table_or_figure_map[0]
+        else:
+            is_table_or_figure_behind = is_table_or_figure_map[
+                current_sentence_index - self.num_surrounding_sentences
+            ]
+
+        surround_sentences_gap_to_test = self.num_surrounding_sentences
+        if current_sentence_index + self.num_surrounding_sentences > total_sentences:
+            is_table_or_figure_ahead = is_table_or_figure_map[-1]
+            surround_sentences_gap_to_test = total_sentences - current_sentence_index
+        else:
+            is_table_or_figure_ahead = is_table_or_figure_map[
+                current_sentence_index + self.num_surrounding_sentences
+            ]
+
+        for (
+            next_sentence_is_table_or_figure_index,
+            next_sentence_is_table_or_figure,
+        ) in enumerate(
+            is_table_or_figure_map[
+                current_sentence_index : current_sentence_index
+                + surround_sentences_gap_to_test
+            ]
+        ):
+            if next_sentence_is_table_or_figure:
+                distance_to_next_figure = next_sentence_is_table_or_figure_index
+
+        return (
+            is_table_or_figure_ahead,
+            is_table_or_figure_behind,
+            min(surround_sentences_gap_to_test, distance_to_next_figure),
+        )
+
+    def merge_similar_chunks(self, current_sentence, current_chunk):
+        new_chunk = None
+        # Current chunk will be updated in place
+        # Only compare when we have 2 or more chunks
+        if len(current_chunk) >= 2:
+            cosine_sim = self.sentence_similarity(
+                " ".join(current_chunk[-2:]), current_sentence
+            )
+            if (
+                cosine_sim < self.similarity_threshold
+                or self.num_tokens_from_string(" ".join(current_chunk))
+                > self.max_chunk_tokens
+            ):
+                if len(current_chunk) > 2:
+                    new_chunk = " ".join(current_chunk[:-1])
+                    current_chunk = [current_chunk[-1]]
+                else:
+                    new_chunk = current_chunk[0]
+                    current_chunk = [current_chunk[1]]
+
+        return new_chunk
+
+    def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=True):
+        chunks = []
+        current_chunk = []
+
+        total_sentences = len(sentences)
+        index = 0
+
+        new_is_table_or_figure_map = []
+        while index < total_sentences:
+            if forwards_direction is False:
+                current_sentence_index = total_sentences - index
+            else:
+                current_sentence_index = index
+
+            current_sentence = sentences[current_sentence_index]
+
+            # Detect if table or figure
+            if is_table_or_figure_map[current_sentence_index]:
+                if forwards_direction:
+                    current_chunk.append(current_chunk)
+                else:
+                    # On the backwards pass we don't want to add to the table chunk
+                    chunks.append(" ".join(current_chunk))
+                    chunks.append(current_chunk)
+                continue
+            elif forwards_direction:
+                # Look ahead to see if figure of table is coming up
+                # We only do this on the forward pass
+                (
+                    is_table_or_figure_ahead,
+                    is_table_or_figure_behind,
+                    min_of_distance_to_next_figure_or_num_surrounding_sentences,
+                ) = self.look_ahead_and_behind_sentences(
+                    total_sentences, is_table_or_figure_map, current_sentence_index
+                )
+
+                if is_table_or_figure_behind:
+                    # Finish off
+                    current_chunk.append(current_sentence)
+                    chunks.append(" ".join(current_chunk))
+                    new_is_table_or_figure_map.append(True)
+                    current_chunk = []
+                    continue
+                elif is_table_or_figure_ahead:
+                    # Add to the ahead chunk
+                    chunks.append(" ".join(current_chunk))
+                    new_is_table_or_figure_map.append(True)
+                    current_chunk = sentences[
+                        current_sentence_index:min_of_distance_to_next_figure_or_num_surrounding_sentences
+                    ]
+
+                    index += min_of_distance_to_next_figure_or_num_surrounding_sentences
+                    continue
+
+            # now group semanticly
+            num_tokens = self.num_tokens_from_string(current_sentence)
+
+            if num_tokens >= self.max_chunk_tokens:
+                chunks.append(current_sentence)
+                new_is_table_or_figure_map.append(False)
+                continue
+            else:
+                current_chunk.append(current_sentence)
+
+            new_chunk = self.merge_similar_chunks(current_sentence, current_chunk)
+
+            if new_chunk is not None:
+                chunks.append(new_chunk)
+                new_is_table_or_figure_map.append(False)
+
+            index += 1
+
+        if len(current_chunk) > 0:
+            chunks.append(" ".join(current_chunk))
+            new_is_table_or_figure_map.append(False)
+
+        return chunks, new_is_table_or_figure_map
+
+    def sentence_similarity(self, text1, text2):
+        vec1 = self._nlp_model(text1).vector
+        vec2 = self._nlp_model(text2).vector
+        return cosine_similarity([vec1], [vec2])[0, 0]
+
+
+async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
+    """Chunk the data.
+
+    Args:
+        record (dict): The record to cleanup.
+
+    Returns:
+        dict: The clean record."""
+
+    try:
+        json_str = json.dumps(record, indent=4)
+
+        logging.info(f"Chunking Input: {json_str}")
+
+        cleaned_record = {
+            "recordId": record["recordId"],
+            "data": {},
+            "errors": None,
+            "warnings": None,
+        }
+
+        # scenarios when page by chunking is enabled
+        cleaned_record["data"] = await text_chunker.chunk(record["data"]["content"])
+
+    except Exception as e:
+        logging.error("Chunking Error: %s", e)
+        return {
+            "recordId": record["recordId"],
+            "data": {},
+            "errors": [
+                {
+                    "message": "Failed to chunk data. Check function app logs for more details of exact failure."
+                }
+            ],
+            "warnings": None,
+        }
+    json_str = json.dumps(cleaned_record, indent=4)
+
+    logging.info(f"Chunking output: {json_str}")
+    return cleaned_record
diff --git a/deploy_ai_search/.env b/deploy_ai_search/.env
index 2b858b2..9efe39f 100644
--- a/deploy_ai_search/.env
+++ b/deploy_ai_search/.env
@@ -3,6 +3,7 @@ FunctionApp__Key=<functionAppKey>
 FunctionApp__PreEmbeddingCleaner__FunctionName=pre_embedding_cleaner
 FunctionApp__ADI__FunctionName=adi_2_ai_search
 FunctionApp__KeyPhraseExtractor__FunctionName=key_phrase_extractor
+FunctionApp__SemanticTextChunker__FunctionName=semantic_text_chunker
 FunctionApp__AppRegistrationResourceId=<App registration in form api://appRegistrationclientId if using identity based connections>
 IdentityType=<identityType> # system_assigned or user_assigned or key
 AIService__AzureSearchOptions__Endpoint=<searchServiceEndpoint>
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
index 6d63bac..a3205bb 100644
--- a/deploy_ai_search/ai_search.py
+++ b/deploy_ai_search/ai_search.py
@@ -220,9 +220,7 @@ def get_pre_embedding_cleaner_skill(self, context, source) -> WebApiSkill:
         ]
 
         pre_embedding_cleaner_skill_outputs = [
-            OutputFieldMappingEntry(name="cleanedChunk", target_name="cleanedChunk"),
-            OutputFieldMappingEntry(name="chunk", target_name="chunk"),
-            OutputFieldMappingEntry(name="sections", target_name="sections"),
+            OutputFieldMappingEntry(name="cleaned_chunk", target_name="cleaned_chunk")
         ]
 
         pre_embedding_cleaner_skill = WebApiSkill(
@@ -263,18 +261,45 @@ def get_text_split_skill(self, context, source) -> SplitSkill:
         --------
             splitSKill: The skill for text split"""
 
-        text_split_skill = SplitSkill(
-            name="Text Split Skill",
-            description="Skill to split the text before sending to embedding",
+        if self.test:
+            batch_size = 2
+            degree_of_parallelism = 2
+        else:
+            batch_size = 16
+            degree_of_parallelism = 16
+
+        semantic_text_chunker_skill_inputs = [
+            InputFieldMappingEntry(name="content", source=source)
+        ]
+
+        semantic_text_chunker_skill_outputs = [
+            OutputFieldMappingEntry(name="chunks", target_name="chunks"),
+        ]
+
+        semantic_text_chunker_skill = WebApiSkill(
+            name="Pre Embedding Cleaner Skill",
+            description="Skill to clean the data before sending to embedding",
             context=context,
-            text_split_mode="pages",
-            maximum_page_length=2000,
-            page_overlap_length=500,
-            inputs=[InputFieldMappingEntry(name="text", source=source)],
-            outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
+            uri=self.environment.get_custom_skill_function_url("semantic_text_chunker"),
+            timeout="PT230S",
+            batch_size=batch_size,
+            degree_of_parallelism=degree_of_parallelism,
+            http_method="POST",
+            inputs=semantic_text_chunker_skill_inputs,
+            outputs=semantic_text_chunker_skill_outputs,
         )
 
-        return text_split_skill
+        if self.environment.identity_type != IdentityType.KEY:
+            semantic_text_chunker_skill.auth_identity = (
+                self.environment.function_app_app_registration_resource_id
+            )
+
+        if self.environment.identity_type == IdentityType.USER_ASSIGNED:
+            semantic_text_chunker_skill.auth_identity = (
+                self.environment.ai_search_user_assigned_identity
+            )
+
+        return semantic_text_chunker_skill
 
     def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
         """Get the custom skill for adi.
@@ -297,7 +322,7 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
 
         if chunk_by_page:
             output = [
-                OutputFieldMappingEntry(name="extracted_content", target_name="pages")
+                OutputFieldMappingEntry(name="extracted_content", target_name="chunks")
             ]
         else:
             output = [
diff --git a/deploy_ai_search/environment.py b/deploy_ai_search/environment.py
index e431304..c42a18c 100644
--- a/deploy_ai_search/environment.py
+++ b/deploy_ai_search/environment.py
@@ -204,6 +204,13 @@ def function_app_pre_embedding_cleaner_route(self) -> str:
         """
         return os.environ.get("FunctionApp__PreEmbeddingCleaner__FunctionName")
 
+    @property
+    def function_app_semantic_text_chunker_route(self) -> str:
+        """
+        This function returns function app semantic text chunker name
+        """
+        return os.environ.get("FunctionApp__SemanticTextChunker__FunctionName")
+
     @property
     def function_app_adi_route(self) -> str:
         """
@@ -249,6 +256,8 @@ def get_custom_skill_function_url(self, skill_type: str):
             route = self.function_app_adi_route
         elif skill_type == "key_phrase_extraction":
             route = self.function_app_key_phrase_extractor_route
+        elif skill_type == "semantic_text_chunker":
+            route = self.function_app_semantic_text_chunker_route
         else:
             raise ValueError(f"Invalid skill type: {skill_type}")
 
diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py
index c9ebffd..1d51874 100644
--- a/deploy_ai_search/rag_documents.py
+++ b/deploy_ai_search/rag_documents.py
@@ -172,15 +172,15 @@ def get_skills(self) -> list:
         )
 
         pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill(
-            "/document/pages/*", "/document/pages/*"
+            "/document/chunks/*", "/document/chunks/*/content"
         )
 
         key_phrase_extraction_skill = self.get_key_phrase_extraction_skill(
-            "/document/pages/*", "/document/pages/*/cleanedChunk"
+            "/document/chunks/*", "/document/chunks/*/cleaned_chunk"
         )
 
         embedding_skill = self.get_vector_skill(
-            "/document/pages/*", "/document/pages/*/cleanedChunk"
+            "/document/chunks/*", "/document/chunks/*/cleaned_chunk"
         )
 
         if self.enable_page_by_chunking:
@@ -204,28 +204,29 @@ def get_skills(self) -> list:
     def get_index_projections(self) -> SearchIndexerIndexProjection:
         """This function returns the index projections for rag document."""
         mappings = [
-            InputFieldMappingEntry(name="Chunk", source="/document/pages/*/chunk"),
+            InputFieldMappingEntry(name="Chunk", source="/document/chunks/*/content"),
             InputFieldMappingEntry(
                 name="ChunkEmbedding",
-                source="/document/pages/*/vector",
+                source="/document/chunks/*/vector",
             ),
             InputFieldMappingEntry(name="Title", source="/document/Title"),
             InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
             InputFieldMappingEntry(
-                name="Keywords", source="/document/pages/*/keywords"
+                name="Keywords", source="/document/chunks/*/keywords"
             ),
             InputFieldMappingEntry(
-                name="Sections", source="/document/pages/*/sections"
+                name="Sections", source="/document/chunks/*/sections"
             ),
             InputFieldMappingEntry(
                 name="Figures",
-                source_context="/document/pages/*/figures/*",
+                source_context="/document/chunks/*/figures/*",
                 inputs=[
                     InputFieldMappingEntry(
-                        name="FigureId", source="/document/pages/*/figures/*/figureId"
+                        name="FigureId", source="/document/chunks/*/figures/*/figureId"
                     ),
                     InputFieldMappingEntry(
-                        name="FigureUri", source="/document/pages/*/figures/*/figureUri"
+                        name="FigureUri",
+                        source="/document/chunks/*/figures/*/figureUri",
                     ),
                 ],
             ),
@@ -238,7 +239,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
             mappings.extend(
                 [
                     InputFieldMappingEntry(
-                        name="PageNumber", source="/document/pages/*/pageNumber"
+                        name="PageNumber", source="/document/chunks/*/pageNumber"
                     )
                 ]
             )
@@ -248,7 +249,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
                 SearchIndexerIndexProjectionSelector(
                     target_index_name=self.index_name,
                     parent_key_field_name="Id",
-                    source_context="/document/pages/*",
+                    source_context="/document/chunks/*",
                     mappings=mappings,
                 ),
             ],

From 2bea14dc9a5622e15c33c844b9222bb0d31c2724 Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 18 Nov 2024 19:25:23 +0000
Subject: [PATCH 02/10] Update chunking mechanism

---
 adi_function_app/function_app.py              |  10 +-
 ...mbedding_cleaner.py => mark_up_cleaner.py} |  45 +++-
 adi_function_app/requirements.txt             |   1 +
 adi_function_app/semantic_text_chunker.py     | 227 +++++++++---------
 deploy_ai_search/ai_search.py                 |  20 +-
 deploy_ai_search/environment.py               |   6 +-
 deploy_ai_search/rag_documents.py             |   6 +-
 7 files changed, 177 insertions(+), 138 deletions(-)
 rename adi_function_app/{pre_embedding_cleaner.py => mark_up_cleaner.py} (71%)

diff --git a/adi_function_app/function_app.py b/adi_function_app/function_app.py
index a12d973..3709758 100644
--- a/adi_function_app/function_app.py
+++ b/adi_function_app/function_app.py
@@ -6,7 +6,7 @@
 import asyncio
 
 from adi_2_ai_search import process_adi_2_ai_search
-from pre_embedding_cleaner import process_pre_embedding_cleaner
+from adi_function_app.mark_up_cleaner import process_mark_up_cleaner
 from key_phrase_extraction import process_key_phrase_extraction
 from semantic_text_chunker import process_semantic_text_chunker, SemanticTextChunker
 
@@ -51,8 +51,8 @@ async def adi_2_ai_search(req: func.HttpRequest) -> func.HttpResponse:
         )
 
 
-@app.route(route="pre_embedding_cleaner", methods=[func.HttpMethod.POST])
-async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
+@app.route(route="mark_up_cleaner", methods=[func.HttpMethod.POST])
+async def mark_up_cleaner(req: func.HttpRequest) -> func.HttpResponse:
     """HTTP trigger for data cleanup function.
 
     Args:
@@ -75,9 +75,7 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
         record_tasks = []
 
         for value in values:
-            record_tasks.append(
-                asyncio.create_task(process_pre_embedding_cleaner(value))
-            )
+            record_tasks.append(asyncio.create_task(process_mark_up_cleaner(value)))
 
         results = await asyncio.gather(*record_tasks)
         logging.debug("Results: %s", results)
diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/mark_up_cleaner.py
similarity index 71%
rename from adi_function_app/pre_embedding_cleaner.py
rename to adi_function_app/mark_up_cleaner.py
index 056103b..ac27c1a 100644
--- a/adi_function_app/pre_embedding_cleaner.py
+++ b/adi_function_app/mark_up_cleaner.py
@@ -5,6 +5,30 @@
 import re
 
 
+def get_sections(text: str) -> list:
+    """
+    Returns the section details from the content.
+
+    Args:
+        text: The input text
+
+    Returns:
+        list: The sections related to text
+    """
+    # Updated regex pattern to capture markdown headers like ### Header
+    combined_pattern = r"(?<=\n|^)[#]+\s*(.*?)(?=\n)"
+    doc_metadata = re.findall(combined_pattern, text, re.DOTALL)
+    return clean_sections(doc_metadata)
+
+
+def clean_sections(sections: list) -> list:
+    """
+    Cleans the sections by removing special characters and extra white spaces.
+    """
+    cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
+    return cleaned_sections
+
+
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     """
     Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -28,7 +52,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     return text
 
 
-def clean_text(src_text: str) -> str:
+def clean_text_and_extract_metadata(src_text: str) -> tuple[str, str]:
     """This function performs following cleanup activities on the text, remove all unicode characters
     remove line spacing,remove stop words, normalize characters
 
@@ -38,16 +62,21 @@ def clean_text(src_text: str) -> str:
     Returns:
         str: The clean text."""
 
+    return_record = {}
+
     try:
         logging.info(f"Input text: {src_text}")
         if len(src_text) == 0:
             logging.error("Input text is empty")
             raise ValueError("Input text is empty")
 
+        return_record["marked_up_chunk"] = src_text
+        return_record["sections"] = get_sections(src_text)
+
         # Define specific patterns for each tag
         tag_patterns = {
             "figurecontent": r"<!-- FigureContent=(.*?)-->",
-            "figure": r"<figure>(.*?)</figure>",
+            "figure": r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>",
             "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
             "figcaption": r"<figcaption>(.*?)</figcaption>",
         }
@@ -61,13 +90,15 @@ def clean_text(src_text: str) -> str:
         if len(cleaned_text) == 0:
             logging.error("Cleaned text is empty")
             raise ValueError("Cleaned text is empty")
+        else:
+            return_record["cleaned_chunk"] = cleaned_text
     except Exception as e:
-        logging.error(f"An error occurred in clean_text: {e}")
+        logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
         return ""
-    return cleaned_text
+    return return_record
 
 
-async def process_pre_embedding_cleaner(record: dict) -> dict:
+async def process_mark_up_cleaner(record: dict) -> dict:
     """Cleanup the data using standard python libraries.
 
     Args:
@@ -88,7 +119,9 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
             "warnings": None,
         }
 
-        cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["content"])
+        cleaned_record["data"] = clean_text_and_extract_metadata(
+            record["data"]["content"]
+        )
 
     except Exception as e:
         logging.error("string cleanup Error: %s", e)
diff --git a/adi_function_app/requirements.txt b/adi_function_app/requirements.txt
index 7ac403b..ca11d4d 100644
--- a/adi_function_app/requirements.txt
+++ b/adi_function_app/requirements.txt
@@ -19,6 +19,7 @@ azure-ai-vision-imageanalysis
 PyMuPDF
 aiohttp
 Pillow
+numpy
 spacy
 en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz
 tiktoken
diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
index 78dd82f..c39180d 100644
--- a/adi_function_app/semantic_text_chunker.py
+++ b/adi_function_app/semantic_text_chunker.py
@@ -1,20 +1,19 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-# This code originates from: https://github.com/microsoft/dstoolkit-text2sql-and-imageprocessing
 import logging
 import json
 import re
 import tiktoken
 import spacy
-from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 
 
 class SemanticTextChunker:
     def __init__(
         self,
-        num_surrounding_sentences=1,
-        similarity_threshold=0.8,
-        max_chunk_tokens=100,
+        num_surrounding_sentences: int = 1,
+        similarity_threshold: float = 0.8,
+        max_chunk_tokens: int = 200,
     ):
         self.num_surrounding_sentences = num_surrounding_sentences
         self.similarity_threshold = similarity_threshold
@@ -24,25 +23,15 @@ def __init__(
         except IOError as e:
             raise ValueError("Spacy model 'en_core_web_md' not found.") from e
 
-    def sentence_contains_table(self, text: str) -> bool:
-        """Detects if a sentence contains table tags.
-
-        Args:
-            text (str): The text to check.
-
-        Returns:
-            bool: If it contains a table."""
-        return "<table>" in text or "</table>" in text
-
-    def sentence_contains_figure(self, text: str) -> bool:
-        """Detects if a sentence contains figure tags.
-
-        Args:
-            text (str): The text to check.
+    def sentence_contains_figure_or_table(self, text: str):
+        return ("<figure" in text or "</figure>" in text) or (
+            "<table>" in text or "</table>" in text
+        )
 
-        Returns:
-            bool: If it contains a figure."""
-        return "<figure>" in text or "</figure>" in text
+    def sentence_is_complete_figure_or_table(self, text: str):
+        return ("<figure" in text and "</figure>" in text) or (
+            "<table>" in text and "</table>" in text
+        )
 
     def num_tokens_from_string(self, string: str) -> int:
         """Gets the number of tokens in a string using a specific encoding.
@@ -52,51 +41,12 @@ def num_tokens_from_string(self, string: str) -> int:
 
         Returns:
             int: The number of tokens in the string."""
-        encoding = tiktoken.get_encoding("cl100k_base")
-        return len(encoding.encode(string))
 
-    def get_sections(self, cleaned_text: str) -> list:
-        """
-        Returns the section details from the content
-
-        Args:
-            cleaned_text: The input text
-
-        Returns:
-            list: The sections related to text
-
-        """
-        combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
-        doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
-        doc_metadata = [match for group in doc_metadata for match in group if match]
-        return self.clean_sections(doc_metadata)
-
-    def clean_sections(self, sections: list) -> list:
-        """Cleans the sections by removing special characters and extra white spaces."""
-        cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
+        encoding = tiktoken.get_encoding("cl100k_base")
 
-        return cleaned_sections
+        return len(encoding.encode(string))
 
     async def chunk(self, text: str) -> list[dict]:
-        """Attempts to chunk the text and then assigns the sections from the relevant chunk, to a separate field.
-
-        Args:
-            text (str): The set of text to chunk
-
-        Returns:
-            list(dict): The list of matching chunks and sections"""
-        final_chunks = self.chunk_into_sentences(text)
-
-        # now extract section data
-        chunk_and_section_output = []
-        for chunk in final_chunks:
-            sections = self.get_sections(chunk)
-
-            chunk_and_section_output.append({"content": chunk, "sections": sections})
-
-        return chunk_and_section_output
-
-    def chunk_into_sentences(self, text: str) -> list[str]:
         """Attempts to chunk the text by:
             Splitting into sentences
             Grouping sentences that contain figures and tables
@@ -113,14 +63,20 @@ def chunk_into_sentences(self, text: str) -> list[str]:
             grouped_sentences,
             is_table_or_figure_map,
         ) = self.group_figures_and_tables_into_sentences(sentences)
-        forward_pass_chunks, is_table_or_figure_map = self.merge_chunks(
+
+        forward_pass_chunks, new_is_table_or_figure_map = self.merge_chunks(
             grouped_sentences, is_table_or_figure_map
         )
 
         backwards_pass_chunks, _ = self.merge_chunks(
-            forward_pass_chunks, is_table_or_figure_map, forwards_direction=False
+            forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False
+        )
+
+        backwards_pass_chunks = list(
+            map(lambda x: x.strip(), reversed(backwards_pass_chunks))
         )
-        return backwards_pass_chunks
+
+        return list(reversed(backwards_pass_chunks))
 
     def split_into_sentences(self, text: str) -> list[str]:
         """Splits a set of text into a list of sentences uses the Spacy NLP model.
@@ -131,11 +87,25 @@ def split_into_sentences(self, text: str) -> list[str]:
         Returns:
             list(str): The extracted sentences
         """
-        doc = self._nlp_model(text)
+
+        def replace_newlines_outside_html(text):
+            def replacement(match):
+                # Only replace if \n is outside HTML tags
+                if "<" not in match.group(0) and ">" not in match.group(0):
+                    return match.group(0).replace("\n", " ")
+                return match.group(0)
+
+            # Match sequences of non-whitespace characters with \n outside tags
+            return re.sub(r"[^<>\s]+\n[^<>\s]+", replacement, text)
+
+        doc = self._nlp_model(replace_newlines_outside_html(text))
         sentences = [sent.text for sent in doc.sents]
+
+        print(len(sentences))
+
         return sentences
 
-    def group_figures_and_tables_into_sentences(self, sentences):
+    def group_figures_and_tables_into_sentences(self, sentences: list[str]):
         grouped_sentences = []
         holding_sentences = []
 
@@ -144,19 +114,18 @@ def group_figures_and_tables_into_sentences(self, sentences):
         is_grouped_sentence = False
         for current_sentence in sentences:
             if is_grouped_sentence is False:
-                if self.sentence_contains_figure(
-                    current_sentence
-                ) or self.sentence_contains_table(current_sentence):
+                if self.sentence_is_complete_figure_or_table(current_sentence):
+                    grouped_sentences.append(current_sentence)
+                    is_table_or_figure_map.append(True)
+                elif self.sentence_contains_figure_or_table(current_sentence):
                     is_grouped_sentence = True
                     holding_sentences.append(current_sentence)
                 else:
                     grouped_sentences.append(current_sentence)
-                    is_table_or_figure_map = False
+                    is_table_or_figure_map.append(False)
             else:
                 # check for ending case
-                if self.sentence_contains_figure(
-                    current_sentence
-                ) or self.sentence_contains_table(current_sentence):
+                if self.sentence_contains_figure_or_table(current_sentence):
                     holding_sentences.append(current_sentence)
 
                     full_sentence = " ".join(holding_sentences)
@@ -164,7 +133,7 @@ def group_figures_and_tables_into_sentences(self, sentences):
                     holding_sentences = []
 
                     is_grouped_sentence = False
-                    is_table_or_figure_map = True
+                    is_table_or_figure_map.append(True)
                 else:
                     holding_sentences.append(current_sentence)
 
@@ -186,7 +155,7 @@ def look_ahead_and_behind_sentences(
             ]
 
         surround_sentences_gap_to_test = self.num_surrounding_sentences
-        if current_sentence_index + self.num_surrounding_sentences > total_sentences:
+        if current_sentence_index + self.num_surrounding_sentences >= total_sentences:
             is_table_or_figure_ahead = is_table_or_figure_map[-1]
             surround_sentences_gap_to_test = total_sentences - current_sentence_index
         else:
@@ -212,27 +181,32 @@ def look_ahead_and_behind_sentences(
             min(surround_sentences_gap_to_test, distance_to_next_figure),
         )
 
-    def merge_similar_chunks(self, current_sentence, current_chunk):
+    def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction):
         new_chunk = None
-        # Current chunk will be updated in place
         # Only compare when we have 2 or more chunks
+
+        if forwards_direction is False:
+            directional_current_chunk = list(reversed(current_chunk))
+        else:
+            directional_current_chunk = current_chunk
+
         if len(current_chunk) >= 2:
             cosine_sim = self.sentence_similarity(
-                " ".join(current_chunk[-2:]), current_sentence
+                " ".join(directional_current_chunk[-2:]), current_sentence
             )
             if (
                 cosine_sim < self.similarity_threshold
-                or self.num_tokens_from_string(" ".join(current_chunk))
-                > self.max_chunk_tokens
+                or self.num_tokens_from_string(" ".join(directional_current_chunk))
+                >= self.max_chunk_tokens
             ):
                 if len(current_chunk) > 2:
-                    new_chunk = " ".join(current_chunk[:-1])
-                    current_chunk = [current_chunk[-1]]
+                    new_chunk = " ".join(directional_current_chunk[:1])
+                    current_chunk = [directional_current_chunk[-1]]
                 else:
                     new_chunk = current_chunk[0]
                     current_chunk = [current_chunk[1]]
 
-        return new_chunk
+        return new_chunk, current_chunk
 
     def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=True):
         chunks = []
@@ -241,10 +215,16 @@ def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=Tru
         total_sentences = len(sentences)
         index = 0
 
+        def retrieve_current_chunk():
+            if forwards_direction:
+                return " ".join(current_chunk)
+            else:
+                return " ".join(reversed(current_chunk))
+
         new_is_table_or_figure_map = []
         while index < total_sentences:
             if forwards_direction is False:
-                current_sentence_index = total_sentences - index
+                current_sentence_index = total_sentences - index - 1
             else:
                 current_sentence_index = index
 
@@ -252,12 +232,20 @@ def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=Tru
 
             # Detect if table or figure
             if is_table_or_figure_map[current_sentence_index]:
+                new_is_table_or_figure_map.append(True)
                 if forwards_direction:
-                    current_chunk.append(current_chunk)
+                    if len(current_chunk) > 0:
+                        current_chunk.append(current_sentence)
+                        chunks.append(retrieve_current_chunk())
+                        current_chunk = []
+                    else:
+                        current_chunk.append(current_sentence)
                 else:
                     # On the backwards pass we don't want to add to the table chunk
-                    chunks.append(" ".join(current_chunk))
-                    chunks.append(current_chunk)
+                    chunks.append(retrieve_current_chunk())
+                    current_chunk.append(current_sentence)
+
+                index += 1
                 continue
             elif forwards_direction:
                 # Look ahead to see if figure of table is coming up
@@ -273,18 +261,24 @@ def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=Tru
                 if is_table_or_figure_behind:
                     # Finish off
                     current_chunk.append(current_sentence)
-                    chunks.append(" ".join(current_chunk))
-                    new_is_table_or_figure_map.append(True)
+                    chunks.append(retrieve_current_chunk())
                     current_chunk = []
+
+                    index += 1
                     continue
                 elif is_table_or_figure_ahead:
                     # Add to the ahead chunk
-                    chunks.append(" ".join(current_chunk))
-                    new_is_table_or_figure_map.append(True)
-                    current_chunk = sentences[
-                        current_sentence_index:min_of_distance_to_next_figure_or_num_surrounding_sentences
-                    ]
-
+                    chunks.append(retrieve_current_chunk())
+                    if forwards_direction:
+                        current_chunk = sentences[
+                            current_sentence_index : current_sentence
+                            + min_of_distance_to_next_figure_or_num_surrounding_sentences
+                        ]
+                    else:
+                        current_chunk = sentences[
+                            current_sentence_index : current_sentence_index
+                            - min_of_distance_to_next_figure_or_num_surrounding_sentences : -1
+                        ]
                     index += min_of_distance_to_next_figure_or_num_surrounding_sentences
                     continue
 
@@ -294,28 +288,38 @@ def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=Tru
             if num_tokens >= self.max_chunk_tokens:
                 chunks.append(current_sentence)
                 new_is_table_or_figure_map.append(False)
-                continue
             else:
                 current_chunk.append(current_sentence)
 
-            new_chunk = self.merge_similar_chunks(current_sentence, current_chunk)
+                new_chunk, current_chunk = self.merge_similar_chunks(
+                    current_sentence,
+                    current_chunk,
+                    forwards_direction=forwards_direction,
+                )
 
-            if new_chunk is not None:
-                chunks.append(new_chunk)
-                new_is_table_or_figure_map.append(False)
+                if new_chunk is not None:
+                    chunks.append(new_chunk)
+                    new_is_table_or_figure_map.append(False)
 
             index += 1
 
         if len(current_chunk) > 0:
-            chunks.append(" ".join(current_chunk))
-            new_is_table_or_figure_map.append(False)
+            final_chunk = " ".join(current_chunk)
+            chunks.append(final_chunk)
+
+            new_is_table_or_figure_map.append(
+                self.sentence_contains_figure_or_table(final_chunk)
+            )
 
         return chunks, new_is_table_or_figure_map
 
-    def sentence_similarity(self, text1, text2):
-        vec1 = self._nlp_model(text1).vector
-        vec2 = self._nlp_model(text2).vector
-        return cosine_similarity([vec1], [vec2])[0, 0]
+    def sentence_similarity(self, text_1, text_2):
+        vec1 = self._nlp_model(text_1).vector
+        vec2 = self._nlp_model(text_2).vector
+
+        dot_product = np.dot(vec1, vec2)
+        magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
+        return dot_product / magnitude if magnitude != 0 else 0.0
 
 
 async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
@@ -340,7 +344,9 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
         }
 
         # scenarios when page by chunking is enabled
-        cleaned_record["data"] = await text_chunker.chunk(record["data"]["content"])
+        cleaned_record["data"]["chunks"] = await text_chunker.chunk(
+            record["data"]["content"]
+        )
 
     except Exception as e:
         logging.error("Chunking Error: %s", e)
@@ -357,4 +363,5 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
     json_str = json.dumps(cleaned_record, indent=4)
 
     logging.info(f"Chunking output: {json_str}")
+
     return cleaned_record
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
index a3205bb..9e3689b 100644
--- a/deploy_ai_search/ai_search.py
+++ b/deploy_ai_search/ai_search.py
@@ -196,7 +196,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
 
         return data_source_connection
 
-    def get_pre_embedding_cleaner_skill(self, context, source) -> WebApiSkill:
+    def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
         """Get the custom skill for data cleanup.
 
         Args:
@@ -215,38 +215,38 @@ def get_pre_embedding_cleaner_skill(self, context, source) -> WebApiSkill:
             batch_size = 16
             degree_of_parallelism = 16
 
-        pre_embedding_cleaner_skill_inputs = [
+        mark_up_cleaner_skill_inputs = [
             InputFieldMappingEntry(name="chunk", source=source)
         ]
 
-        pre_embedding_cleaner_skill_outputs = [
+        mark_up_cleaner_skill_outputs = [
             OutputFieldMappingEntry(name="cleaned_chunk", target_name="cleaned_chunk")
         ]
 
-        pre_embedding_cleaner_skill = WebApiSkill(
+        mark_up_cleaner_skill = WebApiSkill(
             name="Pre Embedding Cleaner Skill",
             description="Skill to clean the data before sending to embedding",
             context=context,
-            uri=self.environment.get_custom_skill_function_url("pre_embedding_cleaner"),
+            uri=self.environment.get_custom_skill_function_url("mark_up_cleaner"),
             timeout="PT230S",
             batch_size=batch_size,
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
-            inputs=pre_embedding_cleaner_skill_inputs,
-            outputs=pre_embedding_cleaner_skill_outputs,
+            inputs=mark_up_cleaner_skill_inputs,
+            outputs=mark_up_cleaner_skill_outputs,
         )
 
         if self.environment.identity_type != IdentityType.KEY:
-            pre_embedding_cleaner_skill.auth_identity = (
+            mark_up_cleaner_skill.auth_identity = (
                 self.environment.function_app_app_registration_resource_id
             )
 
         if self.environment.identity_type == IdentityType.USER_ASSIGNED:
-            pre_embedding_cleaner_skill.auth_identity = (
+            mark_up_cleaner_skill.auth_identity = (
                 self.environment.ai_search_user_assigned_identity
             )
 
-        return pre_embedding_cleaner_skill
+        return mark_up_cleaner_skill
 
     def get_text_split_skill(self, context, source) -> SplitSkill:
         """Get the skill for text split.
diff --git a/deploy_ai_search/environment.py b/deploy_ai_search/environment.py
index c42a18c..8c35b6b 100644
--- a/deploy_ai_search/environment.py
+++ b/deploy_ai_search/environment.py
@@ -198,7 +198,7 @@ def function_app_app_registration_resource_id(self) -> str:
         return os.environ.get("FunctionApp__AppRegistrationResourceId")
 
     @property
-    def function_app_pre_embedding_cleaner_route(self) -> str:
+    def function_app_mark_up_cleaner_route(self) -> str:
         """
         This function returns function app data cleanup function name
         """
@@ -250,8 +250,8 @@ def get_custom_skill_function_url(self, skill_type: str):
         """
         Get the function app url that is hosting the custom skill
         """
-        if skill_type == "pre_embedding_cleaner":
-            route = self.function_app_pre_embedding_cleaner_route
+        if skill_type == "mark_up_cleaner":
+            route = self.function_app_mark_up_cleaner_route
         elif skill_type == "adi":
             route = self.function_app_adi_route
         elif skill_type == "key_phrase_extraction":
diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py
index 1d51874..3f1cb41 100644
--- a/deploy_ai_search/rag_documents.py
+++ b/deploy_ai_search/rag_documents.py
@@ -171,7 +171,7 @@ def get_skills(self) -> list:
             "/document", "/document/extracted_content/content"
         )
 
-        pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill(
+        mark_up_cleaner_skill = self.get_mark_up_cleaner_skill(
             "/document/chunks/*", "/document/chunks/*/content"
         )
 
@@ -186,7 +186,7 @@ def get_skills(self) -> list:
         if self.enable_page_by_chunking:
             skills = [
                 adi_skill,
-                pre_embedding_cleaner_skill,
+                mark_up_cleaner_skill,
                 key_phrase_extraction_skill,
                 embedding_skill,
             ]
@@ -194,7 +194,7 @@ def get_skills(self) -> list:
             skills = [
                 adi_skill,
                 text_split_skill,
-                pre_embedding_cleaner_skill,
+                mark_up_cleaner_skill,
                 key_phrase_extraction_skill,
                 embedding_skill,
             ]

From 963eb2d811de814bba7114e106a9b122082b0999 Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 18 Nov 2024 19:39:52 +0000
Subject: [PATCH 03/10] Update function app code

---
 adi_function_app/adi_2_ai_search.py | 35 +++++++++--------------------
 adi_function_app/mark_up_cleaner.py | 22 ++++++++++++++++--
 deploy_ai_search/ai_search.py       | 33 +++++++++++++++++++++------
 deploy_ai_search/rag_documents.py   |  6 ++---
 4 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py
index 3b7c509..e1c3e31 100644
--- a/adi_function_app/adi_2_ai_search.py
+++ b/adi_function_app/adi_2_ai_search.py
@@ -23,7 +23,6 @@
 
 async def build_and_clean_markdown_for_response(
     markdown_text: str,
-    figures: dict,
     page_no: int = None,
     remove_irrelevant_figures=False,
 ):
@@ -39,7 +38,6 @@ async def build_and_clean_markdown_for_response(
         str: The cleaned Markdown text.
     """
 
-    output_dict = {}
     comment_patterns = r"<!-- PageNumber=\"[^\"]*\" -->|<!-- PageHeader=\"[^\"]*\" -->|<!-- PageFooter=\"[^\"]*\" -->|<!-- PageBreak -->|<!-- Footnote=\"[^\"]*\" -->"
     cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
 
@@ -52,15 +50,14 @@ async def build_and_clean_markdown_for_response(
 
     logging.info(f"Cleaned Text: {cleaned_text}")
 
-    output_dict["content"] = cleaned_text
-
-    output_dict["figures"] = figures
-
     # add page number when chunk by page is enabled
     if page_no is not None:
+        output_dict = {}
+        output_dict["content"] = cleaned_text
         output_dict["pageNumber"] = page_no
-
-    return output_dict
+        return output_dict
+    else:
+        return cleaned_text
 
 
 def update_figure_description(
@@ -323,23 +320,15 @@ async def process_figures_from_extracted_content(
             )
         )
 
-    figure_ids = [
-        figure_processing_data[0] for figure_processing_data in figure_processing_datas
-    ]
     logging.info("Running image understanding tasks")
     figure_descriptions = await asyncio.gather(*figure_understanding_tasks)
     logging.info("Finished image understanding tasks")
     logging.info(f"Image Descriptions: {figure_descriptions}")
 
     logging.info("Running image upload tasks")
-    figure_uris = await asyncio.gather(*figure_upload_tasks)
+    await asyncio.gather(*figure_upload_tasks)
     logging.info("Finished image upload tasks")
 
-    figures = [
-        {"figureId": figure_id, "figureUri": figure_uri}
-        for figure_id, figure_uri in zip(figure_ids, figure_uris)
-    ]
-
     running_offset = 0
     for figure_processing_data, figure_description in zip(
         figure_processing_datas, figure_descriptions
@@ -355,7 +344,7 @@ async def process_figures_from_extracted_content(
         )
         running_offset += desc_offset
 
-    return markdown_content, figures
+    return markdown_content
 
 
 def create_page_wise_content(result: AnalyzeResult) -> list:
@@ -586,8 +575,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
                 ):
                     build_and_clean_markdown_for_response_tasks.append(
                         build_and_clean_markdown_for_response(
-                            extracted_page_content[0],
-                            extracted_page_content[1],
+                            extracted_page_content,
                             page_number,
                             True,
                         )
@@ -609,10 +597,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
             else:
                 markdown_content = result.content
 
-                (
-                    extracted_content,
-                    figures,
-                ) = await process_figures_from_extracted_content(
+                (extracted_content) = await process_figures_from_extracted_content(
                     result,
                     operation_id,
                     container_and_blob,
@@ -622,7 +607,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
                 )
 
                 cleaned_result = await build_and_clean_markdown_for_response(
-                    extracted_content, figures, remove_irrelevant_figures=True
+                    extracted_content, remove_irrelevant_figures=True
                 )
         except Exception as e:
             logging.error(e)
diff --git a/adi_function_app/mark_up_cleaner.py b/adi_function_app/mark_up_cleaner.py
index ac27c1a..979ffc6 100644
--- a/adi_function_app/mark_up_cleaner.py
+++ b/adi_function_app/mark_up_cleaner.py
@@ -29,6 +29,13 @@ def clean_sections(sections: list) -> list:
     return cleaned_sections
 
 
+def extract_figure_ids(text: str) -> list:
+    # Regex pattern to capture FigureId values
+    figure_id_pattern = r'<figure(?:\s+FigureId="([^"]+)")?'
+    figure_ids = re.findall(figure_id_pattern, text)
+    return figure_ids
+
+
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     """
     Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -52,7 +59,9 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     return text
 
 
-def clean_text_and_extract_metadata(src_text: str) -> tuple[str, str]:
+def clean_text_and_extract_metadata(
+    src_text: str, figure_storage_prefix: str
+) -> tuple[str, str]:
     """This function performs following cleanup activities on the text, remove all unicode characters
     remove line spacing,remove stop words, normalize characters
 
@@ -73,6 +82,15 @@ def clean_text_and_extract_metadata(src_text: str) -> tuple[str, str]:
         return_record["marked_up_chunk"] = src_text
         return_record["sections"] = get_sections(src_text)
 
+        figure_ids = extract_figure_ids(src_text)
+
+        figures = []
+        for figure_id in figure_ids:
+            figure_uri = f"{figure_storage_prefix}/{figure_id}"
+            figures.append({"figure_id": figure_id, "figure_uri": figure_uri})
+
+        return_record["figures"] = figures
+
         # Define specific patterns for each tag
         tag_patterns = {
             "figurecontent": r"<!-- FigureContent=(.*?)-->",
@@ -120,7 +138,7 @@ async def process_mark_up_cleaner(record: dict) -> dict:
         }
 
         cleaned_record["data"] = clean_text_and_extract_metadata(
-            record["data"]["content"]
+            record["data"]["chunk"], record["data"]["figure_storage_prefix"]
         )
 
     except Exception as e:
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
index 9e3689b..6258490 100644
--- a/deploy_ai_search/ai_search.py
+++ b/deploy_ai_search/ai_search.py
@@ -216,15 +216,20 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
             degree_of_parallelism = 16
 
         mark_up_cleaner_skill_inputs = [
-            InputFieldMappingEntry(name="chunk", source=source)
+            InputFieldMappingEntry(name="chunk", source=source),
+            InputFieldMappingEntry(
+                name="figure_storage_prefix", source="/document/metadata_storage_path"
+            ),
         ]
 
         mark_up_cleaner_skill_outputs = [
-            OutputFieldMappingEntry(name="cleaned_chunk", target_name="cleaned_chunk")
+            OutputFieldMappingEntry(name="cleaned_chunk", target_name="cleaned_chunk"),
+            OutputFieldMappingEntry(name="chunk", target_name="chunk"),
+            OutputFieldMappingEntry(name="sections", target_name="sections"),
         ]
 
         mark_up_cleaner_skill = WebApiSkill(
-            name="Pre Embedding Cleaner Skill",
+            name="Mark Up Cleaner Skill",
             description="Skill to clean the data before sending to embedding",
             context=context,
             uri=self.environment.get_custom_skill_function_url("mark_up_cleaner"),
@@ -248,14 +253,23 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
 
         return mark_up_cleaner_skill
 
-    def get_text_split_skill(self, context, source) -> SplitSkill:
+    def get_text_split_skill(
+        self,
+        context,
+        source,
+        num_surrounding_sentences: int = 1,
+        similarity_threshold: float = 0.8,
+        max_chunk_tokens: int = 200,
+    ) -> SplitSkill:
         """Get the skill for text split.
 
         Args:
         -----
             context (str): The context of the skill
-            inputs (List[InputFieldMappingEntry]): The inputs of the skill
-            outputs (List[OutputFieldMappingEntry]): The outputs of the skill
+            source (str): The source of the skill
+            num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
+            similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
+            max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.
 
         Returns:
         --------
@@ -277,7 +291,7 @@ def get_text_split_skill(self, context, source) -> SplitSkill:
         ]
 
         semantic_text_chunker_skill = WebApiSkill(
-            name="Pre Embedding Cleaner Skill",
+            name="Mark Up Cleaner Skill",
             description="Skill to clean the data before sending to embedding",
             context=context,
             uri=self.environment.get_custom_skill_function_url("semantic_text_chunker"),
@@ -285,6 +299,11 @@ def get_text_split_skill(self, context, source) -> SplitSkill:
             batch_size=batch_size,
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
+            http_headers={
+                "num_surrounding_sentences": num_surrounding_sentences,
+                "similarity_threshold": similarity_threshold,
+                "max_chunk_tokens": max_chunk_tokens,
+            },
             inputs=semantic_text_chunker_skill_inputs,
             outputs=semantic_text_chunker_skill_outputs,
         )
diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py
index 3f1cb41..7d63856 100644
--- a/deploy_ai_search/rag_documents.py
+++ b/deploy_ai_search/rag_documents.py
@@ -204,7 +204,7 @@ def get_skills(self) -> list:
     def get_index_projections(self) -> SearchIndexerIndexProjection:
         """This function returns the index projections for rag document."""
         mappings = [
-            InputFieldMappingEntry(name="Chunk", source="/document/chunks/*/content"),
+            InputFieldMappingEntry(name="Chunk", source="/document/chunks/*/chunk"),
             InputFieldMappingEntry(
                 name="ChunkEmbedding",
                 source="/document/chunks/*/vector",
@@ -222,11 +222,11 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
                 source_context="/document/chunks/*/figures/*",
                 inputs=[
                     InputFieldMappingEntry(
-                        name="FigureId", source="/document/chunks/*/figures/*/figureId"
+                        name="FigureId", source="/document/chunks/*/figures/*/figure_id"
                     ),
                     InputFieldMappingEntry(
                         name="FigureUri",
-                        source="/document/chunks/*/figures/*/figureUri",
+                        source="/document/chunks/*/figures/*/figure_uri",
                     ),
                 ],
             ),

From f56390a0e04850d14107fadf4e37bea7e7225171 Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 18 Nov 2024 19:41:10 +0000
Subject: [PATCH 04/10] Store the figure ids

---
 adi_function_app/mark_up_cleaner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adi_function_app/mark_up_cleaner.py b/adi_function_app/mark_up_cleaner.py
index 979ffc6..c59b196 100644
--- a/adi_function_app/mark_up_cleaner.py
+++ b/adi_function_app/mark_up_cleaner.py
@@ -86,7 +86,7 @@ def clean_text_and_extract_metadata(
 
         figures = []
         for figure_id in figure_ids:
-            figure_uri = f"{figure_storage_prefix}/{figure_id}"
+            figure_uri = f"{figure_storage_prefix}/{figure_id}.png"
             figures.append({"figure_id": figure_id, "figure_uri": figure_uri})
 
         return_record["figures"] = figures

From 010fe0312fb57c96ec04f4769ee52ea03ef5bdef Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 12:19:44 +0000
Subject: [PATCH 05/10] Update text chunker

---
 adi_function_app/semantic_text_chunker.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
index c39180d..f775afd 100644
--- a/adi_function_app/semantic_text_chunker.py
+++ b/adi_function_app/semantic_text_chunker.py
@@ -232,17 +232,18 @@ def retrieve_current_chunk():
 
             # Detect if table or figure
             if is_table_or_figure_map[current_sentence_index]:
-                new_is_table_or_figure_map.append(True)
                 if forwards_direction:
                     if len(current_chunk) > 0:
                         current_chunk.append(current_sentence)
                         chunks.append(retrieve_current_chunk())
+                        new_is_table_or_figure_map.append(True)
                         current_chunk = []
                     else:
                         current_chunk.append(current_sentence)
                 else:
                     # On the backwards pass we don't want to add to the table chunk
                     chunks.append(retrieve_current_chunk())
+                    new_is_table_or_figure_map.append(True)
                     current_chunk.append(current_sentence)
 
                 index += 1
@@ -262,6 +263,7 @@ def retrieve_current_chunk():
                     # Finish off
                     current_chunk.append(current_sentence)
                     chunks.append(retrieve_current_chunk())
+                    new_is_table_or_figure_map.append(False)
                     current_chunk = []
 
                     index += 1
@@ -269,9 +271,10 @@ def retrieve_current_chunk():
                 elif is_table_or_figure_ahead:
                     # Add to the ahead chunk
                     chunks.append(retrieve_current_chunk())
+                    new_is_table_or_figure_map.append(False)
                     if forwards_direction:
                         current_chunk = sentences[
-                            current_sentence_index : current_sentence
+                            current_sentence_index : current_sentence_index
                             + min_of_distance_to_next_figure_or_num_surrounding_sentences
                         ]
                     else:

From 09b671b69f403659ae1cd18449fa4faa895ad6de Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 15:11:59 +0000
Subject: [PATCH 06/10] Update the chunker

---
 adi_function_app/adi_2_ai_search.py       |  13 +-
 adi_function_app/semantic_text_chunker.py | 189 +++++++++++++++++-----
 2 files changed, 160 insertions(+), 42 deletions(-)

diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py
index e1c3e31..154b63b 100644
--- a/adi_function_app/adi_2_ai_search.py
+++ b/adi_function_app/adi_2_ai_search.py
@@ -38,12 +38,19 @@ async def build_and_clean_markdown_for_response(
         str: The cleaned Markdown text.
     """
 
-    comment_patterns = r"<!-- PageNumber=\"[^\"]*\" -->|<!-- PageHeader=\"[^\"]*\" -->|<!-- PageFooter=\"[^\"]*\" -->|<!-- PageBreak -->|<!-- Footnote=\"[^\"]*\" -->"
-    cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
+    # Pattern to match the comment start `<!--` and comment end `-->`
+    # Matches opening `<!--` up to the first occurrence of a non-hyphen character
+    comment_start_pattern = r"<!--[^<]*"
+    comment_end_pattern = r"(-->|\<)"
+
+    # Using re.sub to remove comments
+    cleaned_text = re.sub(
+        f"{comment_start_pattern}.*?{comment_end_pattern}", "", markdown_text
+    )
 
     # Remove irrelevant figures
     if remove_irrelevant_figures:
-        irrelevant_figure_pattern = r"<!-- FigureContent=\"Irrelevant Image\" -->\s*"
+        irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
         cleaned_text = re.sub(
             irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL
         )
diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
index f775afd..b7e7cef 100644
--- a/adi_function_app/semantic_text_chunker.py
+++ b/adi_function_app/semantic_text_chunker.py
@@ -7,6 +7,8 @@
 import spacy
 import numpy as np
 
+logging.basicConfig(level=logging.INFO)
+
 
 class SemanticTextChunker:
     def __init__(
@@ -14,18 +16,26 @@ def __init__(
         num_surrounding_sentences: int = 1,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 200,
+        min_chunk_tokens: int = 50,
     ):
         self.num_surrounding_sentences = num_surrounding_sentences
         self.similarity_threshold = similarity_threshold
         self.max_chunk_tokens = max_chunk_tokens
+        self.min_chunk_tokens = min_chunk_tokens
         try:
             self._nlp_model = spacy.load("en_core_web_md")
         except IOError as e:
             raise ValueError("Spacy model 'en_core_web_md' not found.") from e
 
+    def sentence_contains_figure_or_table_ending(self, text: str):
+        return "</figure>" in text or "</table>" in text
+
     def sentence_contains_figure_or_table(self, text: str):
-        return ("<figure" in text or "</figure>" in text) or (
-            "<table>" in text or "</table>" in text
+        return (
+            ("<figure" in text or "</figure>" in text)
+            or ("<table>" in text or "</table>" in text)
+            or ("<th" in text or "th>" in text)
+            or ("<td" in text or "td>" in text)
         )
 
     def sentence_is_complete_figure_or_table(self, text: str):
@@ -59,6 +69,7 @@ async def chunk(self, text: str) -> list[dict]:
             list(str): The list of chunks"""
 
         sentences = self.split_into_sentences(text)
+
         (
             grouped_sentences,
             is_table_or_figure_map,
@@ -68,15 +79,54 @@ async def chunk(self, text: str) -> list[dict]:
             grouped_sentences, is_table_or_figure_map
         )
 
+        logging.info(
+            f"""Number of Forward pass chunks: {
+                     len(forward_pass_chunks)}"""
+        )
+        logging.info(f"Forward pass chunks: {forward_pass_chunks}")
+
         backwards_pass_chunks, _ = self.merge_chunks(
             forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False
         )
 
-        backwards_pass_chunks = list(
-            map(lambda x: x.strip(), reversed(backwards_pass_chunks))
+        reversed_backwards_pass_chunks = list(reversed(backwards_pass_chunks))
+
+        logging.info(
+            f"""Number of Backaward pass chunks: {
+                     len(reversed_backwards_pass_chunks)}"""
         )
+        logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
+
+        cleaned_final_chunks = []
+        for chunk in reversed_backwards_pass_chunks:
+            stripped_chunk = chunk.strip()
+            if len(stripped_chunk) > 0:
+                cleaned_final_chunks.append(stripped_chunk)
+
+        logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
+        logging.info(f"Chunks: {cleaned_final_chunks}")
+
+        return cleaned_final_chunks
+
+    def filter_empty_figures(self, text):
+        # Regular expression to match <figure>...</figure> with only newlines or spaces in between
+        pattern = r"<figure>\s*</figure>"
 
-        return list(reversed(backwards_pass_chunks))
+        # Replace any matches of the pattern with an empty string
+        filtered_text = re.sub(pattern, "", text)
+
+        return filtered_text
+
+    def clean_new_lines(self, text):
+        # Remove single newlines surrounded by < and >
+        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
+
+        # Replace all other single newlines with space
+        cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
+
+        # Replace multiple consecutive newlines with a single space followed by \n\n
+        cleaned_text = re.sub(r"\n{2,}", " \n\n", cleaned_text)
+        return cleaned_text
 
     def split_into_sentences(self, text: str) -> list[str]:
         """Splits a set of text into a list of sentences uses the Spacy NLP model.
@@ -88,22 +138,45 @@ def split_into_sentences(self, text: str) -> list[str]:
             list(str): The extracted sentences
         """
 
-        def replace_newlines_outside_html(text):
-            def replacement(match):
-                # Only replace if \n is outside HTML tags
-                if "<" not in match.group(0) and ">" not in match.group(0):
-                    return match.group(0).replace("\n", " ")
-                return match.group(0)
+        cleaned_text = self.clean_new_lines(text)
+
+        # Filter out empty <figure>...</figure> tags
+        cleaned_text = self.filter_empty_figures(cleaned_text)
+
+        doc = self._nlp_model(cleaned_text)
+
+        tag_split_sentences = []
+        # Pattern to match the closing and opening tag junctions with whitespace in between
+        split_pattern = r"(</table>\s*<table\b[^>]*>|</figure>\s*<figure\b[^>]*>)"
+        for sent in doc.sents:
+            split_result = re.split(split_pattern, sent.text)
+            for part in split_result:
+                # Match the junction and split it into two parts
+                if re.match(split_pattern, part):
+                    # Split at the first whitespace
+                    tag_split = part.split(" ", 1)
+                    # Add the closing tag (e.g., </table>)
+                    tag_split_sentences.append(tag_split[0])
+                    if len(tag_split) > 1:
+                        # Add the rest of the string with leading space
+                        tag_split_sentences.append(" " + tag_split[1])
+                else:
+                    tag_split_sentences.append(part)
 
-            # Match sequences of non-whitespace characters with \n outside tags
-            return re.sub(r"[^<>\s]+\n[^<>\s]+", replacement, text)
+        # Now apply a split pattern against markdown headings
+        heading_split_sentences = []
 
-        doc = self._nlp_model(replace_newlines_outside_html(text))
-        sentences = [sent.text for sent in doc.sents]
+        # Iterate through each sentence in tag_split_sentences
+        for sent in tag_split_sentences:
+            # Use re.split to split on \n\n and headings, but keep \n\n in the result
+            split_result = re.split(r"(\n\n|#+ .*)", sent)
 
-        print(len(sentences))
+            # Extend the result with the correctly split parts, retaining \n\n before the heading
+            for part in split_result:
+                if part.strip():  # Only add non-empty parts
+                    heading_split_sentences.append(part)
 
-        return sentences
+        return heading_split_sentences
 
     def group_figures_and_tables_into_sentences(self, sentences: list[str]):
         grouped_sentences = []
@@ -125,7 +198,7 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
                     is_table_or_figure_map.append(False)
             else:
                 # check for ending case
-                if self.sentence_contains_figure_or_table(current_sentence):
+                if self.sentence_contains_figure_or_table_ending(current_sentence):
                     holding_sentences.append(current_sentence)
 
                     full_sentence = " ".join(holding_sentences)
@@ -137,6 +210,8 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
                 else:
                     holding_sentences.append(current_sentence)
 
+        assert len(holding_sentences) == 0, "Holding sentences should be empty"
+
         return grouped_sentences, is_table_or_figure_map
 
     def look_ahead_and_behind_sentences(
@@ -183,31 +258,50 @@ def look_ahead_and_behind_sentences(
 
     def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction):
         new_chunk = None
-        # Only compare when we have 2 or more chunks
 
-        if forwards_direction is False:
-            directional_current_chunk = list(reversed(current_chunk))
-        else:
-            directional_current_chunk = current_chunk
+        def retrieve_current_chunk_up_to_n(n):
+            if forwards_direction:
+                return " ".join(current_chunk[:-n])
+            else:
+                return " ".join(reversed(current_chunk[:-n]))
 
-        if len(current_chunk) >= 2:
+        def retrieve_current_chunks_from_n(n):
+            if forwards_direction:
+                return " ".join(current_chunk[n:])
+            else:
+                return " ".join(reversed(current_chunk[:-n]))
+
+        def retrive_current_chunk_at_n(n):
+            if forwards_direction:
+                return current_chunk[n]
+            else:
+                return current_chunk[n]
+
+        current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
+
+        if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
+            logging.debug("Comparing chunks")
             cosine_sim = self.sentence_similarity(
-                " ".join(directional_current_chunk[-2:]), current_sentence
+                retrieve_current_chunks_from_n(-2), current_sentence
             )
             if (
                 cosine_sim < self.similarity_threshold
-                or self.num_tokens_from_string(" ".join(directional_current_chunk))
-                >= self.max_chunk_tokens
+                or current_chunk_tokens >= self.max_chunk_tokens
             ):
                 if len(current_chunk) > 2:
-                    new_chunk = " ".join(directional_current_chunk[:1])
-                    current_chunk = [directional_current_chunk[-1]]
+                    new_chunk = retrieve_current_chunk_up_to_n(1)
+                    current_chunk = [retrive_current_chunk_at_n(-1)]
                 else:
-                    new_chunk = current_chunk[0]
-                    current_chunk = [current_chunk[1]]
+                    new_chunk = retrive_current_chunk_at_n(0)
+                    current_chunk = [retrive_current_chunk_at_n(1)]
+        else:
+            logging.debug("Chunk too small to compare")
 
         return new_chunk, current_chunk
 
+    def is_markdown_heading(self, text):
+        return text.strip().startswith("#")
+
     def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=True):
         chunks = []
         current_chunk = []
@@ -230,6 +324,10 @@ def retrieve_current_chunk():
 
             current_sentence = sentences[current_sentence_index]
 
+            if len(current_sentence.strip()) == 0:
+                index += 1
+                continue
+
             # Detect if table or figure
             if is_table_or_figure_map[current_sentence_index]:
                 if forwards_direction:
@@ -244,7 +342,7 @@ def retrieve_current_chunk():
                     # On the backwards pass we don't want to add to the table chunk
                     chunks.append(retrieve_current_chunk())
                     new_is_table_or_figure_map.append(True)
-                    current_chunk.append(current_sentence)
+                    current_chunk = [current_sentence]
 
                 index += 1
                 continue
@@ -260,11 +358,18 @@ def retrieve_current_chunk():
                 )
 
                 if is_table_or_figure_behind:
-                    # Finish off
-                    current_chunk.append(current_sentence)
-                    chunks.append(retrieve_current_chunk())
-                    new_is_table_or_figure_map.append(False)
-                    current_chunk = []
+                    # Check if Makrdown heading
+                    if self.is_markdown_heading(current_sentence):
+                        # Start new chunk
+                        chunks.append(retrieve_current_chunk())
+                        new_is_table_or_figure_map.append(False)
+                        current_chunk = [current_sentence]
+                    else:
+                        # Finish off
+                        current_chunk.append(current_sentence)
+                        chunks.append(retrieve_current_chunk())
+                        new_is_table_or_figure_map.append(False)
+                        current_chunk = []
 
                     index += 1
                     continue
@@ -307,7 +412,7 @@ def retrieve_current_chunk():
             index += 1
 
         if len(current_chunk) > 0:
-            final_chunk = " ".join(current_chunk)
+            final_chunk = retrieve_current_chunk()
             chunks.append(final_chunk)
 
             new_is_table_or_figure_map.append(
@@ -322,7 +427,13 @@ def sentence_similarity(self, text_1, text_2):
 
         dot_product = np.dot(vec1, vec2)
         magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
-        return dot_product / magnitude if magnitude != 0 else 0.0
+        similarity = dot_product / magnitude if magnitude != 0 else 0.0
+
+        logging.debug(
+            f"""Similarity between '{text_1}' and '{
+            text_2}': {similarity}"""
+        )
+        return similarity
 
 
 async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:

From c4b75ef7a546780dc82c65e9ccac46c61766af06 Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 15:14:17 +0000
Subject: [PATCH 07/10] Update chunker

---
 adi_function_app/semantic_text_chunker.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
index b7e7cef..6cbc889 100644
--- a/adi_function_app/semantic_text_chunker.py
+++ b/adi_function_app/semantic_text_chunker.py
@@ -81,7 +81,7 @@ async def chunk(self, text: str) -> list[dict]:
 
         logging.info(
             f"""Number of Forward pass chunks: {
-                     len(forward_pass_chunks)}"""
+                len(forward_pass_chunks)}"""
         )
         logging.info(f"Forward pass chunks: {forward_pass_chunks}")
 
@@ -93,7 +93,7 @@ async def chunk(self, text: str) -> list[dict]:
 
         logging.info(
             f"""Number of Backaward pass chunks: {
-                     len(reversed_backwards_pass_chunks)}"""
+                len(reversed_backwards_pass_chunks)}"""
         )
         logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
 
@@ -431,7 +431,7 @@ def sentence_similarity(self, text_1, text_2):
 
         logging.debug(
             f"""Similarity between '{text_1}' and '{
-            text_2}': {similarity}"""
+                text_2}': {similarity}"""
         )
         return similarity
 

From fbe462cf0494268bac45a37d97e23b481219452e Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 15:16:03 +0000
Subject: [PATCH 08/10] Update params

---
 adi_function_app/function_app.py | 2 ++
 deploy_ai_search/ai_search.py    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/adi_function_app/function_app.py b/adi_function_app/function_app.py
index 3709758..5188023 100644
--- a/adi_function_app/function_app.py
+++ b/adi_function_app/function_app.py
@@ -110,6 +110,7 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
             "similarity_threshold", 0.8
         )
         max_chunk_tokens = semantic_text_chunker_config.get("max_chunk_tokens", 500)
+        min_chunk_tokens = semantic_text_chunker_config.get("min_chunk_tokens", 50)
 
     except ValueError:
         return func.HttpResponse(
@@ -124,6 +125,7 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
             num_surrounding_sentences=num_surrounding_sentences,
             similarity_threshold=similarity_threshold,
             max_chunk_tokens=max_chunk_tokens,
+            min_chunk_tokens=min_chunk_tokens,
         )
 
         for value in values:
diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py
index 6258490..3da8f43 100644
--- a/deploy_ai_search/ai_search.py
+++ b/deploy_ai_search/ai_search.py
@@ -260,6 +260,7 @@ def get_text_split_skill(
         num_surrounding_sentences: int = 1,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 200,
+        min_chunk_tokens: int = 50,
     ) -> SplitSkill:
         """Get the skill for text split.
 
@@ -303,6 +304,7 @@ def get_text_split_skill(
                 "num_surrounding_sentences": num_surrounding_sentences,
                 "similarity_threshold": similarity_threshold,
                 "max_chunk_tokens": max_chunk_tokens,
+                "min_chunk_tokens": min_chunk_tokens,
             },
             inputs=semantic_text_chunker_skill_inputs,
             outputs=semantic_text_chunker_skill_outputs,

From 05fdc2cc81f0f658e86b081f51f0b8a1eacf30eb Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 15:26:26 +0000
Subject: [PATCH 09/10] Update ai search

---
 adi_function_app/README.md | 42 ++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/adi_function_app/README.md b/adi_function_app/README.md
index b34c5a7..673d8a6 100644
--- a/adi_function_app/README.md
+++ b/adi_function_app/README.md
@@ -24,13 +24,21 @@ Once the Markdown is obtained, several steps are carried out:
 
 1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt4o in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart.
 
-2. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images.
+2. **Chunking**. The obtained content is chunked accordingly depending on the chunking strategy. This function app supports two chunking methods, **page wise** and **semantic chunking**. The page wise chunking is performed natively by Azure Document Intelligence. For a Semantic Chunking, we include a customer chunker that splits the text with the following strategy:
 
-Page wise analysis in ADI is used to avoid splitting tables / figures across multiple chunks, when the chunking is performed.
+    - Splits text into sentences.
+    - Groups sentences if they are table or figure related to avoid splitting them in context.
+    - Semanticly groups sentences if the similarity is above the threshold, starting from the start of the text.
+    - Semanticly groups sentences if the similarity is above the threshold, starting from the end of the text.
+    - Removes non-existent chunks.
 
-The properties returned from the ADI Custom Skill are then used to perform the following skills:
+    This chunking method aims to improve on page wise chunking, whilst still retaining similar sentences together. When tested, this method shows great performance improvements, over straight page wise chunking, without splitting up the context when relevant.
 
-- Pre-vectorisation cleaning. This stage is important as we extract the section information in this step from the headers in the document. Additionally, we remove any Markdown tags or characters that would cause an embedding error.
+3. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images.
+
+The properties returned from the ADI Custom Skill and Chunking are then used to perform the following skills:
+
+- Markup cleaning. This stage is important as we extract the section information in this step from the headers in the document. Additionally, we remove any Markdown tags or characters that would cause an embedding error.
 - Keyphrase extraction
 - Vectorisation
 
@@ -49,18 +57,24 @@ The Figure 4 content has been interpreted and added into the extracted chunk to
 
 ## Provided Notebooks \& Utilities
 
-- `./ai_search_with_adi_function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown.
+- `./function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown.
 - `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index.
 
 ## Deploying AI Search Setup
 
 To deploy the pre-built index and associated indexer / skillset setup, see instructions in `./deploy_ai_search/README.md`.
 
-## ADI Custom Skill
+## Custom Skills
+
+Deploy the associated function app and the resources. To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline.
+
+### ADI Custom Skill
 
-Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint.
+You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint. The header controls the chunking technique *(page wise or not)*.
 
-To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline.
+### Semantic Chunker Skill
+
+You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
 
 ### Deployment Steps
 
@@ -72,11 +86,15 @@ To use with an index, either use the utility to configure a indexer in the provi
 
 #### function_app.py
 
-`./indexer/ai_search_with_adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills.
+`./indexer/function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills.
+
+#### semantic_text_chunker.py
 
-#### adi_2_aisearch
+`./semantic_text_chunker.py` contains the code to chunk the text semantically, whilst grouping similar sentences.
 
-`./indexer/adi_2_aisearch.py` contains the methods for content extraction with ADI. The key methods are:
+#### adi_2_ai_search.py
+
+`./indexer/adi_2_ai_search.py` contains the methods for content extraction with ADI. The key methods are:
 
 ##### analyse_document
 
@@ -183,8 +201,6 @@ If `chunk_by_page` header is `False`:
 }
 ```
 
-**Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.**
-
 ## Other Provided Custom Skills
 
 Due to a AI Search product limitation that AI Search cannot connect to AI Services behind Private Endpoints, we provide a Custom Key Phrase Extraction Skill that will work within a Private Endpoint environment.

From 70e9e3638f97584f7a5bfd6fcfecfe24ca8ca98b Mon Sep 17 00:00:00 2001
From: Ben Constable <benconstable@microsoft.com>
Date: Mon, 25 Nov 2024 15:27:41 +0000
Subject: [PATCH 10/10] Update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 61127e2..d4236b8 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ It is intended that the plugins and skills provided in this repository, are adap
 ## Components
 
 - `./text_2_sql` contains an three Multi-Shot implementations for Text2SQL generation and querying which can be used to answer questions backed by a database as a knowledge base. A **prompt based** and **vector based** approach are shown, both of which exhibit great performance in answering sql queries. Additionally, a further iteration on the vector based approach is shown which uses a **query cache** to further speed up generation.  With these plugins, your RAG application can now access and pull data from any SQL table exposed to it to answer questions.
-- `./adi_function_app` contains code for linking **Azure Document Intelligence** with AI Search to process complex documents with charts and images, and uses **multi-modal models (gpt4o)** to interpret and understand these. With this custom skill, the RAG application can **draw insights from complex charts** and images during the vector search.
+- `./adi_function_app` contains code for linking **Azure Document Intelligence** with AI Search to process complex documents with charts and images, and uses **multi-modal models (gpt4o)** to interpret and understand these. With this custom skill, the RAG application can **draw insights from complex charts** and images during the vector search. This function app also contains a **Semantic Text Chunking** method that aims to intelligently group similar sentences, retaining figures and tables together, whilst separating out distinct sentences.
 - `./deploy_ai_search` provides an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search and for Text2SQL.
 
 The above components have been successfully used on production RAG projects to increase the quality of responses.