Update

BenConstable9 · BenConstable9 · commit f941161145a0 · 2025-02-06T16:32:30.000Z
diff --git a/image_processing/src/image_processing/requirements.txt b/image_processing/src/image_processing/requirements.txt
@@ -1,7 +1,7 @@
 # This file was autogenerated by uv via the following command:
 #    uv export --frozen --no-hashes --no-editable --no-sources --no-group dev --directory image_processing -o src/image_processing/requirements.txt
 aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
+aiohttp==3.11.12
 aiosignal==1.3.2
 annotated-types==0.7.0
 anyio==4.8.0
@@ -16,7 +16,7 @@ azure-identity==1.19.0
 azure-search==1.0.0b2
 azure-search-documents==11.6.0b8
 azure-storage-blob==12.24.1
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.3
 blis==0.7.11
 bs4==0.0.2
 catalogue==2.0.10
@@ -34,7 +34,7 @@ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_
 et-xmlfile==2.0.0
 filelock==3.17.0
 frozenlist==1.5.0
-fsspec==2024.12.0
+fsspec==2025.2.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
@@ -50,15 +50,15 @@ marisa-trie==1.2.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
-model2vec==0.3.8
+model2vec==0.3.9
 msal==1.31.1
 msal-extensions==1.2.0
 msrest==0.7.1
 multidict==6.1.0
 murmurhash==1.0.12
 numpy==1.26.4
 oauthlib==3.2.2
-openai==1.60.2
+openai==1.61.1
 openpyxl==3.1.5
 packaging==24.2
 pandas==2.2.3
@@ -71,7 +71,7 @@ pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
 pyjwt==2.10.1
-pymupdf==1.25.2
+pymupdf==1.25.3
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py
@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
 
         return len(encoding.encode(string))
 
+    def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
+        cleaned_chunks = []
+        cleaned_is_table_or_figure_map = []
+
+        for current_chunk, is_table_or_figure in zip(chunks, is_table_or_figure_map):
+            cleaned_chunk = current_chunk.strip()
+            if len(cleaned_chunk) > 0:
+                # Add a newline if the chunk ends with a newline (it was a title)
+                if self.is_markdown_heading(current_chunk):
+                    cleaned_chunk = "\n\n" + cleaned_chunk + "\n\n"
+
+                cleaned_chunks.append(cleaned_chunk)
+                cleaned_is_table_or_figure_map.append(is_table_or_figure)
+
+        return cleaned_chunks, cleaned_is_table_or_figure_map
+
     async def chunk(self, text: str) -> list[dict]:
         """Attempts to chunk the text by:
             Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
             grouped_sentences, is_table_or_figure_map
         )
 
+        forward_pass_chunks, new_is_table_or_figure_map = self.clean_chunks_and_map(
+            forward_pass_chunks, new_is_table_or_figure_map
+        )
+
         logging.info(
             f"""Number of Forward pass chunks: {
                 len(forward_pass_chunks)}"""
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
 
     def clean_new_lines(self, text):
         # Remove single newlines surrounded by < and >
-        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
+        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text.strip())
 
         # Replace all other single newlines with space
         cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
                         self.is_markdown_heading(part)
                         and part.endswith("\n\n") is False
                     ):
-                        part = part + "\n\n"
+                        part = "\n\n" + part + "\n\n"
 
                     heading_split_sentences.append(part)
 
@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
             else:
                 return current_chunk[n]
 
-        current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
+        def get_current_chunk_tokens(chunk_segments):
+            return self.num_tokens_from_string(" ".join(chunk_segments))
+
+        current_chunk_tokens = get_current_chunk_tokens(current_chunk)
 
         if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
-            logging.info("Comparing chunks")
-            cosine_sim = self.sentence_similarity(
-                retrieve_current_chunks_from_n(-2), current_sentence
-            )
+            # Calculate the tokens if we were to split
+            if len(current_chunk) > 2:
+                would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
+                would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
+            else:
+                would_be_new_chunk = retrive_current_chunk_at_n(0)
+                would_be_current_chunk = [retrive_current_chunk_at_n(1)]
+
             if (
-                cosine_sim < self.similarity_threshold
-                or current_chunk_tokens >= self.max_chunk_tokens
+                get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
+                and get_current_chunk_tokens(would_be_current_chunk)
+                >= self.min_chunk_tokens
             ):
-                if len(current_chunk) > 2:
-                    new_chunk = retrieve_current_chunk_up_to_n(1)
-                    current_chunk = [retrive_current_chunk_at_n(-1)]
-                else:
-                    new_chunk = retrive_current_chunk_at_n(0)
-                    current_chunk = [retrive_current_chunk_at_n(1)]
+                logging.info("Comparing chunks")
+                if (
+                    current_chunk_tokens >= self.max_chunk_tokens
+                    or self.sentence_similarity(
+                        retrieve_current_chunks_from_n(-2), current_sentence
+                    )
+                    < self.similarity_threshold
+                ):
+                    return would_be_new_chunk, would_be_current_chunk
+            else:
+                logging.info("Chunk too small to compare")
         else:
             logging.info("Chunk too small to compare")
 
diff --git a/uv.lock b/uv.lock