Update tests

BenConstable9 · BenConstable9 · commit f23126f54381 · 2025-02-19T12:53:54.000Z
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
@@ -22,6 +22,7 @@
     LayoutHolder,
     PageWiseContentHolder,
     NonPageWiseContentHolder,
+    PerPageStartingSentenceHolder,
 )
 
 
@@ -340,6 +341,32 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
 
         return page_wise_contents
 
+    def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
+        """Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
+
+        Returns:
+        --------
+            list: A list of the starting sentence of each page."""
+
+        per_page_starting_sentences = []
+
+        for page in self.result.pages:
+            page_content = self.result.content[
+                page.spans[0]["offset"] : page.spans[0]["offset"]
+                + page.spans[0]["length"]
+            ]
+
+            starting_sentence = page_content.split(".")[0]
+
+            per_page_starting_sentences.append(
+                PerPageStartingSentenceHolder(
+                    page_number=page.page_number,
+                    starting_sentence=starting_sentence,
+                )
+            )
+
+        return per_page_starting_sentences
+
     async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
         """Get the Azure Document Intelligence client.
 
@@ -487,7 +514,12 @@ async def analyse(self):
                 if self.extract_figures:
                     await self.process_figures_from_extracted_content(text_content)
 
-                output_record = NonPageWiseContentHolder(layout=text_content)
+                per_page_starting_sentences = self.create_per_page_starting_sentence()
+
+                output_record = NonPageWiseContentHolder(
+                    layout=text_content,
+                    per_page_starting_sentences=per_page_starting_sentences,
+                )
 
         except Exception as e:
             logging.error(e)
diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py
@@ -47,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
     page_wise_layout: list[LayoutHolder]
 
 
+class PerPageStartingSentenceHolder(BaseModel):
+    """A class to hold the starting sentence of each page."""
+
+    page_number: int
+    starting_sentence: str
+
+
 class NonPageWiseContentHolder(BaseModel):
     """A class to hold the non-page-wise content extracted from the document."""
 
     layout: LayoutHolder
+    per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
+        default_factory=list
+    )
+
+
+class ChunkHolder(BaseModel):
+    """A class to hold the text extracted from the document after it has been chunked."""
+
+    mark_up: str
+    sections: Optional[list[str]] = Field(default_factory=list)
+    figures: Optional[list[FigureHolder]] = Field(default_factory=list)
+    starting_sentence: Optional[str] = None
+    cleaned_text: Optional[str] = None
+    page_number: Optional[int] = Field(default=None)
diff --git a/image_processing/src/image_processing/mark_up_cleaner.py b/image_processing/src/image_processing/mark_up_cleaner.py
@@ -3,7 +3,7 @@
 import logging
 import json
 import regex as re
-from layout_holders import FigureHolder
+from layout_holders import FigureHolder, ChunkHolder
 
 
 class MarkUpCleaner:
@@ -76,7 +76,7 @@ def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
         return text
 
     def clean_text_and_extract_metadata(
-        self, text: str, figures: list[FigureHolder]
+        self, chunk: ChunkHolder, figures: list[FigureHolder]
     ) -> tuple[str, str]:
         """This function performs following cleanup activities on the text, remove all unicode characters
         remove line spacing,remove stop words, normalize characters
@@ -88,26 +88,29 @@ def clean_text_and_extract_metadata(
         Returns:
             str: The clean text."""
 
-        return_record = {}
-
         try:
-            logging.info(f"Input text: {text}")
-            if len(text) == 0:
+            logging.info(f"Input text: {chunk.mark_up}")
+            if len(chunk.mark_up) == 0:
                 logging.error("Input text is empty")
                 raise ValueError("Input text is empty")
 
-            return_record["chunk_mark_up"] = text
-
-            figure_ids = self.get_figure_ids(text)
+            figure_ids = self.get_figure_ids(chunk.mark_up)
 
-            return_record["chunk_sections"] = self.get_sections(text)
-            return_record["chunk_figures"] = [
-                figure.model_dump(by_alias=True)
-                for figure in figures
-                if figure.figure_id in figure_ids
+            chunk.sections = self.get_sections(chunk.mark_up)
+            chunk.figures = [
+                figure for figure in figures if figure.figure_id in figure_ids
             ]
 
-            logging.info(f"Sections: {return_record['chunk_sections']}")
+            logging.info(f"Sections: {chunk.sections}")
+
+            # Check if the chunk contains only figure tags (plus whitespace).
+            figure_tag_pattern = (
+                r"<figure(?:\s+FigureId=(\"[^\"]*\"|'[^']*'))?>(.*?)</figure>"
+            )
+            text_without_figures = re.sub(figure_tag_pattern, "", chunk.mark_up).strip()
+            if not text_without_figures and chunk.figures:
+                # When no text outside of figure tags is present, set page_number from the first figure.
+                chunk.page_number = chunk.figures[0].page_number
 
             # Define specific patterns for each tag
             tag_patterns = {
@@ -117,7 +120,7 @@ def clean_text_and_extract_metadata(
                 "figcaption": r"<figcaption>(.*?)</figcaption>",
                 "header": r"^\s*(#{1,6})\s*(.*?)\s*$",
             }
-            cleaned_text = self.remove_markdown_tags(text, tag_patterns)
+            cleaned_text = self.remove_markdown_tags(chunk.mark_up, tag_patterns)
 
             logging.info(f"Removed markdown tags: {cleaned_text}")
 
@@ -130,11 +133,11 @@ def clean_text_and_extract_metadata(
                 logging.error("Cleaned text is empty")
                 raise ValueError("Cleaned text is empty")
             else:
-                return_record["chunk_cleaned"] = cleaned_text
+                chunk.cleaned_text = cleaned_text
         except Exception as e:
             logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
-            return ""
-        return return_record
+            raise e
+        return chunk.model_dump(by_alias=True)
 
     async def clean(self, record: dict) -> dict:
         """Cleanup the data using standard python libraries.
@@ -159,12 +162,17 @@ async def clean(self, record: dict) -> dict:
 
             figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
 
+            chunk_holder = ChunkHolder(mark_up=record["data"]["mark_up"])
+
+            if "page_number" in record["data"]:
+                chunk_holder.page_number = record["data"]["page_number"]
+
             cleaned_record["data"] = self.clean_text_and_extract_metadata(
-                record["data"]["chunk"], figures
+                chunk_holder, figures
             )
 
         except Exception as e:
-            logging.error("string cleanup Error: %s", e)
+            logging.error("Cleanup Error: %s", e)
             return {
                 "recordId": record["recordId"],
                 "data": None,
diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py
@@ -7,6 +7,7 @@
 import spacy
 import numpy as np
 from model2vec import StaticModel
+from layout_holders import PerPageStartingSentenceHolder, ChunkHolder
 
 
 class SemanticTextChunker:
@@ -75,7 +76,7 @@ def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
 
         return cleaned_chunks, cleaned_is_table_or_figure_map
 
-    async def chunk(self, text: str) -> list[dict]:
+    async def chunk(self, text: str) -> list[ChunkHolder]:
         """Attempts to chunk the text by:
             Splitting into sentences
             Grouping sentences that contain figures and tables
@@ -128,7 +129,7 @@ async def chunk(self, text: str) -> list[dict]:
         for chunk in reversed_backwards_pass_chunks:
             stripped_chunk = chunk.strip()
             if len(stripped_chunk) > 0:
-                cleaned_final_chunks.append(stripped_chunk)
+                cleaned_final_chunks.append(ChunkHolder(mark_up=stripped_chunk))
 
         logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
         logging.info(f"Chunks: {cleaned_final_chunks}")
@@ -491,6 +492,29 @@ def sentence_similarity(self, text_1, text_2):
         )
         return similarity
 
+    def assign_page_number_to_chunks(
+        self,
+        chunks: list[ChunkHolder],
+        per_page_starting_sentences: list[PerPageStartingSentenceHolder],
+    ) -> list[ChunkHolder]:
+        """Assigns page numbers to the chunks based on the starting sentences of each page.
+
+        Args:
+            chunks (list[ChunkHolder]): The list of chunks.
+            per_page_starting_sentences (list[PerPageStartingSentenceHolder]): The list of starting sentences of each page.
+
+        Returns:
+            list[ChunkHolder]: The list of chunks with page numbers assigned."""
+        page_number = 1
+        for chunk in chunks:
+            if per_page_starting_sentences:
+                for per_page_starting_sentence in per_page_starting_sentences:
+                    if per_page_starting_sentence.starting_sentence in chunk:
+                        page_number = per_page_starting_sentence.page_number
+                        break
+            chunk.page_number = page_number
+        return chunks
+
 
 async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
     """Chunk the data.
@@ -514,9 +538,21 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
         }
 
         # scenarios when page by chunking is enabled
-        cleaned_record["data"]["chunks"] = await text_chunker.chunk(
-            record["data"]["content"]
-        )
+        chunks = await text_chunker.chunk(record["data"]["content"])
+
+        if "per_page_starting_sentences" in record["data"]:
+            per_page_starting_sentences = [
+                PerPageStartingSentenceHolder(**sentence)
+                for sentence in record["data"]["per_page_starting_sentences"]
+            ]
+
+            chunks = text_chunker.assign_page_number_to_chunks(
+                chunks, per_page_starting_sentences
+            )
+
+        cleaned_record["data"]["chunks"] = [
+            chunk.model_dump(by_alias=True) for chunk in chunks
+        ]
 
     except Exception as e:
         logging.error("Chunking Error: %s", e)
diff --git a/image_processing/tests/image_processing/test_layout_analysis.py b/image_processing/tests/image_processing/test_layout_analysis.py
@@ -6,7 +6,6 @@
 import base64
 from unittest.mock import AsyncMock
 
-# Import the functions/classes under test.
 from layout_analysis import (
     process_layout_analysis,
     LayoutAnalysis,
@@ -147,7 +146,7 @@ async def test_analyse_non_page_wise_no_figures(monkeypatch, dummy_storage_helpe
     )
     # Patch analyse_document to simulate a successful ADI analysis.
     dummy_result = DummyResult(
-        content="Full document content", pages=[DummyPage(0, 21, None)], figures=[]
+        content="Full document content", pages=[DummyPage(0, 21, 1)], figures=[]
     )
 
     async def dummy_analyse_document(file_path):
@@ -435,3 +434,60 @@ class DummyResultContent:
     assert layout.content == "Hello"
     assert layout.page_number == 1
     assert layout.page_offsets == 0
+
+
+def test_create_per_page_starting_sentence():
+    # Create a LayoutAnalysis instance.
+    la = LayoutAnalysis(record_id=200, source="dummy")
+
+    # Create a dummy result with content and pages.
+    # For this test, the first page's content slice will be "HelloWorld" (from index 0 with length 10),
+    # so the starting sentence extracted should be "HelloWorld".
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    dummy_result.content = "HelloWorld. This is a test sentence."
+    # DummyPage creates a page with spans as a list of dictionaries.
+    dummy_result.pages = [DummyPage(0, 10, 1)]
+    la.result = dummy_result
+
+    sentences = la.create_per_page_starting_sentence()
+    assert len(sentences) == 1
+    sentence = sentences[0]
+    assert sentence.page_number == 1
+    assert sentence.starting_sentence == "HelloWorld"
+
+
+def test_create_per_page_starting_sentence_multiple_pages():
+    # Create a LayoutAnalysis instance.
+    la = LayoutAnalysis(record_id=300, source="dummy")
+
+    # Create a dummy result with content spanning two pages.
+    # Use DummyPage to simulate pages; DummyPage expects "spans" as a list of dicts.
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    # Define content as two parts:
+    # Page 1: Offset 0, length 10 gives "Page one." (starting sentence "Page one")
+    # Page 2: Offset 10, length 15 gives " Page two text" (starting sentence " Page two text")
+    dummy_result.content = "Page one.Page two text and more content. This is more random content that is on page 2."
+    dummy_result.pages = [
+        DummyPage(0, 9, 1),  # "Page one." (9 characters: indices 0-8)
+        DummyPage(9, 78, 2),  # "Page two text and" (16 characters: indices 9-24)
+    ]
+    la.result = dummy_result
+
+    # Call create_per_page_starting_sentence and check results.
+    sentences = la.create_per_page_starting_sentence()
+    assert len(sentences) == 2
+
+    # For page 1, the substring is "Page one." -> split on "." gives "Page one"
+    assert sentences[0].page_number == 1
+    assert sentences[0].starting_sentence == "Page one"
+
+    # For page 2, the substring is "Page two text and" -> split on "." gives the entire string
+    assert sentences[1].page_number == 2
+    # We strip potential leading/trailing spaces for validation.
+    assert sentences[1].starting_sentence.strip() == "Page two text and more content"
diff --git a/image_processing/tests/image_processing/test_layout_holders.py b/image_processing/tests/image_processing/test_layout_holders.py
@@ -7,6 +7,8 @@
     LayoutHolder,
     PageWiseContentHolder,
     NonPageWiseContentHolder,
+    ChunkHolder,
+    PerPageStartingSentenceHolder,
 )
 
 
@@ -65,3 +67,41 @@ def test_non_page_wise_content_holder():
     layout = LayoutHolder(content="Full document")
     non_page_holder = NonPageWiseContentHolder(layout=layout)
     assert non_page_holder.layout.content == "Full document"
+
+
+def test_chunk_holder_creation():
+    chunk = ChunkHolder(
+        mark_up="Sample markup",
+        sections=["Section1", "Section2"],
+        figures=[],
+        starting_sentence="First sentence",
+        cleaned_text="Cleaned text content",
+        page_number=1,
+    )
+    assert chunk.mark_up == "Sample markup"
+    assert chunk.sections == ["Section1", "Section2"]
+    assert chunk.starting_sentence == "First sentence"
+    assert chunk.cleaned_text == "Cleaned text content"
+    assert chunk.page_number == 1
+
+
+def test_per_page_starting_sentence_holder_creation():
+    sentence = PerPageStartingSentenceHolder(
+        page_number=1, starting_sentence="This is the starting sentence."
+    )
+    assert sentence.page_number == 1
+    assert sentence.starting_sentence == "This is the starting sentence."
+
+
+def test_non_page_wise_content_holder_with_sentences():
+    layout = LayoutHolder(content="Full document")
+    sentences = [
+        PerPageStartingSentenceHolder(page_number=1, starting_sentence="Start 1"),
+        PerPageStartingSentenceHolder(page_number=2, starting_sentence="Start 2"),
+    ]
+    non_page_holder = NonPageWiseContentHolder(
+        layout=layout, per_page_starting_sentences=sentences
+    )
+    assert non_page_holder.layout.content == "Full document"
+    assert len(non_page_holder.per_page_starting_sentences) == 2
+    assert non_page_holder.per_page_starting_sentences[0].starting_sentence == "Start 1"
diff --git a/image_processing/tests/image_processing/test_mark_up_cleaner.py b/image_processing/tests/image_processing/test_mark_up_cleaner.py
diff --git a/image_processing/tests/image_processing/test_semantic_text_chunker.py b/image_processing/tests/image_processing/test_semantic_text_chunker.py