Skip to content

Commit f23126f

Browse files
committed
Update tests
1 parent 1d7e2d2 commit f23126f

File tree

8 files changed

+690
-78
lines changed

8 files changed

+690
-78
lines changed

image_processing/src/image_processing/layout_analysis.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
LayoutHolder,
2323
PageWiseContentHolder,
2424
NonPageWiseContentHolder,
25+
PerPageStartingSentenceHolder,
2526
)
2627

2728

@@ -340,6 +341,32 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
340341

341342
return page_wise_contents
342343

344+
def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
345+
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
346+
347+
Returns:
348+
--------
349+
list: A list of the starting sentence of each page."""
350+
351+
per_page_starting_sentences = []
352+
353+
for page in self.result.pages:
354+
page_content = self.result.content[
355+
page.spans[0]["offset"] : page.spans[0]["offset"]
356+
+ page.spans[0]["length"]
357+
]
358+
359+
starting_sentence = page_content.split(".")[0]
360+
361+
per_page_starting_sentences.append(
362+
PerPageStartingSentenceHolder(
363+
page_number=page.page_number,
364+
starting_sentence=starting_sentence,
365+
)
366+
)
367+
368+
return per_page_starting_sentences
369+
343370
async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
344371
"""Get the Azure Document Intelligence client.
345372
@@ -487,7 +514,12 @@ async def analyse(self):
487514
if self.extract_figures:
488515
await self.process_figures_from_extracted_content(text_content)
489516

490-
output_record = NonPageWiseContentHolder(layout=text_content)
517+
per_page_starting_sentences = self.create_per_page_starting_sentence()
518+
519+
output_record = NonPageWiseContentHolder(
520+
layout=text_content,
521+
per_page_starting_sentences=per_page_starting_sentences,
522+
)
491523

492524
except Exception as e:
493525
logging.error(e)

image_processing/src/image_processing/layout_holders.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
4747
page_wise_layout: list[LayoutHolder]
4848

4949

50+
class PerPageStartingSentenceHolder(BaseModel):
51+
"""A class to hold the starting sentence of each page."""
52+
53+
page_number: int
54+
starting_sentence: str
55+
56+
5057
class NonPageWiseContentHolder(BaseModel):
5158
"""A class to hold the non-page-wise content extracted from the document."""
5259

5360
layout: LayoutHolder
61+
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
62+
default_factory=list
63+
)
64+
65+
66+
class ChunkHolder(BaseModel):
67+
"""A class to hold the text extracted from the document after it has been chunked."""
68+
69+
mark_up: str
70+
sections: Optional[list[str]] = Field(default_factory=list)
71+
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
72+
starting_sentence: Optional[str] = None
73+
cleaned_text: Optional[str] = None
74+
page_number: Optional[int] = Field(default=None)

image_processing/src/image_processing/mark_up_cleaner.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
import json
55
import regex as re
6-
from layout_holders import FigureHolder
6+
from layout_holders import FigureHolder, ChunkHolder
77

88

99
class MarkUpCleaner:
@@ -76,7 +76,7 @@ def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
7676
return text
7777

7878
def clean_text_and_extract_metadata(
79-
self, text: str, figures: list[FigureHolder]
79+
self, chunk: ChunkHolder, figures: list[FigureHolder]
8080
) -> tuple[str, str]:
8181
"""This function performs following cleanup activities on the text, remove all unicode characters
8282
remove line spacing,remove stop words, normalize characters
@@ -88,26 +88,29 @@ def clean_text_and_extract_metadata(
8888
Returns:
8989
str: The clean text."""
9090

91-
return_record = {}
92-
9391
try:
94-
logging.info(f"Input text: {text}")
95-
if len(text) == 0:
92+
logging.info(f"Input text: {chunk.mark_up}")
93+
if len(chunk.mark_up) == 0:
9694
logging.error("Input text is empty")
9795
raise ValueError("Input text is empty")
9896

99-
return_record["chunk_mark_up"] = text
100-
101-
figure_ids = self.get_figure_ids(text)
97+
figure_ids = self.get_figure_ids(chunk.mark_up)
10298

103-
return_record["chunk_sections"] = self.get_sections(text)
104-
return_record["chunk_figures"] = [
105-
figure.model_dump(by_alias=True)
106-
for figure in figures
107-
if figure.figure_id in figure_ids
99+
chunk.sections = self.get_sections(chunk.mark_up)
100+
chunk.figures = [
101+
figure for figure in figures if figure.figure_id in figure_ids
108102
]
109103

110-
logging.info(f"Sections: {return_record['chunk_sections']}")
104+
logging.info(f"Sections: {chunk.sections}")
105+
106+
# Check if the chunk contains only figure tags (plus whitespace).
107+
figure_tag_pattern = (
108+
r"<figure(?:\s+FigureId=(\"[^\"]*\"|'[^']*'))?>(.*?)</figure>"
109+
)
110+
text_without_figures = re.sub(figure_tag_pattern, "", chunk.mark_up).strip()
111+
if not text_without_figures and chunk.figures:
112+
# When no text outside of figure tags is present, set page_number from the first figure.
113+
chunk.page_number = chunk.figures[0].page_number
111114

112115
# Define specific patterns for each tag
113116
tag_patterns = {
@@ -117,7 +120,7 @@ def clean_text_and_extract_metadata(
117120
"figcaption": r"<figcaption>(.*?)</figcaption>",
118121
"header": r"^\s*(#{1,6})\s*(.*?)\s*$",
119122
}
120-
cleaned_text = self.remove_markdown_tags(text, tag_patterns)
123+
cleaned_text = self.remove_markdown_tags(chunk.mark_up, tag_patterns)
121124

122125
logging.info(f"Removed markdown tags: {cleaned_text}")
123126

@@ -130,11 +133,11 @@ def clean_text_and_extract_metadata(
130133
logging.error("Cleaned text is empty")
131134
raise ValueError("Cleaned text is empty")
132135
else:
133-
return_record["chunk_cleaned"] = cleaned_text
136+
chunk.cleaned_text = cleaned_text
134137
except Exception as e:
135138
logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
136-
return ""
137-
return return_record
139+
raise e
140+
return chunk.model_dump(by_alias=True)
138141

139142
async def clean(self, record: dict) -> dict:
140143
"""Cleanup the data using standard python libraries.
@@ -159,12 +162,17 @@ async def clean(self, record: dict) -> dict:
159162

160163
figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
161164

165+
chunk_holder = ChunkHolder(mark_up=record["data"]["mark_up"])
166+
167+
if "page_number" in record["data"]:
168+
chunk_holder.page_number = record["data"]["page_number"]
169+
162170
cleaned_record["data"] = self.clean_text_and_extract_metadata(
163-
record["data"]["chunk"], figures
171+
chunk_holder, figures
164172
)
165173

166174
except Exception as e:
167-
logging.error("string cleanup Error: %s", e)
175+
logging.error("Cleanup Error: %s", e)
168176
return {
169177
"recordId": record["recordId"],
170178
"data": None,

image_processing/src/image_processing/semantic_text_chunker.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import spacy
88
import numpy as np
99
from model2vec import StaticModel
10+
from layout_holders import PerPageStartingSentenceHolder, ChunkHolder
1011

1112

1213
class SemanticTextChunker:
@@ -75,7 +76,7 @@ def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
7576

7677
return cleaned_chunks, cleaned_is_table_or_figure_map
7778

78-
async def chunk(self, text: str) -> list[dict]:
79+
async def chunk(self, text: str) -> list[ChunkHolder]:
7980
"""Attempts to chunk the text by:
8081
Splitting into sentences
8182
Grouping sentences that contain figures and tables
@@ -128,7 +129,7 @@ async def chunk(self, text: str) -> list[dict]:
128129
for chunk in reversed_backwards_pass_chunks:
129130
stripped_chunk = chunk.strip()
130131
if len(stripped_chunk) > 0:
131-
cleaned_final_chunks.append(stripped_chunk)
132+
cleaned_final_chunks.append(ChunkHolder(mark_up=stripped_chunk))
132133

133134
logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
134135
logging.info(f"Chunks: {cleaned_final_chunks}")
@@ -491,6 +492,29 @@ def sentence_similarity(self, text_1, text_2):
491492
)
492493
return similarity
493494

495+
def assign_page_number_to_chunks(
496+
self,
497+
chunks: list[ChunkHolder],
498+
per_page_starting_sentences: list[PerPageStartingSentenceHolder],
499+
) -> list[ChunkHolder]:
500+
"""Assigns page numbers to the chunks based on the starting sentences of each page.
501+
502+
Args:
503+
chunks (list[ChunkHolder]): The list of chunks.
504+
per_page_starting_sentences (list[PerPageStartingSentenceHolder]): The list of starting sentences of each page.
505+
506+
Returns:
507+
list[ChunkHolder]: The list of chunks with page numbers assigned."""
508+
page_number = 1
509+
for chunk in chunks:
510+
if per_page_starting_sentences:
511+
for per_page_starting_sentence in per_page_starting_sentences:
512+
if per_page_starting_sentence.starting_sentence in chunk:
513+
page_number = per_page_starting_sentence.page_number
514+
break
515+
chunk.page_number = page_number
516+
return chunks
517+
494518

495519
async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
496520
"""Chunk the data.
@@ -514,9 +538,21 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
514538
}
515539

516540
# scenarios when page by chunking is enabled
517-
cleaned_record["data"]["chunks"] = await text_chunker.chunk(
518-
record["data"]["content"]
519-
)
541+
chunks = await text_chunker.chunk(record["data"]["content"])
542+
543+
if "per_page_starting_sentences" in record["data"]:
544+
per_page_starting_sentences = [
545+
PerPageStartingSentenceHolder(**sentence)
546+
for sentence in record["data"]["per_page_starting_sentences"]
547+
]
548+
549+
chunks = text_chunker.assign_page_number_to_chunks(
550+
chunks, per_page_starting_sentences
551+
)
552+
553+
cleaned_record["data"]["chunks"] = [
554+
chunk.model_dump(by_alias=True) for chunk in chunks
555+
]
520556

521557
except Exception as e:
522558
logging.error("Chunking Error: %s", e)

image_processing/tests/image_processing/test_layout_analysis.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import base64
77
from unittest.mock import AsyncMock
88

9-
# Import the functions/classes under test.
109
from layout_analysis import (
1110
process_layout_analysis,
1211
LayoutAnalysis,
@@ -147,7 +146,7 @@ async def test_analyse_non_page_wise_no_figures(monkeypatch, dummy_storage_helpe
147146
)
148147
# Patch analyse_document to simulate a successful ADI analysis.
149148
dummy_result = DummyResult(
150-
content="Full document content", pages=[DummyPage(0, 21, None)], figures=[]
149+
content="Full document content", pages=[DummyPage(0, 21, 1)], figures=[]
151150
)
152151

153152
async def dummy_analyse_document(file_path):
@@ -435,3 +434,60 @@ class DummyResultContent:
435434
assert layout.content == "Hello"
436435
assert layout.page_number == 1
437436
assert layout.page_offsets == 0
437+
438+
439+
def test_create_per_page_starting_sentence():
440+
# Create a LayoutAnalysis instance.
441+
la = LayoutAnalysis(record_id=200, source="dummy")
442+
443+
# Create a dummy result with content and pages.
444+
# For this test, the first page's content slice will be "HelloWorld" (from index 0 with length 10),
445+
# so the starting sentence extracted should be "HelloWorld".
446+
class DummyResultContent:
447+
pass
448+
449+
dummy_result = DummyResultContent()
450+
dummy_result.content = "HelloWorld. This is a test sentence."
451+
# DummyPage creates a page with spans as a list of dictionaries.
452+
dummy_result.pages = [DummyPage(0, 10, 1)]
453+
la.result = dummy_result
454+
455+
sentences = la.create_per_page_starting_sentence()
456+
assert len(sentences) == 1
457+
sentence = sentences[0]
458+
assert sentence.page_number == 1
459+
assert sentence.starting_sentence == "HelloWorld"
460+
461+
462+
def test_create_per_page_starting_sentence_multiple_pages():
463+
# Create a LayoutAnalysis instance.
464+
la = LayoutAnalysis(record_id=300, source="dummy")
465+
466+
# Create a dummy result with content spanning two pages.
467+
# Use DummyPage to simulate pages; DummyPage expects "spans" as a list of dicts.
468+
class DummyResultContent:
469+
pass
470+
471+
dummy_result = DummyResultContent()
472+
# Define content as two parts:
473+
# Page 1: Offset 0, length 10 gives "Page one." (starting sentence "Page one")
474+
# Page 2: Offset 10, length 15 gives " Page two text" (starting sentence " Page two text")
475+
dummy_result.content = "Page one.Page two text and more content. This is more random content that is on page 2."
476+
dummy_result.pages = [
477+
DummyPage(0, 9, 1), # "Page one." (9 characters: indices 0-8)
478+
DummyPage(9, 78, 2), # "Page two text and" (16 characters: indices 9-24)
479+
]
480+
la.result = dummy_result
481+
482+
# Call create_per_page_starting_sentence and check results.
483+
sentences = la.create_per_page_starting_sentence()
484+
assert len(sentences) == 2
485+
486+
# For page 1, the substring is "Page one." -> split on "." gives "Page one"
487+
assert sentences[0].page_number == 1
488+
assert sentences[0].starting_sentence == "Page one"
489+
490+
# For page 2, the substring is "Page two text and" -> split on "." gives the entire string
491+
assert sentences[1].page_number == 2
492+
# We strip potential leading/trailing spaces for validation.
493+
assert sentences[1].starting_sentence.strip() == "Page two text and more content"

image_processing/tests/image_processing/test_layout_holders.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
LayoutHolder,
88
PageWiseContentHolder,
99
NonPageWiseContentHolder,
10+
ChunkHolder,
11+
PerPageStartingSentenceHolder,
1012
)
1113

1214

@@ -65,3 +67,41 @@ def test_non_page_wise_content_holder():
6567
layout = LayoutHolder(content="Full document")
6668
non_page_holder = NonPageWiseContentHolder(layout=layout)
6769
assert non_page_holder.layout.content == "Full document"
70+
71+
72+
def test_chunk_holder_creation():
73+
chunk = ChunkHolder(
74+
mark_up="Sample markup",
75+
sections=["Section1", "Section2"],
76+
figures=[],
77+
starting_sentence="First sentence",
78+
cleaned_text="Cleaned text content",
79+
page_number=1,
80+
)
81+
assert chunk.mark_up == "Sample markup"
82+
assert chunk.sections == ["Section1", "Section2"]
83+
assert chunk.starting_sentence == "First sentence"
84+
assert chunk.cleaned_text == "Cleaned text content"
85+
assert chunk.page_number == 1
86+
87+
88+
def test_per_page_starting_sentence_holder_creation():
89+
sentence = PerPageStartingSentenceHolder(
90+
page_number=1, starting_sentence="This is the starting sentence."
91+
)
92+
assert sentence.page_number == 1
93+
assert sentence.starting_sentence == "This is the starting sentence."
94+
95+
96+
def test_non_page_wise_content_holder_with_sentences():
97+
layout = LayoutHolder(content="Full document")
98+
sentences = [
99+
PerPageStartingSentenceHolder(page_number=1, starting_sentence="Start 1"),
100+
PerPageStartingSentenceHolder(page_number=2, starting_sentence="Start 2"),
101+
]
102+
non_page_holder = NonPageWiseContentHolder(
103+
layout=layout, per_page_starting_sentences=sentences
104+
)
105+
assert non_page_holder.layout.content == "Full document"
106+
assert len(non_page_holder.per_page_starting_sentences) == 2
107+
assert non_page_holder.per_page_starting_sentences[0].starting_sentence == "Start 1"

0 commit comments

Comments
 (0)