Skip to content

Add some initial tests #163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/ci-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,29 @@ jobs:

- name: Run pre-commit
run: uv run pre-commit run --all-files

job-image-processing-unit-tests:
name: Image Processing Unit Tests
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: ${{ env.MIN_PYTHON_VERSION }}

- name: Install uv
uses: astral-sh/setup-uv@v4
with:
enable-cache: true

- name: Install the project
run: uv sync
working-directory: image_processing

- name: Run PyTest
run: uv run pytest --cov=. --cov-config=.coveragerc
working-directory: image_processing
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ repos:

# Python checks
- id: name-tests-test
args: [--pytest-test-first]

# JSON files
- id: pretty-format-json
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,11 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
mark_up_cleaner_context = "/document/page_wise_layout/*"
inputs = [
InputFieldMappingEntry(
name="chunk", source="/document/page_wise_layout/*/merged_content"
name="mark_up", source="/document/page_wise_layout/*/merged_content"
),
InputFieldMappingEntry(
name="page_number",
source="/document/page_wise_layout/*/page_number",
),
InputFieldMappingEntry(
name="figures",
Expand All @@ -230,20 +234,26 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
mark_up_cleaner_context = "/document/chunk_mark_ups/*"
inputs = [
InputFieldMappingEntry(
name="chunk", source="/document/chunk_mark_ups/*"
name="mark_up", source="/document/chunk_mark_ups/*/mark_up"
),
InputFieldMappingEntry(
name="page_number", source="/document/chunk_mark_ups/*/page_number"
),
InputFieldMappingEntry(
name="figures", source="/document/layout/figures/*/updated_figure"
),
]

mark_up_cleaner_skill_outputs = [
OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
OutputFieldMappingEntry(
name="chunk_sections", target_name="chunk_sections"
name="cleaned_text", target_name="final_cleaned_text"
),
OutputFieldMappingEntry(name="sections", target_name="final_sections"),
OutputFieldMappingEntry(name="mark_up", target_name="final_mark_up"),
OutputFieldMappingEntry(name="figures", target_name="final_chunk_figures"),
OutputFieldMappingEntry(
name="page_number", target_name="final_page_number"
),
OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
]

mark_up_cleaner_skill = WebApiSkill(
Expand Down Expand Up @@ -302,7 +312,11 @@ def get_semantic_chunker_skill(
semantic_text_chunker_skill_inputs = [
InputFieldMappingEntry(
name="content", source="/document/layout_merged_content"
)
),
InputFieldMappingEntry(
name="per_page_starting_sentences",
source="/document/per_page_starting_sentences",
),
]

semantic_text_chunker_skill_outputs = [
Expand Down Expand Up @@ -368,7 +382,13 @@ def get_layout_analysis_skill(
)
]
else:
output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
output = [
OutputFieldMappingEntry(name="layout", target_name="layout"),
OutputFieldMappingEntry(
name="per_page_starting_sentences",
target_name="per_page_starting_sentences",
),
]

layout_analysis_skill = WebApiSkill(
name="Layout Analysis Skill",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ def get_index_fields(self) -> list[SearchableField]:
type=SearchFieldDataType.String,
collection=True,
),
SimpleField(
name="PageNumber",
type=SearchFieldDataType.Int64,
sortable=True,
filterable=True,
facetable=True,
),
SearchField(
name="ChunkEmbedding",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
Expand Down Expand Up @@ -137,19 +144,6 @@ def get_index_fields(self) -> list[SearchableField]:
),
]

if self.enable_page_by_chunking:
fields.extend(
[
SimpleField(
name="PageNumber",
type=SearchFieldDataType.Int64,
sortable=True,
filterable=True,
facetable=True,
)
]
)

return fields

def get_semantic_search(self) -> SemanticSearch:
Expand Down Expand Up @@ -194,11 +188,12 @@ def get_skills(self) -> list:
if self.enable_page_by_chunking:
embedding_skill = self.get_vector_skill(
"/document/page_wise_layout/*",
"/document/page_wise_layout/*/chunk_cleaned",
"/document/page_wise_layout/*/final_cleaned_text",
)
else:
embedding_skill = self.get_vector_skill(
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
"/document/chunk_mark_ups/*",
"/document/chunk_mark_ups/*/final_cleaned_text",
)

if self.enable_page_by_chunking:
Expand Down Expand Up @@ -229,7 +224,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
source_context = "/document/page_wise_layout/*"
mappings = [
InputFieldMappingEntry(
name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
name="Chunk", source="/document/page_wise_layout/*/final_mark_up"
),
InputFieldMappingEntry(
name="ChunkEmbedding",
Expand All @@ -239,24 +234,25 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
InputFieldMappingEntry(
name="Sections",
source="/document/page_wise_layout/*/chunk_sections",
source="/document/page_wise_layout/*/final_sections",
),
InputFieldMappingEntry(
name="ChunkFigures",
source="/document/page_wise_layout/*/chunk_figures/*",
source="/document/page_wise_layout/*/final_chunk_figures/*",
),
InputFieldMappingEntry(
name="DateLastModified", source="/document/DateLastModified"
),
InputFieldMappingEntry(
name="PageNumber", source="/document/page_wise_layout/*/page_number"
name="PageNumber",
source="/document/page_wise_layout/*/final_page_number",
),
]
else:
source_context = "/document/chunk_mark_ups/*"
mappings = [
InputFieldMappingEntry(
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
name="Chunk", source="/document/chunk_mark_ups/*/final_mark_up"
),
InputFieldMappingEntry(
name="ChunkEmbedding",
Expand All @@ -265,15 +261,19 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
InputFieldMappingEntry(name="Title", source="/document/Title"),
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
InputFieldMappingEntry(
name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
name="Sections", source="/document/chunk_mark_ups/*/final_sections"
),
InputFieldMappingEntry(
name="ChunkFigures",
source="/document/chunk_mark_ups/*/chunk_figures/*",
source="/document/chunk_mark_ups/*/final_chunk_figures/*",
),
InputFieldMappingEntry(
name="DateLastModified", source="/document/DateLastModified"
),
InputFieldMappingEntry(
name="PageNumber",
source="/document/chunk_mark_ups/*/final_page_number",
),
]

index_projections = SearchIndexerIndexProjection(
Expand Down
11 changes: 11 additions & 0 deletions image_processing/.coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[run]
omit =
tests/*
*/__init__.py

[report]
omit =
tests/*
*/__init__.py
exclude_lines =
if __name__ == "__main__":
5 changes: 5 additions & 0 deletions image_processing/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,9 @@ dev = [
"pygments>=2.18.0",
"ruff>=0.8.1",
"python-dotenv>=1.0.1",
"coverage>=7.6.12",
"pytest>=8.3.4",
"pytest-asyncio>=0.25.3",
"pytest-cov>=6.0.0",
"pytest-mock>=3.14.0",
]
2 changes: 2 additions & 0 deletions image_processing/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
pythonpath = src/image_processing
Empty file.
42 changes: 41 additions & 1 deletion image_processing/src/image_processing/layout_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
LayoutHolder,
PageWiseContentHolder,
NonPageWiseContentHolder,
PerPageStartingSentenceHolder,
)


Expand Down Expand Up @@ -340,6 +341,40 @@ def create_page_wise_content(self) -> list[LayoutHolder]:

return page_wise_contents

def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.

Returns:
--------
list: A list of the starting sentence of each page."""

per_page_starting_sentences = []

for page in self.result.pages:
page_content = self.result.content[
page.spans[0]["offset"] : page.spans[0]["offset"]
+ page.spans[0]["length"]
]

# Remove any leading whitespace/newlines.
cleaned_content = page_content.lstrip()
# If a newline appears before a period, split on newline; otherwise, on period.
if "\n" in cleaned_content:
first_line = cleaned_content.split("\n", 1)[0]
elif "." in cleaned_content:
first_line = cleaned_content.split(".", 1)[0]
else:
first_line = cleaned_content

per_page_starting_sentences.append(
PerPageStartingSentenceHolder(
page_number=page.page_number,
starting_sentence=first_line.strip(),
)
)

return per_page_starting_sentences

async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
"""Get the Azure Document Intelligence client.

Expand Down Expand Up @@ -487,7 +522,12 @@ async def analyse(self):
if self.extract_figures:
await self.process_figures_from_extracted_content(text_content)

output_record = NonPageWiseContentHolder(layout=text_content)
per_page_starting_sentences = self.create_per_page_starting_sentence()

output_record = NonPageWiseContentHolder(
layout=text_content,
per_page_starting_sentences=per_page_starting_sentences,
)

except Exception as e:
logging.error(e)
Expand Down
22 changes: 21 additions & 1 deletion image_processing/src/image_processing/layout_holders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@


class FigureHolder(BaseModel):

"""A class to hold the figure extracted from the document."""

figure_id: str = Field(..., alias="FigureId")
Expand Down Expand Up @@ -48,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
page_wise_layout: list[LayoutHolder]


class PerPageStartingSentenceHolder(BaseModel):
"""A class to hold the starting sentence of each page."""

page_number: int
starting_sentence: str


class NonPageWiseContentHolder(BaseModel):
"""A class to hold the non-page-wise content extracted from the document."""

layout: LayoutHolder
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
default_factory=list
)


class ChunkHolder(BaseModel):
"""A class to hold the text extracted from the document after it has been chunked."""

mark_up: str
sections: Optional[list[str]] = Field(default_factory=list)
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
starting_sentence: Optional[str] = None
cleaned_text: Optional[str] = None
page_number: Optional[int] = Field(default=None)
Loading
Loading