Skip to content

Commit 7f6bd62

Browse files
committed
Update layout analysis
1 parent a497052 commit 7f6bd62

File tree

2 files changed

+22
-7
lines changed

2 files changed

+22
-7
lines changed

image_processing/src/image_processing/layout_analysis.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,12 +356,20 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde
356356
+ page.spans[0]["length"]
357357
]
358358

359-
starting_sentence = page_content.split(".")[0]
359+
# Remove any leading whitespace/newlines.
360+
cleaned_content = page_content.lstrip()
361+
# If a newline appears before a period, split on newline; otherwise, on period.
362+
if "\n" in cleaned_content:
363+
first_line = cleaned_content.split("\n", 1)[0]
364+
elif "." in cleaned_content:
365+
first_line = cleaned_content.split(".", 1)[0]
366+
else:
367+
first_line = cleaned_content
360368

361369
per_page_starting_sentences.append(
362370
PerPageStartingSentenceHolder(
363371
page_number=page.page_number,
364-
starting_sentence=starting_sentence,
372+
starting_sentence=first_line.strip(),
365373
)
366374
)
367375

image_processing/src/image_processing/semantic_text_chunker.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,16 @@ def assign_page_number_to_chunks(
507507
list[ChunkHolder]: The list of chunks with page numbers assigned."""
508508
page_number = 1
509509
for chunk in chunks:
510-
if per_page_starting_sentences:
511-
for per_page_starting_sentence in per_page_starting_sentences:
512-
if per_page_starting_sentence.starting_sentence in chunk:
513-
page_number = per_page_starting_sentence.page_number
514-
break
510+
for per_page_starting_sentence in per_page_starting_sentences[
511+
page_number - 1 :
512+
]:
513+
if per_page_starting_sentence.starting_sentence in chunk:
514+
logging.info(
515+
"Assigning page number %i to chunk",
516+
per_page_starting_sentence.page_number,
517+
)
518+
page_number = per_page_starting_sentence.page_number
519+
break
515520
chunk.page_number = page_number
516521
return chunks
517522

@@ -546,6 +551,8 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
546551
for sentence in record["data"]["per_page_starting_sentences"]
547552
]
548553

554+
logging.info(f"Per page starting sentences: {per_page_starting_sentences}")
555+
549556
chunks = text_chunker.assign_page_number_to_chunks(
550557
chunks, per_page_starting_sentences
551558
)

0 commit comments

Comments
 (0)