Skip to content

Commit ce340e8

Browse files
committed
Add header/footer removal for pypdf
1 parent 267a925 commit ce340e8

17 files changed

+390
-585
lines changed

pdf_benchmark/library_code.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def pypdf_get_text(data: bytes) -> str:
2828
reader = pypdf.PdfReader(BytesIO(data))
2929
for page in reader.pages:
3030
texts.append(page.extract_text())
31-
text = postprocess(texts)
31+
text = postprocess(texts, reader.page_labels)
3232
return text
3333

3434

pdf_benchmark/text_extraction_post_processing.py

Lines changed: 16 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
1-
def postprocess(extracted_texts: list[str]) -> str:
1+
def postprocess(extracted_texts: list[str], page_labels: list[str]) -> str:
22
"""Pass a list of all extracted texts from all pages."""
33
extracted_texts = [replace_ligatures(t) for t in extracted_texts]
44
extracted_texts = [remove_hyphens(t) for t in extracted_texts]
5-
# footer_remover = FooterRemover()
6-
# footer_remover.fit(extracted_texts)
7-
# extracted_texts = [footer_remover.extract(t) for t in extracted_texts]
8-
5+
extracted_texts = remove_footer(extracted_texts, page_labels)
96
return "\n".join(extracted_texts)
107

118

@@ -52,32 +49,20 @@ def dehyphenate(lines: list[str], line_no: int) -> list[str]:
5249
return lines
5350

5451

55-
class FooterRemover:
56-
def __init__(self):
57-
self.footer = None
52+
def remove_footer(extracted_texts: list[str], page_labels: list[str]):
53+
def remove_page_labels(extracted_texts, page_labels):
54+
processed = []
55+
for text, label in zip(extracted_texts, page_labels):
56+
text_left = text.lstrip()
57+
if text_left.startswith(label):
58+
text = text_left[len(label) :]
5859

59-
def fit(self, extracted_texts: list[str]) -> None:
60-
"""
61-
Find the common footer by comparing all extracted texts
62-
and finding the common suffix.
63-
We assume that the footer appears at the end of each text.
64-
"""
65-
common_suffix = None
66-
for text in extracted_texts:
67-
if common_suffix is None:
68-
common_suffix = text
69-
else:
70-
i = 1
71-
while i <= min(len(common_suffix), len(text)):
72-
if common_suffix[-i:] != text[-i:]:
73-
break
74-
i += 1
75-
common_suffix = common_suffix[-(i - 1) :]
60+
text_right = text.rstrip()
61+
if text_right.endswith(label):
62+
text = text_right[: -len(label)]
7663

77-
self.footer = common_suffix
64+
processed.append(text)
65+
return processed
7866

79-
def extract(self, extracted_text: str) -> str:
80-
"""Remove the detected footer from the extracted text."""
81-
if self.footer is not None and extracted_text.endswith(self.footer):
82-
return extracted_text[: -len(self.footer)]
83-
return extracted_text
67+
extracted_texts = remove_page_labels(extracted_texts, page_labels)
68+
return extracted_texts

0 commit comments

Comments
 (0)