Skip to content

Commit 24c51dd

Browse files
authored
ENH: Improve PDFium text extraction (#11)
Several additional changes: * ENH: Add PDFium image extraction * ROB: Make opening/parsing the cache file more robust * MAINT: Update deprecated pdantic API * MAINT: Add pdfrw to main.in
1 parent 4f14b3c commit 24c51dd

File tree

5 files changed

+63
-13
lines changed

5 files changed

+63
-13
lines changed

benchmark.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import time
88
from io import BytesIO
99
from itertools import product
10+
from json import JSONDecodeError
1011
from pathlib import Path
1112
from typing import Literal
1213

@@ -34,7 +35,7 @@
3435
pymupdf_watermarking,
3536
pypdf_get_text,
3637
pypdf_image_extraction,
37-
pypdf_watermarking,
38+
pypdf_watermarking, tika_get_text, pdfium_image_extraction,
3839
)
3940
from pdf_benchmark.output import write_benchmark_report
4041
from pdf_benchmark.score import get_text_extraction_score
@@ -48,8 +49,11 @@ def main(
4849
) -> None:
4950
cache_path = Path("cache.json")
5051
if cache_path.exists():
51-
with open(cache_path) as f:
52-
cache = Cache.parse_obj(json.load(f))
52+
try:
53+
with open(cache_path) as f:
54+
cache = Cache.model_validate(json.load(f))
55+
except JSONDecodeError:
56+
cache = Cache()
5357
else:
5458
cache = Cache()
5559
names = sorted(list(libraries.keys()))
@@ -154,9 +158,7 @@ def write_single_result(
154158
"Tika",
155159
"tika",
156160
"https://pypi.org/project/tika/",
157-
text_extraction_function=lambda n: parser.from_buffer(BytesIO(n))[
158-
"content"
159-
],
161+
text_extraction_function=tika_get_text,
160162
version=tika.__version__,
161163
dependencies="Apache Tika",
162164
license="Apache v2",
@@ -233,6 +235,7 @@ def write_single_result(
233235
text_extraction_function=pdfium_get_text,
234236
version=pypdfium2.V_PYPDFIUM2,
235237
watermarking_function=None,
238+
image_extraction_function=pdfium_image_extraction,
236239
license="Apache-2.0 or BSD-3-Clause",
237240
last_release_date="2023-07-04",
238241
dependencies="PDFium (Foxit/Google)",

pdf_benchmark/data_structures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,4 @@ def has_doc(self, library: Library, document: Document) -> bool:
8888

8989
def write(self, path: Path):
9090
with open(path, "w") as f:
91-
f.write(self.json(indent=4, sort_keys=True))
91+
f.write(self.model_dump_json(indent=4))

pdf_benchmark/library_code.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
from borb.pdf.pdf import PDF
1212
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
1313
from pdfminer.high_level import extract_pages
14+
from requests import ReadTimeout
1415

15-
from .text_extraction_post_processing import postprocess
16+
from .text_extraction_post_processing import postprocess, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE
1617

1718

1819
def pymupdf_get_text(data: bytes) -> str:
@@ -32,16 +33,44 @@ def pypdf_get_text(data: bytes) -> str:
3233
return text
3334

3435

36+
def pdfium_new_line_after_hyphens(text):
37+
return text.replace(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE + '\n')
38+
39+
3540
def pdfium_get_text(data: bytes) -> str:
36-
text = ""
41+
texts = []
42+
page_labels = []
3743
pdf = pdfium.PdfDocument(data)
44+
3845
for i in range(len(pdf)):
46+
if not (label := pdf.get_page_label(i)):
47+
label = str(i + 1)
48+
page_labels.append(label)
3949
page = pdf.get_page(i)
4050
textpage = page.get_textpage()
41-
text += textpage.get_text_range() + "\n"
51+
texts.append(pdfium_new_line_after_hyphens(textpage.get_text_range()))
52+
text = postprocess(texts, page_labels)
4253
return text
4354

4455

56+
def pdfium_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
57+
images = []
58+
try:
59+
pdf = pdfium.PdfDocument(data)
60+
for i in range(len(pdf)):
61+
page = pdf.get_page(i)
62+
index = 1
63+
for obj in page.get_objects():
64+
if isinstance(obj, pdfium.PdfImage):
65+
img = BytesIO()
66+
obj.extract(img)
67+
images.append((f"page-{i+1}-image-{index}.jpg", img.getvalue()))
68+
index += 1
69+
except Exception as exc:
70+
print(f"pdfium Image extraction failure: {exc}")
71+
return images
72+
73+
4574
def pypdf_watermarking(watermark_data: bytes, data: bytes) -> bytes:
4675
watermark_pdf = pypdf.PdfReader(BytesIO(watermark_data))
4776
watermark_page = watermark_pdf.pages[0]
@@ -87,7 +116,7 @@ def pymupdf_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
87116
image_bytes = base_image["image"]
88117
image_ext = base_image["ext"]
89118
images.append(
90-
(f"image{page_index+1}_{image_index}.{image_ext}", image_bytes)
119+
(f"image{page_index + 1}_{image_index}.{image_ext}", image_bytes)
91120
)
92121
return images
93122

@@ -170,7 +199,10 @@ def pdftotext_get_text(data: bytes) -> str:
170199
new_file, filename = tempfile.mkstemp()
171200
with open(filename, "wb") as fp:
172201
fp.write(data)
173-
args = ["/usr/bin/pdftotext", "-enc", "UTF-8", filename, "-"]
202+
pdf_to_text_path = "/usr/bin/pdftotext"
203+
if not os.path.exists(pdf_to_text_path):
204+
pdf_to_text_path = 'pdftotext'
205+
args = [pdf_to_text_path, "-enc", "UTF-8", filename, "-"]
174206
res = subprocess.run(args, capture_output=True)
175207
output = res.stdout.decode("utf-8")
176208
os.close(new_file)
@@ -191,3 +223,15 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
191223

192224
out_buffer.seek(0)
193225
return out_buffer.read()
226+
227+
228+
def tika_get_text(data: bytes) -> str:
229+
from tika import parser
230+
231+
try:
232+
return parser.from_buffer(BytesIO(data), requestOptions={"timeout": (1, 100)})[
233+
"content"
234+
]
235+
except ReadTimeout as ex:
236+
print("Tika timeout:", ex)
237+
return "[[[Tika text extraction failed!]]]"

pdf_benchmark/text_extraction_post_processing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE = "\ufffe"
2+
13
def postprocess(extracted_texts: list[str], page_labels: list[str]) -> str:
24
"""Pass a list of all extracted texts from all pages."""
35
extracted_texts = [replace_ligatures(t) for t in extracted_texts]
@@ -30,7 +32,7 @@ def remove_hyphens(text: str) -> str:
3032
# Find dashes
3133
line_numbers = []
3234
for line_no, line in enumerate(lines[:-1]):
33-
if line.endswith("-"):
35+
if line.endswith("-") or line.endswith(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE):
3436
line_numbers.append(line_no)
3537

3638
# Replace

requirements/main.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ pdftotext
1111
pydantic
1212
pymupdf
1313
pypdfium2
14+
pdfrw
1415
lxml

0 commit comments

Comments
 (0)