ENH: Improve PDFium text extraction (#11)

mqq-marek · web-flow · commit 24c51dd4194b · 2023-10-31T22:44:53.000+01:00
Several additional changes:

* ENH: Add PDFium image extraction
* ROB: Make opening/parsing the cache file more robust
* MAINT: Update deprecated pdantic API
* MAINT: Add pdfrw to main.in
diff --git a/benchmark.py b/benchmark.py
@@ -7,6 +7,7 @@
 import time
 from io import BytesIO
 from itertools import product
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Literal
 
@@ -34,7 +35,7 @@
     pymupdf_watermarking,
     pypdf_get_text,
     pypdf_image_extraction,
-    pypdf_watermarking,
+    pypdf_watermarking, tika_get_text, pdfium_image_extraction,
 )
 from pdf_benchmark.output import write_benchmark_report
 from pdf_benchmark.score import get_text_extraction_score
@@ -48,8 +49,11 @@ def main(
 ) -> None:
     cache_path = Path("cache.json")
     if cache_path.exists():
-        with open(cache_path) as f:
-            cache = Cache.parse_obj(json.load(f))
+        try:
+            with open(cache_path) as f:
+                cache = Cache.model_validate(json.load(f))
+        except JSONDecodeError:
+            cache = Cache()
     else:
         cache = Cache()
     names = sorted(list(libraries.keys()))
@@ -154,9 +158,7 @@ def write_single_result(
             "Tika",
             "tika",
             "https://pypi.org/project/tika/",
-            text_extraction_function=lambda n: parser.from_buffer(BytesIO(n))[
-                "content"
-            ],
+            text_extraction_function=tika_get_text,
             version=tika.__version__,
             dependencies="Apache Tika",
             license="Apache v2",
@@ -233,6 +235,7 @@ def write_single_result(
             text_extraction_function=pdfium_get_text,
             version=pypdfium2.V_PYPDFIUM2,
             watermarking_function=None,
+            image_extraction_function=pdfium_image_extraction,
             license="Apache-2.0 or BSD-3-Clause",
             last_release_date="2023-07-04",
             dependencies="PDFium (Foxit/Google)",
diff --git a/pdf_benchmark/data_structures.py b/pdf_benchmark/data_structures.py
@@ -88,4 +88,4 @@ def has_doc(self, library: Library, document: Document) -> bool:
 
     def write(self, path: Path):
         with open(path, "w") as f:
-            f.write(self.json(indent=4, sort_keys=True))
+            f.write(self.model_dump_json(indent=4))
diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py
@@ -11,8 +11,9 @@
 from borb.pdf.pdf import PDF
 from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
 from pdfminer.high_level import extract_pages
+from requests import ReadTimeout
 
-from .text_extraction_post_processing import postprocess
+from .text_extraction_post_processing import postprocess, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE
 
 
 def pymupdf_get_text(data: bytes) -> str:
@@ -32,16 +33,44 @@ def pypdf_get_text(data: bytes) -> str:
     return text
 
 
+def pdfium_new_line_after_hyphens(text):
+    return text.replace(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE + '\n')
+
+
 def pdfium_get_text(data: bytes) -> str:
-    text = ""
+    texts = []
+    page_labels = []
     pdf = pdfium.PdfDocument(data)
+
     for i in range(len(pdf)):
+        if not (label := pdf.get_page_label(i)):
+            label = str(i + 1)
+        page_labels.append(label)
         page = pdf.get_page(i)
         textpage = page.get_textpage()
-        text += textpage.get_text_range() + "\n"
+        texts.append(pdfium_new_line_after_hyphens(textpage.get_text_range()))
+    text = postprocess(texts, page_labels)
     return text
 
 
+def pdfium_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
+    images = []
+    try:
+        pdf = pdfium.PdfDocument(data)
+        for i in range(len(pdf)):
+            page = pdf.get_page(i)
+            index = 1
+            for obj in page.get_objects():
+                if isinstance(obj, pdfium.PdfImage):
+                    img = BytesIO()
+                    obj.extract(img)
+                    images.append((f"page-{i+1}-image-{index}.jpg", img.getvalue()))
+                    index += 1
+    except Exception as exc:
+        print(f"pdfium Image extraction failure: {exc}")
+    return images
+
+
 def pypdf_watermarking(watermark_data: bytes, data: bytes) -> bytes:
     watermark_pdf = pypdf.PdfReader(BytesIO(watermark_data))
     watermark_page = watermark_pdf.pages[0]
@@ -87,7 +116,7 @@ def pymupdf_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
                 image_bytes = base_image["image"]
                 image_ext = base_image["ext"]
                 images.append(
-                    (f"image{page_index+1}_{image_index}.{image_ext}", image_bytes)
+                    (f"image{page_index + 1}_{image_index}.{image_ext}", image_bytes)
                 )
     return images
 
@@ -170,7 +199,10 @@ def pdftotext_get_text(data: bytes) -> str:
     new_file, filename = tempfile.mkstemp()
     with open(filename, "wb") as fp:
         fp.write(data)
-    args = ["/usr/bin/pdftotext", "-enc", "UTF-8", filename, "-"]
+    pdf_to_text_path = "/usr/bin/pdftotext"
+    if not os.path.exists(pdf_to_text_path):
+        pdf_to_text_path = 'pdftotext'
+    args = [pdf_to_text_path, "-enc", "UTF-8", filename, "-"]
     res = subprocess.run(args, capture_output=True)
     output = res.stdout.decode("utf-8")
     os.close(new_file)
@@ -191,3 +223,15 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
 
     out_buffer.seek(0)
     return out_buffer.read()
+
+
+def tika_get_text(data: bytes) -> str:
+    from tika import parser
+
+    try:
+        return parser.from_buffer(BytesIO(data), requestOptions={"timeout": (1, 100)})[
+            "content"
+        ]
+    except ReadTimeout as ex:
+        print("Tika timeout:", ex)
+        return "[[[Tika text extraction failed!]]]"
diff --git a/pdf_benchmark/text_extraction_post_processing.py b/pdf_benchmark/text_extraction_post_processing.py
@@ -1,3 +1,5 @@
+PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE = "\ufffe"
+
 def postprocess(extracted_texts: list[str], page_labels: list[str]) -> str:
     """Pass a list of all extracted texts from all pages."""
     extracted_texts = [replace_ligatures(t) for t in extracted_texts]
@@ -30,7 +32,7 @@ def remove_hyphens(text: str) -> str:
     # Find dashes
     line_numbers = []
     for line_no, line in enumerate(lines[:-1]):
-        if line.endswith("-"):
+        if line.endswith("-") or line.endswith(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE):
             line_numbers.append(line_no)
 
     # Replace
diff --git a/requirements/main.in b/requirements/main.in
@@ -11,4 +11,5 @@ pdftotext
 pydantic
 pymupdf
 pypdfium2
+pdfrw
 lxml