From 5219ebb0429a177efecbbb7ce80c90e3e94c3184 Mon Sep 17 00:00:00 2001 From: Jamie Lemon Date: Tue, 6 May 2025 17:29:59 +0100 Subject: [PATCH] Multi-page document to single page document. Adds a concept for converting the supplied document into a long single page PDF during the `to_markdown()` processing. --- .../pymupdf4llm/helpers/pymupdf_rag.py | 10 +++- .../pymupdf4llm/helpers/single_long_page.py | 53 +++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 pymupdf4llm/pymupdf4llm/helpers/single_long_page.py diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index c80a2fbc..1cacf4ae 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -43,6 +43,7 @@ from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes from pymupdf4llm.helpers.progress import ProgressBar +from pymupdf4llm.helpers.single_long_page import SingleLongPageDocument from dataclasses import dataclass from collections import defaultdict @@ -321,6 +322,7 @@ def to_markdown( extract_words=False, show_progress=False, use_glyphs=False, + parse_single_long_page=False, ) -> str: """Process the document and return the text of the selected pages. @@ -1137,9 +1139,14 @@ def get_page_output( if use_glyphs: textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE - if show_progress: + if show_progress & parse_single_long_page == False: print(f"Processing {FILENAME}...") pages = ProgressBar(pages) + if parse_single_long_page: + doc = SingleLongPageDocument(doc) + pages = range(doc.page_count) + if show_progress: + print(f"Processing {FILENAME} page 1 of 1 ...") for pno in pages: parms = get_page_output( doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS @@ -1191,7 +1198,6 @@ def extract_images_on_page_simple(page, parms, image_size_limit): return img_info - def filter_small_images(page, parms, image_size_limit): img_info = [] for item in page.get_image_info(): diff --git a/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py b/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py new file mode 100644 index 00000000..61620198 --- /dev/null +++ b/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py @@ -0,0 +1,53 @@ +import pymupdf + +class _SingleLongPage: + + _doc:pymupdf.Document + + def __init__(self, doc:pymupdf.Document, pages_per_row=1): + + # Create source document from supplied pages + page_count = doc.page_count + # Get dimensions of the first page as reference + first_page = doc[0] + page_width = first_page.rect.width + page_height = first_page.rect.height + + # Calculate rows needed + rows = (page_count + pages_per_row - 1) // pages_per_row # Ceiling division + + # Create a new PDF with a single page large enough to hold all pages + # This assumes that all the pages are the same width and height! + self._doc = pymupdf.open() + total_width = page_width * min(pages_per_row, page_count) + total_height = page_height * rows + + # note: MuPDF specifically has a limit of about 32,767 × 32,767 points for page size + if total_height >= 32767: + raise ValueError("Page height exceeds maximum of 32,767 points") + + # Create a new single page with the calculated dimensions + single_page = self._doc.new_page(width=total_width, height=total_height) + + # print(f"dst_page.rect.height: {single_page.rect.height}") + + # Copy content from each source page to the appropriate position on the destination page + for i in range(page_count): + row = i // pages_per_row + col = i % pages_per_row + + # Calculate position for this page + x = col * page_width + y = row * page_height + + # Get source page + src_page = doc[i] + + r = pymupdf.Rect(x,y, src_page.rect.width, (y+src_page.rect.height)) + + # Copy the content + single_page.show_pdf_page(r, doc, i) + +def SingleLongPageDocument(doc:pymupdf.Document, pages_per_row=1) -> pymupdf.Document: + slp = _SingleLongPage(doc, pages_per_row) + return slp._doc