From 5219ebb0429a177efecbbb7ce80c90e3e94c3184 Mon Sep 17 00:00:00 2001
From: Jamie Lemon <jamie.lemon@artifex.com>
Date: Tue, 6 May 2025 17:29:59 +0100
Subject: [PATCH] Multi-page document to single page document.

Adds a concept for converting the supplied
document into a long single page PDF during
the `to_markdown()` processing.
---
 .../pymupdf4llm/helpers/pymupdf_rag.py        | 10 +++-
 .../pymupdf4llm/helpers/single_long_page.py   | 53 +++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 pymupdf4llm/pymupdf4llm/helpers/single_long_page.py

diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index c80a2fbc..1cacf4ae 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -43,6 +43,7 @@
 from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
 from pymupdf4llm.helpers.multi_column import column_boxes
 from pymupdf4llm.helpers.progress import ProgressBar
+from pymupdf4llm.helpers.single_long_page import SingleLongPageDocument
 from dataclasses import dataclass
 from collections import defaultdict
 
@@ -321,6 +322,7 @@ def to_markdown(
     extract_words=False,
     show_progress=False,
     use_glyphs=False,
+    parse_single_long_page=False,
 ) -> str:
     """Process the document and return the text of the selected pages.
 
@@ -1137,9 +1139,14 @@ def get_page_output(
     if use_glyphs:
         textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
 
-    if show_progress:
+    if show_progress & parse_single_long_page == False:
         print(f"Processing {FILENAME}...")
         pages = ProgressBar(pages)
+    if parse_single_long_page:
+        doc = SingleLongPageDocument(doc)
+        pages = range(doc.page_count)
+        if show_progress:
+            print(f"Processing {FILENAME} page 1 of 1 ...")
     for pno in pages:
         parms = get_page_output(
             doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS
@@ -1191,7 +1198,6 @@ def extract_images_on_page_simple(page, parms, image_size_limit):
 
     return img_info
 
-
 def filter_small_images(page, parms, image_size_limit):
     img_info = []
     for item in page.get_image_info():
diff --git a/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py b/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py
new file mode 100644
index 00000000..61620198
--- /dev/null
+++ b/pymupdf4llm/pymupdf4llm/helpers/single_long_page.py
@@ -0,0 +1,53 @@
+import pymupdf
+
+class _SingleLongPage:
+    
+    _doc:pymupdf.Document
+          
+    def __init__(self, doc:pymupdf.Document, pages_per_row=1):
+
+		# Create source document from supplied pages          
+        page_count = doc.page_count
+		# Get dimensions of the first page as reference
+        first_page = doc[0]
+        page_width = first_page.rect.width
+        page_height = first_page.rect.height
+
+		# Calculate rows needed
+        rows = (page_count + pages_per_row - 1) // pages_per_row  # Ceiling division
+
+		# Create a new PDF with a single page large enough to hold all pages
+        # This assumes that all the pages are the same width and height!
+        self._doc = pymupdf.open()
+        total_width = page_width * min(pages_per_row, page_count)
+        total_height = page_height * rows
+        
+		# note: MuPDF specifically has a limit of about 32,767 × 32,767 points for page size
+        if total_height >= 32767:
+            raise ValueError("Page height exceeds maximum of 32,767 points")
+
+		# Create a new single page with the calculated dimensions
+        single_page = self._doc.new_page(width=total_width, height=total_height)
+
+        # print(f"dst_page.rect.height: {single_page.rect.height}")
+
+		# Copy content from each source page to the appropriate position on the destination page
+        for i in range(page_count):
+            row = i // pages_per_row
+            col = i % pages_per_row
+
+			# Calculate position for this page
+            x = col * page_width
+            y = row * page_height
+
+			# Get source page
+            src_page = doc[i]
+
+            r = pymupdf.Rect(x,y, src_page.rect.width, (y+src_page.rect.height))
+
+			# Copy the content
+            single_page.show_pdf_page(r, doc, i)
+    
+def SingleLongPageDocument(doc:pymupdf.Document, pages_per_row=1) -> pymupdf.Document:
+    slp = _SingleLongPage(doc, pages_per_row)
+    return slp._doc