11
11
from borb .pdf .pdf import PDF
12
12
from borb .toolkit .text .simple_text_extraction import SimpleTextExtraction
13
13
from pdfminer .high_level import extract_pages
14
+ from requests import ReadTimeout
14
15
15
- from .text_extraction_post_processing import postprocess
16
+ from .text_extraction_post_processing import postprocess , PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE
16
17
17
18
18
19
def pymupdf_get_text (data : bytes ) -> str :
@@ -32,16 +33,44 @@ def pypdf_get_text(data: bytes) -> str:
32
33
return text
33
34
34
35
36
+ def pdfium_new_line_after_hyphens (text ):
37
+ return text .replace (PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE , PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE + '\n ' )
38
+
39
+
35
40
def pdfium_get_text (data : bytes ) -> str :
36
- text = ""
41
+ texts = []
42
+ page_labels = []
37
43
pdf = pdfium .PdfDocument (data )
44
+
38
45
for i in range (len (pdf )):
46
+ if not (label := pdf .get_page_label (i )):
47
+ label = str (i + 1 )
48
+ page_labels .append (label )
39
49
page = pdf .get_page (i )
40
50
textpage = page .get_textpage ()
41
- text += textpage .get_text_range () + "\n "
51
+ texts .append (pdfium_new_line_after_hyphens (textpage .get_text_range ()))
52
+ text = postprocess (texts , page_labels )
42
53
return text
43
54
44
55
56
+ def pdfium_image_extraction (data : bytes ) -> list [tuple [str , bytes ]]:
57
+ images = []
58
+ try :
59
+ pdf = pdfium .PdfDocument (data )
60
+ for i in range (len (pdf )):
61
+ page = pdf .get_page (i )
62
+ index = 1
63
+ for obj in page .get_objects ():
64
+ if isinstance (obj , pdfium .PdfImage ):
65
+ img = BytesIO ()
66
+ obj .extract (img )
67
+ images .append ((f"page-{ i + 1 } -image-{ index } .jpg" , img .getvalue ()))
68
+ index += 1
69
+ except Exception as exc :
70
+ print (f"pdfium Image extraction failure: { exc } " )
71
+ return images
72
+
73
+
45
74
def pypdf_watermarking (watermark_data : bytes , data : bytes ) -> bytes :
46
75
watermark_pdf = pypdf .PdfReader (BytesIO (watermark_data ))
47
76
watermark_page = watermark_pdf .pages [0 ]
@@ -87,7 +116,7 @@ def pymupdf_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
87
116
image_bytes = base_image ["image" ]
88
117
image_ext = base_image ["ext" ]
89
118
images .append (
90
- (f"image{ page_index + 1 } _{ image_index } .{ image_ext } " , image_bytes )
119
+ (f"image{ page_index + 1 } _{ image_index } .{ image_ext } " , image_bytes )
91
120
)
92
121
return images
93
122
@@ -170,7 +199,10 @@ def pdftotext_get_text(data: bytes) -> str:
170
199
new_file , filename = tempfile .mkstemp ()
171
200
with open (filename , "wb" ) as fp :
172
201
fp .write (data )
173
- args = ["/usr/bin/pdftotext" , "-enc" , "UTF-8" , filename , "-" ]
202
+ pdf_to_text_path = "/usr/bin/pdftotext"
203
+ if not os .path .exists (pdf_to_text_path ):
204
+ pdf_to_text_path = 'pdftotext'
205
+ args = [pdf_to_text_path , "-enc" , "UTF-8" , filename , "-" ]
174
206
res = subprocess .run (args , capture_output = True )
175
207
output = res .stdout .decode ("utf-8" )
176
208
os .close (new_file )
@@ -191,3 +223,15 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
191
223
192
224
out_buffer .seek (0 )
193
225
return out_buffer .read ()
226
+
227
+
228
+ def tika_get_text (data : bytes ) -> str :
229
+ from tika import parser
230
+
231
+ try :
232
+ return parser .from_buffer (BytesIO (data ), requestOptions = {"timeout" : (1 , 100 )})[
233
+ "content"
234
+ ]
235
+ except ReadTimeout as ex :
236
+ print ("Tika timeout:" , ex )
237
+ return "[[[Tika text extraction failed!]]]"
0 commit comments