1
+ import gc
1
2
import io
2
3
import json
3
4
import os
17
18
from zipfile import BadZipFile
18
19
19
20
import chardet
21
+ import openpyxl
20
22
from markitdown import FileConversionException
21
23
from markitdown import MarkItDown
24
+ from markitdown import StreamInfo
22
25
from markitdown import UnsupportedFormatException
23
26
from PIL import Image
24
27
from pypdf import PdfReader
30
33
from onyx .file_processing .html_utils import parse_html_page_basic
31
34
from onyx .file_processing .unstructured import get_unstructured_api_key
32
35
from onyx .file_processing .unstructured import unstructured_to_text
36
+ from onyx .utils .file_types import PRESENTATION_MIME_TYPE
37
+ from onyx .utils .file_types import WORD_PROCESSING_MIME_TYPE
33
38
from onyx .utils .logger import setup_logger
34
39
35
40
logger = setup_logger ()
80
85
"image/webp" ,
81
86
]
82
87
88
+ _MARKITDOWN_CONVERTER : MarkItDown | None = None
89
+
90
+ KNOWN_OPENPYXL_BUGS = [
91
+ "Value must be either numerical or a string containing a wildcard" ,
92
+ "File contains no valid workbook part" ,
93
+ ]
94
+
95
+
96
+ def get_markitdown_converter () -> MarkItDown :
97
+ global _MARKITDOWN_CONVERTER
98
+ if _MARKITDOWN_CONVERTER is None :
99
+ _MARKITDOWN_CONVERTER = MarkItDown (enable_plugins = False )
100
+ return _MARKITDOWN_CONVERTER
101
+
83
102
84
103
class OnyxExtensionType (IntFlag ):
85
104
Plain = auto ()
@@ -338,9 +357,11 @@ def docx_to_text_and_images(
338
357
of avoiding materializing the list of images in memory.
339
358
The images list returned is empty in this case.
340
359
"""
341
- md = MarkItDown ( enable_plugins = False )
360
+ md = get_markitdown_converter ( )
342
361
try :
343
- doc = md .convert (to_bytesio (file ))
362
+ doc = md .convert (
363
+ to_bytesio (file ), stream_info = StreamInfo (mimetype = WORD_PROCESSING_MIME_TYPE )
364
+ )
344
365
except (
345
366
BadZipFile ,
346
367
ValueError ,
@@ -372,9 +393,12 @@ def docx_to_text_and_images(
372
393
373
394
374
395
def pptx_to_text (file : IO [Any ], file_name : str = "" ) -> str :
375
- md = MarkItDown (enable_plugins = False )
396
+ md = get_markitdown_converter ()
397
+ stream_info = StreamInfo (
398
+ mimetype = PRESENTATION_MIME_TYPE , filename = file_name or None , extension = ".pptx"
399
+ )
376
400
try :
377
- presentation = md .convert (to_bytesio (file ))
401
+ presentation = md .convert (to_bytesio (file ), stream_info = stream_info )
378
402
except (
379
403
BadZipFile ,
380
404
ValueError ,
@@ -388,23 +412,69 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
388
412
389
413
390
414
def xlsx_to_text (file : IO [Any ], file_name : str = "" ) -> str :
391
- md = MarkItDown (enable_plugins = False )
415
+ # TODO: switch back to this approach in a few months when markitdown
416
+ # fixes their handling of excel files
417
+
418
+ # md = get_markitdown_converter()
419
+ # stream_info = StreamInfo(
420
+ # mimetype=SPREADSHEET_MIME_TYPE, filename=file_name or None, extension=".xlsx"
421
+ # )
422
+ # try:
423
+ # workbook = md.convert(to_bytesio(file), stream_info=stream_info)
424
+ # except (
425
+ # BadZipFile,
426
+ # ValueError,
427
+ # FileConversionException,
428
+ # UnsupportedFormatException,
429
+ # ) as e:
430
+ # error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
431
+ # if file_name.startswith("~"):
432
+ # logger.debug(error_str + " (this is expected for files with ~)")
433
+ # else:
434
+ # logger.warning(error_str)
435
+ # return ""
436
+ # return workbook.markdown
392
437
try :
393
- workbook = md .convert (to_bytesio (file ))
394
- except (
395
- BadZipFile ,
396
- ValueError ,
397
- FileConversionException ,
398
- UnsupportedFormatException ,
399
- ) as e :
438
+ workbook = openpyxl .load_workbook (file , read_only = True )
439
+ except BadZipFile as e :
400
440
error_str = f"Failed to extract text from { file_name or 'xlsx file' } : { e } "
401
441
if file_name .startswith ("~" ):
402
442
logger .debug (error_str + " (this is expected for files with ~)" )
403
443
else :
404
444
logger .warning (error_str )
405
445
return ""
446
+ except Exception as e :
447
+ if any (s in str (e ) for s in KNOWN_OPENPYXL_BUGS ):
448
+ logger .error (
449
+ f"Failed to extract text from { file_name or 'xlsx file' } . This happens due to a bug in openpyxl. { e } "
450
+ )
451
+ return ""
452
+ raise e
406
453
407
- return workbook .markdown
454
+ text_content = []
455
+ for sheet in workbook .worksheets :
456
+ rows = []
457
+ num_empty_consecutive_rows = 0
458
+ for row in sheet .iter_rows (min_row = 1 , values_only = True ):
459
+ row_str = "," .join (str (cell or "" ) for cell in row )
460
+
461
+ # Only add the row if there are any values in the cells
462
+ if len (row_str ) >= len (row ):
463
+ rows .append (row_str )
464
+ num_empty_consecutive_rows = 0
465
+ else :
466
+ num_empty_consecutive_rows += 1
467
+
468
+ if num_empty_consecutive_rows > 100 :
469
+ # handle massive excel sheets with mostly empty cells
470
+ logger .warning (
471
+ f"Found { num_empty_consecutive_rows } empty rows in { file_name } ,"
472
+ " skipping rest of file"
473
+ )
474
+ break
475
+ sheet_str = "\n " .join (rows )
476
+ text_content .append (sheet_str )
477
+ return TEXT_SECTION_SEPARATOR .join (text_content )
408
478
409
479
410
480
def eml_to_text (file : IO [Any ]) -> str :
@@ -531,6 +601,23 @@ def extract_text_and_images(
531
601
Primary new function for the updated connector.
532
602
Returns structured extraction result with text content, embedded images, and metadata.
533
603
"""
604
+ res = _extract_text_and_images (
605
+ file , file_name , pdf_pass , content_type , image_callback
606
+ )
607
+ # Clean up any temporary objects and force garbage collection
608
+ unreachable = gc .collect ()
609
+ logger .info (f"Unreachable objects: { unreachable } " )
610
+
611
+ return res
612
+
613
+
614
+ def _extract_text_and_images (
615
+ file : IO [Any ],
616
+ file_name : str ,
617
+ pdf_pass : str | None = None ,
618
+ content_type : str | None = None ,
619
+ image_callback : Callable [[bytes , str ], None ] | None = None ,
620
+ ) -> ExtractionResult :
534
621
file .seek (0 )
535
622
536
623
if get_unstructured_api_key ():
@@ -556,7 +643,6 @@ def extract_text_and_images(
556
643
# Default processing
557
644
try :
558
645
extension = get_file_ext (file_name )
559
-
560
646
# docx example for embedded images
561
647
if extension == ".docx" :
562
648
text_content , images = docx_to_text_and_images (
0 commit comments