15
15
from typing import Any
16
16
from typing import IO
17
17
from typing import NamedTuple
18
+ from zipfile import BadZipFile
18
19
19
20
import chardet
20
21
import docx # type: ignore
@@ -332,8 +333,13 @@ def docx_to_text_and_images(
332
333
return text_content , embedded_images
333
334
334
335
335
- def pptx_to_text (file : IO [Any ]) -> str :
336
- presentation = pptx .Presentation (file )
336
+ def pptx_to_text (file : IO [Any ], file_name : str = "" ) -> str :
337
+ try :
338
+ presentation = pptx .Presentation (file )
339
+ except BadZipFile as e :
340
+ error_str = f"Failed to extract text from { file_name or 'pptx file' } : { e } "
341
+ logger .warning (error_str )
342
+ return ""
337
343
text_content = []
338
344
for slide_number , slide in enumerate (presentation .slides , start = 1 ):
339
345
slide_text = f"\n Slide { slide_number } :\n "
@@ -344,8 +350,17 @@ def pptx_to_text(file: IO[Any]) -> str:
344
350
return TEXT_SECTION_SEPARATOR .join (text_content )
345
351
346
352
347
- def xlsx_to_text (file : IO [Any ]) -> str :
348
- workbook = openpyxl .load_workbook (file , read_only = True )
353
+ def xlsx_to_text (file : IO [Any ], file_name : str = "" ) -> str :
354
+ try :
355
+ workbook = openpyxl .load_workbook (file , read_only = True )
356
+ except BadZipFile as e :
357
+ error_str = f"Failed to extract text from { file_name or 'xlsx file' } : { e } "
358
+ if file_name .startswith ("~" ):
359
+ logger .debug (error_str + " (this is expected for files with ~)" )
360
+ else :
361
+ logger .warning (error_str )
362
+ return ""
363
+
349
364
text_content = []
350
365
for sheet in workbook .worksheets :
351
366
rows = []
@@ -504,13 +519,17 @@ def extract_text_and_images(
504
519
if extension == ".pptx" :
505
520
file .seek (0 )
506
521
return ExtractionResult (
507
- text_content = pptx_to_text (file ), embedded_images = [], metadata = {}
522
+ text_content = pptx_to_text (file , file_name = file_name ),
523
+ embedded_images = [],
524
+ metadata = {},
508
525
)
509
526
510
527
if extension == ".xlsx" :
511
528
file .seek (0 )
512
529
return ExtractionResult (
513
- text_content = xlsx_to_text (file ), embedded_images = [], metadata = {}
530
+ text_content = xlsx_to_text (file , file_name = file_name ),
531
+ embedded_images = [],
532
+ metadata = {},
514
533
)
515
534
516
535
if extension == ".eml" :
0 commit comments