don't fail on fake files (onyx-dot-app#4735)

evan-onyx · web-flow · commit eecc2dac2f76 · 2025-05-19T23:09:34.000Z
* don't fail on fake files

* solve at the source

* oops

* oops2
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -164,15 +164,15 @@ def _download_and_extract_sections_basic(
     elif (
         mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ):
-        text = xlsx_to_text(io.BytesIO(response_call()))
-        return [TextSection(link=link, text=text)]
+        text = xlsx_to_text(io.BytesIO(response_call()), file_name=file_name)
+        return [TextSection(link=link, text=text)] if text else []
 
     elif (
         mime_type
         == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
     ):
-        text = pptx_to_text(io.BytesIO(response_call()))
-        return [TextSection(link=link, text=text)]
+        text = pptx_to_text(io.BytesIO(response_call()), file_name=file_name)
+        return [TextSection(link=link, text=text)] if text else []
 
     elif is_gdrive_image_mime_type(mime_type):
         # For images, store them for later processing
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -15,6 +15,7 @@
 from typing import Any
 from typing import IO
 from typing import NamedTuple
+from zipfile import BadZipFile
 
 import chardet
 import docx  # type: ignore
@@ -332,8 +333,13 @@ def docx_to_text_and_images(
     return text_content, embedded_images
 
 
-def pptx_to_text(file: IO[Any]) -> str:
-    presentation = pptx.Presentation(file)
+def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
+    try:
+        presentation = pptx.Presentation(file)
+    except BadZipFile as e:
+        error_str = f"Failed to extract text from {file_name or 'pptx file'}: {e}"
+        logger.warning(error_str)
+        return ""
     text_content = []
     for slide_number, slide in enumerate(presentation.slides, start=1):
         slide_text = f"\nSlide {slide_number}:\n"
@@ -344,8 +350,17 @@ def pptx_to_text(file: IO[Any]) -> str:
     return TEXT_SECTION_SEPARATOR.join(text_content)
 
 
-def xlsx_to_text(file: IO[Any]) -> str:
-    workbook = openpyxl.load_workbook(file, read_only=True)
+def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
+    try:
+        workbook = openpyxl.load_workbook(file, read_only=True)
+    except BadZipFile as e:
+        error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
+        if file_name.startswith("~"):
+            logger.debug(error_str + " (this is expected for files with ~)")
+        else:
+            logger.warning(error_str)
+        return ""
+
     text_content = []
     for sheet in workbook.worksheets:
         rows = []
@@ -504,13 +519,17 @@ def extract_text_and_images(
         if extension == ".pptx":
             file.seek(0)
             return ExtractionResult(
-                text_content=pptx_to_text(file), embedded_images=[], metadata={}
+                text_content=pptx_to_text(file, file_name=file_name),
+                embedded_images=[],
+                metadata={},
             )
 
         if extension == ".xlsx":
             file.seek(0)
             return ExtractionResult(
-                text_content=xlsx_to_text(file), embedded_images=[], metadata={}
+                text_content=xlsx_to_text(file, file_name=file_name),
+                embedded_images=[],
+                metadata={},
             )
 
         if extension == ".eml":