Skip to content

Commit dbe51e2

Browse files
evan-onyxYour Name
authored andcommitted
don't fail on fake files (onyx-dot-app#4735)
* don't fail on fake files * solve at the source * oops * oops2
1 parent f315cff commit dbe51e2

File tree

2 files changed

+29
-10
lines changed

2 files changed

+29
-10
lines changed

backend/onyx/connectors/google_drive/doc_conversion.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,15 +164,15 @@ def _download_and_extract_sections_basic(
164164
elif (
165165
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
166166
):
167-
text = xlsx_to_text(io.BytesIO(response_call()))
168-
return [TextSection(link=link, text=text)]
167+
text = xlsx_to_text(io.BytesIO(response_call()), file_name=file_name)
168+
return [TextSection(link=link, text=text)] if text else []
169169

170170
elif (
171171
mime_type
172172
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
173173
):
174-
text = pptx_to_text(io.BytesIO(response_call()))
175-
return [TextSection(link=link, text=text)]
174+
text = pptx_to_text(io.BytesIO(response_call()), file_name=file_name)
175+
return [TextSection(link=link, text=text)] if text else []
176176

177177
elif is_gdrive_image_mime_type(mime_type):
178178
# For images, store them for later processing

backend/onyx/file_processing/extract_file_text.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from typing import Any
1616
from typing import IO
1717
from typing import NamedTuple
18+
from zipfile import BadZipFile
1819

1920
import chardet
2021
import docx # type: ignore
@@ -332,8 +333,13 @@ def docx_to_text_and_images(
332333
return text_content, embedded_images
333334

334335

335-
def pptx_to_text(file: IO[Any]) -> str:
336-
presentation = pptx.Presentation(file)
336+
def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
337+
try:
338+
presentation = pptx.Presentation(file)
339+
except BadZipFile as e:
340+
error_str = f"Failed to extract text from {file_name or 'pptx file'}: {e}"
341+
logger.warning(error_str)
342+
return ""
337343
text_content = []
338344
for slide_number, slide in enumerate(presentation.slides, start=1):
339345
slide_text = f"\nSlide {slide_number}:\n"
@@ -344,8 +350,17 @@ def pptx_to_text(file: IO[Any]) -> str:
344350
return TEXT_SECTION_SEPARATOR.join(text_content)
345351

346352

347-
def xlsx_to_text(file: IO[Any]) -> str:
348-
workbook = openpyxl.load_workbook(file, read_only=True)
353+
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
354+
try:
355+
workbook = openpyxl.load_workbook(file, read_only=True)
356+
except BadZipFile as e:
357+
error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
358+
if file_name.startswith("~"):
359+
logger.debug(error_str + " (this is expected for files with ~)")
360+
else:
361+
logger.warning(error_str)
362+
return ""
363+
349364
text_content = []
350365
for sheet in workbook.worksheets:
351366
rows = []
@@ -504,13 +519,17 @@ def extract_text_and_images(
504519
if extension == ".pptx":
505520
file.seek(0)
506521
return ExtractionResult(
507-
text_content=pptx_to_text(file), embedded_images=[], metadata={}
522+
text_content=pptx_to_text(file, file_name=file_name),
523+
embedded_images=[],
524+
metadata={},
508525
)
509526

510527
if extension == ".xlsx":
511528
file.seek(0)
512529
return ExtractionResult(
513-
text_content=xlsx_to_text(file), embedded_images=[], metadata={}
530+
text_content=xlsx_to_text(file, file_name=file_name),
531+
embedded_images=[],
532+
metadata={},
514533
)
515534

516535
if extension == ".eml":

0 commit comments

Comments
 (0)