Skip to content

Commit d3c5a4f

Browse files
Wenxi OnyxWeves
authored andcommitted
add docx fallback
1 parent f50006e commit d3c5a4f

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

backend/onyx/file_processing/extract_file_text.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,17 @@ def docx_to_text_and_images(
313313
try:
314314
doc = docx.Document(file)
315315
except BadZipFile as e:
316-
logger.warning(f"Failed to extract text from {file_name or 'docx file'}: {e}")
317-
return "", []
316+
logger.warning(
317+
f"Failed to extract docx {file_name or 'docx file'}: {e}. Attempting to read as text file."
318+
)
319+
320+
# May be an invalid docx, but still a valid text file
321+
file.seek(0)
322+
encoding = detect_encoding(file)
323+
text_content_raw, _ = read_text_file(
324+
file, encoding=encoding, ignore_onyx_metadata=False
325+
)
326+
return text_content_raw or "", []
318327

319328
# Grab text from paragraphs
320329
for paragraph in doc.paragraphs:

0 commit comments

Comments
 (0)