Skip to content

Commit 46f3af4

Browse files
authored
enhance file processing with content type handling (#5196)
1 parent 2af64eb commit 46f3af4

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

backend/onyx/connectors/file/connector.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def _process_file(
7272
file: IO[Any],
7373
metadata: dict[str, Any] | None,
7474
pdf_pass: str | None,
75+
file_type: str | None,
7576
) -> list[Document]:
7677
"""
7778
Process a file and return a list of Documents.
@@ -148,6 +149,7 @@ def _process_file(
148149
file=file,
149150
file_name=file_name,
150151
pdf_pass=pdf_pass,
152+
content_type=file_type,
151153
)
152154

153155
# Each file may have file-specific ONYX_METADATA https://docs.onyx.app/connectors/file
@@ -278,6 +280,7 @@ def load_from_state(self) -> GenerateDocumentsOutput:
278280
file=file_io,
279281
metadata=metadata,
280282
pdf_pass=self.pdf_pass,
283+
file_type=file_record.file_type,
281284
)
282285
documents.extend(new_docs)
283286

backend/onyx/file_processing/extract_file_text.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from onyx.configs.constants import FileOrigin
3030
from onyx.configs.constants import ONYX_METADATA_FILENAME
3131
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
32+
from onyx.file_processing.file_validation import TEXT_MIME_TYPE
3233
from onyx.file_processing.html_utils import parse_html_page_basic
3334
from onyx.file_processing.unstructured import get_unstructured_api_key
3435
from onyx.file_processing.unstructured import unstructured_to_text
@@ -492,10 +493,23 @@ class ExtractionResult(NamedTuple):
492493
metadata: dict[str, Any]
493494

494495

496+
def extract_result_from_text_file(file: IO[Any]) -> ExtractionResult:
497+
encoding = detect_encoding(file)
498+
text_content_raw, file_metadata = read_text_file(
499+
file, encoding=encoding, ignore_onyx_metadata=False
500+
)
501+
return ExtractionResult(
502+
text_content=text_content_raw,
503+
embedded_images=[],
504+
metadata=file_metadata,
505+
)
506+
507+
495508
def extract_text_and_images(
496509
file: IO[Any],
497510
file_name: str,
498511
pdf_pass: str | None = None,
512+
content_type: str | None = None,
499513
) -> ExtractionResult:
500514
"""
501515
Primary new function for the updated connector.
@@ -516,6 +530,13 @@ def extract_text_and_images(
516530
)
517531
file.seek(0) # Reset file pointer just in case
518532

533+
# When we upload a document via a connector or MyDocuments, we extract and store the content of files
534+
# with content types in UploadMimeTypes.DOCUMENT_MIME_TYPES as plain text files.
535+
# As a result, the file name extension may differ from the original content type.
536+
# We process files with a plain text content type first to handle this scenario.
537+
if content_type == TEXT_MIME_TYPE:
538+
return extract_result_from_text_file(file)
539+
519540
# Default processing
520541
try:
521542
extension = get_file_ext(file_name)
@@ -574,15 +595,7 @@ def extract_text_and_images(
574595

575596
# If we reach here and it's a recognized text extension
576597
if is_text_file_extension(file_name):
577-
encoding = detect_encoding(file)
578-
text_content_raw, file_metadata = read_text_file(
579-
file, encoding=encoding, ignore_onyx_metadata=False
580-
)
581-
return ExtractionResult(
582-
text_content=text_content_raw,
583-
embedded_images=[],
584-
metadata=file_metadata,
585-
)
598+
return extract_result_from_text_file(file)
586599

587600
# If it's an image file or something else, we do not parse embedded images from them
588601
# just return empty text

backend/onyx/file_processing/file_validation.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
"image/avif",
2222
]
2323

24+
# Text MIME types
25+
TEXT_MIME_TYPE = "text/plain"
26+
2427

2528
def is_valid_image_type(mime_type: str) -> bool:
2629
"""

0 commit comments

Comments
 (0)