enhance file processing with content type handling (#5196)

Subash-Mohan · web-flow · commit 46f3af4f6825 · 2025-08-14T08:59:53.000Z
diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
@@ -72,6 +72,7 @@ def _process_file(
     file: IO[Any],
     metadata: dict[str, Any] | None,
     pdf_pass: str | None,
+    file_type: str | None,
 ) -> list[Document]:
     """
     Process a file and return a list of Documents.
@@ -148,6 +149,7 @@ def _process_file(
         file=file,
         file_name=file_name,
         pdf_pass=pdf_pass,
+        content_type=file_type,
     )
 
     # Each file may have file-specific ONYX_METADATA https://docs.onyx.app/connectors/file
@@ -278,6 +280,7 @@ def load_from_state(self) -> GenerateDocumentsOutput:
                 file=file_io,
                 metadata=metadata,
                 pdf_pass=self.pdf_pass,
+                file_type=file_record.file_type,
             )
             documents.extend(new_docs)
 
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -29,6 +29,7 @@
 from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import ONYX_METADATA_FILENAME
 from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
+from onyx.file_processing.file_validation import TEXT_MIME_TYPE
 from onyx.file_processing.html_utils import parse_html_page_basic
 from onyx.file_processing.unstructured import get_unstructured_api_key
 from onyx.file_processing.unstructured import unstructured_to_text
@@ -492,10 +493,23 @@ class ExtractionResult(NamedTuple):
     metadata: dict[str, Any]
 
 
+def extract_result_from_text_file(file: IO[Any]) -> ExtractionResult:
+    encoding = detect_encoding(file)
+    text_content_raw, file_metadata = read_text_file(
+        file, encoding=encoding, ignore_onyx_metadata=False
+    )
+    return ExtractionResult(
+        text_content=text_content_raw,
+        embedded_images=[],
+        metadata=file_metadata,
+    )
+
+
 def extract_text_and_images(
     file: IO[Any],
     file_name: str,
     pdf_pass: str | None = None,
+    content_type: str | None = None,
 ) -> ExtractionResult:
     """
     Primary new function for the updated connector.
@@ -516,6 +530,13 @@ def extract_text_and_images(
             )
             file.seek(0)  # Reset file pointer just in case
 
+    # When we upload a document via a connector or MyDocuments, we extract and store the content of files
+    # with content types in UploadMimeTypes.DOCUMENT_MIME_TYPES as plain text files.
+    # As a result, the file name extension may differ from the original content type.
+    # We process files with a plain text content type first to handle this scenario.
+    if content_type == TEXT_MIME_TYPE:
+        return extract_result_from_text_file(file)
+
     # Default processing
     try:
         extension = get_file_ext(file_name)
@@ -574,15 +595,7 @@ def extract_text_and_images(
 
         # If we reach here and it's a recognized text extension
         if is_text_file_extension(file_name):
-            encoding = detect_encoding(file)
-            text_content_raw, file_metadata = read_text_file(
-                file, encoding=encoding, ignore_onyx_metadata=False
-            )
-            return ExtractionResult(
-                text_content=text_content_raw,
-                embedded_images=[],
-                metadata=file_metadata,
-            )
+            return extract_result_from_text_file(file)
 
         # If it's an image file or something else, we do not parse embedded images from them
         # just return empty text
diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py
@@ -21,6 +21,9 @@
     "image/avif",
 ]
 
+# Text MIME types
+TEXT_MIME_TYPE = "text/plain"
+
 
 def is_valid_image_type(mime_type: str) -> bool:
     """

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,9 @@`
`21`	`21`	`"image/avif",`
`22`	`22`	`]`
`23`	`23`
	`24`	`+# Text MIME types`
	`25`	`+TEXT_MIME_TYPE = "text/plain"`
	`26`	`+`
`24`	`27`
`25`	`28`	`def is_valid_image_type(mime_type: str) -> bool:`
`26`	`29`	`"""`