29
29
from onyx .configs .constants import FileOrigin
30
30
from onyx .configs .constants import ONYX_METADATA_FILENAME
31
31
from onyx .configs .llm_configs import get_image_extraction_and_analysis_enabled
32
+ from onyx .file_processing .file_validation import TEXT_MIME_TYPE
32
33
from onyx .file_processing .html_utils import parse_html_page_basic
33
34
from onyx .file_processing .unstructured import get_unstructured_api_key
34
35
from onyx .file_processing .unstructured import unstructured_to_text
@@ -492,10 +493,23 @@ class ExtractionResult(NamedTuple):
492
493
metadata : dict [str , Any ]
493
494
494
495
496
+ def extract_result_from_text_file (file : IO [Any ]) -> ExtractionResult :
497
+ encoding = detect_encoding (file )
498
+ text_content_raw , file_metadata = read_text_file (
499
+ file , encoding = encoding , ignore_onyx_metadata = False
500
+ )
501
+ return ExtractionResult (
502
+ text_content = text_content_raw ,
503
+ embedded_images = [],
504
+ metadata = file_metadata ,
505
+ )
506
+
507
+
495
508
def extract_text_and_images (
496
509
file : IO [Any ],
497
510
file_name : str ,
498
511
pdf_pass : str | None = None ,
512
+ content_type : str | None = None ,
499
513
) -> ExtractionResult :
500
514
"""
501
515
Primary new function for the updated connector.
@@ -516,6 +530,13 @@ def extract_text_and_images(
516
530
)
517
531
file .seek (0 ) # Reset file pointer just in case
518
532
533
+ # When we upload a document via a connector or MyDocuments, we extract and store the content of files
534
+ # with content types in UploadMimeTypes.DOCUMENT_MIME_TYPES as plain text files.
535
+ # As a result, the file name extension may differ from the original content type.
536
+ # We process files with a plain text content type first to handle this scenario.
537
+ if content_type == TEXT_MIME_TYPE :
538
+ return extract_result_from_text_file (file )
539
+
519
540
# Default processing
520
541
try :
521
542
extension = get_file_ext (file_name )
@@ -574,15 +595,7 @@ def extract_text_and_images(
574
595
575
596
# If we reach here and it's a recognized text extension
576
597
if is_text_file_extension (file_name ):
577
- encoding = detect_encoding (file )
578
- text_content_raw , file_metadata = read_text_file (
579
- file , encoding = encoding , ignore_onyx_metadata = False
580
- )
581
- return ExtractionResult (
582
- text_content = text_content_raw ,
583
- embedded_images = [],
584
- metadata = file_metadata ,
585
- )
598
+ return extract_result_from_text_file (file )
586
599
587
600
# If it's an image file or something else, we do not parse embedded images from them
588
601
# just return empty text
0 commit comments