|
| 1 | +import io |
1 | 2 | import json
|
2 | 3 | import mimetypes
|
3 | 4 | import os
|
|
101 | 102 | from onyx.db.models import IndexingStatus
|
102 | 103 | from onyx.db.models import User
|
103 | 104 | from onyx.db.models import UserGroup__ConnectorCredentialPair
|
104 |
| -from onyx.file_processing.extract_file_text import convert_docx_to_txt |
| 105 | +from onyx.file_processing.extract_file_text import extract_file_text |
105 | 106 | from onyx.file_store.file_store import get_default_file_store
|
| 107 | +from onyx.file_store.models import ChatFileType |
106 | 108 | from onyx.key_value_store.interface import KvKeyNotFoundError
|
107 | 109 | from onyx.server.documents.models import AuthStatus
|
108 | 110 | from onyx.server.documents.models import AuthUrl
|
|
124 | 126 | from onyx.server.documents.models import ObjectCreationIdResponse
|
125 | 127 | from onyx.server.documents.models import RunConnectorRequest
|
126 | 128 | from onyx.server.models import StatusResponse
|
| 129 | +from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type |
127 | 130 | from onyx.utils.logger import setup_logger
|
128 | 131 | from onyx.utils.telemetry import create_milestone_and_report
|
129 | 132 | from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
@@ -434,6 +437,7 @@ def should_process_file(file_path: str) -> bool:
|
434 | 437 | file_store = get_default_file_store()
|
435 | 438 | seen_zip = False
|
436 | 439 | for file in files:
|
| 440 | + file_type = mime_type_to_chat_file_type(file.content_type) |
437 | 441 | if file.content_type and file.content_type.startswith("application/zip"):
|
438 | 442 | if seen_zip:
|
439 | 443 | raise HTTPException(status_code=400, detail=SEEN_ZIP_DETAIL)
|
@@ -462,12 +466,16 @@ def should_process_file(file_path: str) -> bool:
|
462 | 466 | deduped_file_paths.append(file_id)
|
463 | 467 | continue
|
464 | 468 |
|
465 |
| - # Special handling for docx files - only store the plaintext version |
466 |
| - if file.content_type and file.content_type.startswith( |
467 |
| - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
468 |
| - ): |
469 |
| - docx_file_id = convert_docx_to_txt(file, file_store) |
470 |
| - deduped_file_paths.append(docx_file_id) |
| 469 | + # Special handling for doc files - only store the plaintext version |
| 470 | + if file_type == ChatFileType.DOC: |
| 471 | + extracted_text = extract_file_text(file.file, file.filename or "") |
| 472 | + text_file_id = file_store.save_file( |
| 473 | + content=io.BytesIO(extracted_text.encode()), |
| 474 | + display_name=file.filename, |
| 475 | + file_origin=FileOrigin.CHAT_UPLOAD, |
| 476 | + file_type="text/plain", |
| 477 | + ) |
| 478 | + deduped_file_paths.append(text_file_id) |
471 | 479 | continue
|
472 | 480 |
|
473 | 481 | # Default handling for all other file types
|
|
0 commit comments