|
| 1 | +import io |
1 | 2 | import json
|
2 | 3 | import mimetypes
|
3 | 4 | import os
|
|
101 | 102 | from onyx.db.models import IndexingStatus
|
102 | 103 | from onyx.db.models import User
|
103 | 104 | from onyx.db.models import UserGroup__ConnectorCredentialPair
|
104 |
| -from onyx.file_processing.extract_file_text import convert_docx_to_txt |
| 105 | +from onyx.file_processing.extract_file_text import extract_file_text |
105 | 106 | from onyx.file_store.file_store import get_default_file_store
|
| 107 | +from onyx.file_store.models import ChatFileType |
106 | 108 | from onyx.key_value_store.interface import KvKeyNotFoundError
|
107 | 109 | from onyx.server.documents.models import AuthStatus
|
108 | 110 | from onyx.server.documents.models import AuthUrl
|
|
124 | 126 | from onyx.server.documents.models import ObjectCreationIdResponse
|
125 | 127 | from onyx.server.documents.models import RunConnectorRequest
|
126 | 128 | from onyx.server.models import StatusResponse
|
| 129 | +from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type |
127 | 130 | from onyx.utils.logger import setup_logger
|
128 | 131 | from onyx.utils.telemetry import create_milestone_and_report
|
129 | 132 | from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
@@ -484,15 +487,17 @@ def should_process_file(file_path: str) -> bool:
|
484 | 487 | deduped_file_names.append(os.path.basename(file_info))
|
485 | 488 | continue
|
486 | 489 |
|
487 |
| - # For mypy, actual check happens at start of function |
488 |
| - assert file.filename is not None |
489 |
| - |
490 |
| - # Special handling for docx files - only store the plaintext version |
491 |
| - if file.content_type and file.content_type.startswith( |
492 |
| - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
493 |
| - ): |
494 |
| - docx_file_id = convert_docx_to_txt(file, file_store) |
495 |
| - deduped_file_paths.append(docx_file_id) |
| 490 | + # Special handling for doc files - only store the plaintext version |
| 491 | + file_type = mime_type_to_chat_file_type(file.content_type) |
| 492 | + if file_type == ChatFileType.DOC: |
| 493 | + extracted_text = extract_file_text(file.file, file.filename or "") |
| 494 | + text_file_id = file_store.save_file( |
| 495 | + content=io.BytesIO(extracted_text.encode()), |
| 496 | + display_name=file.filename, |
| 497 | + file_origin=FileOrigin.CHAT_UPLOAD, |
| 498 | + file_type="text/plain", |
| 499 | + ) |
| 500 | + deduped_file_paths.append(text_file_id) |
496 | 501 | deduped_file_names.append(file.filename)
|
497 | 502 | continue
|
498 | 503 |
|
|
0 commit comments