|
15 | 15 | from typing import Any
|
16 | 16 | from typing import IO
|
17 | 17 | from typing import NamedTuple
|
18 |
| -from typing import Optional |
19 |
| -from typing import TYPE_CHECKING |
20 | 18 | from zipfile import BadZipFile
|
21 | 19 |
|
22 | 20 | import chardet
|
23 | 21 | import openpyxl
|
| 22 | +from markitdown import FileConversionException |
| 23 | +from markitdown import MarkItDown |
| 24 | +from markitdown import StreamInfo |
| 25 | +from markitdown import UnsupportedFormatException |
24 | 26 | from PIL import Image
|
25 | 27 | from pypdf import PdfReader
|
26 | 28 | from pypdf.errors import PdfStreamError
|
|
35 | 37 | from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
|
36 | 38 | from onyx.utils.logger import setup_logger
|
37 | 39 |
|
38 |
| - |
39 |
| -if TYPE_CHECKING: |
40 |
| - from markitdown import MarkItDown |
41 |
| - |
42 |
| - |
43 | 40 | logger = setup_logger()
|
44 | 41 |
|
45 | 42 | # NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
|
|
88 | 85 | "image/webp",
|
89 | 86 | ]
|
90 | 87 |
|
91 |
| -_MARKITDOWN_CONVERTER: Optional["MarkItDown"] = None |
| 88 | +_MARKITDOWN_CONVERTER: MarkItDown | None = None |
92 | 89 |
|
93 | 90 | KNOWN_OPENPYXL_BUGS = [
|
94 | 91 | "Value must be either numerical or a string containing a wildcard",
|
95 | 92 | "File contains no valid workbook part",
|
96 | 93 | ]
|
97 | 94 |
|
98 | 95 |
|
99 |
| -def get_markitdown_converter() -> "MarkItDown": |
| 96 | +def get_markitdown_converter() -> MarkItDown: |
100 | 97 | global _MARKITDOWN_CONVERTER
|
101 | 98 | if _MARKITDOWN_CONVERTER is None:
|
102 |
| - from markitdown import MarkItDown |
103 |
| - |
104 | 99 | _MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
|
105 | 100 | return _MARKITDOWN_CONVERTER
|
106 | 101 |
|
@@ -362,12 +357,6 @@ def docx_to_text_and_images(
|
362 | 357 | of avoiding materializing the list of images in memory.
|
363 | 358 | The images list returned is empty in this case.
|
364 | 359 | """
|
365 |
| - from markitdown import ( |
366 |
| - FileConversionException, |
367 |
| - StreamInfo, |
368 |
| - UnsupportedFormatException, |
369 |
| - ) |
370 |
| - |
371 | 360 | md = get_markitdown_converter()
|
372 | 361 | try:
|
373 | 362 | doc = md.convert(
|
@@ -404,12 +393,6 @@ def docx_to_text_and_images(
|
404 | 393 |
|
405 | 394 |
|
406 | 395 | def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
|
407 |
| - from markitdown import ( |
408 |
| - FileConversionException, |
409 |
| - StreamInfo, |
410 |
| - UnsupportedFormatException, |
411 |
| - ) |
412 |
| - |
413 | 396 | md = get_markitdown_converter()
|
414 | 397 | stream_info = StreamInfo(
|
415 | 398 | mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"
|
|
0 commit comments