Skip to content

Commit 2d4273e

Browse files
committed
.
1 parent 29f7054 commit 2d4273e

File tree

4 files changed

+10
-29
lines changed

4 files changed

+10
-29
lines changed

backend/onyx/context/search/utils.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from collections.abc import Sequence
33
from typing import TypeVar
44

5+
from nltk.corpus import stopwords # type:ignore
6+
from nltk.tokenize import word_tokenize # type:ignore
57
from sqlalchemy.orm import Session
68

79
from onyx.chat.models import SectionRelevancePiece
@@ -151,9 +153,6 @@ def chunks_or_sections_to_search_docs(
151153

152154

153155
def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
154-
from nltk.corpus import stopwords # type: ignore[import-untyped]
155-
from nltk.tokenize import word_tokenize # type: ignore[import-untyped]
156-
157156
try:
158157
# Re-tokenize using the NLTK tokenizer for better matching
159158
query = " ".join(keywords)

backend/onyx/file_processing/extract_file_text.py

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
from typing import Any
1616
from typing import IO
1717
from typing import NamedTuple
18-
from typing import Optional
19-
from typing import TYPE_CHECKING
2018
from zipfile import BadZipFile
2119

2220
import chardet
2321
import openpyxl
22+
from markitdown import FileConversionException
23+
from markitdown import MarkItDown
24+
from markitdown import StreamInfo
25+
from markitdown import UnsupportedFormatException
2426
from PIL import Image
2527
from pypdf import PdfReader
2628
from pypdf.errors import PdfStreamError
@@ -35,11 +37,6 @@
3537
from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
3638
from onyx.utils.logger import setup_logger
3739

38-
39-
if TYPE_CHECKING:
40-
from markitdown import MarkItDown
41-
42-
4340
logger = setup_logger()
4441

4542
# NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
@@ -88,19 +85,17 @@
8885
"image/webp",
8986
]
9087

91-
_MARKITDOWN_CONVERTER: Optional["MarkItDown"] = None
88+
_MARKITDOWN_CONVERTER: MarkItDown | None = None
9289

9390
KNOWN_OPENPYXL_BUGS = [
9491
"Value must be either numerical or a string containing a wildcard",
9592
"File contains no valid workbook part",
9693
]
9794

9895

99-
def get_markitdown_converter() -> "MarkItDown":
96+
def get_markitdown_converter() -> MarkItDown:
10097
global _MARKITDOWN_CONVERTER
10198
if _MARKITDOWN_CONVERTER is None:
102-
from markitdown import MarkItDown
103-
10499
_MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
105100
return _MARKITDOWN_CONVERTER
106101

@@ -362,12 +357,6 @@ def docx_to_text_and_images(
362357
of avoiding materializing the list of images in memory.
363358
The images list returned is empty in this case.
364359
"""
365-
from markitdown import (
366-
FileConversionException,
367-
StreamInfo,
368-
UnsupportedFormatException,
369-
)
370-
371360
md = get_markitdown_converter()
372361
try:
373362
doc = md.convert(
@@ -404,12 +393,6 @@ def docx_to_text_and_images(
404393

405394

406395
def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
407-
from markitdown import (
408-
FileConversionException,
409-
StreamInfo,
410-
UnsupportedFormatException,
411-
)
412-
413396
md = get_markitdown_converter()
414397
stream_info = StreamInfo(
415398
mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"

backend/onyx/kg/clustering/normalizations.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import cast
44

55
import numpy as np
6+
from nltk import ngrams # type: ignore
67
from rapidfuzz.distance.DamerauLevenshtein import normalized_similarity
78
from sqlalchemy import desc
89
from sqlalchemy import Float
@@ -58,8 +59,6 @@ def _normalize_one_entity(
5859
attributes: dict[str, str],
5960
allowed_docs_temp_view_name: str | None = None,
6061
) -> str | None:
61-
from nltk import ngrams
62-
6362
"""
6463
Matches a single entity to the best matching entity of the same type.
6564
"""

backend/scripts/check_lazy_imports.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
logger = logging.getLogger(__name__)
1717

18-
_MODULES_TO_LAZY_IMPORT = {"vertexai", "markitdown"}
18+
_MODULES_TO_LAZY_IMPORT = {"vertexai"}
1919

2020

2121
@dataclass

0 commit comments

Comments
 (0)