.

edwin-onyx · edwin-onyx · commit 2d4273ea8ee9 · 2025-09-22T20:13:10.000-07:00
diff --git a/backend/onyx/context/search/utils.py b/backend/onyx/context/search/utils.py
@@ -2,6 +2,8 @@
 from collections.abc import Sequence
 from typing import TypeVar
 
+from nltk.corpus import stopwords  # type:ignore
+from nltk.tokenize import word_tokenize  # type:ignore
 from sqlalchemy.orm import Session
 
 from onyx.chat.models import SectionRelevancePiece
@@ -151,9 +153,6 @@ def chunks_or_sections_to_search_docs(
 
 
 def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
-    from nltk.corpus import stopwords  # type: ignore[import-untyped]
-    from nltk.tokenize import word_tokenize  # type: ignore[import-untyped]
-
     try:
         # Re-tokenize using the NLTK tokenizer for better matching
         query = " ".join(keywords)
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -15,12 +15,14 @@
 from typing import Any
 from typing import IO
 from typing import NamedTuple
-from typing import Optional
-from typing import TYPE_CHECKING
 from zipfile import BadZipFile
 
 import chardet
 import openpyxl
+from markitdown import FileConversionException
+from markitdown import MarkItDown
+from markitdown import StreamInfo
+from markitdown import UnsupportedFormatException
 from PIL import Image
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError
@@ -35,11 +37,6 @@
 from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
 from onyx.utils.logger import setup_logger
 
-
-if TYPE_CHECKING:
-    from markitdown import MarkItDown
-
-
 logger = setup_logger()
 
 # NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
@@ -88,19 +85,17 @@
     "image/webp",
 ]
 
-_MARKITDOWN_CONVERTER: Optional["MarkItDown"] = None
+_MARKITDOWN_CONVERTER: MarkItDown | None = None
 
 KNOWN_OPENPYXL_BUGS = [
     "Value must be either numerical or a string containing a wildcard",
     "File contains no valid workbook part",
 ]
 
 
-def get_markitdown_converter() -> "MarkItDown":
+def get_markitdown_converter() -> MarkItDown:
     global _MARKITDOWN_CONVERTER
     if _MARKITDOWN_CONVERTER is None:
-        from markitdown import MarkItDown
-
         _MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
     return _MARKITDOWN_CONVERTER
 
@@ -362,12 +357,6 @@ def docx_to_text_and_images(
     of avoiding materializing the list of images in memory.
     The images list returned is empty in this case.
     """
-    from markitdown import (
-        FileConversionException,
-        StreamInfo,
-        UnsupportedFormatException,
-    )
-
     md = get_markitdown_converter()
     try:
         doc = md.convert(
@@ -404,12 +393,6 @@ def docx_to_text_and_images(
 
 
 def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
-    from markitdown import (
-        FileConversionException,
-        StreamInfo,
-        UnsupportedFormatException,
-    )
-
     md = get_markitdown_converter()
     stream_info = StreamInfo(
         mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"
diff --git a/backend/onyx/kg/clustering/normalizations.py b/backend/onyx/kg/clustering/normalizations.py
@@ -3,6 +3,7 @@
 from typing import cast
 
 import numpy as np
+from nltk import ngrams  # type: ignore
 from rapidfuzz.distance.DamerauLevenshtein import normalized_similarity
 from sqlalchemy import desc
 from sqlalchemy import Float
@@ -58,8 +59,6 @@ def _normalize_one_entity(
     attributes: dict[str, str],
     allowed_docs_temp_view_name: str | None = None,
 ) -> str | None:
-    from nltk import ngrams
-
     """
     Matches a single entity to the best matching entity of the same type.
     """
diff --git a/backend/scripts/check_lazy_imports.py b/backend/scripts/check_lazy_imports.py
@@ -15,7 +15,7 @@
 
 logger = logging.getLogger(__name__)
 
-_MODULES_TO_LAZY_IMPORT = {"vertexai", "markitdown"}
+_MODULES_TO_LAZY_IMPORT = {"vertexai"}
 
 
 @dataclass