fix: sharepoint memory via excel parsing (#5444)

evan-onyx · web-flow · commit f4d135d710fd · 2025-09-19T17:10:27.000-07:00
diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py
@@ -1029,7 +1029,7 @@ def _fetch_site_pages(
 
         # Filter pages based on time window if specified
         if start is not None or end is not None:
-            filtered_pages = []
+            filtered_pages: list[dict[str, Any]] = []
             for page in all_pages:
                 page_modified = page.get("lastModifiedDateTime")
                 if page_modified:
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -1,3 +1,4 @@
+import gc
 import io
 import json
 import os
@@ -17,8 +18,10 @@
 from zipfile import BadZipFile
 
 import chardet
+import openpyxl
 from markitdown import FileConversionException
 from markitdown import MarkItDown
+from markitdown import StreamInfo
 from markitdown import UnsupportedFormatException
 from PIL import Image
 from pypdf import PdfReader
@@ -30,6 +33,8 @@
 from onyx.file_processing.html_utils import parse_html_page_basic
 from onyx.file_processing.unstructured import get_unstructured_api_key
 from onyx.file_processing.unstructured import unstructured_to_text
+from onyx.utils.file_types import PRESENTATION_MIME_TYPE
+from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
 from onyx.utils.logger import setup_logger
 
 logger = setup_logger()
@@ -80,6 +85,20 @@
     "image/webp",
 ]
 
+_MARKITDOWN_CONVERTER: MarkItDown | None = None
+
+KNOWN_OPENPYXL_BUGS = [
+    "Value must be either numerical or a string containing a wildcard",
+    "File contains no valid workbook part",
+]
+
+
+def get_markitdown_converter() -> MarkItDown:
+    global _MARKITDOWN_CONVERTER
+    if _MARKITDOWN_CONVERTER is None:
+        _MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
+    return _MARKITDOWN_CONVERTER
+
 
 class OnyxExtensionType(IntFlag):
     Plain = auto()
@@ -338,9 +357,11 @@ def docx_to_text_and_images(
     of avoiding materializing the list of images in memory.
     The images list returned is empty in this case.
     """
-    md = MarkItDown(enable_plugins=False)
+    md = get_markitdown_converter()
     try:
-        doc = md.convert(to_bytesio(file))
+        doc = md.convert(
+            to_bytesio(file), stream_info=StreamInfo(mimetype=WORD_PROCESSING_MIME_TYPE)
+        )
     except (
         BadZipFile,
         ValueError,
@@ -372,9 +393,12 @@ def docx_to_text_and_images(
 
 
 def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
-    md = MarkItDown(enable_plugins=False)
+    md = get_markitdown_converter()
+    stream_info = StreamInfo(
+        mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"
+    )
     try:
-        presentation = md.convert(to_bytesio(file))
+        presentation = md.convert(to_bytesio(file), stream_info=stream_info)
     except (
         BadZipFile,
         ValueError,
@@ -388,23 +412,69 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
 
 
 def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
-    md = MarkItDown(enable_plugins=False)
+    # TODO: switch back to this approach in a few months when markitdown
+    # fixes their handling of excel files
+
+    # md = get_markitdown_converter()
+    # stream_info = StreamInfo(
+    #     mimetype=SPREADSHEET_MIME_TYPE, filename=file_name or None, extension=".xlsx"
+    # )
+    # try:
+    #     workbook = md.convert(to_bytesio(file), stream_info=stream_info)
+    # except (
+    #     BadZipFile,
+    #     ValueError,
+    #     FileConversionException,
+    #     UnsupportedFormatException,
+    # ) as e:
+    #     error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
+    #     if file_name.startswith("~"):
+    #         logger.debug(error_str + " (this is expected for files with ~)")
+    #     else:
+    #         logger.warning(error_str)
+    #     return ""
+    # return workbook.markdown
     try:
-        workbook = md.convert(to_bytesio(file))
-    except (
-        BadZipFile,
-        ValueError,
-        FileConversionException,
-        UnsupportedFormatException,
-    ) as e:
+        workbook = openpyxl.load_workbook(file, read_only=True)
+    except BadZipFile as e:
         error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
         if file_name.startswith("~"):
             logger.debug(error_str + " (this is expected for files with ~)")
         else:
             logger.warning(error_str)
         return ""
+    except Exception as e:
+        if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
+            logger.error(
+                f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
+            )
+            return ""
+        raise e
 
-    return workbook.markdown
+    text_content = []
+    for sheet in workbook.worksheets:
+        rows = []
+        num_empty_consecutive_rows = 0
+        for row in sheet.iter_rows(min_row=1, values_only=True):
+            row_str = ",".join(str(cell or "") for cell in row)
+
+            # Only add the row if there are any values in the cells
+            if len(row_str) >= len(row):
+                rows.append(row_str)
+                num_empty_consecutive_rows = 0
+            else:
+                num_empty_consecutive_rows += 1
+
+            if num_empty_consecutive_rows > 100:
+                # handle massive excel sheets with mostly empty cells
+                logger.warning(
+                    f"Found {num_empty_consecutive_rows} empty rows in {file_name},"
+                    " skipping rest of file"
+                )
+                break
+        sheet_str = "\n".join(rows)
+        text_content.append(sheet_str)
+    return TEXT_SECTION_SEPARATOR.join(text_content)
 
 
 def eml_to_text(file: IO[Any]) -> str:
@@ -531,6 +601,23 @@ def extract_text_and_images(
     Primary new function for the updated connector.
     Returns structured extraction result with text content, embedded images, and metadata.
     """
+    res = _extract_text_and_images(
+        file, file_name, pdf_pass, content_type, image_callback
+    )
+    # Clean up any temporary objects and force garbage collection
+    unreachable = gc.collect()
+    logger.info(f"Unreachable objects: {unreachable}")
+
+    return res
+
+
+def _extract_text_and_images(
+    file: IO[Any],
+    file_name: str,
+    pdf_pass: str | None = None,
+    content_type: str | None = None,
+    image_callback: Callable[[bytes, str], None] | None = None,
+) -> ExtractionResult:
     file.seek(0)
 
     if get_unstructured_api_key():
@@ -556,7 +643,6 @@ def extract_text_and_images(
     # Default processing
     try:
         extension = get_file_ext(file_name)
-
         # docx example for embedded images
         if extension == ".docx":
             text_content, images = docx_to_text_and_images(
diff --git a/backend/onyx/utils/file_types.py b/backend/onyx/utils/file_types.py
@@ -1,3 +1,16 @@
+PRESENTATION_MIME_TYPE = (
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+)
+
+SPREADSHEET_MIME_TYPE = (
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+)
+WORD_PROCESSING_MIME_TYPE = (
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+)
+PDF_MIME_TYPE = "application/pdf"
+
+
 class UploadMimeTypes:
     IMAGE_MIME_TYPES = {"image/jpeg", "image/png", "image/webp"}
     CSV_MIME_TYPES = {"text/csv"}
@@ -13,10 +26,10 @@ class UploadMimeTypes:
         "application/x-yaml",
     }
     DOCUMENT_MIME_TYPES = {
-        "application/pdf",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        PDF_MIME_TYPE,
+        WORD_PROCESSING_MIME_TYPE,
+        PRESENTATION_MIME_TYPE,
+        SPREADSHEET_MIME_TYPE,
         "message/rfc822",
         "application/epub+zip",
     }
diff --git a/backend/onyx/utils/memory_logger.py b/backend/onyx/utils/memory_logger.py
@@ -0,0 +1,53 @@
+# # leaving this here for future mem debugging efforts
+# import os
+# from typing import Any
+
+# import psutil
+# from pympler import asizeof
+
+# from onyx.utils.logger import setup_logger
+
+# logger = setup_logger()
+
+#
+# def log_memory_usage(
+#     label: str,
+#     specific_object: Any = None,
+#     object_label: str = "",
+# ) -> None:
+#     """Log current process memory usage and optionally the size of a specific object.
+
+#     Args:
+#         label: A descriptive label for the current location/operation in code
+#         specific_object: Optional object to measure the size of
+#         object_label: Optional label describing the specific object
+#     """
+#     try:
+#         # Get current process memory info
+#         process = psutil.Process(os.getpid())
+#         memory_info = process.memory_info()
+
+#         # Convert to MB for readability
+#         rss_mb = memory_info.rss / (1024 * 1024)
+#         vms_mb = memory_info.vms / (1024 * 1024)
+
+#         log_parts = [f"MEMORY[{label}]", f"RSS: {rss_mb:.2f}MB", f"VMS: {vms_mb:.2f}MB"]
+
+#         # Add object size if provided
+#         if specific_object is not None:
+#             try:
+#                 # recursively calculate the size of the object
+#                 obj_size = asizeof.asizeof(specific_object)
+#                 obj_size_mb = obj_size / (1024 * 1024)
+#                 obj_desc = f"[{object_label}]" if object_label else "[object]"
+#                 log_parts.append(f"OBJ{obj_desc}: {obj_size_mb:.2f}MB")
+#             except Exception as e:
+#                 log_parts.append(f"OBJ_SIZE_ERROR: {str(e)}")
+
+#         logger.info(" | ".join(log_parts))
+
+#     except Exception as e:
+#         logger.warning(f"Failed to log memory usage for {label}: {str(e)}")
+
+# For example, use this like:
+# log_memory_usage("my_operation", my_large_object, "my_large_object")
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
@@ -51,6 +51,7 @@ nltk==3.9.1
 Office365-REST-Python-Client==2.5.9
 oauthlib==3.2.2
 openai==1.99.5
+openpyxl==3.1.5
 passlib==1.7.4
 playwright==1.41.2
 psutil==5.9.5
@@ -60,6 +61,7 @@ pyairtable==3.0.1
 pycryptodome==3.19.1
 pydantic==2.11.7
 PyGithub==2.5.0
+pympler==1.1
 python-dateutil==2.8.2
 python-gitlab==5.6.0
 python-pptx==0.6.23
@@ -83,6 +85,7 @@ supervisor==4.2.5
 RapidFuzz==3.13.0
 tiktoken==0.7.0
 timeago==1.0.16
+types-openpyxl==3.1.5.20250919
 unstructured==0.15.1
 unstructured-client==0.25.4
 uvicorn==0.35.0