Skip to content

Commit c266196

Browse files
authored
Merge branch 'main' into edwin/dan-2558
2 parents 26cc8f1 + f4d135d commit c266196

File tree

6 files changed

+184
-27
lines changed

6 files changed

+184
-27
lines changed

backend/onyx/connectors/sharepoint/connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ def _fetch_site_pages(
10291029

10301030
# Filter pages based on time window if specified
10311031
if start is not None or end is not None:
1032-
filtered_pages = []
1032+
filtered_pages: list[dict[str, Any]] = []
10331033
for page in all_pages:
10341034
page_modified = page.get("lastModifiedDateTime")
10351035
if page_modified:

backend/onyx/evals/tracing.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@
77
from onyx.configs.app_configs import BRAINTRUST_API_KEY
88
from onyx.configs.app_configs import BRAINTRUST_PROJECT
99

10+
MASKING_LENGTH = 20000
1011

11-
def _truncate_str(s: str, head: int = 800, tail: int = 200) -> str:
12-
if len(s) <= head + tail:
13-
return s
14-
return f"{s[:head]}{s[-tail:]}[TRUNCATED {len(s)} chars to 10,000]"
12+
13+
def _truncate_str(s: str) -> str:
14+
tail = MASKING_LENGTH // 5
15+
head = MASKING_LENGTH - tail
16+
return f"{s[:head]}{s[-tail:]}[TRUNCATED {len(s)} chars to {MASKING_LENGTH}]"
1517

1618

1719
def _mask(data: Any) -> Any:
18-
data_str = str(data)
19-
if len(data_str) > 10_000:
20-
return _truncate_str(data_str)
21-
return data
20+
"""Mask data if it exceeds the maximum length threshold."""
21+
if len(str(data)) <= MASKING_LENGTH:
22+
return data
23+
return _truncate_str(str(data))
2224

2325

2426
def setup_braintrust() -> None:

backend/onyx/file_processing/extract_file_text.py

Lines changed: 100 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import gc
12
import io
23
import json
34
import os
@@ -17,8 +18,10 @@
1718
from zipfile import BadZipFile
1819

1920
import chardet
21+
import openpyxl
2022
from markitdown import FileConversionException
2123
from markitdown import MarkItDown
24+
from markitdown import StreamInfo
2225
from markitdown import UnsupportedFormatException
2326
from PIL import Image
2427
from pypdf import PdfReader
@@ -30,6 +33,8 @@
3033
from onyx.file_processing.html_utils import parse_html_page_basic
3134
from onyx.file_processing.unstructured import get_unstructured_api_key
3235
from onyx.file_processing.unstructured import unstructured_to_text
36+
from onyx.utils.file_types import PRESENTATION_MIME_TYPE
37+
from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
3338
from onyx.utils.logger import setup_logger
3439

3540
logger = setup_logger()
@@ -80,6 +85,20 @@
8085
"image/webp",
8186
]
8287

88+
_MARKITDOWN_CONVERTER: MarkItDown | None = None
89+
90+
KNOWN_OPENPYXL_BUGS = [
91+
"Value must be either numerical or a string containing a wildcard",
92+
"File contains no valid workbook part",
93+
]
94+
95+
96+
def get_markitdown_converter() -> MarkItDown:
97+
global _MARKITDOWN_CONVERTER
98+
if _MARKITDOWN_CONVERTER is None:
99+
_MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
100+
return _MARKITDOWN_CONVERTER
101+
83102

84103
class OnyxExtensionType(IntFlag):
85104
Plain = auto()
@@ -338,9 +357,11 @@ def docx_to_text_and_images(
338357
of avoiding materializing the list of images in memory.
339358
The images list returned is empty in this case.
340359
"""
341-
md = MarkItDown(enable_plugins=False)
360+
md = get_markitdown_converter()
342361
try:
343-
doc = md.convert(to_bytesio(file))
362+
doc = md.convert(
363+
to_bytesio(file), stream_info=StreamInfo(mimetype=WORD_PROCESSING_MIME_TYPE)
364+
)
344365
except (
345366
BadZipFile,
346367
ValueError,
@@ -372,9 +393,12 @@ def docx_to_text_and_images(
372393

373394

374395
def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
375-
md = MarkItDown(enable_plugins=False)
396+
md = get_markitdown_converter()
397+
stream_info = StreamInfo(
398+
mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"
399+
)
376400
try:
377-
presentation = md.convert(to_bytesio(file))
401+
presentation = md.convert(to_bytesio(file), stream_info=stream_info)
378402
except (
379403
BadZipFile,
380404
ValueError,
@@ -388,23 +412,69 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
388412

389413

390414
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
391-
md = MarkItDown(enable_plugins=False)
415+
# TODO: switch back to this approach in a few months when markitdown
416+
# fixes their handling of excel files
417+
418+
# md = get_markitdown_converter()
419+
# stream_info = StreamInfo(
420+
# mimetype=SPREADSHEET_MIME_TYPE, filename=file_name or None, extension=".xlsx"
421+
# )
422+
# try:
423+
# workbook = md.convert(to_bytesio(file), stream_info=stream_info)
424+
# except (
425+
# BadZipFile,
426+
# ValueError,
427+
# FileConversionException,
428+
# UnsupportedFormatException,
429+
# ) as e:
430+
# error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
431+
# if file_name.startswith("~"):
432+
# logger.debug(error_str + " (this is expected for files with ~)")
433+
# else:
434+
# logger.warning(error_str)
435+
# return ""
436+
# return workbook.markdown
392437
try:
393-
workbook = md.convert(to_bytesio(file))
394-
except (
395-
BadZipFile,
396-
ValueError,
397-
FileConversionException,
398-
UnsupportedFormatException,
399-
) as e:
438+
workbook = openpyxl.load_workbook(file, read_only=True)
439+
except BadZipFile as e:
400440
error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
401441
if file_name.startswith("~"):
402442
logger.debug(error_str + " (this is expected for files with ~)")
403443
else:
404444
logger.warning(error_str)
405445
return ""
446+
except Exception as e:
447+
if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
448+
logger.error(
449+
f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
450+
)
451+
return ""
452+
raise e
406453

407-
return workbook.markdown
454+
text_content = []
455+
for sheet in workbook.worksheets:
456+
rows = []
457+
num_empty_consecutive_rows = 0
458+
for row in sheet.iter_rows(min_row=1, values_only=True):
459+
row_str = ",".join(str(cell or "") for cell in row)
460+
461+
# Only add the row if there are any values in the cells
462+
if len(row_str) >= len(row):
463+
rows.append(row_str)
464+
num_empty_consecutive_rows = 0
465+
else:
466+
num_empty_consecutive_rows += 1
467+
468+
if num_empty_consecutive_rows > 100:
469+
# handle massive excel sheets with mostly empty cells
470+
logger.warning(
471+
f"Found {num_empty_consecutive_rows} empty rows in {file_name},"
472+
" skipping rest of file"
473+
)
474+
break
475+
sheet_str = "\n".join(rows)
476+
text_content.append(sheet_str)
477+
return TEXT_SECTION_SEPARATOR.join(text_content)
408478

409479

410480
def eml_to_text(file: IO[Any]) -> str:
@@ -531,6 +601,23 @@ def extract_text_and_images(
531601
Primary new function for the updated connector.
532602
Returns structured extraction result with text content, embedded images, and metadata.
533603
"""
604+
res = _extract_text_and_images(
605+
file, file_name, pdf_pass, content_type, image_callback
606+
)
607+
# Clean up any temporary objects and force garbage collection
608+
unreachable = gc.collect()
609+
logger.info(f"Unreachable objects: {unreachable}")
610+
611+
return res
612+
613+
614+
def _extract_text_and_images(
615+
file: IO[Any],
616+
file_name: str,
617+
pdf_pass: str | None = None,
618+
content_type: str | None = None,
619+
image_callback: Callable[[bytes, str], None] | None = None,
620+
) -> ExtractionResult:
534621
file.seek(0)
535622

536623
if get_unstructured_api_key():
@@ -556,7 +643,6 @@ def extract_text_and_images(
556643
# Default processing
557644
try:
558645
extension = get_file_ext(file_name)
559-
560646
# docx example for embedded images
561647
if extension == ".docx":
562648
text_content, images = docx_to_text_and_images(

backend/onyx/utils/file_types.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
PRESENTATION_MIME_TYPE = (
2+
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
3+
)
4+
5+
SPREADSHEET_MIME_TYPE = (
6+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
7+
)
8+
WORD_PROCESSING_MIME_TYPE = (
9+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
10+
)
11+
PDF_MIME_TYPE = "application/pdf"
12+
13+
114
class UploadMimeTypes:
215
IMAGE_MIME_TYPES = {"image/jpeg", "image/png", "image/webp"}
316
CSV_MIME_TYPES = {"text/csv"}
@@ -13,10 +26,10 @@ class UploadMimeTypes:
1326
"application/x-yaml",
1427
}
1528
DOCUMENT_MIME_TYPES = {
16-
"application/pdf",
17-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
18-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
19-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
29+
PDF_MIME_TYPE,
30+
WORD_PROCESSING_MIME_TYPE,
31+
PRESENTATION_MIME_TYPE,
32+
SPREADSHEET_MIME_TYPE,
2033
"message/rfc822",
2134
"application/epub+zip",
2235
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# # leaving this here for future mem debugging efforts
2+
# import os
3+
# from typing import Any
4+
5+
# import psutil
6+
# from pympler import asizeof
7+
8+
# from onyx.utils.logger import setup_logger
9+
10+
# logger = setup_logger()
11+
12+
#
13+
# def log_memory_usage(
14+
# label: str,
15+
# specific_object: Any = None,
16+
# object_label: str = "",
17+
# ) -> None:
18+
# """Log current process memory usage and optionally the size of a specific object.
19+
20+
# Args:
21+
# label: A descriptive label for the current location/operation in code
22+
# specific_object: Optional object to measure the size of
23+
# object_label: Optional label describing the specific object
24+
# """
25+
# try:
26+
# # Get current process memory info
27+
# process = psutil.Process(os.getpid())
28+
# memory_info = process.memory_info()
29+
30+
# # Convert to MB for readability
31+
# rss_mb = memory_info.rss / (1024 * 1024)
32+
# vms_mb = memory_info.vms / (1024 * 1024)
33+
34+
# log_parts = [f"MEMORY[{label}]", f"RSS: {rss_mb:.2f}MB", f"VMS: {vms_mb:.2f}MB"]
35+
36+
# # Add object size if provided
37+
# if specific_object is not None:
38+
# try:
39+
# # recursively calculate the size of the object
40+
# obj_size = asizeof.asizeof(specific_object)
41+
# obj_size_mb = obj_size / (1024 * 1024)
42+
# obj_desc = f"[{object_label}]" if object_label else "[object]"
43+
# log_parts.append(f"OBJ{obj_desc}: {obj_size_mb:.2f}MB")
44+
# except Exception as e:
45+
# log_parts.append(f"OBJ_SIZE_ERROR: {str(e)}")
46+
47+
# logger.info(" | ".join(log_parts))
48+
49+
# except Exception as e:
50+
# logger.warning(f"Failed to log memory usage for {label}: {str(e)}")
51+
52+
# For example, use this like:
53+
# log_memory_usage("my_operation", my_large_object, "my_large_object")

backend/requirements/default.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ nltk==3.9.1
5151
Office365-REST-Python-Client==2.5.9
5252
oauthlib==3.2.2
5353
openai==1.99.5
54+
openpyxl==3.1.5
5455
passlib==1.7.4
5556
playwright==1.41.2
5657
psutil==5.9.5
@@ -60,6 +61,7 @@ pyairtable==3.0.1
6061
pycryptodome==3.19.1
6162
pydantic==2.11.7
6263
PyGithub==2.5.0
64+
pympler==1.1
6365
python-dateutil==2.8.2
6466
python-gitlab==5.6.0
6567
python-pptx==0.6.23
@@ -83,6 +85,7 @@ supervisor==4.2.5
8385
RapidFuzz==3.13.0
8486
tiktoken==0.7.0
8587
timeago==1.0.16
88+
types-openpyxl==3.1.5.20250919
8689
unstructured==0.15.1
8790
unstructured-client==0.25.4
8891
uvicorn==0.35.0

0 commit comments

Comments
 (0)