Skip to content

Commit a17e5c3

Browse files
evan-onyxwenxi-onyx
authored andcommitted
fix: downloads are never larger than 20mb (#5247)
* fix: downloads are never larger than 20mb * JT comments * import to fix integration tests
1 parent 7b6189e commit a17e5c3

File tree

2 files changed

+23
-13
lines changed

2 files changed

+23
-13
lines changed

backend/onyx/connectors/google_drive/doc_conversion.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
WEB_VIEW_LINK_KEY = "webViewLink"
5353

5454
MAX_RETRIEVER_EMAILS = 20
55+
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
5556

5657
# Mapping of Google Drive mime types to export formats
5758
GOOGLE_MIME_TYPES_TO_EXPORT = {
@@ -97,18 +98,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
9798
return is_valid_image_type(mime_type)
9899

99100

100-
def download_request(service: GoogleDriveService, file_id: str) -> bytes:
101+
def download_request(
102+
service: GoogleDriveService, file_id: str, size_threshold: int
103+
) -> bytes:
101104
"""
102105
Download the file from Google Drive.
103106
"""
104107
# For other file types, download the file
105108
# Use the correct API call for downloading files
106109
request = service.files().get_media(fileId=file_id)
110+
return _download_request(request, file_id, size_threshold)
111+
112+
113+
def _download_request(request: Any, file_id: str, size_threshold: int) -> bytes:
107114
response_bytes = io.BytesIO()
108-
downloader = MediaIoBaseDownload(response_bytes, request)
115+
downloader = MediaIoBaseDownload(
116+
response_bytes, request, chunksize=size_threshold + CHUNK_SIZE_BUFFER
117+
)
109118
done = False
110119
while not done:
111-
_, done = downloader.next_chunk()
120+
download_progress, done = downloader.next_chunk()
121+
if download_progress.resumable_progress > size_threshold:
122+
logger.warning(
123+
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping2."
124+
)
125+
return bytes()
112126

113127
response = response_bytes.getvalue()
114128
if not response:
@@ -121,6 +135,7 @@ def _download_and_extract_sections_basic(
121135
file: dict[str, str],
122136
service: GoogleDriveService,
123137
allow_images: bool,
138+
size_threshold: int,
124139
) -> list[TextSection | ImageSection]:
125140
"""Extract text and images from a Google Drive file."""
126141
file_id = file["id"]
@@ -132,7 +147,7 @@ def _download_and_extract_sections_basic(
132147
# Use the correct API call for downloading files
133148
# lazy evaluation to only download the file if necessary
134149
def response_call() -> bytes:
135-
return download_request(service, file_id)
150+
return download_request(service, file_id, size_threshold)
136151

137152
if is_gdrive_image_mime_type(mime_type):
138153
# Skip images if not explicitly enabled
@@ -162,13 +177,7 @@ def response_call() -> bytes:
162177
request = service.files().export_media(
163178
fileId=file_id, mimeType=export_mime_type
164179
)
165-
response_bytes = io.BytesIO()
166-
downloader = MediaIoBaseDownload(response_bytes, request)
167-
done = False
168-
while not done:
169-
_, done = downloader.next_chunk()
170-
171-
response = response_bytes.getvalue()
180+
response = _download_request(request, file_id, size_threshold)
172181
if not response:
173182
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
174183
return []
@@ -467,7 +476,7 @@ def _get_docs_service() -> GoogleDocsService:
467476
" aligning with basic sections"
468477
)
469478
basic_sections = _download_and_extract_sections_basic(
470-
file, _get_drive_service(), allow_images
479+
file, _get_drive_service(), allow_images, size_threshold
471480
)
472481
sections = align_basic_advanced(basic_sections, doc_sections)
473482

@@ -478,7 +487,7 @@ def _get_docs_service() -> GoogleDocsService:
478487
# Not Google Doc, attempt basic extraction
479488
else:
480489
sections = _download_and_extract_sections_basic(
481-
file, _get_drive_service(), allow_images
490+
file, _get_drive_service(), allow_images, size_threshold
482491
)
483492

484493
# If we still don't have any sections, skip this file

backend/requirements/default.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ langchainhub==0.1.21
4040
langgraph==0.2.72
4141
langgraph-checkpoint==2.0.13
4242
langgraph-sdk==0.1.44
43+
lazy_imports==1.0.1
4344
litellm==1.72.2
4445
lxml==5.3.0
4546
lxml_html_clean==0.2.2

0 commit comments

Comments
 (0)