From 381269a828af3bb0cbda5d6b9960d7f23de20e55 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Sat, 23 Aug 2025 20:48:38 -0700 Subject: [PATCH 1/3] fix: downloads are never larger than 20mb --- .../connectors/google_drive/doc_conversion.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 8ee421dd31f..3243277f9c1 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -97,18 +97,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool: return is_valid_image_type(mime_type) -def download_request(service: GoogleDriveService, file_id: str) -> bytes: +def download_request( + service: GoogleDriveService, file_id: str, size_threshold: int +) -> bytes: """ Download the file from Google Drive. """ # For other file types, download the file # Use the correct API call for downloading files request = service.files().get_media(fileId=file_id) + return _donwload_request(request, file_id, size_threshold) + + +def _donwload_request(request: Any, file_id: str, size_threshold: int) -> bytes: response_bytes = io.BytesIO() - downloader = MediaIoBaseDownload(response_bytes, request) + downloader = MediaIoBaseDownload( + response_bytes, request, chunksize=size_threshold * 2 + ) done = False while not done: - _, done = downloader.next_chunk() + download_progress, done = downloader.next_chunk() + if download_progress.resumable_progress > size_threshold: + logger.warning( + f"File {file_id} exceeds size threshold of {size_threshold}. Skipping2." + ) + return bytes() response = response_bytes.getvalue() if not response: @@ -121,6 +134,7 @@ def _download_and_extract_sections_basic( file: dict[str, str], service: GoogleDriveService, allow_images: bool, + size_threshold: int, ) -> list[TextSection | ImageSection]: """Extract text and images from a Google Drive file.""" file_id = file["id"] @@ -132,7 +146,7 @@ def _download_and_extract_sections_basic( # Use the correct API call for downloading files # lazy evaluation to only download the file if necessary def response_call() -> bytes: - return download_request(service, file_id) + return download_request(service, file_id, size_threshold) if is_gdrive_image_mime_type(mime_type): # Skip images if not explicitly enabled @@ -162,13 +176,7 @@ def response_call() -> bytes: request = service.files().export_media( fileId=file_id, mimeType=export_mime_type ) - response_bytes = io.BytesIO() - downloader = MediaIoBaseDownload(response_bytes, request) - done = False - while not done: - _, done = downloader.next_chunk() - - response = response_bytes.getvalue() + response = _donwload_request(request, file_id, size_threshold) if not response: logger.warning(f"Failed to export {file_name} as {export_mime_type}") return [] @@ -467,7 +475,7 @@ def _get_docs_service() -> GoogleDocsService: " aligning with basic sections" ) basic_sections = _download_and_extract_sections_basic( - file, _get_drive_service(), allow_images + file, _get_drive_service(), allow_images, size_threshold ) sections = align_basic_advanced(basic_sections, doc_sections) @@ -478,7 +486,7 @@ def _get_docs_service() -> GoogleDocsService: # Not Google Doc, attempt basic extraction else: sections = _download_and_extract_sections_basic( - file, _get_drive_service(), allow_images + file, _get_drive_service(), allow_images, size_threshold ) # If we still don't have any sections, skip this file From 8793211f683c6b737f166636d3f9ee491ecb9427 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Sun, 24 Aug 2025 13:47:44 -0700 Subject: [PATCH 2/3] JT comments --- backend/onyx/connectors/google_drive/doc_conversion.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 3243277f9c1..057a70bb1a2 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -52,6 +52,7 @@ WEB_VIEW_LINK_KEY = "webViewLink" MAX_RETRIEVER_EMAILS = 20 +CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read # Mapping of Google Drive mime types to export formats GOOGLE_MIME_TYPES_TO_EXPORT = { @@ -106,13 +107,13 @@ def download_request( # For other file types, download the file # Use the correct API call for downloading files request = service.files().get_media(fileId=file_id) - return _donwload_request(request, file_id, size_threshold) + return _download_request(request, file_id, size_threshold) -def _donwload_request(request: Any, file_id: str, size_threshold: int) -> bytes: +def _download_request(request: Any, file_id: str, size_threshold: int) -> bytes: response_bytes = io.BytesIO() downloader = MediaIoBaseDownload( - response_bytes, request, chunksize=size_threshold * 2 + response_bytes, request, chunksize=size_threshold + CHUNK_SIZE_BUFFER ) done = False while not done: @@ -176,7 +177,7 @@ def response_call() -> bytes: request = service.files().export_media( fileId=file_id, mimeType=export_mime_type ) - response = _donwload_request(request, file_id, size_threshold) + response = _download_request(request, file_id, size_threshold) if not response: logger.warning(f"Failed to export {file_name} as {export_mime_type}") return [] From 189820b7e30964a3c5cf04b22b93fbe56baafeef Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Sun, 24 Aug 2025 18:01:37 -0700 Subject: [PATCH 3/3] import to fix integration tests --- backend/requirements/default.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 132485c146a..c9c9164ea2e 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -40,6 +40,7 @@ langchainhub==0.1.21 langgraph==0.2.72 langgraph-checkpoint==2.0.13 langgraph-sdk==0.1.44 +lazy_imports==1.0.1 litellm==1.72.2 lxml==5.3.0 lxml_html_clean==0.2.2