Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions backend/onyx/connectors/google_drive/doc_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
WEB_VIEW_LINK_KEY = "webViewLink"

MAX_RETRIEVER_EMAILS = 20
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read

# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
Expand Down Expand Up @@ -97,18 +98,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
return is_valid_image_type(mime_type)


def download_request(service: GoogleDriveService, file_id: str) -> bytes:
def download_request(
service: GoogleDriveService, file_id: str, size_threshold: int
) -> bytes:
"""
Download the file from Google Drive.
"""
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
return _download_request(request, file_id, size_threshold)


def _download_request(request: Any, file_id: str, size_threshold: int) -> bytes:
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
downloader = MediaIoBaseDownload(
response_bytes, request, chunksize=size_threshold + CHUNK_SIZE_BUFFER
)
done = False
while not done:
_, done = downloader.next_chunk()
download_progress, done = downloader.next_chunk()
if download_progress.resumable_progress > size_threshold:
logger.warning(
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping2."
)
return bytes()

response = response_bytes.getvalue()
if not response:
Expand All @@ -121,6 +135,7 @@ def _download_and_extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection]:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
Expand All @@ -132,7 +147,7 @@ def _download_and_extract_sections_basic(
# Use the correct API call for downloading files
# lazy evaluation to only download the file if necessary
def response_call() -> bytes:
return download_request(service, file_id)
return download_request(service, file_id, size_threshold)

if is_gdrive_image_mime_type(mime_type):
# Skip images if not explicitly enabled
Expand Down Expand Up @@ -162,13 +177,7 @@ def response_call() -> bytes:
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()

response = response_bytes.getvalue()
response = _download_request(request, file_id, size_threshold)
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
Expand Down Expand Up @@ -467,7 +476,7 @@ def _get_docs_service() -> GoogleDocsService:
" aligning with basic sections"
)
basic_sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)
sections = align_basic_advanced(basic_sections, doc_sections)

Expand All @@ -478,7 +487,7 @@ def _get_docs_service() -> GoogleDocsService:
# Not Google Doc, attempt basic extraction
else:
sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)

# If we still don't have any sections, skip this file
Expand Down
1 change: 1 addition & 0 deletions backend/requirements/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ langchainhub==0.1.21
langgraph==0.2.72
langgraph-checkpoint==2.0.13
langgraph-sdk==0.1.44
lazy_imports==1.0.1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

integration tests were failing bc we were missing this import

litellm==1.72.2
lxml==5.3.0
lxml_html_clean==0.2.2
Expand Down