52
52
WEB_VIEW_LINK_KEY = "webViewLink"
53
53
54
54
MAX_RETRIEVER_EMAILS = 20
55
+ CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
55
56
56
57
# Mapping of Google Drive mime types to export formats
57
58
GOOGLE_MIME_TYPES_TO_EXPORT = {
@@ -97,18 +98,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
97
98
return is_valid_image_type (mime_type )
98
99
99
100
100
- def download_request (service : GoogleDriveService , file_id : str ) -> bytes :
101
+ def download_request (
102
+ service : GoogleDriveService , file_id : str , size_threshold : int
103
+ ) -> bytes :
101
104
"""
102
105
Download the file from Google Drive.
103
106
"""
104
107
# For other file types, download the file
105
108
# Use the correct API call for downloading files
106
109
request = service .files ().get_media (fileId = file_id )
110
+ return _download_request (request , file_id , size_threshold )
111
+
112
+
113
+ def _download_request (request : Any , file_id : str , size_threshold : int ) -> bytes :
107
114
response_bytes = io .BytesIO ()
108
- downloader = MediaIoBaseDownload (response_bytes , request )
115
+ downloader = MediaIoBaseDownload (
116
+ response_bytes , request , chunksize = size_threshold + CHUNK_SIZE_BUFFER
117
+ )
109
118
done = False
110
119
while not done :
111
- _ , done = downloader .next_chunk ()
120
+ download_progress , done = downloader .next_chunk ()
121
+ if download_progress .resumable_progress > size_threshold :
122
+ logger .warning (
123
+ f"File { file_id } exceeds size threshold of { size_threshold } . Skipping2."
124
+ )
125
+ return bytes ()
112
126
113
127
response = response_bytes .getvalue ()
114
128
if not response :
@@ -121,6 +135,7 @@ def _download_and_extract_sections_basic(
121
135
file : dict [str , str ],
122
136
service : GoogleDriveService ,
123
137
allow_images : bool ,
138
+ size_threshold : int ,
124
139
) -> list [TextSection | ImageSection ]:
125
140
"""Extract text and images from a Google Drive file."""
126
141
file_id = file ["id" ]
@@ -132,7 +147,7 @@ def _download_and_extract_sections_basic(
132
147
# Use the correct API call for downloading files
133
148
# lazy evaluation to only download the file if necessary
134
149
def response_call () -> bytes :
135
- return download_request (service , file_id )
150
+ return download_request (service , file_id , size_threshold )
136
151
137
152
if is_gdrive_image_mime_type (mime_type ):
138
153
# Skip images if not explicitly enabled
@@ -162,13 +177,7 @@ def response_call() -> bytes:
162
177
request = service .files ().export_media (
163
178
fileId = file_id , mimeType = export_mime_type
164
179
)
165
- response_bytes = io .BytesIO ()
166
- downloader = MediaIoBaseDownload (response_bytes , request )
167
- done = False
168
- while not done :
169
- _ , done = downloader .next_chunk ()
170
-
171
- response = response_bytes .getvalue ()
180
+ response = _download_request (request , file_id , size_threshold )
172
181
if not response :
173
182
logger .warning (f"Failed to export { file_name } as { export_mime_type } " )
174
183
return []
@@ -467,7 +476,7 @@ def _get_docs_service() -> GoogleDocsService:
467
476
" aligning with basic sections"
468
477
)
469
478
basic_sections = _download_and_extract_sections_basic (
470
- file , _get_drive_service (), allow_images
479
+ file , _get_drive_service (), allow_images , size_threshold
471
480
)
472
481
sections = align_basic_advanced (basic_sections , doc_sections )
473
482
@@ -478,7 +487,7 @@ def _get_docs_service() -> GoogleDocsService:
478
487
# Not Google Doc, attempt basic extraction
479
488
else :
480
489
sections = _download_and_extract_sections_basic (
481
- file , _get_drive_service (), allow_images
490
+ file , _get_drive_service (), allow_images , size_threshold
482
491
)
483
492
484
493
# If we still don't have any sections, skip this file
0 commit comments