@@ -192,79 +192,82 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None
192
192
def _update_traversed_parent_ids (self , folder_id : str ) -> None :
193
193
self ._retrieved_ids .add (folder_id )
194
194
195
- def _get_all_user_emails (self , admins_only : bool ) -> list [str ]:
195
+ def _get_all_user_emails (self ) -> list [str ]:
196
+ # Start with primary admin email
197
+ user_emails = [self .primary_admin_email ]
198
+
199
+ # Only fetch additional users if using service account
200
+ if isinstance (self .creds , OAuthCredentials ):
201
+ return user_emails
202
+
196
203
admin_service = get_admin_service (
197
204
creds = self .creds ,
198
205
user_email = self .primary_admin_email ,
199
206
)
200
- query = "isAdmin=true" if admins_only else "isAdmin=false"
201
- emails = []
202
- for user in execute_paginated_retrieval (
203
- retrieval_function = admin_service .users ().list ,
204
- list_key = "users" ,
205
- fields = USER_FIELDS ,
206
- domain = self .google_domain ,
207
- query = query ,
208
- ):
209
- if email := user .get ("primaryEmail" ):
210
- emails .append (email )
211
- return emails
207
+
208
+ # Get admins first since they're more likely to have access to most files
209
+ for is_admin in [True , False ]:
210
+ query = "isAdmin=true" if is_admin else "isAdmin=false"
211
+ for user in execute_paginated_retrieval (
212
+ retrieval_function = admin_service .users ().list ,
213
+ list_key = "users" ,
214
+ fields = USER_FIELDS ,
215
+ domain = self .google_domain ,
216
+ query = query ,
217
+ ):
218
+ if email := user .get ("primaryEmail" ):
219
+ if email not in user_emails :
220
+ user_emails .append (email )
221
+ return user_emails
212
222
213
223
def _get_all_drive_ids (self ) -> set [str ]:
214
224
primary_drive_service = get_drive_service (
215
225
creds = self .creds ,
216
226
user_email = self .primary_admin_email ,
217
227
)
218
228
all_drive_ids = set ()
229
+ # We don't want to fail if we're using OAuth because you can
230
+ # access your my drive as a non admin user in an org still
231
+ ignore_fetch_failure = isinstance (self .creds , OAuthCredentials )
219
232
for drive in execute_paginated_retrieval (
220
233
retrieval_function = primary_drive_service .drives ().list ,
221
234
list_key = "drives" ,
235
+ continue_on_404_or_403 = ignore_fetch_failure ,
222
236
useDomainAdminAccess = True ,
223
237
fields = "drives(id)" ,
224
238
):
225
239
all_drive_ids .add (drive ["id" ])
226
- return all_drive_ids
227
-
228
- def _initialize_all_class_variables (self ) -> None :
229
- # Get all user emails
230
- # Get admins first becuase they are more likely to have access to the most files
231
- user_emails = [self .primary_admin_email ]
232
- for admins_only in [True , False ]:
233
- for email in self ._get_all_user_emails (admins_only = admins_only ):
234
- if email not in user_emails :
235
- user_emails .append (email )
236
- self ._all_org_emails = user_emails
237
-
238
- self ._all_drive_ids : set [str ] = self ._get_all_drive_ids ()
239
-
240
- # remove drive ids from the folder ids because they are queried differently
241
- self ._requested_folder_ids -= self ._all_drive_ids
242
240
243
- # Remove drive_ids that are not in the all_drive_ids and check them as folders instead
244
- invalid_drive_ids = self ._requested_shared_drive_ids - self ._all_drive_ids
245
- if invalid_drive_ids :
241
+ if not all_drive_ids :
246
242
logger .warning (
247
- f"Some shared drive IDs were not found. IDs: { invalid_drive_ids } "
243
+ "No drives found. This is likely because oauth user "
244
+ "is not an admin and cannot view all drive IDs. "
245
+ "Continuing with only the shared drive IDs specified in the config."
248
246
)
249
- logger .warning ("Checking for folder access instead..." )
250
- self ._requested_folder_ids .update (invalid_drive_ids )
247
+ all_drive_ids = set (self ._requested_shared_drive_ids )
251
248
252
- if not self .include_shared_drives :
253
- self ._requested_shared_drive_ids = set ()
254
- elif not self ._requested_shared_drive_ids :
255
- self ._requested_shared_drive_ids = self ._all_drive_ids
249
+ return all_drive_ids
256
250
257
251
def _impersonate_user_for_retrieval (
258
252
self ,
259
253
user_email : str ,
260
254
is_slim : bool ,
255
+ filtered_drive_ids : set [str ],
256
+ filtered_folder_ids : set [str ],
261
257
start : SecondsSinceUnixEpoch | None = None ,
262
258
end : SecondsSinceUnixEpoch | None = None ,
263
259
) -> Iterator [GoogleDriveFileType ]:
264
260
drive_service = get_drive_service (self .creds , user_email )
261
+
262
+ # if we are including my drives, try to get the current user's my
263
+ # drive if any of the following are true:
264
+ # - no specific emails were requested
265
+ # - the current user's email is in the requested emails
266
+ # - we are using OAuth (in which case we assume that is the only email we will try)
265
267
if self .include_my_drives and (
266
268
not self ._requested_my_drive_emails
267
269
or user_email in self ._requested_my_drive_emails
270
+ or isinstance (self .creds , OAuthCredentials )
268
271
):
269
272
yield from get_all_files_in_my_drive (
270
273
service = drive_service ,
@@ -274,7 +277,7 @@ def _impersonate_user_for_retrieval(
274
277
end = end ,
275
278
)
276
279
277
- remaining_drive_ids = self . _requested_shared_drive_ids - self ._retrieved_ids
280
+ remaining_drive_ids = filtered_drive_ids - self ._retrieved_ids
278
281
for drive_id in remaining_drive_ids :
279
282
yield from get_files_in_shared_drive (
280
283
service = drive_service ,
@@ -285,7 +288,7 @@ def _impersonate_user_for_retrieval(
285
288
end = end ,
286
289
)
287
290
288
- remaining_folders = self . _requested_folder_ids - self ._retrieved_ids
291
+ remaining_folders = filtered_folder_ids - self ._retrieved_ids
289
292
for folder_id in remaining_folders :
290
293
yield from crawl_folders_for_files (
291
294
service = drive_service ,
@@ -302,22 +305,56 @@ def _fetch_drive_items(
302
305
start : SecondsSinceUnixEpoch | None = None ,
303
306
end : SecondsSinceUnixEpoch | None = None ,
304
307
) -> Iterator [GoogleDriveFileType ]:
305
- self ._initialize_all_class_variables ()
308
+ all_org_emails : list [str ] = self ._get_all_user_emails ()
309
+
310
+ all_drive_ids : set [str ] = self ._get_all_drive_ids ()
311
+
312
+ # remove drive ids from the folder ids because they are queried differently
313
+ filtered_folder_ids = self ._requested_folder_ids - all_drive_ids
314
+
315
+ # Remove drive_ids that are not in the all_drive_ids and check them as folders instead
316
+ invalid_drive_ids = self ._requested_shared_drive_ids - all_drive_ids
317
+ if invalid_drive_ids :
318
+ logger .warning (
319
+ f"Some shared drive IDs were not found. IDs: { invalid_drive_ids } "
320
+ )
321
+ logger .warning ("Checking for folder access instead..." )
322
+ filtered_folder_ids .update (invalid_drive_ids )
323
+
324
+ # If including shared drives, use the requested IDs if provided,
325
+ # otherwise use all drive IDs
326
+ filtered_drive_ids = set ()
327
+ if self .include_shared_drives :
328
+ if self ._requested_shared_drive_ids :
329
+ # Remove invalid drive IDs from requested IDs
330
+ filtered_drive_ids = (
331
+ self ._requested_shared_drive_ids - invalid_drive_ids
332
+ )
333
+ else :
334
+ filtered_drive_ids = all_drive_ids
306
335
307
336
# Process users in parallel using ThreadPoolExecutor
308
337
with ThreadPoolExecutor (max_workers = 10 ) as executor :
309
338
future_to_email = {
310
339
executor .submit (
311
- self ._impersonate_user_for_retrieval , email , is_slim , start , end
340
+ self ._impersonate_user_for_retrieval ,
341
+ email ,
342
+ is_slim ,
343
+ filtered_drive_ids ,
344
+ filtered_folder_ids ,
345
+ start ,
346
+ end ,
312
347
): email
313
- for email in self . _all_org_emails
348
+ for email in all_org_emails
314
349
}
315
350
316
351
# Yield results as they complete
317
352
for future in as_completed (future_to_email ):
318
353
yield from future .result ()
319
354
320
- remaining_folders = self ._requested_folder_ids - self ._retrieved_ids
355
+ remaining_folders = (
356
+ filtered_drive_ids | filtered_folder_ids
357
+ ) - self ._retrieved_ids
321
358
if remaining_folders :
322
359
logger .warning (
323
360
f"Some folders/drives were not retrieved. IDs: { remaining_folders } "
0 commit comments