Skip to content

Commit 8fad5f7

Browse files
hagen-danswerRichard Kuo [bot]
authored andcommitted
Updated google copy and added non admin oauth support (#3120)
* Updated google copy and added non admin oauth support * backend update * accounted for oauth * further removed class variables * updated sets
1 parent 310732d commit 8fad5f7

File tree

4 files changed

+99
-56
lines changed

4 files changed

+99
-56
lines changed

backend/danswer/connectors/google_drive/connector.py

Lines changed: 82 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -192,79 +192,82 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None
192192
def _update_traversed_parent_ids(self, folder_id: str) -> None:
193193
self._retrieved_ids.add(folder_id)
194194

195-
def _get_all_user_emails(self, admins_only: bool) -> list[str]:
195+
def _get_all_user_emails(self) -> list[str]:
196+
# Start with primary admin email
197+
user_emails = [self.primary_admin_email]
198+
199+
# Only fetch additional users if using service account
200+
if isinstance(self.creds, OAuthCredentials):
201+
return user_emails
202+
196203
admin_service = get_admin_service(
197204
creds=self.creds,
198205
user_email=self.primary_admin_email,
199206
)
200-
query = "isAdmin=true" if admins_only else "isAdmin=false"
201-
emails = []
202-
for user in execute_paginated_retrieval(
203-
retrieval_function=admin_service.users().list,
204-
list_key="users",
205-
fields=USER_FIELDS,
206-
domain=self.google_domain,
207-
query=query,
208-
):
209-
if email := user.get("primaryEmail"):
210-
emails.append(email)
211-
return emails
207+
208+
# Get admins first since they're more likely to have access to most files
209+
for is_admin in [True, False]:
210+
query = "isAdmin=true" if is_admin else "isAdmin=false"
211+
for user in execute_paginated_retrieval(
212+
retrieval_function=admin_service.users().list,
213+
list_key="users",
214+
fields=USER_FIELDS,
215+
domain=self.google_domain,
216+
query=query,
217+
):
218+
if email := user.get("primaryEmail"):
219+
if email not in user_emails:
220+
user_emails.append(email)
221+
return user_emails
212222

213223
def _get_all_drive_ids(self) -> set[str]:
214224
primary_drive_service = get_drive_service(
215225
creds=self.creds,
216226
user_email=self.primary_admin_email,
217227
)
218228
all_drive_ids = set()
229+
# We don't want to fail if we're using OAuth because you can
230+
# access your my drive as a non admin user in an org still
231+
ignore_fetch_failure = isinstance(self.creds, OAuthCredentials)
219232
for drive in execute_paginated_retrieval(
220233
retrieval_function=primary_drive_service.drives().list,
221234
list_key="drives",
235+
continue_on_404_or_403=ignore_fetch_failure,
222236
useDomainAdminAccess=True,
223237
fields="drives(id)",
224238
):
225239
all_drive_ids.add(drive["id"])
226-
return all_drive_ids
227-
228-
def _initialize_all_class_variables(self) -> None:
229-
# Get all user emails
230-
# Get admins first becuase they are more likely to have access to the most files
231-
user_emails = [self.primary_admin_email]
232-
for admins_only in [True, False]:
233-
for email in self._get_all_user_emails(admins_only=admins_only):
234-
if email not in user_emails:
235-
user_emails.append(email)
236-
self._all_org_emails = user_emails
237-
238-
self._all_drive_ids: set[str] = self._get_all_drive_ids()
239-
240-
# remove drive ids from the folder ids because they are queried differently
241-
self._requested_folder_ids -= self._all_drive_ids
242240

243-
# Remove drive_ids that are not in the all_drive_ids and check them as folders instead
244-
invalid_drive_ids = self._requested_shared_drive_ids - self._all_drive_ids
245-
if invalid_drive_ids:
241+
if not all_drive_ids:
246242
logger.warning(
247-
f"Some shared drive IDs were not found. IDs: {invalid_drive_ids}"
243+
"No drives found. This is likely because oauth user "
244+
"is not an admin and cannot view all drive IDs. "
245+
"Continuing with only the shared drive IDs specified in the config."
248246
)
249-
logger.warning("Checking for folder access instead...")
250-
self._requested_folder_ids.update(invalid_drive_ids)
247+
all_drive_ids = set(self._requested_shared_drive_ids)
251248

252-
if not self.include_shared_drives:
253-
self._requested_shared_drive_ids = set()
254-
elif not self._requested_shared_drive_ids:
255-
self._requested_shared_drive_ids = self._all_drive_ids
249+
return all_drive_ids
256250

257251
def _impersonate_user_for_retrieval(
258252
self,
259253
user_email: str,
260254
is_slim: bool,
255+
filtered_drive_ids: set[str],
256+
filtered_folder_ids: set[str],
261257
start: SecondsSinceUnixEpoch | None = None,
262258
end: SecondsSinceUnixEpoch | None = None,
263259
) -> Iterator[GoogleDriveFileType]:
264260
drive_service = get_drive_service(self.creds, user_email)
261+
262+
# if we are including my drives, try to get the current user's my
263+
# drive if any of the following are true:
264+
# - no specific emails were requested
265+
# - the current user's email is in the requested emails
266+
# - we are using OAuth (in which case we assume that is the only email we will try)
265267
if self.include_my_drives and (
266268
not self._requested_my_drive_emails
267269
or user_email in self._requested_my_drive_emails
270+
or isinstance(self.creds, OAuthCredentials)
268271
):
269272
yield from get_all_files_in_my_drive(
270273
service=drive_service,
@@ -274,7 +277,7 @@ def _impersonate_user_for_retrieval(
274277
end=end,
275278
)
276279

277-
remaining_drive_ids = self._requested_shared_drive_ids - self._retrieved_ids
280+
remaining_drive_ids = filtered_drive_ids - self._retrieved_ids
278281
for drive_id in remaining_drive_ids:
279282
yield from get_files_in_shared_drive(
280283
service=drive_service,
@@ -285,7 +288,7 @@ def _impersonate_user_for_retrieval(
285288
end=end,
286289
)
287290

288-
remaining_folders = self._requested_folder_ids - self._retrieved_ids
291+
remaining_folders = filtered_folder_ids - self._retrieved_ids
289292
for folder_id in remaining_folders:
290293
yield from crawl_folders_for_files(
291294
service=drive_service,
@@ -302,22 +305,56 @@ def _fetch_drive_items(
302305
start: SecondsSinceUnixEpoch | None = None,
303306
end: SecondsSinceUnixEpoch | None = None,
304307
) -> Iterator[GoogleDriveFileType]:
305-
self._initialize_all_class_variables()
308+
all_org_emails: list[str] = self._get_all_user_emails()
309+
310+
all_drive_ids: set[str] = self._get_all_drive_ids()
311+
312+
# remove drive ids from the folder ids because they are queried differently
313+
filtered_folder_ids = self._requested_folder_ids - all_drive_ids
314+
315+
# Remove drive_ids that are not in the all_drive_ids and check them as folders instead
316+
invalid_drive_ids = self._requested_shared_drive_ids - all_drive_ids
317+
if invalid_drive_ids:
318+
logger.warning(
319+
f"Some shared drive IDs were not found. IDs: {invalid_drive_ids}"
320+
)
321+
logger.warning("Checking for folder access instead...")
322+
filtered_folder_ids.update(invalid_drive_ids)
323+
324+
# If including shared drives, use the requested IDs if provided,
325+
# otherwise use all drive IDs
326+
filtered_drive_ids = set()
327+
if self.include_shared_drives:
328+
if self._requested_shared_drive_ids:
329+
# Remove invalid drive IDs from requested IDs
330+
filtered_drive_ids = (
331+
self._requested_shared_drive_ids - invalid_drive_ids
332+
)
333+
else:
334+
filtered_drive_ids = all_drive_ids
306335

307336
# Process users in parallel using ThreadPoolExecutor
308337
with ThreadPoolExecutor(max_workers=10) as executor:
309338
future_to_email = {
310339
executor.submit(
311-
self._impersonate_user_for_retrieval, email, is_slim, start, end
340+
self._impersonate_user_for_retrieval,
341+
email,
342+
is_slim,
343+
filtered_drive_ids,
344+
filtered_folder_ids,
345+
start,
346+
end,
312347
): email
313-
for email in self._all_org_emails
348+
for email in all_org_emails
314349
}
315350

316351
# Yield results as they complete
317352
for future in as_completed(future_to_email):
318353
yield from future.result()
319354

320-
remaining_folders = self._requested_folder_ids - self._retrieved_ids
355+
remaining_folders = (
356+
filtered_drive_ids | filtered_folder_ids
357+
) - self._retrieved_ids
321358
if remaining_folders:
322359
logger.warning(
323360
f"Some folders/drives were not retrieved. IDs: {remaining_folders}"

backend/danswer/connectors/google_utils/google_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def execute_paginated_retrieval(
105105
)()
106106
elif e.resp.status == 404 or e.resp.status == 403:
107107
if continue_on_404_or_403:
108-
logger.warning(f"Error executing request: {e}")
108+
logger.debug(f"Error executing request: {e}")
109109
results = {}
110110
else:
111111
raise e

web/src/app/admin/connectors/[connector]/pages/DynamicConnectorCreationForm.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ const RenderField: FC<RenderFieldProps> = ({
104104
type={field.type}
105105
label={label}
106106
name={field.name}
107+
isTextArea={true}
107108
/>
108109
)}
109110
</>

web/src/lib/connectors/connectors.tsx

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -221,23 +221,28 @@ export const connectorConfigs: Record<
221221
},
222222
{
223223
type: "text",
224-
description:
225-
"Enter a comma separated list of the URLs of the shared drives to index. Leave blank to index all shared drives.",
224+
description: (currentCredential) => {
225+
return currentCredential?.credential_json?.google_tokens
226+
? "If you are a non-admin user authenticated using Google Oauth, you will need to specify the URLs for the shared drives you would like to index. Leaving this blank will NOT index any shared drives."
227+
: "Enter a comma separated list of the URLs for the shared drive you would like to index. Leave this blank to index all shared drives.";
228+
},
226229
label: "Shared Drive URLs",
227230
name: "shared_drive_urls",
228231
visibleCondition: (values) => values.include_shared_drives,
229232
optional: true,
230233
},
231234
{
232235
type: "checkbox",
233-
label: (currentCredential) =>
234-
currentCredential?.credential_json?.google_drive_tokens
236+
label: (currentCredential) => {
237+
return currentCredential?.credential_json?.google_tokens
235238
? "Include My Drive?"
236-
: "Include Everyone's My Drive?",
237-
description: (currentCredential) =>
238-
currentCredential?.credential_json?.google_drive_tokens
239+
: "Include Everyone's My Drive?";
240+
},
241+
description: (currentCredential) => {
242+
return currentCredential?.credential_json?.google_tokens
239243
? "This will allow Danswer to index everything in your My Drive."
240-
: "This will allow Danswer to index everything in everyone's My Drives.",
244+
: "This will allow Danswer to index everything in everyone's My Drives.";
245+
},
241246
name: "include_my_drives",
242247
optional: true,
243248
default: true,
@@ -250,15 +255,15 @@ export const connectorConfigs: Record<
250255
name: "my_drive_emails",
251256
visibleCondition: (values, currentCredential) =>
252257
values.include_my_drives &&
253-
!currentCredential?.credential_json?.google_drive_tokens,
258+
!currentCredential?.credential_json?.google_tokens,
254259
optional: true,
255260
},
256261
],
257262
advanced_values: [
258263
{
259264
type: "text",
260265
description:
261-
"Enter a comma separated list of the URLs of the folders located in Shared Drives to index. The files located in these folders (and all subfolders) will be indexed. Note: This will be in addition to the 'Include Shared Drives' and 'Shared Drive URLs' settings, so leave those blank if you only want to index the folders specified here.",
266+
"Enter a comma separated list of the URLs of any folders you would like to index. The files located in these folders (and all subfolders) will be indexed. Note: This will be in addition to whatever settings you have selected above, so leave those blank if you only want to index the folders specified here.",
262267
label: "Folder URLs",
263268
name: "shared_folder_urls",
264269
optional: true,

0 commit comments

Comments
 (0)