From 5452c0fd1041afd912ac52efabf10daa824a4576 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 13:55:00 -0700 Subject: [PATCH 1/6] k --- .../miscellaneous_utils.py | 11 ++++- backend/danswer/connectors/gmail/connector.py | 49 +++++++++++++++++-- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py index 1b79c4a4fa..8e8ea8d7d6 100644 --- a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py @@ -23,7 +23,16 @@ def datetime_to_utc(dt: datetime) -> datetime: def time_str_to_utc(datetime_str: str) -> datetime: - dt = parse(datetime_str) + try: + dt = parse(datetime_str) + except ValueError: + # Handle malformed timezone by attempting to fix common format issues + if "0000" in datetime_str: + # Convert "0000" to "+0000" for proper timezone parsing + fixed_dt_str = datetime_str.replace(" 0000", " +0000") + dt = parse(fixed_dt_str) + else: + raise return datetime_to_utc(dt) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 42d2f305f7..9c6e89c5fb 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -1,4 +1,8 @@ +import re +import time from base64 import urlsafe_b64decode +from datetime import datetime +from datetime import timezone from typing import Any from typing import cast from typing import Dict @@ -6,6 +10,7 @@ from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore from googleapiclient import discovery # type: ignore +from googleapiclient.errors import HttpError # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -34,6 +39,43 @@ logger = setup_logger() +def _execute_with_retry(request: Any) -> Any: + try: + return request.execute() + except HttpError as error: + if error.resp.status == 429: + # Attempt to get 'Retry-After' from headers + retry_after = error.resp.get("Retry-After") + if retry_after: + sleep_time = int(retry_after) + else: + # Extract 'Retry after' timestamp from error message + match = re.search( + r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)", + str(error), + ) + if match: + retry_after_timestamp = match.group(1) + retry_after_dt = datetime.strptime( + retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" + ) + sleep_time = max( + int( + ( + retry_after_dt - datetime.now(timezone.utc) + ).total_seconds() + ), + 0, + ) + else: + sleep_time = 60 + + logger.info(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") + time.sleep(sleep_time) + else: + raise + + class GmailConnector(LoadConnector, PollConnector): def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: self.batch_size = batch_size @@ -156,7 +198,7 @@ def _fetch_mails_from_gmail( query = GmailConnector._build_time_range_query(time_range_start, time_range_end) service = discovery.build("gmail", "v1", credentials=self.creds) while page_token is not None: - result = ( + result = _execute_with_retry( service.users() .messages() .list( @@ -165,18 +207,17 @@ def _fetch_mails_from_gmail( q=query, maxResults=self.batch_size, ) - .execute() ) + page_token = result.get("nextPageToken") messages = result.get("messages", []) doc_batch = [] for message in messages: message_id = message["id"] - msg = ( + msg = _execute_with_retry( service.users() .messages() .get(userId="me", id=message_id, format="full") - .execute() ) doc = self._email_to_document(msg) doc_batch.append(doc) From 57b9fbee40826302ea7bd82d488291b0457a00c6 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 16:42:21 -0700 Subject: [PATCH 2/6] k --- backend/danswer/connectors/gmail/connector.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 9c6e89c5fb..e0516e7ed5 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -58,13 +58,10 @@ def _execute_with_retry(request: Any) -> Any: retry_after_timestamp = match.group(1) retry_after_dt = datetime.strptime( retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" - ) + ).replace(tzinfo=timezone.utc) + current_time = datetime.now(timezone.utc) sleep_time = max( - int( - ( - retry_after_dt - datetime.now(timezone.utc) - ).total_seconds() - ), + int((retry_after_dt - current_time).total_seconds()), 0, ) else: From 5366fb76bdf13248ecbbb360873f6442625f0dda Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 17:22:21 -0700 Subject: [PATCH 3/6] k --- backend/danswer/connectors/gmail/connector.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index e0516e7ed5..7848a4e065 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -65,12 +65,18 @@ def _execute_with_retry(request: Any) -> Any: 0, ) else: - sleep_time = 60 + logger.error( + f"No Retry-After header or timestamp found in error message: {error}" + ) + sleep_time = 600 # 10 minutes logger.info(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") time.sleep(sleep_time) - else: - raise + + # If it still fails, just raise to not be stuck in a loop + return request.execute() + + raise class GmailConnector(LoadConnector, PollConnector): From fc98593027214bc26f4806844a7108148d76db39 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 18:07:39 -0700 Subject: [PATCH 4/6] k --- backend/danswer/connectors/gmail/connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 7848a4e065..1eed1e5b4b 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -68,7 +68,9 @@ def _execute_with_retry(request: Any) -> Any: logger.error( f"No Retry-After header or timestamp found in error message: {error}" ) - sleep_time = 600 # 10 minutes + sleep_time = 900 # This is how much the sleep tends to be according to a few tests + + sleep_time += 5 # Add a buffer to be safe logger.info(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") time.sleep(sleep_time) From 8714fab16326d7c42fb56138f122b2014b3783bf Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 20:16:09 -0700 Subject: [PATCH 5/6] k --- backend/danswer/connectors/gmail/connector.py | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 1eed1e5b4b..d18ac3c54c 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -40,45 +40,54 @@ def _execute_with_retry(request: Any) -> Any: - try: - return request.execute() - except HttpError as error: - if error.resp.status == 429: - # Attempt to get 'Retry-After' from headers - retry_after = error.resp.get("Retry-After") - if retry_after: - sleep_time = int(retry_after) - else: - # Extract 'Retry after' timestamp from error message - match = re.search( - r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)", - str(error), - ) - if match: - retry_after_timestamp = match.group(1) - retry_after_dt = datetime.strptime( - retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" - ).replace(tzinfo=timezone.utc) - current_time = datetime.now(timezone.utc) - sleep_time = max( - int((retry_after_dt - current_time).total_seconds()), - 0, - ) + max_attempts = 10 + attempt = 0 + + while attempt < max_attempts: + try: + return request.execute() + except HttpError as error: + attempt += 1 + + if error.resp.status == 429: + # Attempt to get 'Retry-After' from headers + retry_after = error.resp.get("Retry-After") + if retry_after: + sleep_time = int(retry_after) else: - logger.error( - f"No Retry-After header or timestamp found in error message: {error}" + # Extract 'Retry after' timestamp from error message + match = re.search( + r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)", + str(error), ) - sleep_time = 900 # This is how much the sleep tends to be according to a few tests + if match: + retry_after_timestamp = match.group(1) + retry_after_dt = datetime.strptime( + retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" + ).replace(tzinfo=timezone.utc) + current_time = datetime.now(timezone.utc) + sleep_time = max( + int((retry_after_dt - current_time).total_seconds()), + 0, + ) + else: + logger.error( + f"No Retry-After header or timestamp found in error message: {error}" + ) + sleep_time = 60 - sleep_time += 5 # Add a buffer to be safe + sleep_time += 3 # Add a buffer to be safe - logger.info(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") - time.sleep(sleep_time) + logger.info( + f"Rate limit exceeded. Attempt {attempt}/{max_attempts}. Sleeping for {sleep_time} seconds." + ) + time.sleep(sleep_time) - # If it still fails, just raise to not be stuck in a loop - return request.execute() + else: + raise - raise + # If we've exhausted all attempts + raise Exception(f"Failed to execute request after {max_attempts} attempts") class GmailConnector(LoadConnector, PollConnector): From f733ce3791b47cb8aaf74fab07f9c96f83580fbe Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 30 Oct 2024 20:20:50 -0700 Subject: [PATCH 6/6] k --- backend/danswer/connectors/gmail/connector.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index d18ac3c54c..376f03e71d 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -44,6 +44,13 @@ def _execute_with_retry(request: Any) -> Any: attempt = 0 while attempt < max_attempts: + # Note for reasons unknown, the Google API will sometimes return a 429 + # and even after waiting the retry period, it will return another 429. + # It could be due to a few possibilities: + # 1. Other things are also requesting from the Gmail API with the same key + # 2. It's a rolling rate limit so the moment we get some amount of requests cleared, we hit it again very quickly + # 3. The retry-after has a maximum and we've already hit the limit for the day + # or it's something else... try: return request.execute() except HttpError as error: