From 2cac481915ab396a4ffb29c542db62f84e3a3f5f Mon Sep 17 00:00:00 2001 From: saksham23467 <142910439+saksham23467@users.noreply.github.com> Date: Thu, 31 Jul 2025 17:57:26 +0530 Subject: [PATCH 1/2] Add Clones metric API (#2604) Signed-off-by: saksham23467 <142910439+saksham23467@users.noreply.github.com> --- augur/api/metrics/repo_meta.py | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index ffc8fc84ef..c39922e17b 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -1240,3 +1240,59 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non results = pd.read_sql(summarySQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) return results + +@register_metric() +def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): + """ + Returns the number of repository clones (total and unique) for a given repo or repo group. + :param repo_group_id: The repository's repo_group_id + :param repo_id: The repository's repo_id, defaults to None + :param begin_date: Start date for filtering clone data (optional) + :param end_date: End date for filtering clone data (optional) + :return: DataFrame of clone counts (total and unique) per day + """ + if not begin_date: + begin_date = '1970-1-1 00:00:00' + if not end_date: + end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + if repo_id: + clones_sql = s.sql.text(""" + SELECT + repo_id, + clone_data_timestamp AS date, + count_clones AS total_clones, + unique_clones + FROM augur_data.repo_clones_data + WHERE repo_id = :repo_id + AND clone_data_timestamp BETWEEN :begin_date AND :end_date + ORDER BY clone_data_timestamp + """) + with current_app.engine.connect() as conn: + results = pd.read_sql(clones_sql, conn, params={ + 'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date + }) + return results + else: + clones_sql = s.sql.text(""" + SELECT + repo_id, + clone_data_timestamp AS date, + count_clones AS total_clones, + unique_clones + FROM augur_data.repo_clones_data + WHERE repo_id IN ( + SELECT repo_id FROM augur_data.repo WHERE repo_group_id = :repo_group_id + ) + AND clone_data_timestamp BETWEEN :begin_date AND :end_date + ORDER BY repo_id, clone_data_timestamp + """) + with current_app.engine.connect() as conn: + results = pd.read_sql(clones_sql, conn, params={ + 'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date + }) + return results From bb6d2a092971c292b2c3f363d114baa0457c782e Mon Sep 17 00:00:00 2001 From: saksham23467 <142910439+saksham23467@users.noreply.github.com> Date: Fri, 1 Aug 2025 15:59:20 +0530 Subject: [PATCH 2/2] fix: improve error handling for deleted repositories in GitHub move detection - Fix 404 error handling to gracefully handle deleted repositories instead of raising exceptions - Add proper task-level error handling with try-catch blocks - Enhance API call retry logic with better error handling - Add validation for repository URL parsing and API responses - Improve logging throughout the process for better monitoring - Mark deleted repositories as IGNORE in database instead of failing tasks Signed-off-by: saksham23467 <142910439+saksham23467@users.noreply.github.com> --- augur/tasks/github/detect_move/core.py | 37 ++++++++++++----- augur/tasks/github/detect_move/tasks.py | 54 ++++++++++++++++--------- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index b302a70a06..cf153f1acc 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -46,17 +46,32 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'): - owner, name = get_owner_repo(repo.repo_git) - url = f"https://api.github.com/repos/{owner}/{name}" + try: + owner, name = get_owner_repo(repo.repo_git) + url = f"https://api.github.com/repos/{owner}/{name}" + except Exception as e: + logger.error(f"Failed to parse repo URL {repo.repo_git}: {str(e)}") + raise Exception(f"Invalid repository URL format: {repo.repo_git}") attempts = 0 while attempts < 10: - response_from_gh = hit_api(key_auth, url, logger) + try: + response_from_gh = hit_api(key_auth, url, logger) + + if response_from_gh and response_from_gh.status_code != 404: + break - if response_from_gh and response_from_gh.status_code != 404: - break + attempts += 1 + except Exception as e: + logger.warning(f"API call attempt {attempts + 1} failed for {url}: {str(e)}") + attempts += 1 + if attempts >= 10: + raise Exception(f"Failed to get API response after {attempts} attempts: {str(e)}") - attempts += 1 + # Validate response + if not response_from_gh: + logger.error(f"No response received from GitHub API for {url}") + raise Exception(f"No response from GitHub API for {url}") #Update Url and retry if 301 #301 moved permanently @@ -79,9 +94,10 @@ def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='c update_repo_with_dict(repo, repo_update_dict, logger) + logger.info(f"Repository {repo.repo_git} has moved to {repo_update_dict['repo_git']}. Updated repository URL and resetting collection.") raise Exception("ERROR: Repo has moved! Resetting Collection!") - #Mark as ignore if 404 + #Mark as ignore if 404 (repository deleted) if response_from_gh.status_code == 404: repo_update_dict = { 'repo_git': repo.repo_git, @@ -113,14 +129,15 @@ def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='c collectionRecord.ml_task_id = None collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - session.commit() - raise Exception("ERROR: Repo has moved, and there is no redirection! 404 returned, not 301. Resetting Collection!") + + logger.warning(f"Repository {repo.repo_git} returned 404 (deleted). Marked as IGNORE and collection stopped.") + return # Return gracefully instead of raising exception if attempts >= 10: logger.error(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") - raise Exception(f"ERROR: Could not get api response for repo: {url}") + raise Exception(f"ERROR: Could not get api response for repo: {url} after {attempts} attempts") #skip if not 404 logger.info(f"Repo found at url: {url}") diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index f542d89289..6491cb126f 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -14,17 +14,25 @@ def detect_github_repo_move_core(repo_git : str) -> None: logger.info(f"Starting repo_move operation with {repo_git}") - repo = get_repo_by_repo_git(repo_git) + try: + repo = get_repo_by_repo_git(repo_git) + + if not repo: + logger.error(f"Repository not found in database: {repo_git}") + return - logger.info(f"Pinging repo: {repo_git}") + logger.info(f"Pinging repo: {repo_git}") - key_auth = GithubRandomKeyAuth(logger) + key_auth = GithubRandomKeyAuth(logger) - with get_session() as session: - - #Ping each repo with the given repo_git to make sure - #that they are still in place. - ping_github_for_repo_move(session, key_auth, repo, logger) + with get_session() as session: + #Ping each repo with the given repo_git to make sure + #that they are still in place. + ping_github_for_repo_move(session, key_auth, repo, logger) + + except Exception as e: + logger.error(f"Error during repo move detection for {repo_git}: {str(e)}") + raise @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -34,14 +42,22 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: logger.info(f"Starting repo_move operation with {repo_git}") - repo = get_repo_by_repo_git(repo_git) - - logger.info(f"Pinging repo: {repo_git}") - - key_auth = GithubRandomKeyAuth(logger) - - with get_session() as session: - - #Ping each repo with the given repo_git to make sure - #that they are still in place. - ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file + try: + repo = get_repo_by_repo_git(repo_git) + + if not repo: + logger.error(f"Repository not found in database: {repo_git}") + return + + logger.info(f"Pinging repo: {repo_git}") + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + #Ping each repo with the given repo_git to make sure + #that they are still in place. + ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='secondary') + + except Exception as e: + logger.error(f"Error during repo move detection for {repo_git}: {str(e)}") + raise \ No newline at end of file