From 2430868e2a583a177fbd27b56f4b10dd0e66e820 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:17:04 +0100 Subject: [PATCH 01/33] Refactor: Simplify Path handling --- src/nplinker/genomics/antismash/antismash_downloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index a02728f4..a95483ed 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -42,8 +42,7 @@ def download_and_extract_antismash_data( >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") """ download_root = Path(download_root) - extract_root = Path(extract_root) - extract_path = extract_root / "antismash" / antismash_id + extract_path = Path(extract_root) / "antismash" / antismash_id try: if extract_path.exists(): From 1f072467054a02defa9dfb5c767f861fba712aac Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:17:42 +0100 Subject: [PATCH 02/33] Refactor: Improve extract path handling to ensure that non-empty dirs are not deleted --- .../genomics/antismash/antismash_downloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index a95483ed..8f4fe5b7 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -44,12 +44,12 @@ def download_and_extract_antismash_data( download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id - try: - if extract_path.exists(): - _check_extract_path(extract_path) - else: - extract_path.mkdir(parents=True, exist_ok=True) + if extract_path.exists(): + _check_extract_path(extract_path) + else: + extract_path.mkdir(parents=True, exist_ok=True) + try: for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: url = base_url.format(antismash_id, antismash_id + ".zip") download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") From 8d7238c9cbd4007ea96d7f376fb7f283570c5d97 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:18:16 +0100 Subject: [PATCH 03/33] Refactor: Extract file cleanup logic into a separate function for better readability --- .../antismash/antismash_downloader.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 8f4fe5b7..93f22fc3 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -55,15 +55,7 @@ def download_and_extract_antismash_data( download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") break - # delete subdirs - for subdir_path in list_dirs(extract_path): - shutil.rmtree(subdir_path) - - # delete unnecessary files - files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) - for file in list_files(extract_path): - if file not in files_to_keep: - os.remove(file) + _cleanup_extracted_files(extract_path) logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id) @@ -77,3 +69,15 @@ def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): raise ValueError(f'Nonempty directory: "{extract_path}"') + + +def _cleanup_extracted_files(extract_path: str | PathLike) -> None: + # delete subdirs + for subdir_path in list_dirs(extract_path): + shutil.rmtree(subdir_path) + + # delete unnecessary files + files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) + for file in list_files(extract_path): + if file not in files_to_keep: + os.remove(file) From d5e64eb57bb6948ce07315438fd838f38eb84454 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:19:35 +0100 Subject: [PATCH 04/33] Refactor: Move extract_path preparation logic into a seperate function for better readabiliy --- .../genomics/antismash/antismash_downloader.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 93f22fc3..f2397ea8 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -44,10 +44,7 @@ def download_and_extract_antismash_data( download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id - if extract_path.exists(): - _check_extract_path(extract_path) - else: - extract_path.mkdir(parents=True, exist_ok=True) + _prepare_extract_path(extract_path) try: for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: @@ -81,3 +78,10 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None: for file in list_files(extract_path): if file not in files_to_keep: os.remove(file) + + +def _prepare_extract_path(extract_path: str | PathLike) -> None: + if extract_path.exists(): + _check_extract_path(extract_path) + else: + extract_path.mkdir(parents=True, exist_ok=True) From 66f13d6d57639e87036b684a59be6e6d183b7ad3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:17:41 +0100 Subject: [PATCH 05/33] Refactor: Separate genome assembly resolution and antiSMASH data retrieval into distinct functions for improved clarity --- .../antismash/podp_antismash_downloader.py | 92 ++++++++++++++----- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 84e3cee4..5b9ded97 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -179,33 +179,18 @@ def podp_download_and_extract_antismash_data( logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt") continue - # if not downloaded or lookup attempted, then try to resolve the ID - # and download - logger.info(f"Start lookup process for genome ID {raw_genome_id}") - gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data) - gs_obj.resolve_attempted = True - - if gs_obj.resolved_refseq_id == "": - # give up on this one - logger.warning(f"Failed lookup for genome ID {raw_genome_id}") + # resolve genome ID + try: + get_genome_assembly_accession(gs, genome_record["genome_ID"]) + except Exception as e: + logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") continue - # if resolved id is valid, try to download and extract antismash data + # retrieve antismash BGC data from antiSMASH-DB try: - download_and_extract_antismash_data( - gs_obj.resolved_refseq_id, project_download_root, project_extract_root - ) - - gs_obj.bgc_path = str( - Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute() - ) - - output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id) - if output_path.exists(): - Path.touch(output_path / "completed", exist_ok=True) - + retrieve_antismash_db_data(gs, project_download_root, project_extract_root) except Exception: - gs_obj.bgc_path = "" + continue # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] @@ -247,6 +232,67 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id +def get_genome_assembly_accession( + genome_status: GenomeStatus, genome_id_data: Mapping[str, str] +) -> None: + """Resolve and update the genome assembly accession for a given genome status. + + This function attempts to resolve the RefSeq ID for the provided genome record + and updates the `genome_status` object with the resolved ID. It also sets the + `resolve_attempted` flag to `True` to indicate that an attempt to resolve the + RefSeq ID has been made. If the resolution fails, raises a RuntimeError and leaves + the `resolved_refseq_id` empty. + + Args: + genome_status (GenomeStatus): An object representing the status of the genome, + which will be updated with the resolved RefSeq ID. + genome_id_data (Mapping[str, str]): A dictionary containing genome + information, where keys like "RefSeq_accession", "GenBank_accession", + or "JGI_Genome_ID" are used to resolve the RefSeq ID. + + Warnings: + Logs a warning if the RefSeq ID cannot be resolved. + """ + genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data) + genome_status.resolve_attempted = True + + if genome_status.resolved_refseq_id == "": + raise RuntimeError("Failed to get genome assembly accession") + + +def retrieve_antismash_db_data( + genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Retrieve antiSMASH database data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + try: + download_and_extract_antismash_data(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + except Exception as e: + genome_status.bgc_path = "" + raise e + + def _resolve_genbank_accession(genbank_id: str) -> str: """Try to get RefSeq assembly id through given GenBank assembly id. From 706f8419b32c0929f721b8b0c59feec8fa3bba64 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:18:06 +0100 Subject: [PATCH 06/33] Refactor: Improve genome ID handling and logging --- .../antismash/podp_antismash_downloader.py | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 5b9ded97..744025a2 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -153,30 +153,26 @@ def podp_download_and_extract_antismash_data( gs_dict = GenomeStatus.read_json(gs_file) for i, genome_record in enumerate(genome_records): - # get the best available ID from the dict - genome_id_data = genome_record["genome_ID"] - raw_genome_id = get_best_available_genome_id(genome_id_data) - if raw_genome_id is None or len(raw_genome_id) == 0: - logger.warning(f'Invalid input genome record "{genome_record}"') - continue + logger.info( + f"Getting antismash BGC data for genome record {i + 1} of {len(genome_records)}." + ) - # check if genome ID exist in the genome status file - if raw_genome_id not in gs_dict: - gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id) + # get the best available genome ID from the dict + original_genome_id = get_best_available_genome_id(genome_record["genome_ID"]) + if not original_genome_id: + logger.warning(f"Skipping invalid genome record: {genome_record}") + continue - gs_obj = gs_dict[raw_genome_id] + # Retrieve or initialize the GenomeStatus object for the genome ID + gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - logger.info( - f"Checking for antismash data {i + 1}/{len(genome_records)}, " - f"current genome ID={raw_genome_id}" - ) - # first, check if BGC data is downloaded - if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists(): - logger.info(f"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}") + # Skip genome if BGC data is downloaded + if gs.bgc_path and Path(gs.bgc_path).exists(): + logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") continue - # second, check if lookup attempted previously - if gs_obj.resolve_attempted: - logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt") + # Skip genome if lookup attempted previously + if gs.resolve_attempted: + logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue # resolve genome ID From b25506e7d9501df7245a152b6094411364ef4213 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:20:18 +0100 Subject: [PATCH 07/33] Refactor: Move logging for antiSMASH data retrieval errors and success messages --- src/nplinker/genomics/antismash/antismash_downloader.py | 3 --- .../genomics/antismash/podp_antismash_downloader.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index f2397ea8..07662107 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -54,11 +54,8 @@ def download_and_extract_antismash_data( _cleanup_extracted_files(extract_path) - logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id) - except Exception as e: shutil.rmtree(extract_path) - logger.warning(e) raise e diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 744025a2..04189f87 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -185,8 +185,11 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) - except Exception: - continue + logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted") + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" + ) # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] From 2ed8ac6cf158cc2721c0caa7fb01e70914abe3ad Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:29:05 +0100 Subject: [PATCH 08/33] simplify comment --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 04189f87..0cdea2fe 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -166,11 +166,10 @@ def podp_download_and_extract_antismash_data( # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - # Skip genome if BGC data is downloaded + # Skip genomes if gs.bgc_path and Path(gs.bgc_path).exists(): logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") continue - # Skip genome if lookup attempted previously if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue From a1e49620a2da8ab7cd00c3864b8e77e3c54e7337 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:30:48 +0100 Subject: [PATCH 09/33] Enhance logging messages in antiSMASH data retrieval --- .../genomics/antismash/podp_antismash_downloader.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 0cdea2fe..aadf1eb4 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -162,13 +162,15 @@ def podp_download_and_extract_antismash_data( if not original_genome_id: logger.warning(f"Skipping invalid genome record: {genome_record}") continue - # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) # Skip genomes if gs.bgc_path and Path(gs.bgc_path).exists(): - logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") + logger.info( + f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " + f"{gs.bgc_path}" + ) continue if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") @@ -184,7 +186,9 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) - logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted") + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" From e8dd391e56ea97b138648e755247823925b3f61e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:38:55 +0100 Subject: [PATCH 10/33] test: adapt test to changed logging info message --- tests/unit/genomics/test_podp_antismash_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index 7fb9ec0c..302d354d 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -218,7 +218,7 @@ def test_caching(download_root, extract_root, genome_status_file, caplog): assert genome_obj.resolve_attempted podp_download_and_extract_antismash_data(genome_records, download_root, extract_root) assert ( - f"Genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" + f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" in caplog.text ) assert ( From ea159ca824a5d8bf701c2fb9ab03c3ededff0a59 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 12:06:02 +0100 Subject: [PATCH 11/33] Feat: Add antiSMASH API functionality --- src/nplinker/genomics/antismash/__init__.py | 12 +- .../antismash/antismash_api_client.py | 97 ++++++++++++++++ .../antismash/antismash_downloader.py | 97 ++++++++++++---- .../genomics/antismash/ncbi_downloader.py | 107 ++++++++++++++++++ .../antismash/podp_antismash_downloader.py | 72 ++++++++++-- .../genomics/test_antismash_downloader.py | 10 +- 6 files changed, 362 insertions(+), 33 deletions(-) create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index e126f548..4817e806 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,16 +1,24 @@ -from .antismash_downloader import download_and_extract_antismash_data +from .antismash_api_client import antismash_job_is_done +from .antismash_api_client import submit_antismash_job +from .antismash_downloader import download_and_extract_from_antismash_api +from .antismash_downloader import download_and_extract_from_antismash_db from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data __all__ = [ - "download_and_extract_antismash_data", + "download_and_extract_from_antismash_api", + "download_and_extract_from_antismash_db", "AntismashBGCLoader", "parse_bgc_genbank", "GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data", + "download_and_extract_ncbi_genome", + "submit_antismash_job", + "antismash_job_is_done", ] diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py new file mode 100644 index 00000000..32b932c0 --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -0,0 +1,97 @@ +from __future__ import annotations +import logging +from os import PathLike +from pathlib import Path +from typing import Optional +import requests + + +logger = logging.getLogger(__name__) + + +def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: + """Submits an antiSMASH job using the provided GenBank file. + + This function sends a GenBank file to the antiSMASH API + and retrieves the job ID if the submission is successful. + + Args: + genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. + + Returns: + Optional[str]: The job ID if the submission is successful, or None if it fails. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + RuntimeError: If the API response does not contain a job ID. + """ + url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" + genbank_filepath = Path(genbank_filepath) + + with open(genbank_filepath, "rb") as file: + files = {"seq": file} + response = requests.post(url, files=files) + response.raise_for_status() # Raise an exception for HTTP errors + + data = response.json() + if "id" not in data: + raise RuntimeError("No antiSMASH job ID returned") + return data["id"] + + +def query_antismash_job(job_id: str) -> Optional[dict]: + """Gets the status of an antiSMASH job. + + Args: + job_id (str): The job ID to query. + + Returns: + dict: The response JSON if successful, otherwise None. + """ + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an exception for HTTP errors + return response.json() + + except requests.exceptions.RequestException as req_err: + logger.error(f"Request failed for job_id {job_id}: {req_err}") + except ValueError as json_err: # Handles JSON decoding errors + logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") + except Exception as err: + logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + + +def antismash_job_is_done(job_id: str) -> bool: + """Checks if the antiSMASH job is complete by polling the job status. + + Args: + job_id (str): The job ID to query. + + Returns: + bool: True if the job is done, False if the job is still running. + + Raises: + RuntimeError: If the job status could not be retrieved or if the job failed. + ValueError: If the job state is missing or unexpected in the response. + """ + response = query_antismash_job(job_id) + + if response is None: + raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") + if "state" not in response: + raise ValueError(f"Job state missing in response for job_id: {job_id}") + + job_state = response["state"] + if job_state in ("running", "queued"): + return False + if job_state == "done": + return True + if job_state == "failed": + job_status = response.get("status", "No error message provided") + raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") + else: + raise ValueError( + f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}" + ) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 07662107..928001c9 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,6 +4,7 @@ import shutil from os import PathLike from pathlib import Path +import requests from nplinker.utils import download_and_extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -15,50 +16,108 @@ ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}" # The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}" +# antismash api to download results from submitted jobs +ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}" def download_and_extract_antismash_data( - antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike + url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike ) -> None: """Download and extract antiSMASH BGC archive for a specified genome. - The antiSMASH database (https://antismash-db.secondarymetabolites.org/) - is used to download the BGC archive. And antiSMASH use RefSeq assembly id - of a genome as the id of the archive. + This function downloads a BGC archive from the specified URL, extracts its contents, + and organizes the extracted files into a structured directory under the given `extract_root`. Args: - antismash_id: The id used to download BGC archive from antiSMASH database. - If the id is versioned (e.g., "GCF_004339725.1") please be sure to - specify the version as well. - download_root: Path to the directory to place downloaded archive in. - extract_root: Path to the directory data files will be extracted to. + url (str): The URL to download the BGC archive from. + antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory. + download_root: Path to the directory where the downloaded archive will be stored. + extract_root: Path to the directory where the data files will be extracted. Note that an `antismash` directory will be created in the specified `extract_root` if it doesn't exist. The files will be extracted to `/antismash/` directory. Raises: - ValueError: if `/antismash/` dir is not empty. + ValueError: if `/antismash/` dir is not empty. + Exception: If any error occurs during the download or extraction process, the partially extracted + directory will be cleaned up, and the exception will be re-raised. Examples: - >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") + >>> download_and_extract_antismash_data( + "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip", + "GCF_001.1", + "/data/download", + "/data/extracted" + ) """ - download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id _prepare_extract_path(extract_path) - try: - for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: - url = base_url.format(antismash_id, antismash_id + ".zip") - download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") - break - + download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip") _cleanup_extracted_files(extract_path) - except Exception as e: shutil.rmtree(extract_path) raise e +def download_and_extract_from_antismash_api( + job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Downloads and extracts results from an antiSMASH API job. + + This function constructs the download URL using the provided job ID then + downloads the results as a ZIP file and extracts its contents to the specified directories. + + Args: + antismash_id (str): The unique identifier for the antiSMASH dataset. + job_id (str): The job ID for the antiSMASH API job. + download_root (str or PathLike): The root directory where the ZIP file will be downloaded. + extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + zipfile.BadZipFile: If the downloaded file is not a valid ZIP file. + OSError: If there is an issue with file operations such as writing or extracting. + """ + url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip") + download_and_extract_antismash_data(url, antismash_id, download_root, extract_root) + + +def download_and_extract_from_antismash_db( + refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Download and extract antiSMASH BGC archive for a specified genome. + + The antiSMASH database (https://antismash-db.secondarymetabolites.org/) + is used to download the BGC archive. And antiSMASH use RefSeq assembly id + of a genome as the id of the archive. + + Args: + refseq_acc: The id used to download BGC archive from antiSMASH database. + If the id is versioned (e.g., "GCF_004339725.1") please be sure to + specify the version as well. + download_root: Path to the directory to place downloaded archive in. + extract_root: Path to the directory data files will be extracted to. + Note that an `antismash` directory will be created in the specified `extract_root` if + it doesn't exist. The files will be extracted to `/antismash/` directory. + + Raises: + ValueError: if `/antismash/` dir is not empty. + + Examples: + >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted") + """ + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: + url = base_url.format(refseq_acc, f"{refseq_acc}.zip") + if requests.head(url).status_code == 404: # not found + continue + download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root) + return # Exit the loop once a valid URL is processed + + # if both urls give 404 not found + raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py new file mode 100644 index 00000000..b0b333d5 --- /dev/null +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -0,0 +1,107 @@ +import logging +import os +import shutil +import time +from os import PathLike +from pathlib import Path +from typing import Optional +from nplinker.utils import check_md5 +from nplinker.utils import download_url +from nplinker.utils import extract_archive + + +logger = logging.getLogger(__name__) + + +def download_and_extract_ncbi_genome( + refseq_id: str, + download_root: str | PathLike, + extract_root: str | PathLike, + max_attempts: int = 10, +) -> Optional[Path]: + """Downloads and extracts an NCBI dataset for a given genome refseq ID. + + This function attempts to download a dataset from the NCBI database using + the provided refseq ID. It retries the download process up to a maximum + number of times if any errors occur. The function verifies the integrity + of the downloaded files using MD5 checksums and moves and renames the + GenBank files upon successful verification. + + Args: + refseq_id (str): The refseq ID for the dataset to be downloaded. + download_root (str or Path): The root directory where the dataset will be downloaded. + extract_root (str or Path): The root directory where the dataset will be extracted. + max_attempts (int): The maximum number of times to attempt downloading. + + Returns: + Path: The path to the extracted dataset if successful, otherwise None. + + Raises: + Exception: If the maximum number of retries is reached and the dataset could + not be successfully downloaded and extracted. + """ + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{refseq_id}/download?include_annotation_type=GENOME_GB" + ) + + download_root = Path(download_root) + extract_path = Path(extract_root) / "ncbi_genomes" + filename = f"ncbi_{refseq_id}.zip" + + extract_path.mkdir(parents=True, exist_ok=True) + + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + archive = download_root / filename + break + except Exception as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(2) + else: + raise RuntimeError( + f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." + ) + + extract_archive(archive, extract_path) + verify_ncbi_dataset_md5_sums(extract_path) + + # Move and rename GenBank file + genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" + new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path.rename(new_genbank_path) + + # Delete unnecessary files + shutil.rmtree(extract_path / "ncbi_dataset") + os.remove(extract_path / "md5sum.txt") + os.remove(extract_path / "README.md") + + return new_genbank_path + + +def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index aadf1eb4..2d5f727c 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,6 +2,7 @@ import json import logging import re +import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -11,7 +12,11 @@ from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import antismash_job_is_done +from nplinker.genomics.antismash import download_and_extract_from_antismash_api +from nplinker.genomics.antismash import download_and_extract_from_antismash_db +from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + continue except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" ) + # retrieve antismash BGC by submitting antismash job via API + try: + logger.info( + f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + ) + genome_path = download_and_extract_ncbi_genome( + gs.resolved_refseq_id, project_download_root, project_extract_root + ) + job_id = submit_antismash_job(genome_path) + while antismash_job_is_done(job_id) is False: + time.sleep(15) + retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) + continue + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + ) + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: @@ -286,13 +313,42 @@ def retrieve_antismash_db_data( extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() - try: - download_and_extract_antismash_data(antismash_id, download_root, extract_root) - Path.touch(extract_path / "completed", exist_ok=True) - genome_status.bgc_path = str(download_path) - except Exception as e: - genome_status.bgc_path = "" - raise e + download_and_extract_from_antismash_db(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + + +def retrieve_antismash_job_data( + job_id: str, + genome_status: GenomeStatus, + download_root: str | PathLike, + extract_root: str | PathLike, +) -> None: + """Retrieve antiSMASH API data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + job_id (str): The job ID for the antiSMASH API job. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) def _resolve_genbank_accession(genbank_id: str) -> str: diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py index 1dfeb4cf..651629c2 100644 --- a/tests/unit/genomics/test_antismash_downloader.py +++ b/tests/unit/genomics/test_antismash_downloader.py @@ -1,5 +1,5 @@ import pytest -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.utils import extract_archive from nplinker.utils import list_files @@ -14,7 +14,7 @@ def test_default(self, tmp_path): extract_root.mkdir() original_extract_root = tmp_path / "original" original_extract_root.mkdir() - download_and_extract_antismash_data(self.antismash_id, download_root, extract_root) + download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash" / "GCF_004339725.1" extracted_files = list_files(extracted_folder, keep_parent=False) @@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError, match="Nonempty directory"): - download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted") + download_and_extract_from_antismash_db( + self.antismash_id, tmp_path, tmp_path / "extracted" + ) # test a non-existent ID, which can be either a fake ID, non-existent in NCBI # or a valid NCBI genome ID but it does not have BGC data in antismash database @@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path): extract_root.mkdir() for test_id in nonexisting_ids: with pytest.raises(RuntimeError): - download_and_extract_antismash_data(test_id, download_root, extract_root) + download_and_extract_from_antismash_db(test_id, download_root, extract_root) extracted_folder = extract_root / "antismash" / test_id assert not extracted_folder.exists() From a972a7001f07fc1ea7eb8924b4fbe71e71f523f6 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 12:06:02 +0100 Subject: [PATCH 12/33] Feat: Add antiSMASH API functionality --- src/nplinker/genomics/antismash/__init__.py | 12 +- .../antismash/antismash_api_client.py | 97 ++++++++++++++++ .../antismash/antismash_downloader.py | 97 ++++++++++++---- .../genomics/antismash/ncbi_downloader.py | 107 ++++++++++++++++++ .../antismash/podp_antismash_downloader.py | 72 ++++++++++-- .../genomics/test_antismash_downloader.py | 10 +- 6 files changed, 362 insertions(+), 33 deletions(-) create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index e126f548..4817e806 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,16 +1,24 @@ -from .antismash_downloader import download_and_extract_antismash_data +from .antismash_api_client import antismash_job_is_done +from .antismash_api_client import submit_antismash_job +from .antismash_downloader import download_and_extract_from_antismash_api +from .antismash_downloader import download_and_extract_from_antismash_db from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data __all__ = [ - "download_and_extract_antismash_data", + "download_and_extract_from_antismash_api", + "download_and_extract_from_antismash_db", "AntismashBGCLoader", "parse_bgc_genbank", "GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data", + "download_and_extract_ncbi_genome", + "submit_antismash_job", + "antismash_job_is_done", ] diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py new file mode 100644 index 00000000..32b932c0 --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -0,0 +1,97 @@ +from __future__ import annotations +import logging +from os import PathLike +from pathlib import Path +from typing import Optional +import requests + + +logger = logging.getLogger(__name__) + + +def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: + """Submits an antiSMASH job using the provided GenBank file. + + This function sends a GenBank file to the antiSMASH API + and retrieves the job ID if the submission is successful. + + Args: + genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. + + Returns: + Optional[str]: The job ID if the submission is successful, or None if it fails. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + RuntimeError: If the API response does not contain a job ID. + """ + url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" + genbank_filepath = Path(genbank_filepath) + + with open(genbank_filepath, "rb") as file: + files = {"seq": file} + response = requests.post(url, files=files) + response.raise_for_status() # Raise an exception for HTTP errors + + data = response.json() + if "id" not in data: + raise RuntimeError("No antiSMASH job ID returned") + return data["id"] + + +def query_antismash_job(job_id: str) -> Optional[dict]: + """Gets the status of an antiSMASH job. + + Args: + job_id (str): The job ID to query. + + Returns: + dict: The response JSON if successful, otherwise None. + """ + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an exception for HTTP errors + return response.json() + + except requests.exceptions.RequestException as req_err: + logger.error(f"Request failed for job_id {job_id}: {req_err}") + except ValueError as json_err: # Handles JSON decoding errors + logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") + except Exception as err: + logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + + +def antismash_job_is_done(job_id: str) -> bool: + """Checks if the antiSMASH job is complete by polling the job status. + + Args: + job_id (str): The job ID to query. + + Returns: + bool: True if the job is done, False if the job is still running. + + Raises: + RuntimeError: If the job status could not be retrieved or if the job failed. + ValueError: If the job state is missing or unexpected in the response. + """ + response = query_antismash_job(job_id) + + if response is None: + raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") + if "state" not in response: + raise ValueError(f"Job state missing in response for job_id: {job_id}") + + job_state = response["state"] + if job_state in ("running", "queued"): + return False + if job_state == "done": + return True + if job_state == "failed": + job_status = response.get("status", "No error message provided") + raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") + else: + raise ValueError( + f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}" + ) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 07662107..928001c9 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,6 +4,7 @@ import shutil from os import PathLike from pathlib import Path +import requests from nplinker.utils import download_and_extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -15,50 +16,108 @@ ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}" # The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}" +# antismash api to download results from submitted jobs +ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}" def download_and_extract_antismash_data( - antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike + url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike ) -> None: """Download and extract antiSMASH BGC archive for a specified genome. - The antiSMASH database (https://antismash-db.secondarymetabolites.org/) - is used to download the BGC archive. And antiSMASH use RefSeq assembly id - of a genome as the id of the archive. + This function downloads a BGC archive from the specified URL, extracts its contents, + and organizes the extracted files into a structured directory under the given `extract_root`. Args: - antismash_id: The id used to download BGC archive from antiSMASH database. - If the id is versioned (e.g., "GCF_004339725.1") please be sure to - specify the version as well. - download_root: Path to the directory to place downloaded archive in. - extract_root: Path to the directory data files will be extracted to. + url (str): The URL to download the BGC archive from. + antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory. + download_root: Path to the directory where the downloaded archive will be stored. + extract_root: Path to the directory where the data files will be extracted. Note that an `antismash` directory will be created in the specified `extract_root` if it doesn't exist. The files will be extracted to `/antismash/` directory. Raises: - ValueError: if `/antismash/` dir is not empty. + ValueError: if `/antismash/` dir is not empty. + Exception: If any error occurs during the download or extraction process, the partially extracted + directory will be cleaned up, and the exception will be re-raised. Examples: - >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") + >>> download_and_extract_antismash_data( + "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip", + "GCF_001.1", + "/data/download", + "/data/extracted" + ) """ - download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id _prepare_extract_path(extract_path) - try: - for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: - url = base_url.format(antismash_id, antismash_id + ".zip") - download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") - break - + download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip") _cleanup_extracted_files(extract_path) - except Exception as e: shutil.rmtree(extract_path) raise e +def download_and_extract_from_antismash_api( + job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Downloads and extracts results from an antiSMASH API job. + + This function constructs the download URL using the provided job ID then + downloads the results as a ZIP file and extracts its contents to the specified directories. + + Args: + antismash_id (str): The unique identifier for the antiSMASH dataset. + job_id (str): The job ID for the antiSMASH API job. + download_root (str or PathLike): The root directory where the ZIP file will be downloaded. + extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + zipfile.BadZipFile: If the downloaded file is not a valid ZIP file. + OSError: If there is an issue with file operations such as writing or extracting. + """ + url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip") + download_and_extract_antismash_data(url, antismash_id, download_root, extract_root) + + +def download_and_extract_from_antismash_db( + refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Download and extract antiSMASH BGC archive for a specified genome. + + The antiSMASH database (https://antismash-db.secondarymetabolites.org/) + is used to download the BGC archive. And antiSMASH use RefSeq assembly id + of a genome as the id of the archive. + + Args: + refseq_acc: The id used to download BGC archive from antiSMASH database. + If the id is versioned (e.g., "GCF_004339725.1") please be sure to + specify the version as well. + download_root: Path to the directory to place downloaded archive in. + extract_root: Path to the directory data files will be extracted to. + Note that an `antismash` directory will be created in the specified `extract_root` if + it doesn't exist. The files will be extracted to `/antismash/` directory. + + Raises: + ValueError: if `/antismash/` dir is not empty. + + Examples: + >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted") + """ + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: + url = base_url.format(refseq_acc, f"{refseq_acc}.zip") + if requests.head(url).status_code == 404: # not found + continue + download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root) + return # Exit the loop once a valid URL is processed + + # if both urls give 404 not found + raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py new file mode 100644 index 00000000..b0b333d5 --- /dev/null +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -0,0 +1,107 @@ +import logging +import os +import shutil +import time +from os import PathLike +from pathlib import Path +from typing import Optional +from nplinker.utils import check_md5 +from nplinker.utils import download_url +from nplinker.utils import extract_archive + + +logger = logging.getLogger(__name__) + + +def download_and_extract_ncbi_genome( + refseq_id: str, + download_root: str | PathLike, + extract_root: str | PathLike, + max_attempts: int = 10, +) -> Optional[Path]: + """Downloads and extracts an NCBI dataset for a given genome refseq ID. + + This function attempts to download a dataset from the NCBI database using + the provided refseq ID. It retries the download process up to a maximum + number of times if any errors occur. The function verifies the integrity + of the downloaded files using MD5 checksums and moves and renames the + GenBank files upon successful verification. + + Args: + refseq_id (str): The refseq ID for the dataset to be downloaded. + download_root (str or Path): The root directory where the dataset will be downloaded. + extract_root (str or Path): The root directory where the dataset will be extracted. + max_attempts (int): The maximum number of times to attempt downloading. + + Returns: + Path: The path to the extracted dataset if successful, otherwise None. + + Raises: + Exception: If the maximum number of retries is reached and the dataset could + not be successfully downloaded and extracted. + """ + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{refseq_id}/download?include_annotation_type=GENOME_GB" + ) + + download_root = Path(download_root) + extract_path = Path(extract_root) / "ncbi_genomes" + filename = f"ncbi_{refseq_id}.zip" + + extract_path.mkdir(parents=True, exist_ok=True) + + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + archive = download_root / filename + break + except Exception as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(2) + else: + raise RuntimeError( + f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." + ) + + extract_archive(archive, extract_path) + verify_ncbi_dataset_md5_sums(extract_path) + + # Move and rename GenBank file + genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" + new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path.rename(new_genbank_path) + + # Delete unnecessary files + shutil.rmtree(extract_path / "ncbi_dataset") + os.remove(extract_path / "md5sum.txt") + os.remove(extract_path / "README.md") + + return new_genbank_path + + +def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index aadf1eb4..2d5f727c 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,6 +2,7 @@ import json import logging import re +import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -11,7 +12,11 @@ from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import antismash_job_is_done +from nplinker.genomics.antismash import download_and_extract_from_antismash_api +from nplinker.genomics.antismash import download_and_extract_from_antismash_db +from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + continue except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" ) + # retrieve antismash BGC by submitting antismash job via API + try: + logger.info( + f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + ) + genome_path = download_and_extract_ncbi_genome( + gs.resolved_refseq_id, project_download_root, project_extract_root + ) + job_id = submit_antismash_job(genome_path) + while antismash_job_is_done(job_id) is False: + time.sleep(15) + retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) + continue + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + ) + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: @@ -286,13 +313,42 @@ def retrieve_antismash_db_data( extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() - try: - download_and_extract_antismash_data(antismash_id, download_root, extract_root) - Path.touch(extract_path / "completed", exist_ok=True) - genome_status.bgc_path = str(download_path) - except Exception as e: - genome_status.bgc_path = "" - raise e + download_and_extract_from_antismash_db(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + + +def retrieve_antismash_job_data( + job_id: str, + genome_status: GenomeStatus, + download_root: str | PathLike, + extract_root: str | PathLike, +) -> None: + """Retrieve antiSMASH API data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + job_id (str): The job ID for the antiSMASH API job. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) def _resolve_genbank_accession(genbank_id: str) -> str: diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py index 1dfeb4cf..651629c2 100644 --- a/tests/unit/genomics/test_antismash_downloader.py +++ b/tests/unit/genomics/test_antismash_downloader.py @@ -1,5 +1,5 @@ import pytest -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.utils import extract_archive from nplinker.utils import list_files @@ -14,7 +14,7 @@ def test_default(self, tmp_path): extract_root.mkdir() original_extract_root = tmp_path / "original" original_extract_root.mkdir() - download_and_extract_antismash_data(self.antismash_id, download_root, extract_root) + download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash" / "GCF_004339725.1" extracted_files = list_files(extracted_folder, keep_parent=False) @@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError, match="Nonempty directory"): - download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted") + download_and_extract_from_antismash_db( + self.antismash_id, tmp_path, tmp_path / "extracted" + ) # test a non-existent ID, which can be either a fake ID, non-existent in NCBI # or a valid NCBI genome ID but it does not have BGC data in antismash database @@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path): extract_root.mkdir() for test_id in nonexisting_ids: with pytest.raises(RuntimeError): - download_and_extract_antismash_data(test_id, download_root, extract_root) + download_and_extract_from_antismash_db(test_id, download_root, extract_root) extracted_folder = extract_root / "antismash" / test_id assert not extracted_folder.exists() From f3a159f3bcfbc1568058ecd3a6fda08838953439 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:27:18 +0100 Subject: [PATCH 13/33] fix: improve logging message for start of antiSMASH API process --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 2d5f727c..35f269cc 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -203,7 +203,8 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC by submitting antismash job via API try: logger.info( - f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + "Downloading genome assembly from NCBI and submitting antiSMASH job for " + f"genome ID {gs.original_id}." ) genome_path = download_and_extract_ncbi_genome( gs.resolved_refseq_id, project_download_root, project_extract_root From da4da3c85d472c7e535068072da9cf5943b87410 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:30:58 +0100 Subject: [PATCH 14/33] docs: improve docstring for download_and_extract_ncbi_genome function --- .../genomics/antismash/ncbi_downloader.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index b0b333d5..a65a5b92 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -19,26 +19,27 @@ def download_and_extract_ncbi_genome( extract_root: str | PathLike, max_attempts: int = 10, ) -> Optional[Path]: - """Downloads and extracts an NCBI dataset for a given genome refseq ID. + """Downloads and extracts an NCBI dataset for a given genome RefSeq ID. - This function attempts to download a dataset from the NCBI database using - the provided refseq ID. It retries the download process up to a maximum - number of times if any errors occur. The function verifies the integrity - of the downloaded files using MD5 checksums and moves and renames the - GenBank files upon successful verification. + This function retrieves a dataset from the NCBI database using the provided + RefSeq ID. It retries the download process up to a specified maximum number + of attempts in case of errors. The function verifies the integrity of the + downloaded files using MD5 checksums, extracts the dataset, and renames the + GenBank file for easier access. Unnecessary files are removed after successful + processing. Args: - refseq_id (str): The refseq ID for the dataset to be downloaded. - download_root (str or Path): The root directory where the dataset will be downloaded. - extract_root (str or Path): The root directory where the dataset will be extracted. - max_attempts (int): The maximum number of times to attempt downloading. + refseq_id (str): The RefSeq ID for the dataset to be downloaded. + download_root (str | PathLike): The directory where the dataset will be downloaded. + extract_root (str | PathLike): The directory where the dataset will be extracted. + max_attempts (int): The maximum number of download attempts. Defaults to 10. Returns: - Path: The path to the extracted dataset if successful, otherwise None. + Optional[Path]: The path to the extracted GenBank file if successful, otherwise None. Raises: - Exception: If the maximum number of retries is reached and the dataset could - not be successfully downloaded and extracted. + RuntimeError: If the maximum number of retries is reached and the dataset + could not be successfully downloaded and extracted. """ url = ( "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" From af094e4176c912fbc164ecbf92cd477faa20bbd3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:39:45 +0100 Subject: [PATCH 15/33] fix: update logging messages for antiSMASH data retrieval failures --- .../genomics/antismash/podp_antismash_downloader.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 35f269cc..f96538e1 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -196,8 +196,9 @@ def podp_download_and_extract_antismash_data( ) continue except Exception as e: - logger.warning( - f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" + logger.info( + f"Unable to retrieve BGC data from antiSMASH-DB for genome ID {gs.original_id}. " + f"Error: {e}" ) # retrieve antismash BGC by submitting antismash job via API @@ -218,10 +219,14 @@ def podp_download_and_extract_antismash_data( ) continue except Exception as e: - logger.warning( - f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + logger.info( + f"Unable to retrieve BGC data via antiSMASH API for genome ID {gs.original_id}. " + f"Error: {e}" ) + if gs.bgc_path == "": + logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: From 99d99a25714ca7ad76301698c048018b6ac285d5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:43:35 +0100 Subject: [PATCH 16/33] add logging after antiSMASH job submission --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index f96538e1..efd7acc5 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -211,6 +211,7 @@ def podp_download_and_extract_antismash_data( gs.resolved_refseq_id, project_download_root, project_extract_root ) job_id = submit_antismash_job(genome_path) + logger.info(f"Waiting for antiSMASH job {job_id} to complete.") while antismash_job_is_done(job_id) is False: time.sleep(15) retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) From 1c8393479a9d27924be27086347fed048f152194 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:48:57 +0100 Subject: [PATCH 17/33] refactor: rename refseq_id to genome_assembly_acc --- src/nplinker/genomics/antismash/ncbi_downloader.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a65a5b92..b6b5da0c 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -14,7 +14,7 @@ def download_and_extract_ncbi_genome( - refseq_id: str, + genome_assembly_acc: str, download_root: str | PathLike, extract_root: str | PathLike, max_attempts: int = 10, @@ -29,7 +29,7 @@ def download_and_extract_ncbi_genome( processing. Args: - refseq_id (str): The RefSeq ID for the dataset to be downloaded. + genome_assembly_acc (str): The NCBI accession of the genome assembly to be downloaded. download_root (str | PathLike): The directory where the dataset will be downloaded. extract_root (str | PathLike): The directory where the dataset will be extracted. max_attempts (int): The maximum number of download attempts. Defaults to 10. @@ -43,12 +43,12 @@ def download_and_extract_ncbi_genome( """ url = ( "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" - f"{refseq_id}/download?include_annotation_type=GENOME_GB" + f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" ) download_root = Path(download_root) extract_path = Path(extract_root) / "ncbi_genomes" - filename = f"ncbi_{refseq_id}.zip" + filename = f"ncbi_{genome_assembly_acc}.zip" extract_path.mkdir(parents=True, exist_ok=True) @@ -70,8 +70,8 @@ def download_and_extract_ncbi_genome( verify_ncbi_dataset_md5_sums(extract_path) # Move and rename GenBank file - genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" - new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff" + new_genbank_path = extract_path / f"{genome_assembly_acc}.gbff" genbank_path.rename(new_genbank_path) # Delete unnecessary files From 496db38b06416c6accb39bf197fee388502eaa4b Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:49:50 +0100 Subject: [PATCH 18/33] improve genome download process with validation and retry logic --- .../genomics/antismash/ncbi_downloader.py | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index b6b5da0c..d2095e81 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -5,6 +5,8 @@ from os import PathLike from pathlib import Path from typing import Optional +import httpx +import requests from nplinker.utils import check_md5 from nplinker.utils import download_url from nplinker.utils import extract_archive @@ -41,31 +43,11 @@ def download_and_extract_ncbi_genome( RuntimeError: If the maximum number of retries is reached and the dataset could not be successfully downloaded and extracted. """ - url = ( - "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" - f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" - ) - - download_root = Path(download_root) extract_path = Path(extract_root) / "ncbi_genomes" - filename = f"ncbi_{genome_assembly_acc}.zip" - extract_path.mkdir(parents=True, exist_ok=True) - for attempt in range(1, max_attempts + 1): - try: - download_url(url, download_root, filename) - archive = download_root / filename - break - except Exception as e: - logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") - if attempt < max_attempts: - time.sleep(2) - else: - raise RuntimeError( - f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." - ) - + _check_genome_accession_validity(genome_assembly_acc) + archive = _download_genome(genome_assembly_acc, download_root, max_attempts) extract_archive(archive, extract_path) verify_ncbi_dataset_md5_sums(extract_path) @@ -106,3 +88,48 @@ def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: file_path = extract_path / file_name if not check_md5(file_path, md5sum): raise ValueError(f"MD5 checksum mismatch for {file_path}") + + +def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10): + """Check the validity of genome accessio.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check" + + # Retry multiple times because NCBI has currently issues (500 Internal Server Error) + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url) + response.raise_for_status() + break + except Exception: + if attempt < max_attempts: + time.sleep(1) + + # Raise if no attempt was successful + response.raise_for_status() + # Raise if genome assembly is not successful + if "valid_assemblies" not in response.json(): + raise ValueError(f"Not a valid genome assembly accession: {genome_assembly_acc}") + + +def _download_genome(genome_assembly_acc, download_root, max_attempts): + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" + ) + download_root = Path(download_root) + filename = f"ncbi_{genome_assembly_acc}.zip" + + # Retry multiple times because NCBI has issues currently + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + return download_root / filename + except httpx.ReadTimeout as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(1) + else: + raise httpx.ReadTimeout( + f"Failed to download the genome {genome_assembly_acc} from NCBI. " + f"Maximum download retries ({max_attempts}) reached for {url}." + ) From cc6573c206869fbb6152a02d0bad494f072b8669 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:50:28 +0100 Subject: [PATCH 19/33] test: add unit tests for download_and_extract_ncbi_genome function --- tests/unit/genomics/test_ncbi_downloader.py | 43 +++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 tests/unit/genomics/test_ncbi_downloader.py diff --git a/tests/unit/genomics/test_ncbi_downloader.py b/tests/unit/genomics/test_ncbi_downloader.py new file mode 100644 index 00000000..dadbe9ea --- /dev/null +++ b/tests/unit/genomics/test_ncbi_downloader.py @@ -0,0 +1,43 @@ +from unittest.mock import patch +import httpx +import pytest +from nplinker.genomics.antismash.ncbi_downloader import download_and_extract_ncbi_genome + + +@pytest.fixture +def download_root(tmp_path): + return tmp_path / "download" + + +@pytest.fixture +def extract_root(tmp_path): + return tmp_path / "extracted" + + +def test_download_and_extract_ncbi_genome_success(download_root, extract_root): + refseq_id = "GCF_000514775.1" + + genome_path = download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) + + assert genome_path == extract_root / "ncbi_genomes" / f"{refseq_id}.gbff" + assert not (extract_root / "ncbi_genomes" / "md5sum.txt").exists() + assert not (extract_root / "ncbi_genomes" / "README.md").exists() + assert not (extract_root / "ncbi_genomes" / "ncbi_dataset").exists() + + +def test_download_and_extract_ncbi_genome_max_retries(download_root, extract_root): + refseq_id = "GCF_000514775.1" + + with patch( + "nplinker.genomics.antismash.ncbi_downloader.download_url", + side_effect=httpx.ReadTimeout("Download failed"), + ): + with pytest.raises(httpx.ReadTimeout, match="Maximum download retries"): + download_and_extract_ncbi_genome(refseq_id, download_root, extract_root, max_attempts=1) + + +def test_download_and_extract_ncbi_genome_invalid_refseq_id(download_root, extract_root): + refseq_id = "invalid_ref_seq_id" + + with pytest.raises(ValueError, match="Not a valid genome assembly accession"): + download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) From 51e4817b669a18be12a9c814213905e667a85081 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:50:52 +0100 Subject: [PATCH 20/33] refactor: rename verify_ncbi_dataset_md5_sums to _verify_ncbi_dataset_md5_sums for consistency --- src/nplinker/genomics/antismash/ncbi_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index d2095e81..f3260ef0 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -49,7 +49,7 @@ def download_and_extract_ncbi_genome( _check_genome_accession_validity(genome_assembly_acc) archive = _download_genome(genome_assembly_acc, download_root, max_attempts) extract_archive(archive, extract_path) - verify_ncbi_dataset_md5_sums(extract_path) + _verify_ncbi_dataset_md5_sums(extract_path) # Move and rename GenBank file genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff" @@ -64,7 +64,7 @@ def download_and_extract_ncbi_genome( return new_genbank_path -def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, From 6b564ea6d418763a312ac3e59144a4d267b42c9e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:52:24 +0100 Subject: [PATCH 21/33] refactor: move _verify_ncbi_dataset_md5_sums function to a new location for better organization --- .../genomics/antismash/ncbi_downloader.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index f3260ef0..3a91b367 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -64,32 +64,6 @@ def download_and_extract_ncbi_genome( return new_genbank_path -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: - """Verify the integrity of files in a specified directory using MD5 checksums. - - This function reads an "md5sum.txt" file located in the given extraction path, - which contains MD5 checksums and corresponding file names. It then computes - the MD5 checksum for each file and compares it with the expected value. If any - file's checksum does not match, a `ValueError` is raised. - - Args: - extract_path (PathLike): Path to the directory containing the files and - the "md5sum.txt" file. - - Returns: - bool: True if all files pass the MD5 checksum verification. - - Raises: - ValueError: If the MD5 checksum of any file does not match the expected value. - """ - with open(extract_path / "md5sum.txt", "r") as f: - for line in f: - md5sum, file_name = line.strip().split() - file_path = extract_path / file_name - if not check_md5(file_path, md5sum): - raise ValueError(f"MD5 checksum mismatch for {file_path}") - - def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10): """Check the validity of genome accessio.""" url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check" @@ -133,3 +107,29 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): f"Failed to download the genome {genome_assembly_acc} from NCBI. " f"Maximum download retries ({max_attempts}) reached for {url}." ) + + +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") From 7570852bb7437419228b02978d820deae009bead Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:06:18 +0100 Subject: [PATCH 22/33] feat: handle already download antiSMASH results --- src/nplinker/genomics/antismash/__init__.py | 2 + .../antismash/antismash_downloader.py | 35 ++++++++++++++ .../antismash/podp_antismash_downloader.py | 46 ++++++++++++++++++- 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index 4817e806..7c9179a0 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -2,6 +2,7 @@ from .antismash_api_client import submit_antismash_job from .antismash_downloader import download_and_extract_from_antismash_api from .antismash_downloader import download_and_extract_from_antismash_db +from .antismash_downloader import extract_antismash_data from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank from .ncbi_downloader import download_and_extract_ncbi_genome @@ -11,6 +12,7 @@ __all__ = [ + "extract_antismash_data", "download_and_extract_from_antismash_api", "download_and_extract_from_antismash_db", "AntismashBGCLoader", diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 928001c9..ef351ff8 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -6,6 +6,7 @@ from pathlib import Path import requests from nplinker.utils import download_and_extract_archive +from nplinker.utils import extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -118,6 +119,40 @@ def download_and_extract_from_antismash_db( raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") +def extract_antismash_data( + archive: str | PathLike, extract_root: str | PathLike, antimash_id: str +) -> None: + """Extracts antiSMASH results from a given archive into a specified directory. + + This function handles the extraction of antiSMASH results by preparing the + extraction path, extracting the archive, and performing cleanup of + unnecessary files. If an error occurs during the process, the partially + extracted files are removed, and the exception is re-raised. + + Args: + archive (str | PathLike): The path to the archive file containing antiSMASH results. + extract_root (str | PathLike): The root directory where the data should + be extracted. + antimash_id (str): A unique identifier for the antiSMASH data, used to + create a subdirectory for the extracted files. + + Raises: + Exception: If any error occurs during the extraction process, the + exception is re-raised after cleaning up the extraction directory. + """ + extract_path = Path(extract_root) / "antismash" / antimash_id + + _prepare_extract_path(extract_path) + + try: + extract_archive(archive, extract_path, remove_finished=False) + _cleanup_extracted_files(extract_path) + + except Exception as e: + shutil.rmtree(extract_path) + raise e + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index efd7acc5..311c5160 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -16,6 +16,7 @@ from nplinker.genomics.antismash import download_and_extract_from_antismash_api from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import extract_antismash_data from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -170,13 +171,23 @@ def podp_download_and_extract_antismash_data( # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - # Skip genomes + # Check if genomes already have antiSMASH BGC data if gs.bgc_path and Path(gs.bgc_path).exists(): logger.info( f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " f"{gs.bgc_path}" ) - continue + try: + process_existing_antismash_data(gs, project_extract_root) + continue + except Exception as e: + logger.warning( + "Failed to process existing antiSMASH BGC data for genome ID " + f"{original_genome_id}. Error: {e}" + ) + gs.bgc_path = "" # Reset bgc path + + # Check if a previous attempt to get bgc data has failed if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue @@ -296,6 +307,37 @@ def get_genome_assembly_accession( raise RuntimeError("Failed to get genome assembly accession") +def process_existing_antismash_data(gs_obj: GenomeStatus, extract_root: str | PathLike) -> None: + """Processes already downloaded antiSMASH BGC data archive. + + This function ensures that the antiSMASH data archive associated with a given genomic sequence + object is properly extracted into a specified directory. If the data has already been extracted, + the function skips the extraction process. + + Args: + gs_obj: An object representing a genomic sequence, which contains the path + to the antiSMASH BGC data (accessible via `gs_obj.bgc_path`) and + an original identifier (`gs_obj.original_id`). + extract_root: The root directory where the antiSMASH data should be extracted. + + Raises: + Any exceptions raised by the `extract_antismash_data` function if the extraction fails. + """ + antismash_id = Path(gs_obj.bgc_path).stem + extract_path = Path(extract_root, "antismash", antismash_id) + completed_marker = extract_path / "completed" + + # Check if archive is already successfully extracted + if completed_marker.exists(): + logger.info( + f"antiSMASH BGC data for {gs_obj.original_id} already extracted at {extract_path}." + ) + return + + extract_antismash_data(gs_obj.bgc_path, extract_root, antismash_id) + completed_marker.touch(exist_ok=True) + + def retrieve_antismash_db_data( genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike ) -> None: From 98812546caa77d506714d5b3013f1b06986aab8f Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:07:04 +0100 Subject: [PATCH 23/33] fix mistake in docstring --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 311c5160..5af4169a 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -297,8 +297,8 @@ def get_genome_assembly_accession( information, where keys like "RefSeq_accession", "GenBank_accession", or "JGI_Genome_ID" are used to resolve the RefSeq ID. - Warnings: - Logs a warning if the RefSeq ID cannot be resolved. + Raises: + RuntimeError: If the RefSeq ID cannot be resolved. """ genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data) genome_status.resolve_attempted = True From 0e1ec548fa420ff0b9f51f74545b10b2ca9a8bc0 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:14:32 +0100 Subject: [PATCH 24/33] fix: update return type of submit_antismash_job to str --- src/nplinker/genomics/antismash/antismash_api_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 32b932c0..2b6fd7a8 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: +def submit_antismash_job(genbank_filepath: str | PathLike) -> str: """Submits an antiSMASH job using the provided GenBank file. This function sends a GenBank file to the antiSMASH API @@ -19,11 +19,11 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. Returns: - Optional[str]: The job ID if the submission is successful, or None if it fails. + str: The job ID if the submission. Raises: requests.exceptions.RequestException: If there is an issue with the HTTP request. - RuntimeError: If the API response does not contain a job ID. + RuntimeError: If the API response does not contain a job ID. """ url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" genbank_filepath = Path(genbank_filepath) From f53f1a7b062f73600a17bb6d9ecdc159d426306e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:16:36 +0100 Subject: [PATCH 25/33] fix: update return type of download_and_extract_ncbi_genome to Path --- src/nplinker/genomics/antismash/ncbi_downloader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index 3a91b367..a47c53f5 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -4,7 +4,6 @@ import time from os import PathLike from pathlib import Path -from typing import Optional import httpx import requests from nplinker.utils import check_md5 @@ -20,7 +19,7 @@ def download_and_extract_ncbi_genome( download_root: str | PathLike, extract_root: str | PathLike, max_attempts: int = 10, -) -> Optional[Path]: +) -> Path: """Downloads and extracts an NCBI dataset for a given genome RefSeq ID. This function retrieves a dataset from the NCBI database using the provided @@ -37,7 +36,7 @@ def download_and_extract_ncbi_genome( max_attempts (int): The maximum number of download attempts. Defaults to 10. Returns: - Optional[Path]: The path to the extracted GenBank file if successful, otherwise None. + Path: The path to the extracted GenBank file. Raises: RuntimeError: If the maximum number of retries is reached and the dataset From 5e5b2bd241f7f0287b318bb0c63aea03daa20555 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:29:04 +0100 Subject: [PATCH 26/33] update submit_antismash_job to return job ID as string and improve error handling in antismash_job_is_done --- .../antismash/antismash_api_client.py | 58 +++++++------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 2b6fd7a8..39cba5b4 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -2,7 +2,6 @@ import logging from os import PathLike from pathlib import Path -from typing import Optional import requests @@ -36,60 +35,45 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str: data = response.json() if "id" not in data: raise RuntimeError("No antiSMASH job ID returned") - return data["id"] - - -def query_antismash_job(job_id: str) -> Optional[dict]: - """Gets the status of an antiSMASH job. - - Args: - job_id (str): The job ID to query. - - Returns: - dict: The response JSON if successful, otherwise None. - """ - url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" - - try: - response = requests.get(url, timeout=10) - response.raise_for_status() # Raise an exception for HTTP errors - return response.json() - - except requests.exceptions.RequestException as req_err: - logger.error(f"Request failed for job_id {job_id}: {req_err}") - except ValueError as json_err: # Handles JSON decoding errors - logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") - except Exception as err: - logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + return str(data["id"]) def antismash_job_is_done(job_id: str) -> bool: - """Checks if the antiSMASH job is complete by polling the job status. + """Determines if the antiSMASH job has completed by checking its status. + + This function queries the antiSMASH API to retrieve the current state + of the job and determines whether it has finished successfully, is still + in progress, or has encountered an error. Args: - job_id (str): The job ID to query. + job_id (str): The unique identifier of the antiSMASH job. Returns: - bool: True if the job is done, False if the job is still running. + bool: True if the job is completed successfully, False if it is still + running or queued. Raises: - RuntimeError: If the job status could not be retrieved or if the job failed. - ValueError: If the job state is missing or unexpected in the response. + RuntimeError: If the job has failed or if the API response indicates an error. + ValueError: If the job state is missing or an unexpected state is encountered + in the API response. + requests.exceptions.HTTPError: If an HTTP error occurs during the API request. """ - response = query_antismash_job(job_id) + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise exception for HTTP errors + respose_data = response.json() - if response is None: - raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") - if "state" not in response: + if "state" not in respose_data: raise ValueError(f"Job state missing in response for job_id: {job_id}") - job_state = response["state"] + job_state = respose_data["state"] if job_state in ("running", "queued"): return False if job_state == "done": return True if job_state == "failed": - job_status = response.get("status", "No error message provided") + job_status = respose_data.get("status", "No error message provided") raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") else: raise ValueError( From 909ca639c586e8066acf43e2c49c55bcae73adda Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:29:18 +0100 Subject: [PATCH 27/33] chore: add types-requests to development dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 879895f0..8fd371b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ "mypy", "typing_extensions", # stub packages. Update the `format-typing-check.yml` too if you add more. + "types-requests", "types-beautifulsoup4", "types-jsonschema", "types-networkx", From 252a177c840eb28a32c17398a44d351927cfb7cc Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:30:27 +0100 Subject: [PATCH 28/33] fix: update return type of _verify_ncbi_dataset_md5_sums to None and remove return value documentation --- src/nplinker/genomics/antismash/ncbi_downloader.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a47c53f5..9bf8c96b 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): ) -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, @@ -120,9 +120,6 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: extract_path (PathLike): Path to the directory containing the files and the "md5sum.txt" file. - Returns: - bool: True if all files pass the MD5 checksum verification. - Raises: ValueError: If the MD5 checksum of any file does not match the expected value. """ From 81a01024c1b01da9007be45444a7e09a689f69d4 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:31:49 +0100 Subject: [PATCH 29/33] fix: update _verify_ncbi_dataset_md5_sums to accept str or PathLike for extract_path --- src/nplinker/genomics/antismash/ncbi_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index 9bf8c96b..a39338fa 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): ) -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: +def _verify_ncbi_dataset_md5_sums(extract_path: str | PathLike) -> None: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, @@ -123,6 +123,7 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: Raises: ValueError: If the MD5 checksum of any file does not match the expected value. """ + extract_path = Path(extract_path) with open(extract_path / "md5sum.txt", "r") as f: for line in f: md5sum, file_name = line.strip().split() From 50890a73d3e38466118ed5d030b25d0788b4cb66 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:33:26 +0100 Subject: [PATCH 30/33] fix: convert extract_path to Path in _prepare_extract_path for consistency --- src/nplinker/genomics/antismash/antismash_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index ef351ff8..2c27938e 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -172,6 +172,7 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None: def _prepare_extract_path(extract_path: str | PathLike) -> None: + extract_path = Path(extract_path) if extract_path.exists(): _check_extract_path(extract_path) else: From b784476bbec8223ca0535c6a644eda11ee4ae8ef Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:53:52 +0100 Subject: [PATCH 31/33] chore: update typing dependencies in format-typing-check workflow --- .github/workflows/format-typing-check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml index a4128a15..c5a97d4b 100644 --- a/.github/workflows/format-typing-check.yml +++ b/.github/workflows/format-typing-check.yml @@ -37,8 +37,8 @@ jobs: - name: Install ruff and mypy run: | pip install ruff mypy typing_extensions \ - types-Deprecated types-beautifulsoup4 types-jsonschema \ - types-networkx types-tabulate types-PyYAML pandas-stubs + types-Deprecated types-beautifulsoup4 types-jsonschema types-requests \ + types-networkx types-tabulate types-PyYAML pandas-stubs - name: Get all changed python files id: changed-python-files uses: tj-actions/changed-files@v44 From bdebd9679a2f3358f6ce2fc733ca6958d0342190 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 19:05:54 +0100 Subject: [PATCH 32/33] fix: clarify return value documentation for submit_antismash_job function --- src/nplinker/genomics/antismash/antismash_api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 39cba5b4..124aedcb 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -18,7 +18,7 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str: genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. Returns: - str: The job ID if the submission. + str: The job ID of the submitted antiSMASH job. Raises: requests.exceptions.RequestException: If there is an issue with the HTTP request. From 7334c902813a3e5dae31b3087a0d626bd73686b9 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 19:06:02 +0100 Subject: [PATCH 33/33] fix: enable postponed evaluation of type annotations in ncbi_downloader.py --- src/nplinker/genomics/antismash/ncbi_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a39338fa..143102c8 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -1,3 +1,4 @@ +from __future__ import annotations import logging import os import shutil