From 2430868e2a583a177fbd27b56f4b10dd0e66e820 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 18:17:04 +0100
Subject: [PATCH 01/33] Refactor: Simplify Path handling

---
 src/nplinker/genomics/antismash/antismash_downloader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index a02728f4..a95483ed 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -42,8 +42,7 @@ def download_and_extract_antismash_data(
         >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
     """
     download_root = Path(download_root)
-    extract_root = Path(extract_root)
-    extract_path = extract_root / "antismash" / antismash_id
+    extract_path = Path(extract_root) / "antismash" / antismash_id
 
     try:
         if extract_path.exists():

From 1f072467054a02defa9dfb5c767f861fba712aac Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 18:17:42 +0100
Subject: [PATCH 02/33] Refactor: Improve extract path handling to ensure that
 non-empty dirs are not deleted

---
 .../genomics/antismash/antismash_downloader.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index a95483ed..8f4fe5b7 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -44,12 +44,12 @@ def download_and_extract_antismash_data(
     download_root = Path(download_root)
     extract_path = Path(extract_root) / "antismash" / antismash_id
 
-    try:
-        if extract_path.exists():
-            _check_extract_path(extract_path)
-        else:
-            extract_path.mkdir(parents=True, exist_ok=True)
+    if extract_path.exists():
+        _check_extract_path(extract_path)
+    else:
+        extract_path.mkdir(parents=True, exist_ok=True)
 
+    try:
         for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
             url = base_url.format(antismash_id, antismash_id + ".zip")
             download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")

From 8d7238c9cbd4007ea96d7f376fb7f283570c5d97 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 18:18:16 +0100
Subject: [PATCH 03/33] Refactor: Extract file cleanup logic into a separate
 function for better readability

---
 .../antismash/antismash_downloader.py         | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 8f4fe5b7..93f22fc3 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -55,15 +55,7 @@ def download_and_extract_antismash_data(
             download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")
             break
 
-        # delete subdirs
-        for subdir_path in list_dirs(extract_path):
-            shutil.rmtree(subdir_path)
-
-        # delete unnecessary files
-        files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
-        for file in list_files(extract_path):
-            if file not in files_to_keep:
-                os.remove(file)
+        _cleanup_extracted_files(extract_path)
 
         logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id)
 
@@ -77,3 +69,15 @@ def _check_extract_path(extract_path: Path):
     # check if extract_path is empty
     if any(extract_path.iterdir()):
         raise ValueError(f'Nonempty directory: "{extract_path}"')
+
+
+def _cleanup_extracted_files(extract_path: str | PathLike) -> None:
+    # delete subdirs
+    for subdir_path in list_dirs(extract_path):
+        shutil.rmtree(subdir_path)
+
+    # delete unnecessary files
+    files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
+    for file in list_files(extract_path):
+        if file not in files_to_keep:
+            os.remove(file)

From d5e64eb57bb6948ce07315438fd838f38eb84454 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 18:19:35 +0100
Subject: [PATCH 04/33] Refactor: Move extract_path preparation logic into a
 seperate function for better readabiliy

---
 .../genomics/antismash/antismash_downloader.py       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 93f22fc3..f2397ea8 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -44,10 +44,7 @@ def download_and_extract_antismash_data(
     download_root = Path(download_root)
     extract_path = Path(extract_root) / "antismash" / antismash_id
 
-    if extract_path.exists():
-        _check_extract_path(extract_path)
-    else:
-        extract_path.mkdir(parents=True, exist_ok=True)
+    _prepare_extract_path(extract_path)
 
     try:
         for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
@@ -81,3 +78,10 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None:
     for file in list_files(extract_path):
         if file not in files_to_keep:
             os.remove(file)
+
+
+def _prepare_extract_path(extract_path: str | PathLike) -> None:
+    if extract_path.exists():
+        _check_extract_path(extract_path)
+    else:
+        extract_path.mkdir(parents=True, exist_ok=True)

From 66f13d6d57639e87036b684a59be6e6d183b7ad3 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:17:41 +0100
Subject: [PATCH 05/33] Refactor: Separate genome assembly resolution and
 antiSMASH data retrieval into distinct functions for improved clarity

---
 .../antismash/podp_antismash_downloader.py    | 92 ++++++++++++++-----
 1 file changed, 69 insertions(+), 23 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 84e3cee4..5b9ded97 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -179,33 +179,18 @@ def podp_download_and_extract_antismash_data(
             logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt")
             continue
 
-        # if not downloaded or lookup attempted, then try to resolve the ID
-        # and download
-        logger.info(f"Start lookup process for genome ID {raw_genome_id}")
-        gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data)
-        gs_obj.resolve_attempted = True
-
-        if gs_obj.resolved_refseq_id == "":
-            # give up on this one
-            logger.warning(f"Failed lookup for genome ID {raw_genome_id}")
+        # resolve genome ID
+        try:
+            get_genome_assembly_accession(gs, genome_record["genome_ID"])
+        except Exception as e:
+            logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}")
             continue
 
-        # if resolved id is valid, try to download and extract antismash data
+        # retrieve antismash BGC data from antiSMASH-DB
         try:
-            download_and_extract_antismash_data(
-                gs_obj.resolved_refseq_id, project_download_root, project_extract_root
-            )
-
-            gs_obj.bgc_path = str(
-                Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute()
-            )
-
-            output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id)
-            if output_path.exists():
-                Path.touch(output_path / "completed", exist_ok=True)
-
+            retrieve_antismash_db_data(gs, project_download_root, project_extract_root)
         except Exception:
-            gs_obj.bgc_path = ""
+            continue
 
     # raise and log warning for failed downloads
     failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
@@ -247,6 +232,67 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non
     return best_id
 
 
+def get_genome_assembly_accession(
+    genome_status: GenomeStatus, genome_id_data: Mapping[str, str]
+) -> None:
+    """Resolve and update the genome assembly accession for a given genome status.
+
+    This function attempts to resolve the RefSeq ID for the provided genome record
+    and updates the `genome_status` object with the resolved ID. It also sets the
+    `resolve_attempted` flag to `True` to indicate that an attempt to resolve the
+    RefSeq ID has been made. If the resolution fails, raises a RuntimeError and leaves
+    the `resolved_refseq_id` empty.
+
+    Args:
+        genome_status (GenomeStatus): An object representing the status of the genome,
+            which will be updated with the resolved RefSeq ID.
+        genome_id_data (Mapping[str, str]): A dictionary containing genome
+            information, where keys like "RefSeq_accession", "GenBank_accession",
+            or "JGI_Genome_ID" are used to resolve the RefSeq ID.
+
+    Warnings:
+        Logs a warning if the RefSeq ID cannot be resolved.
+    """
+    genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data)
+    genome_status.resolve_attempted = True
+
+    if genome_status.resolved_refseq_id == "":
+        raise RuntimeError("Failed to get genome assembly accession")
+
+
+def retrieve_antismash_db_data(
+    genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Retrieve antiSMASH database data for a given genome and update its status.
+
+    This function downloads and extracts antiSMASH data for a genome identified
+    by its resolved RefSeq ID. It updates the `genome_status` object with the
+    path to the downloaded data or sets it to an empty string if an error occurs.
+
+    Args:
+        genome_status (GenomeStatus): An object representing the genome's status,
+            including its resolved RefSeq ID and BGC path.
+        download_root (str | PathLike): The root directory where the antiSMASH
+            data will be downloaded.
+        extract_root (str | PathLike): The root directory where the antiSMASH
+            data will be extracted.
+
+    Raises:
+        Exception: If an error occurs during the download or extraction process.
+    """
+    antismash_id = genome_status.resolved_refseq_id
+    extract_path = Path(extract_root, "antismash", antismash_id)
+    download_path = Path(download_root, f"{antismash_id}.zip").absolute()
+
+    try:
+        download_and_extract_antismash_data(antismash_id, download_root, extract_root)
+        Path.touch(extract_path / "completed", exist_ok=True)
+        genome_status.bgc_path = str(download_path)
+    except Exception as e:
+        genome_status.bgc_path = ""
+        raise e
+
+
 def _resolve_genbank_accession(genbank_id: str) -> str:
     """Try to get RefSeq assembly id through given GenBank assembly id.
 

From 706f8419b32c0929f721b8b0c59feec8fa3bba64 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:18:06 +0100
Subject: [PATCH 06/33] Refactor: Improve genome ID handling and logging

---
 .../antismash/podp_antismash_downloader.py    | 36 +++++++++----------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 5b9ded97..744025a2 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -153,30 +153,26 @@ def podp_download_and_extract_antismash_data(
     gs_dict = GenomeStatus.read_json(gs_file)
 
     for i, genome_record in enumerate(genome_records):
-        # get the best available ID from the dict
-        genome_id_data = genome_record["genome_ID"]
-        raw_genome_id = get_best_available_genome_id(genome_id_data)
-        if raw_genome_id is None or len(raw_genome_id) == 0:
-            logger.warning(f'Invalid input genome record "{genome_record}"')
-            continue
+        logger.info(
+            f"Getting antismash BGC data for genome record {i + 1} of {len(genome_records)}."
+        )
 
-        # check if genome ID exist in the genome status file
-        if raw_genome_id not in gs_dict:
-            gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id)
+        # get the best available genome ID from the dict
+        original_genome_id = get_best_available_genome_id(genome_record["genome_ID"])
+        if not original_genome_id:
+            logger.warning(f"Skipping invalid genome record: {genome_record}")
+            continue
 
-        gs_obj = gs_dict[raw_genome_id]
+        # Retrieve or initialize the GenomeStatus object for the genome ID
+        gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id))
 
-        logger.info(
-            f"Checking for antismash data {i + 1}/{len(genome_records)}, "
-            f"current genome ID={raw_genome_id}"
-        )
-        # first, check if BGC data is downloaded
-        if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists():
-            logger.info(f"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}")
+        # Skip genome if BGC data is downloaded
+        if gs.bgc_path and Path(gs.bgc_path).exists():
+            logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}")
             continue
-        # second, check if lookup attempted previously
-        if gs_obj.resolve_attempted:
-            logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt")
+        # Skip genome if lookup attempted previously
+        if gs.resolve_attempted:
+            logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt")
             continue
 
         # resolve genome ID

From b25506e7d9501df7245a152b6094411364ef4213 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:20:18 +0100
Subject: [PATCH 07/33] Refactor: Move logging for antiSMASH data retrieval
 errors and success messages

---
 src/nplinker/genomics/antismash/antismash_downloader.py    | 3 ---
 .../genomics/antismash/podp_antismash_downloader.py        | 7 +++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index f2397ea8..07662107 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -54,11 +54,8 @@ def download_and_extract_antismash_data(
 
         _cleanup_extracted_files(extract_path)
 
-        logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id)
-
     except Exception as e:
         shutil.rmtree(extract_path)
-        logger.warning(e)
         raise e
 
 
diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 744025a2..04189f87 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -185,8 +185,11 @@ def podp_download_and_extract_antismash_data(
         # retrieve antismash BGC data from antiSMASH-DB
         try:
             retrieve_antismash_db_data(gs, project_download_root, project_extract_root)
-        except Exception:
-            continue
+            logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted")
+        except Exception as e:
+            logger.warning(
+                f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}"
+            )
 
     # raise and log warning for failed downloads
     failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]

From 2ed8ac6cf158cc2721c0caa7fb01e70914abe3ad Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:29:05 +0100
Subject: [PATCH 08/33] simplify comment

---
 src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 04189f87..0cdea2fe 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -166,11 +166,10 @@ def podp_download_and_extract_antismash_data(
         # Retrieve or initialize the GenomeStatus object for the genome ID
         gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id))
 
-        # Skip genome if BGC data is downloaded
+        # Skip genomes
         if gs.bgc_path and Path(gs.bgc_path).exists():
             logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}")
             continue
-        # Skip genome if lookup attempted previously
         if gs.resolve_attempted:
             logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt")
             continue

From a1e49620a2da8ab7cd00c3864b8e77e3c54e7337 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:30:48 +0100
Subject: [PATCH 09/33] Enhance logging messages in antiSMASH data retrieval

---
 .../genomics/antismash/podp_antismash_downloader.py    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 0cdea2fe..aadf1eb4 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -162,13 +162,15 @@ def podp_download_and_extract_antismash_data(
         if not original_genome_id:
             logger.warning(f"Skipping invalid genome record: {genome_record}")
             continue
-
         # Retrieve or initialize the GenomeStatus object for the genome ID
         gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id))
 
         # Skip genomes
         if gs.bgc_path and Path(gs.bgc_path).exists():
-            logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}")
+            logger.info(
+                f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to "
+                f"{gs.bgc_path}"
+            )
             continue
         if gs.resolve_attempted:
             logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt")
@@ -184,7 +186,9 @@ def podp_download_and_extract_antismash_data(
         # retrieve antismash BGC data from antiSMASH-DB
         try:
             retrieve_antismash_db_data(gs, project_download_root, project_extract_root)
-            logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted")
+            logger.info(
+                f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted"
+            )
         except Exception as e:
             logger.warning(
                 f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}"

From e8dd391e56ea97b138648e755247823925b3f61e Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 18 Mar 2025 19:38:55 +0100
Subject: [PATCH 10/33] test: adapt test to changed logging info message

---
 tests/unit/genomics/test_podp_antismash_downloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py
index 7fb9ec0c..302d354d 100644
--- a/tests/unit/genomics/test_podp_antismash_downloader.py
+++ b/tests/unit/genomics/test_podp_antismash_downloader.py
@@ -218,7 +218,7 @@ def test_caching(download_root, extract_root, genome_status_file, caplog):
     assert genome_obj.resolve_attempted
     podp_download_and_extract_antismash_data(genome_records, download_root, extract_root)
     assert (
-        f"Genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}"
+        f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}"
         in caplog.text
     )
     assert (

From ea159ca824a5d8bf701c2fb9ab03c3ededff0a59 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 12:06:02 +0100
Subject: [PATCH 11/33] Feat: Add antiSMASH API functionality

---
 src/nplinker/genomics/antismash/__init__.py   |  12 +-
 .../antismash/antismash_api_client.py         |  97 ++++++++++++++++
 .../antismash/antismash_downloader.py         |  97 ++++++++++++----
 .../genomics/antismash/ncbi_downloader.py     | 107 ++++++++++++++++++
 .../antismash/podp_antismash_downloader.py    |  72 ++++++++++--
 .../genomics/test_antismash_downloader.py     |  10 +-
 6 files changed, 362 insertions(+), 33 deletions(-)
 create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py
 create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index e126f548..4817e806 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -1,16 +1,24 @@
-from .antismash_downloader import download_and_extract_antismash_data
+from .antismash_api_client import antismash_job_is_done
+from .antismash_api_client import submit_antismash_job
+from .antismash_downloader import download_and_extract_from_antismash_api
+from .antismash_downloader import download_and_extract_from_antismash_db
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
+from .ncbi_downloader import download_and_extract_ncbi_genome
 from .podp_antismash_downloader import GenomeStatus
 from .podp_antismash_downloader import get_best_available_genome_id
 from .podp_antismash_downloader import podp_download_and_extract_antismash_data
 
 
 __all__ = [
-    "download_and_extract_antismash_data",
+    "download_and_extract_from_antismash_api",
+    "download_and_extract_from_antismash_db",
     "AntismashBGCLoader",
     "parse_bgc_genbank",
     "GenomeStatus",
     "get_best_available_genome_id",
     "podp_download_and_extract_antismash_data",
+    "download_and_extract_ncbi_genome",
+    "submit_antismash_job",
+    "antismash_job_is_done",
 ]
diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
new file mode 100644
index 00000000..32b932c0
--- /dev/null
+++ b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+import logging
+from os import PathLike
+from pathlib import Path
+from typing import Optional
+import requests
+
+
+logger = logging.getLogger(__name__)
+
+
+def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]:
+    """Submits an antiSMASH job using the provided GenBank file.
+
+    This function sends a GenBank file to the antiSMASH API
+    and retrieves the job ID if the submission is successful.
+
+    Args:
+        genbank_filepath (str | PathLike): The path to the GenBank file to be submitted.
+
+    Returns:
+        Optional[str]: The job ID if the submission is successful, or None if it fails.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+         RuntimeError: If the API response does not contain a job ID.
+    """
+    url = "https://antismash.secondarymetabolites.org/api/v1.0/submit"
+    genbank_filepath = Path(genbank_filepath)
+
+    with open(genbank_filepath, "rb") as file:
+        files = {"seq": file}
+        response = requests.post(url, files=files)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+
+    data = response.json()
+    if "id" not in data:
+        raise RuntimeError("No antiSMASH job ID returned")
+    return data["id"]
+
+
+def query_antismash_job(job_id: str) -> Optional[dict]:
+    """Gets the status of an antiSMASH job.
+
+    Args:
+        job_id (str): The job ID to query.
+
+    Returns:
+        dict: The response JSON if successful, otherwise None.
+    """
+    url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}"
+
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        return response.json()
+
+    except requests.exceptions.RequestException as req_err:
+        logger.error(f"Request failed for job_id {job_id}: {req_err}")
+    except ValueError as json_err:  # Handles JSON decoding errors
+        logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}")
+    except Exception as err:
+        logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}")
+
+
+def antismash_job_is_done(job_id: str) -> bool:
+    """Checks if the antiSMASH job is complete by polling the job status.
+
+    Args:
+        job_id (str): The job ID to query.
+
+    Returns:
+        bool: True if the job is done, False if the job is still running.
+
+    Raises:
+        RuntimeError: If the job status could not be retrieved or if the job failed.
+        ValueError: If the job state is missing or unexpected in the response.
+    """
+    response = query_antismash_job(job_id)
+
+    if response is None:
+        raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}")
+    if "state" not in response:
+        raise ValueError(f"Job state missing in response for job_id: {job_id}")
+
+    job_state = response["state"]
+    if job_state in ("running", "queued"):
+        return False
+    if job_state == "done":
+        return True
+    if job_state == "failed":
+        job_status = response.get("status", "No error message provided")
+        raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}")
+    else:
+        raise ValueError(
+            f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}"
+        )
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 07662107..928001c9 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -4,6 +4,7 @@
 import shutil
 from os import PathLike
 from pathlib import Path
+import requests
 from nplinker.utils import download_and_extract_archive
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
@@ -15,50 +16,108 @@
 ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}"
 # The antiSMASH DBV2 is for the availability of the old version, better to keep it.
 ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}"
+# antismash api to download results from submitted jobs
+ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}"
 
 
 def download_and_extract_antismash_data(
-    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+    url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
 ) -> None:
     """Download and extract antiSMASH BGC archive for a specified genome.
 
-    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
-    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
-    of a genome as the id of the archive.
+    This function downloads a BGC archive from the specified URL, extracts its contents,
+    and organizes the extracted files into a structured directory under the given `extract_root`.
 
     Args:
-        antismash_id: The id used to download BGC archive from antiSMASH database.
-            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
-            specify the version as well.
-        download_root: Path to the directory to place downloaded archive in.
-        extract_root: Path to the directory data files will be extracted to.
+        url (str): The URL to download the BGC archive from.
+        antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory.
+        download_root: Path to the directory where the downloaded archive will be stored.
+        extract_root: Path to the directory where the data files will be extracted.
             Note that an `antismash` directory will be created in the specified `extract_root` if
             it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
 
     Raises:
-        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.
+        ValueError: if `<extract_root>/antismash/<antismash_id>` dir is not empty.
+        Exception: If any error occurs during the download or extraction process, the partially extracted
+            directory will be cleaned up, and the exception will be re-raised.
 
     Examples:
-        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
+         >>> download_and_extract_antismash_data(
+                 "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip",
+                 "GCF_001.1",
+                 "/data/download",
+                 "/data/extracted"
+             )
     """
-    download_root = Path(download_root)
     extract_path = Path(extract_root) / "antismash" / antismash_id
 
     _prepare_extract_path(extract_path)
-
     try:
-        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
-            url = base_url.format(antismash_id, antismash_id + ".zip")
-            download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")
-            break
-
+        download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip")
         _cleanup_extracted_files(extract_path)
-
     except Exception as e:
         shutil.rmtree(extract_path)
         raise e
 
 
+def download_and_extract_from_antismash_api(
+    job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Downloads and extracts results from an antiSMASH API job.
+
+    This function constructs the download URL using the provided job ID then
+    downloads the results as a ZIP file and extracts its contents to the specified directories.
+
+    Args:
+        antismash_id (str): The unique identifier for the antiSMASH dataset.
+        job_id (str): The job ID for the antiSMASH API job.
+        download_root (str or PathLike): The root directory where the ZIP file will be downloaded.
+        extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+        zipfile.BadZipFile: If the downloaded file is not a valid ZIP file.
+        OSError: If there is an issue with file operations such as writing or extracting.
+    """
+    url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip")
+    download_and_extract_antismash_data(url, antismash_id, download_root, extract_root)
+
+
+def download_and_extract_from_antismash_db(
+    refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Download and extract antiSMASH BGC archive for a specified genome.
+
+    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
+    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
+    of a genome as the id of the archive.
+
+    Args:
+        refseq_acc: The id used to download BGC archive from antiSMASH database.
+            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
+            specify the version as well.
+        download_root: Path to the directory to place downloaded archive in.
+        extract_root: Path to the directory data files will be extracted to.
+            Note that an `antismash` directory will be created in the specified `extract_root` if
+            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
+
+    Raises:
+        ValueError: if `<extract_root>/antismash/<refseq_acc>` dir is not empty.
+
+    Examples:
+        >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted")
+    """
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
+        url = base_url.format(refseq_acc, f"{refseq_acc}.zip")
+        if requests.head(url).status_code == 404:  # not found
+            continue
+        download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root)
+        return  # Exit the loop once a valid URL is processed
+
+    # if both urls give 404 not found
+    raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}")
+
+
 def _check_extract_path(extract_path: Path):
     # check if extract_path is empty
     if any(extract_path.iterdir()):
diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
new file mode 100644
index 00000000..b0b333d5
--- /dev/null
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -0,0 +1,107 @@
+import logging
+import os
+import shutil
+import time
+from os import PathLike
+from pathlib import Path
+from typing import Optional
+from nplinker.utils import check_md5
+from nplinker.utils import download_url
+from nplinker.utils import extract_archive
+
+
+logger = logging.getLogger(__name__)
+
+
+def download_and_extract_ncbi_genome(
+    refseq_id: str,
+    download_root: str | PathLike,
+    extract_root: str | PathLike,
+    max_attempts: int = 10,
+) -> Optional[Path]:
+    """Downloads and extracts an NCBI dataset for a given genome refseq ID.
+
+    This function attempts to download a dataset from the NCBI database using
+    the provided refseq ID. It retries the download process up to a maximum
+    number of times if any errors occur. The function verifies the integrity
+    of the downloaded files using MD5 checksums and moves and renames the
+    GenBank files upon successful verification.
+
+    Args:
+        refseq_id (str): The refseq ID for the dataset to be downloaded.
+        download_root (str or Path): The root directory where the dataset will be downloaded.
+        extract_root (str or Path): The root directory where the dataset will be extracted.
+        max_attempts (int): The maximum number of times to attempt downloading.
+
+    Returns:
+        Path: The path to the extracted dataset if successful, otherwise None.
+
+    Raises:
+        Exception: If the maximum number of retries is reached and the dataset could
+        not be successfully downloaded and extracted.
+    """
+    url = (
+        "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"
+        f"{refseq_id}/download?include_annotation_type=GENOME_GB"
+    )
+
+    download_root = Path(download_root)
+    extract_path = Path(extract_root) / "ncbi_genomes"
+    filename = f"ncbi_{refseq_id}.zip"
+
+    extract_path.mkdir(parents=True, exist_ok=True)
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            download_url(url, download_root, filename)
+            archive = download_root / filename
+            break
+        except Exception as e:
+            logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}")
+            if attempt < max_attempts:
+                time.sleep(2)
+    else:
+        raise RuntimeError(
+            f"Maximum download retries ({max_attempts}) reached for {url}. Download failed."
+        )
+
+    extract_archive(archive, extract_path)
+    verify_ncbi_dataset_md5_sums(extract_path)
+
+    # Move and rename GenBank file
+    genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff"
+    new_genbank_path = extract_path / f"{refseq_id}.gbff"
+    genbank_path.rename(new_genbank_path)
+
+    # Delete unnecessary files
+    shutil.rmtree(extract_path / "ncbi_dataset")
+    os.remove(extract_path / "md5sum.txt")
+    os.remove(extract_path / "README.md")
+
+    return new_genbank_path
+
+
+def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
+    """Verify the integrity of files in a specified directory using MD5 checksums.
+
+    This function reads an "md5sum.txt" file located in the given extraction path,
+    which contains MD5 checksums and corresponding file names. It then computes
+    the MD5 checksum for each file and compares it with the expected value. If any
+    file's checksum does not match, a `ValueError` is raised.
+
+    Args:
+        extract_path (PathLike): Path to the directory containing the files and
+            the "md5sum.txt" file.
+
+    Returns:
+        bool: True if all files pass the MD5 checksum verification.
+
+    Raises:
+        ValueError: If the MD5 checksum of any file does not match the expected value.
+    """
+    with open(extract_path / "md5sum.txt", "r") as f:
+        for line in f:
+            md5sum, file_name = line.strip().split()
+            file_path = extract_path / file_name
+            if not check_md5(file_path, md5sum):
+                raise ValueError(f"MD5 checksum mismatch for {file_path}")
diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index aadf1eb4..2d5f727c 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import re
+import time
 import warnings
 from collections.abc import Mapping
 from collections.abc import Sequence
@@ -11,7 +12,11 @@
 from bs4 import BeautifulSoup
 from jsonschema import validate
 from nplinker.defaults import GENOME_STATUS_FILENAME
-from nplinker.genomics.antismash import download_and_extract_antismash_data
+from nplinker.genomics.antismash import antismash_job_is_done
+from nplinker.genomics.antismash import download_and_extract_from_antismash_api
+from nplinker.genomics.antismash import download_and_extract_from_antismash_db
+from nplinker.genomics.antismash import download_and_extract_ncbi_genome
+from nplinker.genomics.antismash import submit_antismash_job
 from nplinker.schemas import GENOME_STATUS_SCHEMA
 
 
@@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data(
             logger.info(
                 f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted"
             )
+            continue
         except Exception as e:
             logger.warning(
                 f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}"
             )
 
+        # retrieve antismash BGC by submitting antismash job via API
+        try:
+            logger.info(
+                f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}."
+            )
+            genome_path = download_and_extract_ncbi_genome(
+                gs.resolved_refseq_id, project_download_root, project_extract_root
+            )
+            job_id = submit_antismash_job(genome_path)
+            while antismash_job_is_done(job_id) is False:
+                time.sleep(15)
+            retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root)
+            logger.info(
+                f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted"
+            )
+            continue
+        except Exception as e:
+            logger.warning(
+                f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}"
+            )
+
     # raise and log warning for failed downloads
     failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
     if failed_ids:
@@ -286,13 +313,42 @@ def retrieve_antismash_db_data(
     extract_path = Path(extract_root, "antismash", antismash_id)
     download_path = Path(download_root, f"{antismash_id}.zip").absolute()
 
-    try:
-        download_and_extract_antismash_data(antismash_id, download_root, extract_root)
-        Path.touch(extract_path / "completed", exist_ok=True)
-        genome_status.bgc_path = str(download_path)
-    except Exception as e:
-        genome_status.bgc_path = ""
-        raise e
+    download_and_extract_from_antismash_db(antismash_id, download_root, extract_root)
+    Path.touch(extract_path / "completed", exist_ok=True)
+    genome_status.bgc_path = str(download_path)
+
+
+def retrieve_antismash_job_data(
+    job_id: str,
+    genome_status: GenomeStatus,
+    download_root: str | PathLike,
+    extract_root: str | PathLike,
+) -> None:
+    """Retrieve antiSMASH API data for a given genome and update its status.
+
+    This function downloads and extracts antiSMASH data for a genome identified
+    by its resolved RefSeq ID. It updates the `genome_status` object with the
+    path to the downloaded data or sets it to an empty string if an error occurs.
+
+    Args:
+        job_id (str): The job ID for the antiSMASH API job.
+        genome_status (GenomeStatus): An object representing the genome's status,
+            including its resolved RefSeq ID and BGC path.
+        download_root (str | PathLike): The root directory where the antiSMASH
+            data will be downloaded.
+        extract_root (str | PathLike): The root directory where the antiSMASH
+            data will be extracted.
+
+    Raises:
+        Exception: If an error occurs during the download or extraction process.
+    """
+    antismash_id = genome_status.resolved_refseq_id
+    extract_path = Path(extract_root, "antismash", antismash_id)
+    download_path = Path(download_root, f"{antismash_id}.zip").absolute()
+
+    download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root)
+    Path.touch(extract_path / "completed", exist_ok=True)
+    genome_status.bgc_path = str(download_path)
 
 
 def _resolve_genbank_accession(genbank_id: str) -> str:
diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py
index 1dfeb4cf..651629c2 100644
--- a/tests/unit/genomics/test_antismash_downloader.py
+++ b/tests/unit/genomics/test_antismash_downloader.py
@@ -1,5 +1,5 @@
 import pytest
-from nplinker.genomics.antismash import download_and_extract_antismash_data
+from nplinker.genomics.antismash import download_and_extract_from_antismash_db
 from nplinker.utils import extract_archive
 from nplinker.utils import list_files
 
@@ -14,7 +14,7 @@ def test_default(self, tmp_path):
         extract_root.mkdir()
         original_extract_root = tmp_path / "original"
         original_extract_root.mkdir()
-        download_and_extract_antismash_data(self.antismash_id, download_root, extract_root)
+        download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root)
         archive = download_root / "GCF_004339725.1.zip"
         extracted_folder = extract_root / "antismash" / "GCF_004339725.1"
         extracted_files = list_files(extracted_folder, keep_parent=False)
@@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path):
         nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir"
         nonempty_path.mkdir(parents=True)
         with pytest.raises(ValueError, match="Nonempty directory"):
-            download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted")
+            download_and_extract_from_antismash_db(
+                self.antismash_id, tmp_path, tmp_path / "extracted"
+            )
 
     # test a non-existent ID, which can be either a fake ID, non-existent in NCBI
     # or a valid NCBI genome ID but it does not have BGC data in antismash database
@@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path):
         extract_root.mkdir()
         for test_id in nonexisting_ids:
             with pytest.raises(RuntimeError):
-                download_and_extract_antismash_data(test_id, download_root, extract_root)
+                download_and_extract_from_antismash_db(test_id, download_root, extract_root)
             extracted_folder = extract_root / "antismash" / test_id
             assert not extracted_folder.exists()

From a972a7001f07fc1ea7eb8924b4fbe71e71f523f6 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 12:06:02 +0100
Subject: [PATCH 12/33] Feat: Add antiSMASH API functionality

---
 src/nplinker/genomics/antismash/__init__.py   |  12 +-
 .../antismash/antismash_api_client.py         |  97 ++++++++++++++++
 .../antismash/antismash_downloader.py         |  97 ++++++++++++----
 .../genomics/antismash/ncbi_downloader.py     | 107 ++++++++++++++++++
 .../antismash/podp_antismash_downloader.py    |  72 ++++++++++--
 .../genomics/test_antismash_downloader.py     |  10 +-
 6 files changed, 362 insertions(+), 33 deletions(-)
 create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py
 create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index e126f548..4817e806 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -1,16 +1,24 @@
-from .antismash_downloader import download_and_extract_antismash_data
+from .antismash_api_client import antismash_job_is_done
+from .antismash_api_client import submit_antismash_job
+from .antismash_downloader import download_and_extract_from_antismash_api
+from .antismash_downloader import download_and_extract_from_antismash_db
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
+from .ncbi_downloader import download_and_extract_ncbi_genome
 from .podp_antismash_downloader import GenomeStatus
 from .podp_antismash_downloader import get_best_available_genome_id
 from .podp_antismash_downloader import podp_download_and_extract_antismash_data
 
 
 __all__ = [
-    "download_and_extract_antismash_data",
+    "download_and_extract_from_antismash_api",
+    "download_and_extract_from_antismash_db",
     "AntismashBGCLoader",
     "parse_bgc_genbank",
     "GenomeStatus",
     "get_best_available_genome_id",
     "podp_download_and_extract_antismash_data",
+    "download_and_extract_ncbi_genome",
+    "submit_antismash_job",
+    "antismash_job_is_done",
 ]
diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
new file mode 100644
index 00000000..32b932c0
--- /dev/null
+++ b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+import logging
+from os import PathLike
+from pathlib import Path
+from typing import Optional
+import requests
+
+
+logger = logging.getLogger(__name__)
+
+
+def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]:
+    """Submits an antiSMASH job using the provided GenBank file.
+
+    This function sends a GenBank file to the antiSMASH API
+    and retrieves the job ID if the submission is successful.
+
+    Args:
+        genbank_filepath (str | PathLike): The path to the GenBank file to be submitted.
+
+    Returns:
+        Optional[str]: The job ID if the submission is successful, or None if it fails.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+         RuntimeError: If the API response does not contain a job ID.
+    """
+    url = "https://antismash.secondarymetabolites.org/api/v1.0/submit"
+    genbank_filepath = Path(genbank_filepath)
+
+    with open(genbank_filepath, "rb") as file:
+        files = {"seq": file}
+        response = requests.post(url, files=files)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+
+    data = response.json()
+    if "id" not in data:
+        raise RuntimeError("No antiSMASH job ID returned")
+    return data["id"]
+
+
+def query_antismash_job(job_id: str) -> Optional[dict]:
+    """Gets the status of an antiSMASH job.
+
+    Args:
+        job_id (str): The job ID to query.
+
+    Returns:
+        dict: The response JSON if successful, otherwise None.
+    """
+    url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}"
+
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        return response.json()
+
+    except requests.exceptions.RequestException as req_err:
+        logger.error(f"Request failed for job_id {job_id}: {req_err}")
+    except ValueError as json_err:  # Handles JSON decoding errors
+        logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}")
+    except Exception as err:
+        logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}")
+
+
+def antismash_job_is_done(job_id: str) -> bool:
+    """Checks if the antiSMASH job is complete by polling the job status.
+
+    Args:
+        job_id (str): The job ID to query.
+
+    Returns:
+        bool: True if the job is done, False if the job is still running.
+
+    Raises:
+        RuntimeError: If the job status could not be retrieved or if the job failed.
+        ValueError: If the job state is missing or unexpected in the response.
+    """
+    response = query_antismash_job(job_id)
+
+    if response is None:
+        raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}")
+    if "state" not in response:
+        raise ValueError(f"Job state missing in response for job_id: {job_id}")
+
+    job_state = response["state"]
+    if job_state in ("running", "queued"):
+        return False
+    if job_state == "done":
+        return True
+    if job_state == "failed":
+        job_status = response.get("status", "No error message provided")
+        raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}")
+    else:
+        raise ValueError(
+            f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}"
+        )
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 07662107..928001c9 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -4,6 +4,7 @@
 import shutil
 from os import PathLike
 from pathlib import Path
+import requests
 from nplinker.utils import download_and_extract_archive
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
@@ -15,50 +16,108 @@
 ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}"
 # The antiSMASH DBV2 is for the availability of the old version, better to keep it.
 ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}"
+# antismash api to download results from submitted jobs
+ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}"
 
 
 def download_and_extract_antismash_data(
-    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+    url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
 ) -> None:
     """Download and extract antiSMASH BGC archive for a specified genome.
 
-    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
-    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
-    of a genome as the id of the archive.
+    This function downloads a BGC archive from the specified URL, extracts its contents,
+    and organizes the extracted files into a structured directory under the given `extract_root`.
 
     Args:
-        antismash_id: The id used to download BGC archive from antiSMASH database.
-            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
-            specify the version as well.
-        download_root: Path to the directory to place downloaded archive in.
-        extract_root: Path to the directory data files will be extracted to.
+        url (str): The URL to download the BGC archive from.
+        antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory.
+        download_root: Path to the directory where the downloaded archive will be stored.
+        extract_root: Path to the directory where the data files will be extracted.
             Note that an `antismash` directory will be created in the specified `extract_root` if
             it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
 
     Raises:
-        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.
+        ValueError: if `<extract_root>/antismash/<antismash_id>` dir is not empty.
+        Exception: If any error occurs during the download or extraction process, the partially extracted
+            directory will be cleaned up, and the exception will be re-raised.
 
     Examples:
-        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
+         >>> download_and_extract_antismash_data(
+                 "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip",
+                 "GCF_001.1",
+                 "/data/download",
+                 "/data/extracted"
+             )
     """
-    download_root = Path(download_root)
     extract_path = Path(extract_root) / "antismash" / antismash_id
 
     _prepare_extract_path(extract_path)
-
     try:
-        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
-            url = base_url.format(antismash_id, antismash_id + ".zip")
-            download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")
-            break
-
+        download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip")
         _cleanup_extracted_files(extract_path)
-
     except Exception as e:
         shutil.rmtree(extract_path)
         raise e
 
 
+def download_and_extract_from_antismash_api(
+    job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Downloads and extracts results from an antiSMASH API job.
+
+    This function constructs the download URL using the provided job ID then
+    downloads the results as a ZIP file and extracts its contents to the specified directories.
+
+    Args:
+        antismash_id (str): The unique identifier for the antiSMASH dataset.
+        job_id (str): The job ID for the antiSMASH API job.
+        download_root (str or PathLike): The root directory where the ZIP file will be downloaded.
+        extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+        zipfile.BadZipFile: If the downloaded file is not a valid ZIP file.
+        OSError: If there is an issue with file operations such as writing or extracting.
+    """
+    url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip")
+    download_and_extract_antismash_data(url, antismash_id, download_root, extract_root)
+
+
+def download_and_extract_from_antismash_db(
+    refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Download and extract antiSMASH BGC archive for a specified genome.
+
+    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
+    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
+    of a genome as the id of the archive.
+
+    Args:
+        refseq_acc: The id used to download BGC archive from antiSMASH database.
+            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
+            specify the version as well.
+        download_root: Path to the directory to place downloaded archive in.
+        extract_root: Path to the directory data files will be extracted to.
+            Note that an `antismash` directory will be created in the specified `extract_root` if
+            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
+
+    Raises:
+        ValueError: if `<extract_root>/antismash/<refseq_acc>` dir is not empty.
+
+    Examples:
+        >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted")
+    """
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
+        url = base_url.format(refseq_acc, f"{refseq_acc}.zip")
+        if requests.head(url).status_code == 404:  # not found
+            continue
+        download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root)
+        return  # Exit the loop once a valid URL is processed
+
+    # if both urls give 404 not found
+    raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}")
+
+
 def _check_extract_path(extract_path: Path):
     # check if extract_path is empty
     if any(extract_path.iterdir()):
diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
new file mode 100644
index 00000000..b0b333d5
--- /dev/null
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -0,0 +1,107 @@
+import logging
+import os
+import shutil
+import time
+from os import PathLike
+from pathlib import Path
+from typing import Optional
+from nplinker.utils import check_md5
+from nplinker.utils import download_url
+from nplinker.utils import extract_archive
+
+
+logger = logging.getLogger(__name__)
+
+
+def download_and_extract_ncbi_genome(
+    refseq_id: str,
+    download_root: str | PathLike,
+    extract_root: str | PathLike,
+    max_attempts: int = 10,
+) -> Optional[Path]:
+    """Downloads and extracts an NCBI dataset for a given genome refseq ID.
+
+    This function attempts to download a dataset from the NCBI database using
+    the provided refseq ID. It retries the download process up to a maximum
+    number of times if any errors occur. The function verifies the integrity
+    of the downloaded files using MD5 checksums and moves and renames the
+    GenBank files upon successful verification.
+
+    Args:
+        refseq_id (str): The refseq ID for the dataset to be downloaded.
+        download_root (str or Path): The root directory where the dataset will be downloaded.
+        extract_root (str or Path): The root directory where the dataset will be extracted.
+        max_attempts (int): The maximum number of times to attempt downloading.
+
+    Returns:
+        Path: The path to the extracted dataset if successful, otherwise None.
+
+    Raises:
+        Exception: If the maximum number of retries is reached and the dataset could
+        not be successfully downloaded and extracted.
+    """
+    url = (
+        "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"
+        f"{refseq_id}/download?include_annotation_type=GENOME_GB"
+    )
+
+    download_root = Path(download_root)
+    extract_path = Path(extract_root) / "ncbi_genomes"
+    filename = f"ncbi_{refseq_id}.zip"
+
+    extract_path.mkdir(parents=True, exist_ok=True)
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            download_url(url, download_root, filename)
+            archive = download_root / filename
+            break
+        except Exception as e:
+            logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}")
+            if attempt < max_attempts:
+                time.sleep(2)
+    else:
+        raise RuntimeError(
+            f"Maximum download retries ({max_attempts}) reached for {url}. Download failed."
+        )
+
+    extract_archive(archive, extract_path)
+    verify_ncbi_dataset_md5_sums(extract_path)
+
+    # Move and rename GenBank file
+    genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff"
+    new_genbank_path = extract_path / f"{refseq_id}.gbff"
+    genbank_path.rename(new_genbank_path)
+
+    # Delete unnecessary files
+    shutil.rmtree(extract_path / "ncbi_dataset")
+    os.remove(extract_path / "md5sum.txt")
+    os.remove(extract_path / "README.md")
+
+    return new_genbank_path
+
+
+def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
+    """Verify the integrity of files in a specified directory using MD5 checksums.
+
+    This function reads an "md5sum.txt" file located in the given extraction path,
+    which contains MD5 checksums and corresponding file names. It then computes
+    the MD5 checksum for each file and compares it with the expected value. If any
+    file's checksum does not match, a `ValueError` is raised.
+
+    Args:
+        extract_path (PathLike): Path to the directory containing the files and
+            the "md5sum.txt" file.
+
+    Returns:
+        bool: True if all files pass the MD5 checksum verification.
+
+    Raises:
+        ValueError: If the MD5 checksum of any file does not match the expected value.
+    """
+    with open(extract_path / "md5sum.txt", "r") as f:
+        for line in f:
+            md5sum, file_name = line.strip().split()
+            file_path = extract_path / file_name
+            if not check_md5(file_path, md5sum):
+                raise ValueError(f"MD5 checksum mismatch for {file_path}")
diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index aadf1eb4..2d5f727c 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import re
+import time
 import warnings
 from collections.abc import Mapping
 from collections.abc import Sequence
@@ -11,7 +12,11 @@
 from bs4 import BeautifulSoup
 from jsonschema import validate
 from nplinker.defaults import GENOME_STATUS_FILENAME
-from nplinker.genomics.antismash import download_and_extract_antismash_data
+from nplinker.genomics.antismash import antismash_job_is_done
+from nplinker.genomics.antismash import download_and_extract_from_antismash_api
+from nplinker.genomics.antismash import download_and_extract_from_antismash_db
+from nplinker.genomics.antismash import download_and_extract_ncbi_genome
+from nplinker.genomics.antismash import submit_antismash_job
 from nplinker.schemas import GENOME_STATUS_SCHEMA
 
 
@@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data(
             logger.info(
                 f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted"
             )
+            continue
         except Exception as e:
             logger.warning(
                 f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}"
             )
 
+        # retrieve antismash BGC by submitting antismash job via API
+        try:
+            logger.info(
+                f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}."
+            )
+            genome_path = download_and_extract_ncbi_genome(
+                gs.resolved_refseq_id, project_download_root, project_extract_root
+            )
+            job_id = submit_antismash_job(genome_path)
+            while antismash_job_is_done(job_id) is False:
+                time.sleep(15)
+            retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root)
+            logger.info(
+                f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted"
+            )
+            continue
+        except Exception as e:
+            logger.warning(
+                f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}"
+            )
+
     # raise and log warning for failed downloads
     failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
     if failed_ids:
@@ -286,13 +313,42 @@ def retrieve_antismash_db_data(
     extract_path = Path(extract_root, "antismash", antismash_id)
     download_path = Path(download_root, f"{antismash_id}.zip").absolute()
 
-    try:
-        download_and_extract_antismash_data(antismash_id, download_root, extract_root)
-        Path.touch(extract_path / "completed", exist_ok=True)
-        genome_status.bgc_path = str(download_path)
-    except Exception as e:
-        genome_status.bgc_path = ""
-        raise e
+    download_and_extract_from_antismash_db(antismash_id, download_root, extract_root)
+    Path.touch(extract_path / "completed", exist_ok=True)
+    genome_status.bgc_path = str(download_path)
+
+
+def retrieve_antismash_job_data(
+    job_id: str,
+    genome_status: GenomeStatus,
+    download_root: str | PathLike,
+    extract_root: str | PathLike,
+) -> None:
+    """Retrieve antiSMASH API data for a given genome and update its status.
+
+    This function downloads and extracts antiSMASH data for a genome identified
+    by its resolved RefSeq ID. It updates the `genome_status` object with the
+    path to the downloaded data or sets it to an empty string if an error occurs.
+
+    Args:
+        job_id (str): The job ID for the antiSMASH API job.
+        genome_status (GenomeStatus): An object representing the genome's status,
+            including its resolved RefSeq ID and BGC path.
+        download_root (str | PathLike): The root directory where the antiSMASH
+            data will be downloaded.
+        extract_root (str | PathLike): The root directory where the antiSMASH
+            data will be extracted.
+
+    Raises:
+        Exception: If an error occurs during the download or extraction process.
+    """
+    antismash_id = genome_status.resolved_refseq_id
+    extract_path = Path(extract_root, "antismash", antismash_id)
+    download_path = Path(download_root, f"{antismash_id}.zip").absolute()
+
+    download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root)
+    Path.touch(extract_path / "completed", exist_ok=True)
+    genome_status.bgc_path = str(download_path)
 
 
 def _resolve_genbank_accession(genbank_id: str) -> str:
diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py
index 1dfeb4cf..651629c2 100644
--- a/tests/unit/genomics/test_antismash_downloader.py
+++ b/tests/unit/genomics/test_antismash_downloader.py
@@ -1,5 +1,5 @@
 import pytest
-from nplinker.genomics.antismash import download_and_extract_antismash_data
+from nplinker.genomics.antismash import download_and_extract_from_antismash_db
 from nplinker.utils import extract_archive
 from nplinker.utils import list_files
 
@@ -14,7 +14,7 @@ def test_default(self, tmp_path):
         extract_root.mkdir()
         original_extract_root = tmp_path / "original"
         original_extract_root.mkdir()
-        download_and_extract_antismash_data(self.antismash_id, download_root, extract_root)
+        download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root)
         archive = download_root / "GCF_004339725.1.zip"
         extracted_folder = extract_root / "antismash" / "GCF_004339725.1"
         extracted_files = list_files(extracted_folder, keep_parent=False)
@@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path):
         nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir"
         nonempty_path.mkdir(parents=True)
         with pytest.raises(ValueError, match="Nonempty directory"):
-            download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted")
+            download_and_extract_from_antismash_db(
+                self.antismash_id, tmp_path, tmp_path / "extracted"
+            )
 
     # test a non-existent ID, which can be either a fake ID, non-existent in NCBI
     # or a valid NCBI genome ID but it does not have BGC data in antismash database
@@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path):
         extract_root.mkdir()
         for test_id in nonexisting_ids:
             with pytest.raises(RuntimeError):
-                download_and_extract_antismash_data(test_id, download_root, extract_root)
+                download_and_extract_from_antismash_db(test_id, download_root, extract_root)
             extracted_folder = extract_root / "antismash" / test_id
             assert not extracted_folder.exists()

From f3a159f3bcfbc1568058ecd3a6fda08838953439 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 14:27:18 +0100
Subject: [PATCH 13/33] fix: improve logging message for start of antiSMASH API
 process

---
 src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 2d5f727c..35f269cc 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -203,7 +203,8 @@ def podp_download_and_extract_antismash_data(
         # retrieve antismash BGC by submitting antismash job via API
         try:
             logger.info(
-                f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}."
+                "Downloading genome assembly from NCBI and submitting antiSMASH job for "
+                f"genome ID {gs.original_id}."
             )
             genome_path = download_and_extract_ncbi_genome(
                 gs.resolved_refseq_id, project_download_root, project_extract_root

From da4da3c85d472c7e535068072da9cf5943b87410 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 14:30:58 +0100
Subject: [PATCH 14/33] docs: improve docstring for
 download_and_extract_ncbi_genome function

---
 .../genomics/antismash/ncbi_downloader.py     | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index b0b333d5..a65a5b92 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -19,26 +19,27 @@ def download_and_extract_ncbi_genome(
     extract_root: str | PathLike,
     max_attempts: int = 10,
 ) -> Optional[Path]:
-    """Downloads and extracts an NCBI dataset for a given genome refseq ID.
+    """Downloads and extracts an NCBI dataset for a given genome RefSeq ID.
 
-    This function attempts to download a dataset from the NCBI database using
-    the provided refseq ID. It retries the download process up to a maximum
-    number of times if any errors occur. The function verifies the integrity
-    of the downloaded files using MD5 checksums and moves and renames the
-    GenBank files upon successful verification.
+    This function retrieves a dataset from the NCBI database using the provided
+    RefSeq ID. It retries the download process up to a specified maximum number
+    of attempts in case of errors. The function verifies the integrity of the
+    downloaded files using MD5 checksums, extracts the dataset, and renames the
+    GenBank file for easier access. Unnecessary files are removed after successful
+    processing.
 
     Args:
-        refseq_id (str): The refseq ID for the dataset to be downloaded.
-        download_root (str or Path): The root directory where the dataset will be downloaded.
-        extract_root (str or Path): The root directory where the dataset will be extracted.
-        max_attempts (int): The maximum number of times to attempt downloading.
+        refseq_id (str): The RefSeq ID for the dataset to be downloaded.
+        download_root (str | PathLike): The directory where the dataset will be downloaded.
+        extract_root (str | PathLike): The directory where the dataset will be extracted.
+        max_attempts (int): The maximum number of download attempts. Defaults to 10.
 
     Returns:
-        Path: The path to the extracted dataset if successful, otherwise None.
+        Optional[Path]: The path to the extracted GenBank file if successful, otherwise None.
 
     Raises:
-        Exception: If the maximum number of retries is reached and the dataset could
-        not be successfully downloaded and extracted.
+        RuntimeError: If the maximum number of retries is reached and the dataset
+            could not be successfully downloaded and extracted.
     """
     url = (
         "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"

From af094e4176c912fbc164ecbf92cd477faa20bbd3 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 14:39:45 +0100
Subject: [PATCH 15/33] fix: update logging messages for antiSMASH data
 retrieval failures

---
 .../genomics/antismash/podp_antismash_downloader.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 35f269cc..f96538e1 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -196,8 +196,9 @@ def podp_download_and_extract_antismash_data(
             )
             continue
         except Exception as e:
-            logger.warning(
-                f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}"
+            logger.info(
+                f"Unable to retrieve BGC data from antiSMASH-DB for genome ID {gs.original_id}. "
+                f"Error: {e}"
             )
 
         # retrieve antismash BGC by submitting antismash job via API
@@ -218,10 +219,14 @@ def podp_download_and_extract_antismash_data(
             )
             continue
         except Exception as e:
-            logger.warning(
-                f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}"
+            logger.info(
+                f"Unable to retrieve BGC data via antiSMASH API for genome ID {gs.original_id}. "
+                f"Error: {e}"
             )
 
+        if gs.bgc_path == "":
+            logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.")
+
     # raise and log warning for failed downloads
     failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
     if failed_ids:

From 99d99a25714ca7ad76301698c048018b6ac285d5 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 14:43:35 +0100
Subject: [PATCH 16/33] add logging after antiSMASH job submission

---
 src/nplinker/genomics/antismash/podp_antismash_downloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index f96538e1..efd7acc5 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -211,6 +211,7 @@ def podp_download_and_extract_antismash_data(
                 gs.resolved_refseq_id, project_download_root, project_extract_root
             )
             job_id = submit_antismash_job(genome_path)
+            logger.info(f"Waiting for antiSMASH job {job_id} to complete.")
             while antismash_job_is_done(job_id) is False:
                 time.sleep(15)
             retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root)

From 1c8393479a9d27924be27086347fed048f152194 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 16:48:57 +0100
Subject: [PATCH 17/33] refactor: rename refseq_id to genome_assembly_acc

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index a65a5b92..b6b5da0c 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -14,7 +14,7 @@
 
 
 def download_and_extract_ncbi_genome(
-    refseq_id: str,
+    genome_assembly_acc: str,
     download_root: str | PathLike,
     extract_root: str | PathLike,
     max_attempts: int = 10,
@@ -29,7 +29,7 @@ def download_and_extract_ncbi_genome(
     processing.
 
     Args:
-        refseq_id (str): The RefSeq ID for the dataset to be downloaded.
+        genome_assembly_acc (str): The NCBI accession of the genome assembly to be downloaded.
         download_root (str | PathLike): The directory where the dataset will be downloaded.
         extract_root (str | PathLike): The directory where the dataset will be extracted.
         max_attempts (int): The maximum number of download attempts. Defaults to 10.
@@ -43,12 +43,12 @@ def download_and_extract_ncbi_genome(
     """
     url = (
         "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"
-        f"{refseq_id}/download?include_annotation_type=GENOME_GB"
+        f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB"
     )
 
     download_root = Path(download_root)
     extract_path = Path(extract_root) / "ncbi_genomes"
-    filename = f"ncbi_{refseq_id}.zip"
+    filename = f"ncbi_{genome_assembly_acc}.zip"
 
     extract_path.mkdir(parents=True, exist_ok=True)
 
@@ -70,8 +70,8 @@ def download_and_extract_ncbi_genome(
     verify_ncbi_dataset_md5_sums(extract_path)
 
     # Move and rename GenBank file
-    genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff"
-    new_genbank_path = extract_path / f"{refseq_id}.gbff"
+    genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff"
+    new_genbank_path = extract_path / f"{genome_assembly_acc}.gbff"
     genbank_path.rename(new_genbank_path)
 
     # Delete unnecessary files

From 496db38b06416c6accb39bf197fee388502eaa4b Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 16:49:50 +0100
Subject: [PATCH 18/33] improve genome download process with validation and
 retry logic

---
 .../genomics/antismash/ncbi_downloader.py     | 71 +++++++++++++------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index b6b5da0c..d2095e81 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -5,6 +5,8 @@
 from os import PathLike
 from pathlib import Path
 from typing import Optional
+import httpx
+import requests
 from nplinker.utils import check_md5
 from nplinker.utils import download_url
 from nplinker.utils import extract_archive
@@ -41,31 +43,11 @@ def download_and_extract_ncbi_genome(
         RuntimeError: If the maximum number of retries is reached and the dataset
             could not be successfully downloaded and extracted.
     """
-    url = (
-        "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"
-        f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB"
-    )
-
-    download_root = Path(download_root)
     extract_path = Path(extract_root) / "ncbi_genomes"
-    filename = f"ncbi_{genome_assembly_acc}.zip"
-
     extract_path.mkdir(parents=True, exist_ok=True)
 
-    for attempt in range(1, max_attempts + 1):
-        try:
-            download_url(url, download_root, filename)
-            archive = download_root / filename
-            break
-        except Exception as e:
-            logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}")
-            if attempt < max_attempts:
-                time.sleep(2)
-    else:
-        raise RuntimeError(
-            f"Maximum download retries ({max_attempts}) reached for {url}. Download failed."
-        )
-
+    _check_genome_accession_validity(genome_assembly_acc)
+    archive = _download_genome(genome_assembly_acc, download_root, max_attempts)
     extract_archive(archive, extract_path)
     verify_ncbi_dataset_md5_sums(extract_path)
 
@@ -106,3 +88,48 @@ def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
             file_path = extract_path / file_name
             if not check_md5(file_path, md5sum):
                 raise ValueError(f"MD5 checksum mismatch for {file_path}")
+
+
+def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10):
+    """Check the validity of genome accessio."""
+    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check"
+
+    # Retry multiple times because NCBI has currently issues (500 Internal Server Error)
+    for attempt in range(1, max_attempts + 1):
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            break
+        except Exception:
+            if attempt < max_attempts:
+                time.sleep(1)
+
+    # Raise if no attempt was successful
+    response.raise_for_status()
+    # Raise if genome assembly is not successful
+    if "valid_assemblies" not in response.json():
+        raise ValueError(f"Not a valid genome assembly accession: {genome_assembly_acc}")
+
+
+def _download_genome(genome_assembly_acc, download_root, max_attempts):
+    url = (
+        "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/"
+        f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB"
+    )
+    download_root = Path(download_root)
+    filename = f"ncbi_{genome_assembly_acc}.zip"
+
+    # Retry multiple times because NCBI has issues currently
+    for attempt in range(1, max_attempts + 1):
+        try:
+            download_url(url, download_root, filename)
+            return download_root / filename
+        except httpx.ReadTimeout as e:
+            logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}")
+            if attempt < max_attempts:
+                time.sleep(1)
+    else:
+        raise httpx.ReadTimeout(
+            f"Failed to download the genome {genome_assembly_acc} from NCBI. "
+            f"Maximum download retries ({max_attempts}) reached for {url}."
+        )

From cc6573c206869fbb6152a02d0bad494f072b8669 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 16:50:28 +0100
Subject: [PATCH 19/33] test: add unit tests for
 download_and_extract_ncbi_genome function

---
 tests/unit/genomics/test_ncbi_downloader.py | 43 +++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 tests/unit/genomics/test_ncbi_downloader.py

diff --git a/tests/unit/genomics/test_ncbi_downloader.py b/tests/unit/genomics/test_ncbi_downloader.py
new file mode 100644
index 00000000..dadbe9ea
--- /dev/null
+++ b/tests/unit/genomics/test_ncbi_downloader.py
@@ -0,0 +1,43 @@
+from unittest.mock import patch
+import httpx
+import pytest
+from nplinker.genomics.antismash.ncbi_downloader import download_and_extract_ncbi_genome
+
+
+@pytest.fixture
+def download_root(tmp_path):
+    return tmp_path / "download"
+
+
+@pytest.fixture
+def extract_root(tmp_path):
+    return tmp_path / "extracted"
+
+
+def test_download_and_extract_ncbi_genome_success(download_root, extract_root):
+    refseq_id = "GCF_000514775.1"
+
+    genome_path = download_and_extract_ncbi_genome(refseq_id, download_root, extract_root)
+
+    assert genome_path == extract_root / "ncbi_genomes" / f"{refseq_id}.gbff"
+    assert not (extract_root / "ncbi_genomes" / "md5sum.txt").exists()
+    assert not (extract_root / "ncbi_genomes" / "README.md").exists()
+    assert not (extract_root / "ncbi_genomes" / "ncbi_dataset").exists()
+
+
+def test_download_and_extract_ncbi_genome_max_retries(download_root, extract_root):
+    refseq_id = "GCF_000514775.1"
+
+    with patch(
+        "nplinker.genomics.antismash.ncbi_downloader.download_url",
+        side_effect=httpx.ReadTimeout("Download failed"),
+    ):
+        with pytest.raises(httpx.ReadTimeout, match="Maximum download retries"):
+            download_and_extract_ncbi_genome(refseq_id, download_root, extract_root, max_attempts=1)
+
+
+def test_download_and_extract_ncbi_genome_invalid_refseq_id(download_root, extract_root):
+    refseq_id = "invalid_ref_seq_id"
+
+    with pytest.raises(ValueError, match="Not a valid genome assembly accession"):
+        download_and_extract_ncbi_genome(refseq_id, download_root, extract_root)

From 51e4817b669a18be12a9c814213905e667a85081 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 16:50:52 +0100
Subject: [PATCH 20/33] refactor: rename verify_ncbi_dataset_md5_sums to
 _verify_ncbi_dataset_md5_sums for consistency

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index d2095e81..f3260ef0 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -49,7 +49,7 @@ def download_and_extract_ncbi_genome(
     _check_genome_accession_validity(genome_assembly_acc)
     archive = _download_genome(genome_assembly_acc, download_root, max_attempts)
     extract_archive(archive, extract_path)
-    verify_ncbi_dataset_md5_sums(extract_path)
+    _verify_ncbi_dataset_md5_sums(extract_path)
 
     # Move and rename GenBank file
     genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff"
@@ -64,7 +64,7 @@ def download_and_extract_ncbi_genome(
     return new_genbank_path
 
 
-def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
+def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
     """Verify the integrity of files in a specified directory using MD5 checksums.
 
     This function reads an "md5sum.txt" file located in the given extraction path,

From 6b564ea6d418763a312ac3e59144a4d267b42c9e Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 16:52:24 +0100
Subject: [PATCH 21/33] refactor: move _verify_ncbi_dataset_md5_sums function
 to a new location for better organization

---
 .../genomics/antismash/ncbi_downloader.py     | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index f3260ef0..3a91b367 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -64,32 +64,6 @@ def download_and_extract_ncbi_genome(
     return new_genbank_path
 
 
-def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
-    """Verify the integrity of files in a specified directory using MD5 checksums.
-
-    This function reads an "md5sum.txt" file located in the given extraction path,
-    which contains MD5 checksums and corresponding file names. It then computes
-    the MD5 checksum for each file and compares it with the expected value. If any
-    file's checksum does not match, a `ValueError` is raised.
-
-    Args:
-        extract_path (PathLike): Path to the directory containing the files and
-            the "md5sum.txt" file.
-
-    Returns:
-        bool: True if all files pass the MD5 checksum verification.
-
-    Raises:
-        ValueError: If the MD5 checksum of any file does not match the expected value.
-    """
-    with open(extract_path / "md5sum.txt", "r") as f:
-        for line in f:
-            md5sum, file_name = line.strip().split()
-            file_path = extract_path / file_name
-            if not check_md5(file_path, md5sum):
-                raise ValueError(f"MD5 checksum mismatch for {file_path}")
-
-
 def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10):
     """Check the validity of genome accessio."""
     url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check"
@@ -133,3 +107,29 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts):
             f"Failed to download the genome {genome_assembly_acc} from NCBI. "
             f"Maximum download retries ({max_attempts}) reached for {url}."
         )
+
+
+def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
+    """Verify the integrity of files in a specified directory using MD5 checksums.
+
+    This function reads an "md5sum.txt" file located in the given extraction path,
+    which contains MD5 checksums and corresponding file names. It then computes
+    the MD5 checksum for each file and compares it with the expected value. If any
+    file's checksum does not match, a `ValueError` is raised.
+
+    Args:
+        extract_path (PathLike): Path to the directory containing the files and
+            the "md5sum.txt" file.
+
+    Returns:
+        bool: True if all files pass the MD5 checksum verification.
+
+    Raises:
+        ValueError: If the MD5 checksum of any file does not match the expected value.
+    """
+    with open(extract_path / "md5sum.txt", "r") as f:
+        for line in f:
+            md5sum, file_name = line.strip().split()
+            file_path = extract_path / file_name
+            if not check_md5(file_path, md5sum):
+                raise ValueError(f"MD5 checksum mismatch for {file_path}")

From 7570852bb7437419228b02978d820deae009bead Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:06:18 +0100
Subject: [PATCH 22/33] feat: handle already download antiSMASH results

---
 src/nplinker/genomics/antismash/__init__.py   |  2 +
 .../antismash/antismash_downloader.py         | 35 ++++++++++++++
 .../antismash/podp_antismash_downloader.py    | 46 ++++++++++++++++++-
 3 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index 4817e806..7c9179a0 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -2,6 +2,7 @@
 from .antismash_api_client import submit_antismash_job
 from .antismash_downloader import download_and_extract_from_antismash_api
 from .antismash_downloader import download_and_extract_from_antismash_db
+from .antismash_downloader import extract_antismash_data
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
 from .ncbi_downloader import download_and_extract_ncbi_genome
@@ -11,6 +12,7 @@
 
 
 __all__ = [
+    "extract_antismash_data",
     "download_and_extract_from_antismash_api",
     "download_and_extract_from_antismash_db",
     "AntismashBGCLoader",
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 928001c9..ef351ff8 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 import requests
 from nplinker.utils import download_and_extract_archive
+from nplinker.utils import extract_archive
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
 
@@ -118,6 +119,40 @@ def download_and_extract_from_antismash_db(
     raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}")
 
 
+def extract_antismash_data(
+    archive: str | PathLike, extract_root: str | PathLike, antimash_id: str
+) -> None:
+    """Extracts antiSMASH results from a given archive into a specified directory.
+
+    This function handles the extraction of antiSMASH results by preparing the
+    extraction path, extracting the archive, and performing cleanup of
+    unnecessary files. If an error occurs during the process, the partially
+    extracted files are removed, and the exception is re-raised.
+
+    Args:
+        archive (str | PathLike): The path to the archive file containing antiSMASH results.
+        extract_root (str | PathLike): The root directory where the data should
+            be extracted.
+        antimash_id (str): A unique identifier for the antiSMASH data, used to
+            create a subdirectory for the extracted files.
+
+    Raises:
+        Exception: If any error occurs during the extraction process, the
+            exception is re-raised after cleaning up the extraction directory.
+    """
+    extract_path = Path(extract_root) / "antismash" / antimash_id
+
+    _prepare_extract_path(extract_path)
+
+    try:
+        extract_archive(archive, extract_path, remove_finished=False)
+        _cleanup_extracted_files(extract_path)
+
+    except Exception as e:
+        shutil.rmtree(extract_path)
+        raise e
+
+
 def _check_extract_path(extract_path: Path):
     # check if extract_path is empty
     if any(extract_path.iterdir()):
diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index efd7acc5..311c5160 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -16,6 +16,7 @@
 from nplinker.genomics.antismash import download_and_extract_from_antismash_api
 from nplinker.genomics.antismash import download_and_extract_from_antismash_db
 from nplinker.genomics.antismash import download_and_extract_ncbi_genome
+from nplinker.genomics.antismash import extract_antismash_data
 from nplinker.genomics.antismash import submit_antismash_job
 from nplinker.schemas import GENOME_STATUS_SCHEMA
 
@@ -170,13 +171,23 @@ def podp_download_and_extract_antismash_data(
         # Retrieve or initialize the GenomeStatus object for the genome ID
         gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id))
 
-        # Skip genomes
+        # Check if genomes already have antiSMASH BGC data
         if gs.bgc_path and Path(gs.bgc_path).exists():
             logger.info(
                 f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to "
                 f"{gs.bgc_path}"
             )
-            continue
+            try:
+                process_existing_antismash_data(gs, project_extract_root)
+                continue
+            except Exception as e:
+                logger.warning(
+                    "Failed to process existing antiSMASH BGC data for genome ID "
+                    f"{original_genome_id}. Error: {e}"
+                )
+        gs.bgc_path = ""  # Reset bgc path
+
+        # Check if a previous attempt to get bgc data has failed
         if gs.resolve_attempted:
             logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt")
             continue
@@ -296,6 +307,37 @@ def get_genome_assembly_accession(
         raise RuntimeError("Failed to get genome assembly accession")
 
 
+def process_existing_antismash_data(gs_obj: GenomeStatus, extract_root: str | PathLike) -> None:
+    """Processes already downloaded antiSMASH BGC data archive.
+
+    This function ensures that the antiSMASH data archive associated with a given genomic sequence
+    object is properly extracted into a specified directory. If the data has already been extracted,
+    the function skips the extraction process.
+
+    Args:
+        gs_obj: An object representing a genomic sequence, which contains the path
+                to the antiSMASH BGC data (accessible via `gs_obj.bgc_path`) and
+                an original identifier (`gs_obj.original_id`).
+        extract_root: The root directory where the antiSMASH data should be extracted.
+
+    Raises:
+        Any exceptions raised by the `extract_antismash_data` function if the extraction fails.
+    """
+    antismash_id = Path(gs_obj.bgc_path).stem
+    extract_path = Path(extract_root, "antismash", antismash_id)
+    completed_marker = extract_path / "completed"
+
+    # Check if archive is already successfully extracted
+    if completed_marker.exists():
+        logger.info(
+            f"antiSMASH BGC data for {gs_obj.original_id} already extracted at {extract_path}."
+        )
+        return
+
+    extract_antismash_data(gs_obj.bgc_path, extract_root, antismash_id)
+    completed_marker.touch(exist_ok=True)
+
+
 def retrieve_antismash_db_data(
     genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike
 ) -> None:

From 98812546caa77d506714d5b3013f1b06986aab8f Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:07:04 +0100
Subject: [PATCH 23/33] fix mistake in docstring

---
 src/nplinker/genomics/antismash/podp_antismash_downloader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
index 311c5160..5af4169a 100644
--- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -297,8 +297,8 @@ def get_genome_assembly_accession(
             information, where keys like "RefSeq_accession", "GenBank_accession",
             or "JGI_Genome_ID" are used to resolve the RefSeq ID.
 
-    Warnings:
-        Logs a warning if the RefSeq ID cannot be resolved.
+    Raises:
+        RuntimeError: If the RefSeq ID cannot be resolved.
     """
     genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data)
     genome_status.resolve_attempted = True

From 0e1ec548fa420ff0b9f51f74545b10b2ca9a8bc0 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:14:32 +0100
Subject: [PATCH 24/33] fix: update return type of submit_antismash_job to str

---
 src/nplinker/genomics/antismash/antismash_api_client.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
index 32b932c0..2b6fd7a8 100644
--- a/src/nplinker/genomics/antismash/antismash_api_client.py
+++ b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -9,7 +9,7 @@
 logger = logging.getLogger(__name__)
 
 
-def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]:
+def submit_antismash_job(genbank_filepath: str | PathLike) -> str:
     """Submits an antiSMASH job using the provided GenBank file.
 
     This function sends a GenBank file to the antiSMASH API
@@ -19,11 +19,11 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]:
         genbank_filepath (str | PathLike): The path to the GenBank file to be submitted.
 
     Returns:
-        Optional[str]: The job ID if the submission is successful, or None if it fails.
+        str: The job ID if the submission.
 
     Raises:
         requests.exceptions.RequestException: If there is an issue with the HTTP request.
-         RuntimeError: If the API response does not contain a job ID.
+        RuntimeError: If the API response does not contain a job ID.
     """
     url = "https://antismash.secondarymetabolites.org/api/v1.0/submit"
     genbank_filepath = Path(genbank_filepath)

From f53f1a7b062f73600a17bb6d9ecdc159d426306e Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:16:36 +0100
Subject: [PATCH 25/33] fix: update return type of
 download_and_extract_ncbi_genome to Path

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index 3a91b367..a47c53f5 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -4,7 +4,6 @@
 import time
 from os import PathLike
 from pathlib import Path
-from typing import Optional
 import httpx
 import requests
 from nplinker.utils import check_md5
@@ -20,7 +19,7 @@ def download_and_extract_ncbi_genome(
     download_root: str | PathLike,
     extract_root: str | PathLike,
     max_attempts: int = 10,
-) -> Optional[Path]:
+) -> Path:
     """Downloads and extracts an NCBI dataset for a given genome RefSeq ID.
 
     This function retrieves a dataset from the NCBI database using the provided
@@ -37,7 +36,7 @@ def download_and_extract_ncbi_genome(
         max_attempts (int): The maximum number of download attempts. Defaults to 10.
 
     Returns:
-        Optional[Path]: The path to the extracted GenBank file if successful, otherwise None.
+        Path: The path to the extracted GenBank file.
 
     Raises:
         RuntimeError: If the maximum number of retries is reached and the dataset

From 5e5b2bd241f7f0287b318bb0c63aea03daa20555 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:29:04 +0100
Subject: [PATCH 26/33] update submit_antismash_job to return job ID as string
 and improve error handling in antismash_job_is_done

---
 .../antismash/antismash_api_client.py         | 58 +++++++------------
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
index 2b6fd7a8..39cba5b4 100644
--- a/src/nplinker/genomics/antismash/antismash_api_client.py
+++ b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -2,7 +2,6 @@
 import logging
 from os import PathLike
 from pathlib import Path
-from typing import Optional
 import requests
 
 
@@ -36,60 +35,45 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str:
     data = response.json()
     if "id" not in data:
         raise RuntimeError("No antiSMASH job ID returned")
-    return data["id"]
-
-
-def query_antismash_job(job_id: str) -> Optional[dict]:
-    """Gets the status of an antiSMASH job.
-
-    Args:
-        job_id (str): The job ID to query.
-
-    Returns:
-        dict: The response JSON if successful, otherwise None.
-    """
-    url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}"
-
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()  # Raise an exception for HTTP errors
-        return response.json()
-
-    except requests.exceptions.RequestException as req_err:
-        logger.error(f"Request failed for job_id {job_id}: {req_err}")
-    except ValueError as json_err:  # Handles JSON decoding errors
-        logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}")
-    except Exception as err:
-        logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}")
+    return str(data["id"])
 
 
 def antismash_job_is_done(job_id: str) -> bool:
-    """Checks if the antiSMASH job is complete by polling the job status.
+    """Determines if the antiSMASH job has completed by checking its status.
+
+    This function queries the antiSMASH API to retrieve the current state
+    of the job and determines whether it has finished successfully, is still
+    in progress, or has encountered an error.
 
     Args:
-        job_id (str): The job ID to query.
+        job_id (str): The unique identifier of the antiSMASH job.
 
     Returns:
-        bool: True if the job is done, False if the job is still running.
+        bool: True if the job is completed successfully, False if it is still
+            running or queued.
 
     Raises:
-        RuntimeError: If the job status could not be retrieved or if the job failed.
-        ValueError: If the job state is missing or unexpected in the response.
+        RuntimeError: If the job has failed or if the API response indicates an error.
+        ValueError: If the job state is missing or an unexpected state is encountered
+            in the API response.
+        requests.exceptions.HTTPError: If an HTTP error occurs during the API request.
     """
-    response = query_antismash_job(job_id)
+    url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}"
+
+    response = requests.get(url, timeout=10)
+    response.raise_for_status()  # Raise exception for HTTP errors
+    respose_data = response.json()
 
-    if response is None:
-        raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}")
-    if "state" not in response:
+    if "state" not in respose_data:
         raise ValueError(f"Job state missing in response for job_id: {job_id}")
 
-    job_state = response["state"]
+    job_state = respose_data["state"]
     if job_state in ("running", "queued"):
         return False
     if job_state == "done":
         return True
     if job_state == "failed":
-        job_status = response.get("status", "No error message provided")
+        job_status = respose_data.get("status", "No error message provided")
         raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}")
     else:
         raise ValueError(

From 909ca639c586e8066acf43e2c49c55bcae73adda Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:29:18 +0100
Subject: [PATCH 27/33] chore: add types-requests to development dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 879895f0..8fd371b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ dev = [
     "mypy",
     "typing_extensions",
     # stub packages. Update the `format-typing-check.yml` too if you add more.
+    "types-requests",
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",

From 252a177c840eb28a32c17398a44d351927cfb7cc Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:30:27 +0100
Subject: [PATCH 28/33] fix: update return type of
 _verify_ncbi_dataset_md5_sums to None and remove return value documentation

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index a47c53f5..9bf8c96b 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts):
         )
 
 
-def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
+def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None:
     """Verify the integrity of files in a specified directory using MD5 checksums.
 
     This function reads an "md5sum.txt" file located in the given extraction path,
@@ -120,9 +120,6 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool:
         extract_path (PathLike): Path to the directory containing the files and
             the "md5sum.txt" file.
 
-    Returns:
-        bool: True if all files pass the MD5 checksum verification.
-
     Raises:
         ValueError: If the MD5 checksum of any file does not match the expected value.
     """

From 81a01024c1b01da9007be45444a7e09a689f69d4 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:31:49 +0100
Subject: [PATCH 29/33] fix: update _verify_ncbi_dataset_md5_sums to accept str
 or PathLike for extract_path

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index 9bf8c96b..a39338fa 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts):
         )
 
 
-def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None:
+def _verify_ncbi_dataset_md5_sums(extract_path: str | PathLike) -> None:
     """Verify the integrity of files in a specified directory using MD5 checksums.
 
     This function reads an "md5sum.txt" file located in the given extraction path,
@@ -123,6 +123,7 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None:
     Raises:
         ValueError: If the MD5 checksum of any file does not match the expected value.
     """
+    extract_path = Path(extract_path)
     with open(extract_path / "md5sum.txt", "r") as f:
         for line in f:
             md5sum, file_name = line.strip().split()

From 50890a73d3e38466118ed5d030b25d0788b4cb66 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:33:26 +0100
Subject: [PATCH 30/33] fix: convert extract_path to Path in
 _prepare_extract_path for consistency

---
 src/nplinker/genomics/antismash/antismash_downloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index ef351ff8..2c27938e 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -172,6 +172,7 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None:
 
 
 def _prepare_extract_path(extract_path: str | PathLike) -> None:
+    extract_path = Path(extract_path)
     if extract_path.exists():
         _check_extract_path(extract_path)
     else:

From b784476bbec8223ca0535c6a644eda11ee4ae8ef Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 18:53:52 +0100
Subject: [PATCH 31/33] chore: update typing dependencies in
 format-typing-check workflow

---
 .github/workflows/format-typing-check.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
index a4128a15..c5a97d4b 100644
--- a/.github/workflows/format-typing-check.yml
+++ b/.github/workflows/format-typing-check.yml
@@ -37,8 +37,8 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema \
-            types-networkx types-tabulate types-PyYAML pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-requests \
+            types-networkx types-tabulate types-PyYAML pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

From bdebd9679a2f3358f6ce2fc733ca6958d0342190 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 19:05:54 +0100
Subject: [PATCH 32/33] fix: clarify return value documentation for
 submit_antismash_job function

---
 src/nplinker/genomics/antismash/antismash_api_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
index 39cba5b4..124aedcb 100644
--- a/src/nplinker/genomics/antismash/antismash_api_client.py
+++ b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -18,7 +18,7 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str:
         genbank_filepath (str | PathLike): The path to the GenBank file to be submitted.
 
     Returns:
-        str: The job ID if the submission.
+        str: The job ID of the submitted antiSMASH job.
 
     Raises:
         requests.exceptions.RequestException: If there is an issue with the HTTP request.

From 7334c902813a3e5dae31b3087a0d626bd73686b9 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 19 Mar 2025 19:06:02 +0100
Subject: [PATCH 33/33] fix: enable postponed evaluation of type annotations in
 ncbi_downloader.py

---
 src/nplinker/genomics/antismash/ncbi_downloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py
index a39338fa..143102c8 100644
--- a/src/nplinker/genomics/antismash/ncbi_downloader.py
+++ b/src/nplinker/genomics/antismash/ncbi_downloader.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 import logging
 import os
 import shutil