From 31877441c85025d5609a3279436271cfc7330a8c Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sat, 14 Dec 2024 22:01:34 -0800
Subject: [PATCH 01/15] feat: parse and clean archive badges and markdown links
 to URL

---
 pyproject.toml              |  1 +
 src/pyosmeta/models/base.py | 20 +++++++++++++++++++-
 src/pyosmeta/utils_clean.py | 29 +++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 138cded..9ee53d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ classifiers = [
 ]
 dependencies = [
     "pydantic>=2.0",
+    "python-doi",
     "python-dotenv",
     "requests",
     "ruamel-yaml>=0.17.21",
diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py
index 63a4a18..2f743af 100644
--- a/src/pyosmeta/models/base.py
+++ b/src/pyosmeta/models/base.py
@@ -19,7 +19,7 @@
 )
 
 from pyosmeta.models.github import Labels
-from pyosmeta.utils_clean import clean_date, clean_markdown
+from pyosmeta.utils_clean import clean_archive, clean_date, clean_markdown
 
 
 class Partnerships(str, Enum):
@@ -403,3 +403,21 @@ def extract_label(cls, labels: list[str | Labels]) -> list[str]:
             label.name if isinstance(label, Labels) else label
             for label in labels
         ]
+
+    @field_validator(
+        "archive",
+        mode="before",
+    )
+    @classmethod
+    def clean_archive(cls, archive: str) -> str:
+        """Clean the archive value to ensure it's a valid archive URL."""
+        return clean_archive(archive)
+
+    @field_validator(
+        "joss",
+        mode="before",
+    )
+    @classmethod
+    def clean_joss(cls, joss: str) -> str:
+        """Clean the joss value to ensure it's a valid URL."""
+        return clean_archive(joss)
diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py
index c87ea92..ed0572a 100644
--- a/src/pyosmeta/utils_clean.py
+++ b/src/pyosmeta/utils_clean.py
@@ -7,6 +7,8 @@
 from datetime import datetime
 from typing import Any
 
+import doi
+
 
 def get_clean_user(username: str) -> str:
     """Cleans a GitHub username provided in a review issue by removing any
@@ -125,3 +127,30 @@ def clean_date_accepted_key(review_dict: dict[str, Any]) -> dict[str, str]:
             review_dict["date_accepted"] = value
             break
     return review_dict
+
+
+def clean_archive(archive):
+    """Clean an archive link to ensure it is a valid URL."""
+
+    def is_doi(archive):
+        try:
+            return doi.validate_doi(archive)
+        except ValueError:
+            return False
+
+    if archive.startswith("[") and archive.endswith(")"):
+        # Extract the outermost link
+        link = archive[archive.rfind("](") + 2 : -1]
+        if not link.startswith("http"):
+            return clean_archive(link)
+        return link
+    elif archive.startswith("http"):
+        return archive
+    elif link := is_doi(archive):
+        return link
+    elif archive.lower() == "n/a":
+        return None
+    elif archive.lower() == "tbd":
+        return None
+    else:
+        raise ValueError(f"Invalid archive URL: {archive}")

From aa53640c35bdf1520946c701ccd4566b6b7adb69 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Wed, 18 Dec 2024 23:51:14 -0800
Subject: [PATCH 02/15] Improve docs, reuse URL checker, improve test data

---
 src/pyosmeta/models/base.py                | 29 +++------
 src/pyosmeta/utils_clean.py                | 68 +++++++++++++++++++---
 tests/data/reviews/bolded_keys.txt         |  4 +-
 tests/data/reviews/partnership_astropy.txt |  4 +-
 tests/data/reviews/reviewer_keyed.txt      |  2 +-
 tests/data/reviews/reviewer_list.txt       |  2 +-
 6 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py
index 2f743af..1742bd2 100644
--- a/src/pyosmeta/models/base.py
+++ b/src/pyosmeta/models/base.py
@@ -8,7 +8,6 @@
 from enum import Enum
 from typing import Any, Optional, Set, Union
 
-import requests
 from pydantic import (
     AliasChoices,
     BaseModel,
@@ -19,7 +18,12 @@
 )
 
 from pyosmeta.models.github import Labels
-from pyosmeta.utils_clean import clean_archive, clean_date, clean_markdown
+from pyosmeta.utils_clean import (
+    check_url,
+    clean_archive,
+    clean_date,
+    clean_markdown,
+)
 
 
 class Partnerships(str, Enum):
@@ -59,29 +63,12 @@ def format_url(cls, url: str) -> str:
             elif not url.startswith("http"):
                 print("Oops, missing http")
                 url = "https://" + url
-        if cls._check_url(url=url):
+        if check_url(url=url):
             return url
         else:
+            print(f"Oops, url `{url}` is not valid, removing it")
             return None
 
-    @staticmethod
-    def _check_url(url: str) -> bool:
-        """Test url. Return true if there's a valid response, False if not
-
-        Parameters
-        ----------
-        url : str
-            String for a url to a website to test.
-
-        """
-
-        try:
-            response = requests.get(url, timeout=6)
-            return response.status_code == 200
-        except Exception:
-            print("Oops, url", url, "is not valid, removing it")
-            return False
-
 
 class PersonModel(BaseModel, UrlValidatorMixin):
     model_config = ConfigDict(
diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py
index ed0572a..69c9264 100644
--- a/src/pyosmeta/utils_clean.py
+++ b/src/pyosmeta/utils_clean.py
@@ -8,6 +8,7 @@
 from typing import Any
 
 import doi
+import requests
 
 
 def get_clean_user(username: str) -> str:
@@ -129,22 +130,71 @@ def clean_date_accepted_key(review_dict: dict[str, Any]) -> dict[str, str]:
     return review_dict
 
 
+def check_url(url: str) -> bool:
+    """Test url. Return true if there's a valid response, False if not
+
+    Parameters
+    ----------
+    url : str
+        String for a url to a website to test.
+
+    """
+
+    try:
+        response = requests.get(url, timeout=6)
+        return response.status_code == 200
+    except Exception:
+        return False
+
+
+def is_doi(archive) -> str | None:
+    """Check if the DOI is valid and return the DOI link.
+
+    Parameters
+    ----------
+    archive : str
+        The DOI string to validate, e.g., `10.1234/zenodo.12345678`
+
+    Returns
+    -------
+    str | None
+        The DOI link in the form `https://doi.org/10.1234/zenodo.12345678` or `None`
+        if the DOI is invalid.
+    """
+    try:
+        return doi.validate_doi(archive)
+    except ValueError:
+        pass
+
+
 def clean_archive(archive):
-    """Clean an archive link to ensure it is a valid URL."""
+    """Clean an archive link to ensure it is a valid URL.
 
-    def is_doi(archive):
-        try:
-            return doi.validate_doi(archive)
-        except ValueError:
-            return False
+    This utility will attempt to parse the DOI link from the various formats
+    that are commonly present in review metadata. This utility will handle:
+
+    * Markdown links in the format `[label](URL)`, e.g., `[my archive](https://doi.org/10.1234/zenodo.12345678)`
+    * Raw text in the format `DOI` e.g., `10.1234/zenodo.12345678`
+    * URLs in the format `http(s)://...` e.g., `https://doi.org/10.1234/zenodo.12345678`
+    * The special cases `n/a` and `tbd` which will be returned as `None` in anticipation of future data
+
+    If the archive link is a URL, it will be returned as is with a check that
+    it resolves but is not required to be a valid DOI. If the archive link is
+    a DOI, it will be validated and returned as a URL in the form
+    `https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package.
 
+    """
     if archive.startswith("[") and archive.endswith(")"):
         # Extract the outermost link
         link = archive[archive.rfind("](") + 2 : -1]
-        if not link.startswith("http"):
-            return clean_archive(link)
-        return link
+        # recursively clean the archive link
+        return clean_archive(link)
     elif archive.startswith("http"):
+        if archive.startswith("http://"):
+            archive = archive.replace("http://", "https://")
+        # Validate that the URL resolves
+        if not check_url(archive):
+            raise ValueError(f"Invalid archive URL: {archive}")
         return archive
     elif link := is_doi(archive):
         return link
diff --git a/tests/data/reviews/bolded_keys.txt b/tests/data/reviews/bolded_keys.txt
index 7275920..a3ddd3a 100644
--- a/tests/data/reviews/bolded_keys.txt
+++ b/tests/data/reviews/bolded_keys.txt
@@ -9,8 +9,8 @@
 **Reviewer 1:** @fakereviewer1
 **Reviewer 2:** @fakereviewer2
 **Reviews Expected By:** fake date
-**Archive:** [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
-**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
+**Archive:** [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
+**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://zenodo.org/records/8415866))
 **Date accepted (month/day/year):** 06/29/2024
 
 ---
diff --git a/tests/data/reviews/partnership_astropy.txt b/tests/data/reviews/partnership_astropy.txt
index 3c9a475..7c86395 100644
--- a/tests/data/reviews/partnership_astropy.txt
+++ b/tests/data/reviews/partnership_astropy.txt
@@ -7,8 +7,8 @@ Version submitted:  v.0.8.5
 Editor: @editoruser
 Reviewer 1: @reviewer1
 Reviewer 2: @reviewer2
-Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://doi.org/fakedoi/doi.svg)
-JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://joss.theoj.org/papers/fakedoi)
+Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://zenodo.org/records/8415866)
+JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://doi.org/10.21105/joss.01450)
 Version accepted: v.0.9.2
 Date accepted (month/day/year): 04/21/2024
 
diff --git a/tests/data/reviews/reviewer_keyed.txt b/tests/data/reviews/reviewer_keyed.txt
index 3a3560a..b5437f8 100644
--- a/tests/data/reviews/reviewer_keyed.txt
+++ b/tests/data/reviews/reviewer_keyed.txt
@@ -9,7 +9,7 @@ Editor: @fakeeditor
 Reviewer 1: @fakereviewer1
 Reviewer 2: @fakereviewer2
 Reviews Expected By: fake date
-Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
+Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
 Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
 Date accepted (month/day/year): 06/29/2024
 
diff --git a/tests/data/reviews/reviewer_list.txt b/tests/data/reviews/reviewer_list.txt
index 3e9f9e4..3014b14 100644
--- a/tests/data/reviews/reviewer_list.txt
+++ b/tests/data/reviews/reviewer_list.txt
@@ -8,7 +8,7 @@ EiC: @fakeeic
 Editor: @fakeeditor
 Reviewers: @fakereviewer1 , @fakereviewer2, @fakereviewer3
 Reviews Expected By: fake date
-Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
+Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
 Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
 Date accepted (month/day/year): 06/29/2024
 

From dc3d6274a974062c206f3e47edffb58d331b1931 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:00:40 -0800
Subject: [PATCH 03/15] coverage

---
 src/pyosmeta/models/base.py | 2 +-
 src/pyosmeta/utils_clean.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py
index 1742bd2..4933611 100644
--- a/src/pyosmeta/models/base.py
+++ b/src/pyosmeta/models/base.py
@@ -65,7 +65,7 @@ def format_url(cls, url: str) -> str:
                 url = "https://" + url
         if check_url(url=url):
             return url
-        else:
+        else:  # pragma: no cover
             print(f"Oops, url `{url}` is not valid, removing it")
             return None
 
diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py
index 69c9264..5d6371c 100644
--- a/src/pyosmeta/utils_clean.py
+++ b/src/pyosmeta/utils_clean.py
@@ -143,7 +143,7 @@ def check_url(url: str) -> bool:
     try:
         response = requests.get(url, timeout=6)
         return response.status_code == 200
-    except Exception:
+    except Exception:  # pragma: no cover
         return False
 
 

From 09c276b4278620a65251cc80b9211b678d581727 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:00:48 -0800
Subject: [PATCH 04/15] Test with JOSS DOI

---
 tests/data/reviews/reviewer_list.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/data/reviews/reviewer_list.txt b/tests/data/reviews/reviewer_list.txt
index 3014b14..17df318 100644
--- a/tests/data/reviews/reviewer_list.txt
+++ b/tests/data/reviews/reviewer_list.txt
@@ -9,6 +9,7 @@ Editor: @fakeeditor
 Reviewers: @fakereviewer1 , @fakereviewer2, @fakereviewer3
 Reviews Expected By: fake date
 Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
+JOSS DOI: [![DOI](https://example.com/fakearchive)](https://doi.org/10.21105/joss.01450)
 Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
 Date accepted (month/day/year): 06/29/2024
 

From 053f12da002e9089fc9162bd02ce9304370f40f3 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:02:13 -0800
Subject: [PATCH 05/15] cleanup

---
 src/pyosmeta/utils_clean.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py
index 5d6371c..da18331 100644
--- a/src/pyosmeta/utils_clean.py
+++ b/src/pyosmeta/utils_clean.py
@@ -189,6 +189,9 @@ def clean_archive(archive):
         link = archive[archive.rfind("](") + 2 : -1]
         # recursively clean the archive link
         return clean_archive(link)
+    elif link := is_doi(archive):
+        # is_doi returns the DOI link if it is valid
+        return link
     elif archive.startswith("http"):
         if archive.startswith("http://"):
             archive = archive.replace("http://", "https://")
@@ -196,8 +199,6 @@ def clean_archive(archive):
         if not check_url(archive):
             raise ValueError(f"Invalid archive URL: {archive}")
         return archive
-    elif link := is_doi(archive):
-        return link
     elif archive.lower() == "n/a":
         return None
     elif archive.lower() == "tbd":

From 7dce691f8bc2d69003aadcb0d91a8124d0b0027b Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:03:57 -0800
Subject: [PATCH 06/15] Add changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd26941..1df2c1c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather tha
 * Fix: Eix field not processing correctly  (@lwasser, #234)
 * Fix: Updated documentation throughout with a focus on how a user's name is accessed and updated (@lwasser)
 * Fix: ReviewUser object name can be optional. There are times when we don't have the actual person's name only the GH username (@lwasser)
+* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid
 
 ## [v1.3.7] - 2024-08-27
 

From 9e051ccd51306ffe85e0832f9ab6d9f44636d9ca Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:04:21 -0800
Subject: [PATCH 07/15] attribution in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1df2c1c..8dc20f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,7 @@ Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather tha
 * Fix: Eix field not processing correctly  (@lwasser, #234)
 * Fix: Updated documentation throughout with a focus on how a user's name is accessed and updated (@lwasser)
 * Fix: ReviewUser object name can be optional. There are times when we don't have the actual person's name only the GH username (@lwasser)
-* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid
+* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid (@banesullivan)
 
 ## [v1.3.7] - 2024-08-27
 

From cea651cfdece2cea52002e0c5ab5e6544163f291 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:24:52 -0800
Subject: [PATCH 08/15] Better testing and normalize the JOSS field

---
 src/pyosmeta/parse_issues.py            |  3 +++
 tests/data/reviews/archives_doi.txt     | 28 +++++++++++++++++++++++++
 tests/data/reviews/unknown_archives.txt | 28 +++++++++++++++++++++++++
 tests/integration/test_parse_issues.py  | 17 +++++++++++++++
 4 files changed, 76 insertions(+)
 create mode 100644 tests/data/reviews/archives_doi.txt
 create mode 100644 tests/data/reviews/unknown_archives.txt

diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 5e1696e..411fe6b 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -218,6 +218,9 @@ def _postprocess_meta(self, meta: dict, body: List[str]) -> dict:
         meta["partners"] = self.get_categories(
             body, "## Community Partnerships", 3, keyed=True
         )
+        if "joss_doi" in meta:
+            # Normalize the JOSS archive field. Some issues use `JOSS DOI` others `JOSS`
+            meta["joss"] = meta.pop("joss_doi")
 
         return meta
 
diff --git a/tests/data/reviews/archives_doi.txt b/tests/data/reviews/archives_doi.txt
new file mode 100644
index 0000000..3356a90
--- /dev/null
+++ b/tests/data/reviews/archives_doi.txt
@@ -0,0 +1,28 @@
+Submitting Author: Fakename (@fakeauthor)
+All current maintainers: (@fakeauthor1, @fakeauthor2)
+Package Name: fake_package
+One-Line Description of Package: A fake python package
+Repository Link: https://example.com/fakeauthor1/fake_package
+Version submitted: v1.0.0
+EiC: @fakeeic
+Editor: @fakeeditor
+Reviewer 1: @fakereviewer1
+Reviewer 2: @fakereviewer2
+Reviews Expected By: fake date
+Archive: 10.5281/zenodo.8415866
+JOSS DOI: 10.21105/joss.01450
+Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
+Date accepted (month/day/year): 06/29/2024
+
+---
+
+## Scope
+
+- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
+- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
+(etc)
+
+## Community Partnerships
+
+- [ ] etc
+- [ ] aaaaaa
diff --git a/tests/data/reviews/unknown_archives.txt b/tests/data/reviews/unknown_archives.txt
new file mode 100644
index 0000000..739a558
--- /dev/null
+++ b/tests/data/reviews/unknown_archives.txt
@@ -0,0 +1,28 @@
+Submitting Author: Fakename (@fakeauthor)
+All current maintainers: (@fakeauthor1, @fakeauthor2)
+Package Name: fake_package
+One-Line Description of Package: A fake python package
+Repository Link: https://example.com/fakeauthor1/fake_package
+Version submitted: v1.0.0
+EiC: @fakeeic
+Editor: @fakeeditor
+Reviewer 1: @fakereviewer1
+Reviewer 2: @fakereviewer2
+Reviews Expected By: fake date
+Archive: TBD
+JOSS DOI: N/A
+Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
+Date accepted (month/day/year): 06/29/2024
+
+---
+
+## Scope
+
+- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
+- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
+(etc)
+
+## Community Partnerships
+
+- [ ] etc
+- [ ] aaaaaa
diff --git a/tests/integration/test_parse_issues.py b/tests/integration/test_parse_issues.py
index 237ebfe..1373c34 100644
--- a/tests/integration/test_parse_issues.py
+++ b/tests/integration/test_parse_issues.py
@@ -51,6 +51,23 @@ def test_parse_bolded_keys(process_issues, data_file):
     assert review.package_name == "fake_package"
 
 
+def test_parse_doi_archives(process_issues, data_file):
+    """
+    Test handling of DOI archives in various formats.
+
+    This is a smoke test to ensure graceful handling of these cases.
+    """
+    review = data_file("reviews/archives_doi.txt", True)
+    review = process_issues.parse_issue(review)
+    assert review.archive == "https://zenodo.org/record/8415866"
+    assert review.joss == "http://joss.theoj.org/papers/10.21105/joss.01450"
+
+    review = data_file("reviews/unknown_archives.txt", True)
+    review = process_issues.parse_issue(review)
+    assert review.archive is None
+    assert review.joss is None
+
+
 def test_parse_labels(issue_list, process_issues):
     """
     `Label` models should be coerced to a string when parsing an issue

From 2b9db98d53f186adb91538ef1886ddd88b47c987 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Thu, 19 Dec 2024 00:28:12 -0800
Subject: [PATCH 09/15] Correct changelog

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8dc20f6..80f986d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ## [Unreleased]
 
+* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid (@banesullivan)
+
 [v1.4] - 2024-11-22
 
 Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather than try to fix on pypi we will just go with it to ensure our release cycles are smooth given no one else uses this package except pyopensci.
@@ -17,7 +19,6 @@ Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather tha
 * Fix: Eix field not processing correctly  (@lwasser, #234)
 * Fix: Updated documentation throughout with a focus on how a user's name is accessed and updated (@lwasser)
 * Fix: ReviewUser object name can be optional. There are times when we don't have the actual person's name only the GH username (@lwasser)
-* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid (@banesullivan)
 
 ## [v1.3.7] - 2024-08-27
 

From 03376a99c4c77974ffa1fa02c3c24e24c14bdee4 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 21:40:44 -0800
Subject: [PATCH 10/15] Apply suggestions from code review

Co-authored-by: Leah Wasser <leah@pyopensci.org>
---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80f986d..4e7c1db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,9 +4,9 @@
 
 ## [Unreleased]
 
-* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid (@banesullivan)
+* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid. Added python-doi as a dependency (@banesullivan)
 
-[v1.4] - 2024-11-22
+## [v1.4] - 2024-11-22
 
 Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather than try to fix on pypi we will just go with it to ensure our release cycles are smooth given no one else uses this package except pyopensci.
 

From b6b9f275830c848e2ad3b886b0eb643385cde5ad Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 21:59:26 -0800
Subject: [PATCH 11/15] Handle missing values

---
 src/pyosmeta/utils_clean.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py
index da18331..a0b789e 100644
--- a/src/pyosmeta/utils_clean.py
+++ b/src/pyosmeta/utils_clean.py
@@ -168,7 +168,7 @@ def is_doi(archive) -> str | None:
 
 
 def clean_archive(archive):
-    """Clean an archive link to ensure it is a valid URL.
+    """Clean an archive link to ensure it is a valid DOI URL.
 
     This utility will attempt to parse the DOI link from the various formats
     that are commonly present in review metadata. This utility will handle:
@@ -184,6 +184,10 @@ def clean_archive(archive):
     `https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package.
 
     """
+    archive = archive.strip()  # Remove leading/trailing whitespace
+    if not archive:
+        # If field is empty, return None
+        return None
     if archive.startswith("[") and archive.endswith(")"):
         # Extract the outermost link
         link = archive[archive.rfind("](") + 2 : -1]
@@ -197,7 +201,7 @@ def clean_archive(archive):
             archive = archive.replace("http://", "https://")
         # Validate that the URL resolves
         if not check_url(archive):
-            raise ValueError(f"Invalid archive URL: {archive}")
+            raise ValueError(f"Invalid archive URL (not resolving): {archive}")
         return archive
     elif archive.lower() == "n/a":
         return None

From b25ebbcf177161e0ef22bf025a3184a5358458c7 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 22:06:03 -0800
Subject: [PATCH 12/15] Add print statement to indicate progress

---
 src/pyosmeta/parse_issues.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 411fe6b..220bce6 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -306,6 +306,7 @@ def parse_issues(
         reviews = {}
         errors = {}
         for issue in issues:
+            print(f"Processing review {issue.title}")
             try:
                 review = self.parse_issue(issue)
                 reviews[review.package_name] = review

From e1246e078590ba140dd5711cafe5f3f0ece06b76 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 22:07:48 -0800
Subject: [PATCH 13/15] Update changelog with note on python-doi

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4e7c1db..9e308af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 ## [Unreleased]
 
-* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid. Added python-doi as a dependency (@banesullivan)
+* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid. Added python-doi as a dependency to ensure archive/DOI URLs fully resolve (@banesullivan)
 
 ## [v1.4] - 2024-11-22
 

From a4e9477563659767563a6be951b7bb7e97bc5a6d Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 22:22:15 -0800
Subject: [PATCH 14/15] Improve testing

---
 tests/data/reviews/archives_invalid.txt       | 28 +++++++++++++++++++
 tests/data/reviews/archives_missing.txt       | 28 +++++++++++++++++++
 ...nown_archives.txt => archives_unknown.txt} |  0
 tests/integration/test_parse_issues.py        | 12 +++++++-
 4 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/reviews/archives_invalid.txt
 create mode 100644 tests/data/reviews/archives_missing.txt
 rename tests/data/reviews/{unknown_archives.txt => archives_unknown.txt} (100%)

diff --git a/tests/data/reviews/archives_invalid.txt b/tests/data/reviews/archives_invalid.txt
new file mode 100644
index 0000000..7617b34
--- /dev/null
+++ b/tests/data/reviews/archives_invalid.txt
@@ -0,0 +1,28 @@
+Submitting Author: Fakename (@fakeauthor)
+All current maintainers: (@fakeauthor1, @fakeauthor2)
+Package Name: fake_package
+One-Line Description of Package: A fake python package
+Repository Link: https://example.com/fakeauthor1/fake_package
+Version submitted: v1.0.0
+EiC: @fakeeic
+Editor: @fakeeditor
+Reviewer 1: @fakereviewer1
+Reviewer 2: @fakereviewer2
+Reviews Expected By: fake date
+Archive: 10.1234/zenodo.12345678
+JOSS DOI: 10.21105/joss.00000
+Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
+Date accepted (month/day/year): 06/29/2024
+
+---
+
+## Scope
+
+- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
+- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
+(etc)
+
+## Community Partnerships
+
+- [ ] etc
+- [ ] aaaaaa
diff --git a/tests/data/reviews/archives_missing.txt b/tests/data/reviews/archives_missing.txt
new file mode 100644
index 0000000..4f30063
--- /dev/null
+++ b/tests/data/reviews/archives_missing.txt
@@ -0,0 +1,28 @@
+Submitting Author: Fakename (@fakeauthor)
+All current maintainers: (@fakeauthor1, @fakeauthor2)
+Package Name: fake_package
+One-Line Description of Package: A fake python package
+Repository Link: https://example.com/fakeauthor1/fake_package
+Version submitted: v1.0.0
+EiC: @fakeeic
+Editor: @fakeeditor
+Reviewer 1: @fakereviewer1
+Reviewer 2: @fakereviewer2
+Reviews Expected By: fake date
+Archive:
+JOSS DOI:
+Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
+Date accepted (month/day/year): 06/29/2024
+
+---
+
+## Scope
+
+- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
+- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
+(etc)
+
+## Community Partnerships
+
+- [ ] etc
+- [ ] aaaaaa
diff --git a/tests/data/reviews/unknown_archives.txt b/tests/data/reviews/archives_unknown.txt
similarity index 100%
rename from tests/data/reviews/unknown_archives.txt
rename to tests/data/reviews/archives_unknown.txt
diff --git a/tests/integration/test_parse_issues.py b/tests/integration/test_parse_issues.py
index 1373c34..41d4301 100644
--- a/tests/integration/test_parse_issues.py
+++ b/tests/integration/test_parse_issues.py
@@ -62,11 +62,21 @@ def test_parse_doi_archives(process_issues, data_file):
     assert review.archive == "https://zenodo.org/record/8415866"
     assert review.joss == "http://joss.theoj.org/papers/10.21105/joss.01450"
 
-    review = data_file("reviews/unknown_archives.txt", True)
+    review = data_file("reviews/archives_unknown.txt", True)
     review = process_issues.parse_issue(review)
     assert review.archive is None
     assert review.joss is None
 
+    review = data_file("reviews/archives_missing.txt", True)
+    review = process_issues.parse_issue(review)
+    assert review.archive is None
+    assert review.joss is None
+
+    review = data_file("reviews/archives_invalid.txt", True)
+
+    with pytest.raises(ValueError):
+        review = process_issues.parse_issue(review)
+
 
 def test_parse_labels(issue_list, process_issues):
     """

From 4a56e10a0648ac0738f89966653227a54b607fa9 Mon Sep 17 00:00:00 2001
From: Bane Sullivan <banesullivan@gmail.com>
Date: Sun, 12 Jan 2025 22:24:12 -0800
Subject: [PATCH 15/15] Use `match=` in validation testing

---
 tests/integration/test_parse_issues.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_parse_issues.py b/tests/integration/test_parse_issues.py
index 41d4301..8fa2586 100644
--- a/tests/integration/test_parse_issues.py
+++ b/tests/integration/test_parse_issues.py
@@ -74,7 +74,7 @@ def test_parse_doi_archives(process_issues, data_file):
 
     review = data_file("reviews/archives_invalid.txt", True)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="Invalid archive"):
         review = process_issues.parse_issue(review)