diff --git a/CHANGELOG.md b/CHANGELOG.md index cd26941..9e308af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,9 @@ ## [Unreleased] -[v1.4] - 2024-11-22 +* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid. Added python-doi as a dependency to ensure archive/DOI URLs fully resolve (@banesullivan) + +## [v1.4] - 2024-11-22 Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather than try to fix on pypi we will just go with it to ensure our release cycles are smooth given no one else uses this package except pyopensci. diff --git a/pyproject.toml b/pyproject.toml index 138cded..9ee53d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ ] dependencies = [ "pydantic>=2.0", + "python-doi", "python-dotenv", "requests", "ruamel-yaml>=0.17.21", diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py index 63a4a18..4933611 100644 --- a/src/pyosmeta/models/base.py +++ b/src/pyosmeta/models/base.py @@ -8,7 +8,6 @@ from enum import Enum from typing import Any, Optional, Set, Union -import requests from pydantic import ( AliasChoices, BaseModel, @@ -19,7 +18,12 @@ ) from pyosmeta.models.github import Labels -from pyosmeta.utils_clean import clean_date, clean_markdown +from pyosmeta.utils_clean import ( + check_url, + clean_archive, + clean_date, + clean_markdown, +) class Partnerships(str, Enum): @@ -59,29 +63,12 @@ def format_url(cls, url: str) -> str: elif not url.startswith("http"): print("Oops, missing http") url = "https://" + url - if cls._check_url(url=url): + if check_url(url=url): return url - else: + else: # pragma: no cover + print(f"Oops, url `{url}` is not valid, removing it") return None - @staticmethod - def _check_url(url: str) -> bool: - """Test url. Return true if there's a valid response, False if not - - Parameters - ---------- - url : str - String for a url to a website to test. - - """ - - try: - response = requests.get(url, timeout=6) - return response.status_code == 200 - except Exception: - print("Oops, url", url, "is not valid, removing it") - return False - class PersonModel(BaseModel, UrlValidatorMixin): model_config = ConfigDict( @@ -403,3 +390,21 @@ def extract_label(cls, labels: list[str | Labels]) -> list[str]: label.name if isinstance(label, Labels) else label for label in labels ] + + @field_validator( + "archive", + mode="before", + ) + @classmethod + def clean_archive(cls, archive: str) -> str: + """Clean the archive value to ensure it's a valid archive URL.""" + return clean_archive(archive) + + @field_validator( + "joss", + mode="before", + ) + @classmethod + def clean_joss(cls, joss: str) -> str: + """Clean the joss value to ensure it's a valid URL.""" + return clean_archive(joss) diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 5e1696e..220bce6 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -218,6 +218,9 @@ def _postprocess_meta(self, meta: dict, body: List[str]) -> dict: meta["partners"] = self.get_categories( body, "## Community Partnerships", 3, keyed=True ) + if "joss_doi" in meta: + # Normalize the JOSS archive field. Some issues use `JOSS DOI` others `JOSS` + meta["joss"] = meta.pop("joss_doi") return meta @@ -303,6 +306,7 @@ def parse_issues( reviews = {} errors = {} for issue in issues: + print(f"Processing review {issue.title}") try: review = self.parse_issue(issue) reviews[review.package_name] = review diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py index c87ea92..a0b789e 100644 --- a/src/pyosmeta/utils_clean.py +++ b/src/pyosmeta/utils_clean.py @@ -7,6 +7,9 @@ from datetime import datetime from typing import Any +import doi +import requests + def get_clean_user(username: str) -> str: """Cleans a GitHub username provided in a review issue by removing any @@ -125,3 +128,84 @@ def clean_date_accepted_key(review_dict: dict[str, Any]) -> dict[str, str]: review_dict["date_accepted"] = value break return review_dict + + +def check_url(url: str) -> bool: + """Test url. Return true if there's a valid response, False if not + + Parameters + ---------- + url : str + String for a url to a website to test. + + """ + + try: + response = requests.get(url, timeout=6) + return response.status_code == 200 + except Exception: # pragma: no cover + return False + + +def is_doi(archive) -> str | None: + """Check if the DOI is valid and return the DOI link. + + Parameters + ---------- + archive : str + The DOI string to validate, e.g., `10.1234/zenodo.12345678` + + Returns + ------- + str | None + The DOI link in the form `https://doi.org/10.1234/zenodo.12345678` or `None` + if the DOI is invalid. + """ + try: + return doi.validate_doi(archive) + except ValueError: + pass + + +def clean_archive(archive): + """Clean an archive link to ensure it is a valid DOI URL. + + This utility will attempt to parse the DOI link from the various formats + that are commonly present in review metadata. This utility will handle: + + * Markdown links in the format `[label](URL)`, e.g., `[my archive](https://doi.org/10.1234/zenodo.12345678)` + * Raw text in the format `DOI` e.g., `10.1234/zenodo.12345678` + * URLs in the format `http(s)://...` e.g., `https://doi.org/10.1234/zenodo.12345678` + * The special cases `n/a` and `tbd` which will be returned as `None` in anticipation of future data + + If the archive link is a URL, it will be returned as is with a check that + it resolves but is not required to be a valid DOI. If the archive link is + a DOI, it will be validated and returned as a URL in the form + `https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package. + + """ + archive = archive.strip() # Remove leading/trailing whitespace + if not archive: + # If field is empty, return None + return None + if archive.startswith("[") and archive.endswith(")"): + # Extract the outermost link + link = archive[archive.rfind("](") + 2 : -1] + # recursively clean the archive link + return clean_archive(link) + elif link := is_doi(archive): + # is_doi returns the DOI link if it is valid + return link + elif archive.startswith("http"): + if archive.startswith("http://"): + archive = archive.replace("http://", "https://") + # Validate that the URL resolves + if not check_url(archive): + raise ValueError(f"Invalid archive URL (not resolving): {archive}") + return archive + elif archive.lower() == "n/a": + return None + elif archive.lower() == "tbd": + return None + else: + raise ValueError(f"Invalid archive URL: {archive}") diff --git a/tests/data/reviews/archives_doi.txt b/tests/data/reviews/archives_doi.txt new file mode 100644 index 0000000..3356a90 --- /dev/null +++ b/tests/data/reviews/archives_doi.txt @@ -0,0 +1,28 @@ +Submitting Author: Fakename (@fakeauthor) +All current maintainers: (@fakeauthor1, @fakeauthor2) +Package Name: fake_package +One-Line Description of Package: A fake python package +Repository Link: https://example.com/fakeauthor1/fake_package +Version submitted: v1.0.0 +EiC: @fakeeic +Editor: @fakeeditor +Reviewer 1: @fakereviewer1 +Reviewer 2: @fakereviewer2 +Reviews Expected By: fake date +Archive: 10.5281/zenodo.8415866 +JOSS DOI: 10.21105/joss.01450 +Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) +Date accepted (month/day/year): 06/29/2024 + +--- + +## Scope + +- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted. +- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment]. +(etc) + +## Community Partnerships + +- [ ] etc +- [ ] aaaaaa diff --git a/tests/data/reviews/archives_invalid.txt b/tests/data/reviews/archives_invalid.txt new file mode 100644 index 0000000..7617b34 --- /dev/null +++ b/tests/data/reviews/archives_invalid.txt @@ -0,0 +1,28 @@ +Submitting Author: Fakename (@fakeauthor) +All current maintainers: (@fakeauthor1, @fakeauthor2) +Package Name: fake_package +One-Line Description of Package: A fake python package +Repository Link: https://example.com/fakeauthor1/fake_package +Version submitted: v1.0.0 +EiC: @fakeeic +Editor: @fakeeditor +Reviewer 1: @fakereviewer1 +Reviewer 2: @fakereviewer2 +Reviews Expected By: fake date +Archive: 10.1234/zenodo.12345678 +JOSS DOI: 10.21105/joss.00000 +Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) +Date accepted (month/day/year): 06/29/2024 + +--- + +## Scope + +- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted. +- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment]. +(etc) + +## Community Partnerships + +- [ ] etc +- [ ] aaaaaa diff --git a/tests/data/reviews/archives_missing.txt b/tests/data/reviews/archives_missing.txt new file mode 100644 index 0000000..4f30063 --- /dev/null +++ b/tests/data/reviews/archives_missing.txt @@ -0,0 +1,28 @@ +Submitting Author: Fakename (@fakeauthor) +All current maintainers: (@fakeauthor1, @fakeauthor2) +Package Name: fake_package +One-Line Description of Package: A fake python package +Repository Link: https://example.com/fakeauthor1/fake_package +Version submitted: v1.0.0 +EiC: @fakeeic +Editor: @fakeeditor +Reviewer 1: @fakereviewer1 +Reviewer 2: @fakereviewer2 +Reviews Expected By: fake date +Archive: +JOSS DOI: +Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) +Date accepted (month/day/year): 06/29/2024 + +--- + +## Scope + +- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted. +- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment]. +(etc) + +## Community Partnerships + +- [ ] etc +- [ ] aaaaaa diff --git a/tests/data/reviews/archives_unknown.txt b/tests/data/reviews/archives_unknown.txt new file mode 100644 index 0000000..739a558 --- /dev/null +++ b/tests/data/reviews/archives_unknown.txt @@ -0,0 +1,28 @@ +Submitting Author: Fakename (@fakeauthor) +All current maintainers: (@fakeauthor1, @fakeauthor2) +Package Name: fake_package +One-Line Description of Package: A fake python package +Repository Link: https://example.com/fakeauthor1/fake_package +Version submitted: v1.0.0 +EiC: @fakeeic +Editor: @fakeeditor +Reviewer 1: @fakereviewer1 +Reviewer 2: @fakereviewer2 +Reviews Expected By: fake date +Archive: TBD +JOSS DOI: N/A +Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) +Date accepted (month/day/year): 06/29/2024 + +--- + +## Scope + +- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted. +- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment]. +(etc) + +## Community Partnerships + +- [ ] etc +- [ ] aaaaaa diff --git a/tests/data/reviews/bolded_keys.txt b/tests/data/reviews/bolded_keys.txt index 7275920..a3ddd3a 100644 --- a/tests/data/reviews/bolded_keys.txt +++ b/tests/data/reviews/bolded_keys.txt @@ -9,8 +9,8 @@ **Reviewer 1:** @fakereviewer1 **Reviewer 2:** @fakereviewer2 **Reviews Expected By:** fake date -**Archive:** [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive) -**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) +**Archive:** [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866) +**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://zenodo.org/records/8415866)) **Date accepted (month/day/year):** 06/29/2024 --- diff --git a/tests/data/reviews/partnership_astropy.txt b/tests/data/reviews/partnership_astropy.txt index 3c9a475..7c86395 100644 --- a/tests/data/reviews/partnership_astropy.txt +++ b/tests/data/reviews/partnership_astropy.txt @@ -7,8 +7,8 @@ Version submitted: v.0.8.5 Editor: @editoruser Reviewer 1: @reviewer1 Reviewer 2: @reviewer2 -Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://doi.org/fakedoi/doi.svg) -JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://joss.theoj.org/papers/fakedoi) +Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://zenodo.org/records/8415866) +JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://doi.org/10.21105/joss.01450) Version accepted: v.0.9.2 Date accepted (month/day/year): 04/21/2024 diff --git a/tests/data/reviews/reviewer_keyed.txt b/tests/data/reviews/reviewer_keyed.txt index 3a3560a..b5437f8 100644 --- a/tests/data/reviews/reviewer_keyed.txt +++ b/tests/data/reviews/reviewer_keyed.txt @@ -9,7 +9,7 @@ Editor: @fakeeditor Reviewer 1: @fakereviewer1 Reviewer 2: @fakereviewer2 Reviews Expected By: fake date -Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive) +Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866) Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) Date accepted (month/day/year): 06/29/2024 diff --git a/tests/data/reviews/reviewer_list.txt b/tests/data/reviews/reviewer_list.txt index 3e9f9e4..17df318 100644 --- a/tests/data/reviews/reviewer_list.txt +++ b/tests/data/reviews/reviewer_list.txt @@ -8,7 +8,8 @@ EiC: @fakeeic Editor: @fakeeditor Reviewers: @fakereviewer1 , @fakereviewer2, @fakereviewer3 Reviews Expected By: fake date -Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive) +Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866) +JOSS DOI: [![DOI](https://example.com/fakearchive)](https://doi.org/10.21105/joss.01450) Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive)) Date accepted (month/day/year): 06/29/2024 diff --git a/tests/integration/test_parse_issues.py b/tests/integration/test_parse_issues.py index 237ebfe..8fa2586 100644 --- a/tests/integration/test_parse_issues.py +++ b/tests/integration/test_parse_issues.py @@ -51,6 +51,33 @@ def test_parse_bolded_keys(process_issues, data_file): assert review.package_name == "fake_package" +def test_parse_doi_archives(process_issues, data_file): + """ + Test handling of DOI archives in various formats. + + This is a smoke test to ensure graceful handling of these cases. + """ + review = data_file("reviews/archives_doi.txt", True) + review = process_issues.parse_issue(review) + assert review.archive == "https://zenodo.org/record/8415866" + assert review.joss == "http://joss.theoj.org/papers/10.21105/joss.01450" + + review = data_file("reviews/archives_unknown.txt", True) + review = process_issues.parse_issue(review) + assert review.archive is None + assert review.joss is None + + review = data_file("reviews/archives_missing.txt", True) + review = process_issues.parse_issue(review) + assert review.archive is None + assert review.joss is None + + review = data_file("reviews/archives_invalid.txt", True) + + with pytest.raises(ValueError, match="Invalid archive"): + review = process_issues.parse_issue(review) + + def test_parse_labels(issue_list, process_issues): """ `Label` models should be coerced to a string when parsing an issue