Skip to content

Commit

Permalink
fix: tests for cff, add test for doi, move doi and orcid matchers to …
Browse files Browse the repository at this point in the history
…utils
  • Loading branch information
rmfranken committed Nov 22, 2024
1 parent aef27dd commit ee9238e
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 6 deletions.
9 changes: 3 additions & 6 deletions gimie/parsers/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from gimie import logger
from gimie.graph.namespaces import SDO, MD4I
from gimie.parsers.abstract import Parser
from gimie.utils.uri import is_valid_orcid, valid_doi_match_extractor


class CffParser(Parser):
Expand Down Expand Up @@ -52,9 +53,7 @@ def parse(self, data: bytes) -> Graph:
return extracted_cff_triples
for author in authors:
orcid = URIRef(author["orcid"])
if re.match(
r"https:\/\/orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
):
if is_valid_orcid(orcid):
extracted_cff_triples.add(
(self.subject, SDO.author, URIRef(orcid))
)
Expand Down Expand Up @@ -114,9 +113,7 @@ def doi_to_url(doi: str) -> str:

# regex from:
# https://www.crossref.org/blog/dois-and-matching-regular-expressions
doi_match = re.search(
r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
)
doi_match = valid_doi_match_extractor(doi)

if doi_match is None:
raise ValueError(f"Not a valid DOI: {doi}")
Expand Down
60 changes: 60 additions & 0 deletions gimie/utils/uri.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from typing import List, Literal
from urllib.parse import urlparse
import re

from gimie.graph.namespaces import GIMIE

Expand Down Expand Up @@ -64,3 +65,62 @@ def generate_uri(ref: str):
'https://sdsc-ordes.github.io/gimie/abc'
"""
return str(GIMIE[ref])


def is_valid_orcid(orcid):
"""Check if the input is a valid ORCID according to definition from orcid.org [1]_.
.. [1] [https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier](https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier)
Parameters
----------
orcid:
The ORCID to validate.
Returns
-------
bool:
True if the ORCID is valid, False otherwise.
Examples
--------
>>> is_valid_orcid("https://orcid.org/0000-0001-2345-6789")
True
>>> is_valid_orcid("0000-0001-2345-6789")
False
>>> is_valid_orcid("http://orcid.org/0000-0001-2345-6789")
False
"""
return bool(
re.match(
r"(https:\/\/)?orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
)
)


def valid_doi_match_extractor(doi):
"""Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_.
.. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions)
Parameters
----------
doi:
The DOI to validate.
Returns
-------
bool:
True if the DOI is valid, False otherwise.
Examples
--------
>>> is_valid_doi("10.0000/example.abcd")
True
>>> is_valid_doi("doi.org/10.0000/example.abcd")
False
>>> is_valid_doi("https://doi.org/10.0000/example.abcd")
False
"""
return re.search(
r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
)
75 changes: 75 additions & 0 deletions tests/test_cff.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from gimie.io import LocalResource
from gimie.parsers import CffParser
from gimie.parsers.cff import get_cff_authors
from rdflib import URIRef


def test_parse_cff():
Expand All @@ -8,3 +10,76 @@ def test_parse_cff():
cff_content = f.read()
authors = get_cff_authors(cff_content)
assert authors is not None


def test_broken_cff():
cff_file_emptyish = b"""
cff-version: 1.2.0
message: "This is a CFF devoid authors or DOI"
"""
cff_file_bad_syntax = b"""
cff-version: 1.2.0
title: gimie :
authors:
family-names: Doe
given-names: John
- family-names: Smith
given-names:
Jane
orcid: 0000-0001-2345-6789
"""
cff_file_broken_orcid = b"""
cff-version: 1.2.0
title: gimie
authors:
- family-names: Doe
given-names: John
orcid: 0000-0001-2345-6789
- family-names: Smith
given-names: Jane
orcid: http://www.orcid.org/0000-0001-2345-6789
"""
cff_file_author_without_orcid = b"""
cff-version: 1.2.0
title: gimie
authors:
- family-names: Doe
given-names: John
"""

# empty graph = \n according to rdflib
assert (
CffParser(subject=URIRef("https://example.org/"))
.parse(data=cff_file_emptyish)
.serialize(format="ttl")
== "\n"
)
assert (
CffParser(subject=URIRef("https://example.org/"))
.parse(data=cff_file_bad_syntax)
.serialize(format="ttl")
== "\n"
)
assert (
CffParser(subject=URIRef("https://example.org/"))
.parse(data=cff_file_broken_orcid)
.serialize(format="ttl")
== "\n"
)
assert (
CffParser(subject=URIRef("https://example.org/"))
.parse(data=cff_file_author_without_orcid)
.serialize(format="ttl")
== "\n"
)


def test_doi():
cff_file = b"""
cff-version: 1.2.0
title: gimie
doi: 10.5281/zenodo.1234567
"""
assert "https://doi.org/10.5281/zenodo.1234567" in CffParser(
subject=URIRef("https://example.org/")
).parse(data=cff_file).serialize(format="ttl")

0 comments on commit ee9238e

Please sign in to comment.