fix(cff): enforce valid urls as doi (#108)

* fix(cff): parse yaml to handle quotes + prepend scheme to doi * chore(fmt): quotes * chore: update lock * docs(readme): update obsolete env var name * docs(readme): lighter fmt * test(cff): update doctest to include scheme * fix(cff): ensure doi is a valid url * fix: drop unused import * fix(cff): error handling on invalid yaml * refactor(cff): use regex + defensive prog * fix(cff): drop unused prefix var * test(cff): update test cases with valid doi * feat(cff): add warning when doi missing from cff * fix(log): replace deprecated logger.warn -> logger.warning * feat(log): format to include loglevel
sdsc-ordes · Feb 2, 2024 · e68513f · e68513f
1 parent 23c75dd
commit e68513f
Show file tree

Hide file tree

Showing 4 changed files with 573 additions and 640 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Scientific code repositories contain valuable metadata which can be used to enri
 
 Using Gimie: easy peasy, it's a 3 step process.
 
-## STEP 1: Installation
+## 1: Installation
 
 To install the stable version on PyPI:
 
@@ -32,10 +32,10 @@ Gimie is also available as a docker container hosted on the [Github container re
 docker pull ghcr.io/sdsc-ordes/gimie:latest
 
 # The access token can be provided as an environment variable
-docker run -e ACCESS_TOKEN=$ACCESS_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data <repo>
+docker run -e GITHUB_TOKEN=$GITHUB_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data <repo>
 ```
 
-## STEP 2 : Set your credentials
+## 2 : Set your credentials
 
 In order to access the github api, you need to provide a github token with the `read:org` scope.
 
@@ -61,7 +61,7 @@ and/or your Gitlab token:
 export GITLAB_TOKEN=
 ```
 
-## STEP 3: GIMIE info ! Run Gimie
+## 3: GIMIE info ! Run Gimie
 
 ### As a command line tool
 

diff --git a/gimie/__init__.py b/gimie/__init__.py
@@ -22,4 +22,8 @@
 __version__ = importlib_metadata.version(__name__)
 
 logger = logging.getLogger()
-logger.setLevel(logging.WARNING)
+stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s")
+stream_handler = logging.StreamHandler()
+stream_handler.setLevel(logging.WARNING)
+stream_handler.setFormatter(stdout_formatter)
+logger.addHandler(stream_handler)
diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
@@ -17,15 +17,17 @@
 from io import BytesIO
 import re
 from typing import List, Optional, Set
+import yaml
 
 from rdflib.term import URIRef
 
+from gimie import logger
 from gimie.graph.namespaces import SDO
 from gimie.parsers.abstract import Parser, Property
 
 
 class CffParser(Parser):
-    """Parse cff file to extract the doi into schema:citation <doi>."""
+    """Parse DOI from CITATION.cff into schema:citation <doi>."""
 
     def __init__(self):
         super().__init__()
@@ -43,30 +45,81 @@ def parse(self, data: bytes) -> Set[Property]:
         return props
 
 
+def doi_to_url(doi: str) -> str:
+    """Formats a doi to an https URL to doi.org.
+
+    Parameters
+    ----------
+    doi
+        doi where the scheme (e.g. https://) and
+        hostname (e.g. doi.org) may be missing.
+
+    Returns
+    -------
+    str
+        doi formatted as a valid url. Base url
+        is set to https://doi.org when missing.
+
+    Examples
+    --------
+    >>> doi_to_url("10.0000/example.abcd")
+    'https://doi.org/10.0000/example.abcd'
+    >>> doi_to_url("doi.org/10.0000/example.abcd")
+    'https://doi.org/10.0000/example.abcd'
+    >>> doi_to_url("https://doi.org/10.0000/example.abcd")
+    'https://doi.org/10.0000/example.abcd'
+    """
+
+    # regex from:
+    # https://www.crossref.org/blog/dois-and-matching-regular-expressions
+    doi_match = re.search(
+        r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
+    )
+
+    if doi_match is None:
+        raise ValueError(f"Not a valid DOI: {doi}")
+
+    short_doi = doi_match.group()
+
+    return f"https://doi.org/{short_doi}"
+
+
 def get_cff_doi(data: bytes) -> Optional[str]:
     """Given a CFF file, returns the DOI, if any.
 
     Parameters
     ----------
-    data:
+    data
         The cff file body as bytes.
 
+    Returns
+    -------
+    str, optional
+        doi formatted as a valid url
+
     Examples
     --------
     >>> get_cff_doi(bytes("doi:   10.5281/zenodo.1234", encoding="utf8"))
-    '10.5281/zenodo.1234'
+    'https://doi.org/10.5281/zenodo.1234'
     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
 
     """
 
-    matches = re.search(
-        r"^doi: *(.*)$",
-        data.decode(),
-        flags=re.IGNORECASE | re.MULTILINE,
-    )
     try:
-        doi = matches.groups()[0]
-    except AttributeError:
-        doi = None
+        cff = yaml.safe_load(data.decode())
+    except yaml.scanner.ScannerError:
+        logger.warning("cannot read CITATION.cff, skipped.")
+        return None
+
+    try:
+        doi_url = doi_to_url(cff["doi"])
+    # No doi in cff file
+    except (KeyError, TypeError):
+        logger.warning("CITATION.cff does not contain a 'doi' key.")
+        doi_url = None
+    # doi is malformed
+    except ValueError as err:
+        logger.warning(err)
+        doi_url = None
 
-    return doi
+    return doi_url