Skip to content

Commit

Permalink
fix(cff): enforce valid urls as doi (#108)
Browse files Browse the repository at this point in the history
* fix(cff): parse yaml to handle quotes + prepend scheme to doi

* chore(fmt): quotes

* chore: update lock

* docs(readme): update obsolete env var name

* docs(readme): lighter fmt

* test(cff): update doctest to include scheme

* fix(cff): ensure doi is a valid url

* fix: drop unused import

* fix(cff): error handling on invalid yaml

* refactor(cff): use regex + defensive prog

* fix(cff): drop unused prefix var

* test(cff): update test cases with valid doi

* feat(cff): add warning when doi missing from cff

* fix(log): replace deprecated logger.warn -> logger.warning

* feat(log): format to include loglevel
  • Loading branch information
cmdoret authored Feb 2, 2024
1 parent 23c75dd commit e68513f
Show file tree
Hide file tree
Showing 4 changed files with 573 additions and 640 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Scientific code repositories contain valuable metadata which can be used to enri

Using Gimie: easy peasy, it's a 3 step process.

## STEP 1: Installation
## 1: Installation

To install the stable version on PyPI:

Expand All @@ -32,10 +32,10 @@ Gimie is also available as a docker container hosted on the [Github container re
docker pull ghcr.io/sdsc-ordes/gimie:latest

# The access token can be provided as an environment variable
docker run -e ACCESS_TOKEN=$ACCESS_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data <repo>
docker run -e GITHUB_TOKEN=$GITHUB_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data <repo>
```

## STEP 2 : Set your credentials
## 2 : Set your credentials

In order to access the github api, you need to provide a github token with the `read:org` scope.

Expand All @@ -61,7 +61,7 @@ and/or your Gitlab token:
export GITLAB_TOKEN=
```

## STEP 3: GIMIE info ! Run Gimie
## 3: GIMIE info ! Run Gimie

### As a command line tool

Expand Down
6 changes: 5 additions & 1 deletion gimie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@
__version__ = importlib_metadata.version(__name__)

logger = logging.getLogger()
logger.setLevel(logging.WARNING)
stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s")
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.WARNING)
stream_handler.setFormatter(stdout_formatter)
logger.addHandler(stream_handler)
77 changes: 65 additions & 12 deletions gimie/parsers/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@
from io import BytesIO
import re
from typing import List, Optional, Set
import yaml

from rdflib.term import URIRef

from gimie import logger
from gimie.graph.namespaces import SDO
from gimie.parsers.abstract import Parser, Property


class CffParser(Parser):
"""Parse cff file to extract the doi into schema:citation <doi>."""
"""Parse DOI from CITATION.cff into schema:citation <doi>."""

def __init__(self):
super().__init__()
Expand All @@ -43,30 +45,81 @@ def parse(self, data: bytes) -> Set[Property]:
return props


def doi_to_url(doi: str) -> str:
"""Formats a doi to an https URL to doi.org.
Parameters
----------
doi
doi where the scheme (e.g. https://) and
hostname (e.g. doi.org) may be missing.
Returns
-------
str
doi formatted as a valid url. Base url
is set to https://doi.org when missing.
Examples
--------
>>> doi_to_url("10.0000/example.abcd")
'https://doi.org/10.0000/example.abcd'
>>> doi_to_url("doi.org/10.0000/example.abcd")
'https://doi.org/10.0000/example.abcd'
>>> doi_to_url("https://doi.org/10.0000/example.abcd")
'https://doi.org/10.0000/example.abcd'
"""

# regex from:
# https://www.crossref.org/blog/dois-and-matching-regular-expressions
doi_match = re.search(
r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
)

if doi_match is None:
raise ValueError(f"Not a valid DOI: {doi}")

short_doi = doi_match.group()

return f"https://doi.org/{short_doi}"


def get_cff_doi(data: bytes) -> Optional[str]:
"""Given a CFF file, returns the DOI, if any.
Parameters
----------
data:
data
The cff file body as bytes.
Returns
-------
str, optional
doi formatted as a valid url
Examples
--------
>>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8"))
'10.5281/zenodo.1234'
'https://doi.org/10.5281/zenodo.1234'
>>> get_cff_doi(bytes("abc: def", encoding="utf8"))
"""

matches = re.search(
r"^doi: *(.*)$",
data.decode(),
flags=re.IGNORECASE | re.MULTILINE,
)
try:
doi = matches.groups()[0]
except AttributeError:
doi = None
cff = yaml.safe_load(data.decode())
except yaml.scanner.ScannerError:
logger.warning("cannot read CITATION.cff, skipped.")
return None

try:
doi_url = doi_to_url(cff["doi"])
# No doi in cff file
except (KeyError, TypeError):
logger.warning("CITATION.cff does not contain a 'doi' key.")
doi_url = None
# doi is malformed
except ValueError as err:
logger.warning(err)
doi_url = None

return doi
return doi_url
Loading

0 comments on commit e68513f

Please sign in to comment.