feat: support multiple DOI's

sdsc-ordes · Dec 17, 2024 · a884cbc · a884cbc
1 parent 70bcac9
commit a884cbc
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 34 deletions.
diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
@@ -34,21 +34,22 @@ def __init__(self, subject: str):
         super().__init__(subject)
 
     def parse(self, data: bytes) -> Graph:
-        """Extracts a DOI link and list of authors from a CFF file and returns a
-        graph with a single triple <subject> <schema:citation> <doi>
+        """Extracts DOIs and list of authors from a CFF file and returns a
+        graph with triples <subject> <schema:citation> <doi>
         and a number of author objects with <schema:name> and <md4i:orcid> values.
-        If no DOI is found, it will not be included in the graph.
-        If no authors are found, it will not be included in the graph.
-        If neither authors nor DOI are found, an empty graph is returned.
+        If no DOIs are found, they will not be included in the graph.
+        If no authors are found, they will not be included in the graph.
+        If neither authors nor DOIs are found, an empty graph is returned.
         """
         extracted_cff_triples = Graph()
-        doi = get_cff_doi(data)
+        dois = get_cff_doi(data)
         authors = get_cff_authors(data)
 
-        if doi:
-            extracted_cff_triples.add(
-                (self.subject, SDO.citation, URIRef(doi))
-            )
+        if dois:
+            for doi in dois:
+                extracted_cff_triples.add(
+                    (self.subject, SDO.citation, URIRef(doi))
+                )
         if not authors:
             return extracted_cff_triples
         for author in authors:
@@ -119,8 +120,8 @@ def doi_to_url(doi: str) -> str:
     return f"https://doi.org/{doi_match}"
 
 
-def get_cff_doi(data: bytes) -> Optional[str]:
-    """Given a CFF file, returns the DOI, if any.
+def get_cff_doi(data: bytes) -> Optional[List[str]]:
+    """Given a CFF file, returns a list of DOIs, if any.
 
     Parameters
     ----------
@@ -129,13 +130,15 @@ def get_cff_doi(data: bytes) -> Optional[str]:
 
     Returns
     -------
-    str, optional
-        doi formatted as a valid url
+    list of str, optional
+        DOIs formatted as valid URLs
 
     Examples
     --------
-    >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.1234", encoding="utf8"))
-    'https://doi.org/10.5281/zenodo.1234'
+    >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.1234\\n    - type: doi\\n      value: 10.5281/zenodo.5678", encoding="utf8"))
+    ['https://doi.org/10.5281/zenodo.1234', 'https://doi.org/10.5281/zenodo.5678']
+    >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.9012", encoding="utf8"))
+    ['https://doi.org/10.5281/zenodo.9012']
     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
     """
 
@@ -144,25 +147,24 @@ def get_cff_doi(data: bytes) -> Optional[str]:
     except yaml.scanner.ScannerError:
         logger.warning("cannot read CITATION.cff, skipped.")
         return None
+
+    doi_urls = []
     try:
         identifiers = cff.get("identifiers", [])
-        doi_identifier = next(
-            (id for id in identifiers if id.get("type") == "doi"), None
-        )
-        if doi_identifier:
-            doi_url = doi_to_url(doi_identifier["value"])
-        else:
-            raise KeyError("No DOI found in identifiers")
-    # No doi in cff file
+        for identifier in identifiers:
+            if identifier.get("type") == "doi":
+                try:
+                    doi_url = doi_to_url(identifier["value"])
+                    doi_urls.append(doi_url)
+                except ValueError as err:
+                    logger.warning(err)
     except (KeyError, TypeError):
-        logger.warning("CITATION.cff does not contain a 'doi' key.")
-        doi_url = None
-    # doi is malformed
-    except ValueError as err:
-        logger.warning(err)
-        doi_url = None
-
-    return doi_url
+        logger.warning(
+            "CITATION.cff does not contain a valid 'identifiers' key."
+        )
+        return None
+
+    return doi_urls if doi_urls else None
 
 
 def get_cff_authors(data: bytes) -> Optional[List[dict[str, str]]]:

diff --git a/tests/test_cff.py b/tests/test_cff.py
@@ -78,10 +78,18 @@ def test_parse_doi():
     identifiers:
     - type: doi
       value: 10.5281/zenodo.3555620
+    - type: doi
+      value: 10.21105/joss.01274
     """
-    obj = next(
+    parsed_dois = list(
         CffParser(subject=URIRef("https://example.org/"))
         .parse(data=cff_file)
         .objects()
     )
-    assert URIRef("https://doi.org/10.5281/zenodo.3555620") == obj
+    expected_dois = [
+        URIRef("https://doi.org/10.5281/zenodo.3555620"),
+        URIRef("https://doi.org/10.21105/joss.01274"),
+    ]
+    # parsed_dois already contains all parsed DOI objects
+    for doi in expected_dois:
+        assert doi in parsed_dois