From fada9d4adae7de75d7960a2f3014bb90a6b15ab7 Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Thu, 14 Nov 2024 15:42:41 +0100
Subject: [PATCH 01/31] docs: add CFF file (#111)

Adds citation file (cff) to gimie repository for documenation and testing purposes.
---
 CITATION.cff | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 CITATION.cff

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..0f6d174
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,44 @@
+# This CITATION.cff file was generated with cffinit.
+# Visit https://bit.ly/cffinit to generate yours today!
+
+cff-version: 1.2.0
+title: gimie
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Cyril
+    family-names: Matthey-Doret
+    affiliation: Swiss Data Science Center
+    orcid: 'https://orcid.org/0000-0002-1126-1535'
+  - given-names: Sabine
+    family-names: Maennel
+    orcid: 'https://orcid.org/0009-0001-3022-8239'
+    affiliation: Swiss Data Science Center
+  - given-names: Robin
+    family-names: Franken
+    orcid: 'https://orcid.org/0009-0008-0143-9118'
+    affiliation: Swiss Data Science Center
+  - given-names: Martin
+    family-names: Fontanet
+    orcid: 'https://orcid.org/0000-0002-6441-8540'
+    affiliation: Swiss Data Science Center
+  - given-names: Laure
+    family-names: Vancauwenberghe
+    affiliation: Swiss Data Science Center
+  - given-names: Stefan
+    family-names: Milosavljevic
+    email: supermegaiperste@hotmail.com
+    affiliation: Swiss Data Science Center
+repository-code: 'https://github.com/sdsc-ordes/gimie'
+abstract: Extract linked metadata from repositories
+keywords:
+  - git
+  - cli
+  - library
+  - linked-open-data
+  - metadata-extraction
+  - fair-data
+  - scientific-software
+license: Apache-2.0

From f1b7e0bb653ebdd32c6356c6f7273d2053ae2f54 Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Tue, 19 Nov 2024 07:46:38 +0100
Subject: [PATCH 02/31] fix: spelling mistake in run as library docs (#113)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ef023b5..88477e9 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ gimie data https://github.com/numpy/numpy
 
 ```python
 from gimie.project import Project
-proj = Project("https://github.com/numpy/numpy)
+proj = Project("https://github.com/numpy/numpy")
 
 # To retrieve the rdflib.Graph object
 g = proj.extract()

From 5d0a1dc06b6b2234d1932c8f939738c9c6772933 Mon Sep 17 00:00:00 2001
From: raj kumar <79806602+raj921@users.noreply.github.com>
Date: Tue, 19 Nov 2024 14:21:28 +0530
Subject: [PATCH 03/31] Improve authentication error messages (#116)

This commit improves the error messages shown when authentication fails:

1. Added specific error messages for:
   - Missing GitHub token
   - Invalid GitHub token
   - API rate limit exceeded

2. Added clear guidance on how to fix authentication issues:
   - Instructions to set GITHUB_TOKEN/GITLAB_TOKEN
   - Suggestion to check token validity

3. Improved error handling in GitHub extractor:
   - Better token validation in _headers method
   - More descriptive error messages
   - Proper handling of network connection issues

Example error messages:
- 'GitHub token not found. Please set the GITHUB_TOKEN environment variable...'
- 'GitHub authentication failed. Please check that your GITHUB_TOKEN is valid'
- 'Authentication failed: API rate limit exceeded. Please check...'

This makes it easier for users to understand and fix authentication issues.

Co-authored-by: Robin Franken <77491494+rmfranken@users.noreply.github.com>
---
 gimie/extractors/common/queries.py | 16 ++++++++++++++--
 gimie/extractors/github.py         | 16 +++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/gimie/extractors/common/queries.py b/gimie/extractors/common/queries.py
index 3414b95..dcc6d53 100644
--- a/gimie/extractors/common/queries.py
+++ b/gimie/extractors/common/queries.py
@@ -28,7 +28,13 @@ def send_rest_query(
     )
 
     if resp.status_code != 200:
-        raise ConnectionError(resp.json()["message"])
+        error_msg = resp.json().get("message", "")
+        if "API rate limit exceeded" in error_msg:
+            raise ConnectionError(
+                "Authentication failed: API rate limit exceeded. Please check that you have added "
+                "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables."
+            )
+        raise ConnectionError(f"API request failed: {error_msg}")
     return resp.json()
 
 
@@ -46,5 +52,11 @@ def send_graphql_query(
     )
 
     if resp.status_code != 200:
-        raise ConnectionError(resp.json()["message"])
+        error_msg = resp.json().get("message", "")
+        if "API rate limit exceeded" in error_msg:
+            raise ConnectionError(
+                "Authentication failed: API rate limit exceeded. Please check that you have added "
+                "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables."
+            )
+        raise ConnectionError(f"API request failed: {error_msg}")
     return resp.json()
diff --git a/gimie/extractors/github.py b/gimie/extractors/github.py
index 96d56f6..fb28afa 100644
--- a/gimie/extractors/github.py
+++ b/gimie/extractors/github.py
@@ -256,15 +256,21 @@ def _headers(self) -> Any:
         try:
             if not self.token:
                 self.token = os.environ.get("GITHUB_TOKEN")
-                assert self.token
+                if not self.token:
+                    raise ValueError(
+                        "GitHub token not found. Please set the GITHUB_TOKEN environment variable "
+                        "with your GitHub personal access token."
+                    )
             headers = {"Authorization": f"token {self.token}"}
 
             login = requests.get(f"{GH_API}/user", headers=headers)
-            assert login.json().get("login")
-        except AssertionError:
-            return {}
-        else:
+            if not login.ok or not login.json().get("login"):
+                raise ValueError(
+                    "GitHub authentication failed. Please check that your GITHUB_TOKEN is valid."
+                )
             return headers
+        except requests.exceptions.RequestException as e:
+            raise ConnectionError(f"Failed to connect to GitHub API: {str(e)}")
 
     def _get_keywords(self, *nodes: Dict[str, Any]) -> List[str]:
         """Extract names from GraphQL topic nodes."""

From 1866420f39b15312e6f73cab60c73f8e3ca677d3 Mon Sep 17 00:00:00 2001
From: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
Date: Tue, 19 Nov 2024 10:07:23 +0100
Subject: [PATCH 04/31] ci: make conventional PR title check optional (#117)

---
 .github/workflows/conventional-prs.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml
index d45ea2e..1de34c7 100644
--- a/.github/workflows/conventional-prs.yml
+++ b/.github/workflows/conventional-prs.yml
@@ -1,4 +1,4 @@
-name: PR
+name: Conventional PR title
 on:
   pull_request_target:
     types:
@@ -12,7 +12,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       # https://github.com/amannn/action-semantic-pull-request
-      - uses: amannn/action-semantic-pull-request@v5.3.0
+      - name: PR title format check
+        uses: amannn/action-semantic-pull-request@v5.3.0
+        continue-on-error: true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:

From a91b59c991277893d4dbd702112b9b8690e6c219 Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Wed, 20 Nov 2024 09:44:17 +0100
Subject: [PATCH 05/31] fix: unused import

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
---
 gimie/parsers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimie/parsers/__init__.py b/gimie/parsers/__init__.py
index 599c49b..ef5a31c 100644
--- a/gimie/parsers/__init__.py
+++ b/gimie/parsers/__init__.py
@@ -24,7 +24,7 @@
 from gimie.parsers.license import LicenseParser, is_license_filename
 from gimie.parsers.cff import CffParser
 
-from rdflib import Graph, URIRef
+from rdflib import Graph
 
 
 class ParserInfo(NamedTuple):

From b088a451c4ebcea1afbe8c0d82e810b6651c187e Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Wed, 20 Nov 2024 10:15:20 +0100
Subject: [PATCH 06/31] fix: typo

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
---
 gimie/parsers/cff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index ba950ae..8bef575 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -27,7 +27,7 @@
 
 
 class CffParser(Parser):
-    """Parse DOI and authorsfrom CITATION.cff into schema:citation <doi>. and schema:"""
+    """Parse DOI and authors from CITATION.cff."""
 
     def __init__(self, subject: str):
         super().__init__(subject)

From d3eb1f4f17d30c8d596ab1c4fb0969cb76b621a3 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 10:22:58 +0100
Subject: [PATCH 07/31] refactor: rename variable

---
 gimie/parsers/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gimie/parsers/__init__.py b/gimie/parsers/__init__.py
index ef5a31c..8abaa49 100644
--- a/gimie/parsers/__init__.py
+++ b/gimie/parsers/__init__.py
@@ -104,11 +104,11 @@ def parse_files(
     parsers:
         A set of parser names. If None, use the default collection.
     """
-    new_graph = Graph()
+    parsed_properties = Graph()
     for file in files:
         parser = select_parser(file.path, parsers)
         if not parser:
             continue
         data = file.open().read()
-        new_graph |= parser(subject).parse(data or b"")
-    return new_graph
+        parsed_properties |= parser(subject).parse(data or b"")
+    return parsed_properties

From b81b5b648a3230b6502b251abb84fc21e8357438 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 10:26:01 +0100
Subject: [PATCH 08/31] docs: add docstring parameter for parser class

---
 gimie/parsers/abstract.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gimie/parsers/abstract.py b/gimie/parsers/abstract.py
index b9508d9..3a06af4 100644
--- a/gimie/parsers/abstract.py
+++ b/gimie/parsers/abstract.py
@@ -22,11 +22,17 @@
 
 
 class Parser(ABC):
-    """Parser is an Abstract Base Class. It is only meant
+    """
+    Parser is an Abstract Base Class. It is only meant
     to define a standard interface for all parsers.
 
     All subclasses must implement parse(). A parser parses
     bytes data into a set of predicate-object tuples.
+
+    Parameters
+    ----------
+    subject : str
+    The subject of a triple (subject - predicate - object) to be used for writing parsed properties to.
     """
 
     def __init__(self, subject: str):

From 5238b6449dc478578b696abfb53fdcef3f537236 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 10:28:05 +0100
Subject: [PATCH 09/31] refactor: rename variable

---
 gimie/parsers/cff.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 8bef575..e0d1202 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -40,20 +40,22 @@ def parse(self, data: bytes) -> Graph:
         If no authors are found, it will not be included in the graph.
         If neither authors nor DOI are found, an empty graph is returned.
         """
-        rdf_graph = Graph()
+        extracted_cff_triples = Graph()
         doi = get_cff_doi(data)
         authors = get_cff_authors(data)
 
         if doi:
-            rdf_graph.add((self.subject, SDO.citation, URIRef(doi)))
+            extracted_cff_triples.add(
+                (self.subject, SDO.citation, URIRef(doi))
+            )
         if not authors:
-            return rdf_graph
+            return extracted_cff_triples
         for author in authors:
             if author["orcid"]:
-                rdf_graph.add(
+                extracted_cff_triples.add(
                     (self.subject, SDO.author, URIRef(author["orcid"]))
                 )
-                rdf_graph.add(
+                extracted_cff_triples.add(
                     (
                         URIRef(author["orcid"]),
                         SDO.name,
@@ -64,22 +66,24 @@ def parse(self, data: bytes) -> Graph:
                         ),
                     )
                 )
-                rdf_graph.add(
+                extracted_cff_triples.add(
                     (
                         URIRef(author["orcid"]),
                         MD4I.orcidId,
                         Literal(author["orcid"]),
                     )
                 )
-                rdf_graph.add(
+                extracted_cff_triples.add(
                     (
                         URIRef(author["orcid"]),
                         SDO.affiliation,
                         Literal(author["affiliation"]),
                     )
                 )
-                rdf_graph.add((URIRef(author["orcid"]), RDF.type, SDO.Person))
-        return rdf_graph
+                extracted_cff_triples.add(
+                    (URIRef(author["orcid"]), RDF.type, SDO.Person)
+                )
+        return extracted_cff_triples
 
 
 def doi_to_url(doi: str) -> str:

From 5645f6d7df6757652b838adca03e921b0cb88ca5 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 11:28:44 +0100
Subject: [PATCH 10/31] feat: check if orcid is valid before writing

---
 gimie/parsers/cff.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index e0d1202..b6aa2e2 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -51,13 +51,16 @@ def parse(self, data: bytes) -> Graph:
         if not authors:
             return extracted_cff_triples
         for author in authors:
-            if author["orcid"]:
+            orcid = URIRef(author["orcid"])
+            if re.match(
+                r"https:\/\/orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
+            ):
                 extracted_cff_triples.add(
-                    (self.subject, SDO.author, URIRef(author["orcid"]))
+                    (self.subject, SDO.author, URIRef(orcid))
                 )
                 extracted_cff_triples.add(
                     (
-                        URIRef(author["orcid"]),
+                        URIRef(orcid),
                         SDO.name,
                         Literal(
                             author["given-names"]
@@ -68,21 +71,19 @@ def parse(self, data: bytes) -> Graph:
                 )
                 extracted_cff_triples.add(
                     (
-                        URIRef(author["orcid"]),
+                        orcid,
                         MD4I.orcidId,
-                        Literal(author["orcid"]),
+                        Literal(orcid),
                     )
                 )
                 extracted_cff_triples.add(
                     (
-                        URIRef(author["orcid"]),
+                        orcid,
                         SDO.affiliation,
                         Literal(author["affiliation"]),
                     )
                 )
-                extracted_cff_triples.add(
-                    (URIRef(author["orcid"]), RDF.type, SDO.Person)
-                )
+                extracted_cff_triples.add((orcid, RDF.type, SDO.Person))
         return extracted_cff_triples
 
 

From 6a25951ffb262e2acf9be89d47a1424ebddb03e9 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 11:31:47 +0100
Subject: [PATCH 11/31] refactor: rename variable

---
 gimie/parsers/license/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py
index d19eef4..fa4bedc 100644
--- a/gimie/parsers/license/__init__.py
+++ b/gimie/parsers/license/__init__.py
@@ -41,12 +41,14 @@ def parse(self, data: bytes) -> Graph:
         graph with a single triple <url> <schema:license> <spdx_url>.
         If no matching URL is found, an empty graph is returned.
         """
-        new_graph = Graph()
+        extracted_license_triple = Graph()
         license_url = match_license(data)
 
         if license_url:
-            new_graph.add((self.subject, SDO.license, URIRef(license_url)))
-        return new_graph
+            extracted_license_triple.add(
+                (self.subject, SDO.license, URIRef(license_url))
+            )
+        return extracted_license_triple
 
 
 def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:

From f7a1165d5a17245924c23b1604c7c1946ab46470 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 11:44:12 +0100
Subject: [PATCH 12/31] chore: remove pyshacl

---
 poetry.lock          | 107 +------------------------------------------
 pyproject.toml       |   2 -
 tests/test_output.py |  24 +---------
 3 files changed, 3 insertions(+), 130 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ae5ff2b..a76846e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -510,27 +510,6 @@ gitdb = ">=4.0.1,<5"
 doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
 test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
 
-[[package]]
-name = "html5lib"
-version = "1.1"
-description = "HTML parser based on the WHATWG HTML specification"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-files = [
-    {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"},
-    {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"},
-]
-
-[package.dependencies]
-six = ">=1.9"
-webencodings = "*"
-
-[package.extras]
-all = ["chardet (>=2.2)", "genshi", "lxml"]
-chardet = ["chardet (>=2.2)"]
-genshi = ["genshi"]
-lxml = ["lxml"]
-
 [[package]]
 name = "identify"
 version = "2.6.2"
@@ -1085,20 +1064,6 @@ files = [
     {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
-[[package]]
-name = "owlrl"
-version = "6.0.2"
-description = "OWL-RL and RDFS based RDF Closure inferencing for Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "owlrl-6.0.2-py3-none-any.whl", hash = "sha256:57eca06b221edbbc682376c8d42e2ddffc99f61e82c0da02e26735592f08bacc"},
-    {file = "owlrl-6.0.2.tar.gz", hash = "sha256:904e3310ff4df15101475776693d2427d1f8244ee9a6a9f9e13c3c57fae90b74"},
-]
-
-[package.dependencies]
-rdflib = ">=6.0.2"
-
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -1170,23 +1135,6 @@ nodeenv = ">=0.11.1"
 pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"
 
-[[package]]
-name = "prettytable"
-version = "3.12.0"
-description = "A simple Python library for easily displaying tabular data in a visually appealing ASCII table format"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "prettytable-3.12.0-py3-none-any.whl", hash = "sha256:77ca0ad1c435b6e363d7e8623d7cc4fcf2cf15513bf77a1c1b2e814930ac57cc"},
-    {file = "prettytable-3.12.0.tar.gz", hash = "sha256:f04b3e1ba35747ac86e96ec33e3bb9748ce08e254dc2a1c6253945901beec804"},
-]
-
-[package.dependencies]
-wcwidth = "*"
-
-[package.extras]
-tests = ["pytest", "pytest-cov", "pytest-lazy-fixtures"]
-
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@@ -1202,8 +1150,8 @@ files = [
 annotated-types = ">=0.6.0"
 pydantic-core = "2.23.4"
 typing-extensions = [
-    {version = ">=4.6.1", markers = "python_version < \"3.13\""},
     {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
+    {version = ">=4.6.1", markers = "python_version < \"3.13\""},
 ]
 
 [package.extras]
@@ -1378,35 +1326,6 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
-[[package]]
-name = "pyshacl"
-version = "0.26.0"
-description = "Python SHACL Validator"
-optional = false
-python-versions = "<4.0.0,>=3.8.1"
-files = [
-    {file = "pyshacl-0.26.0-py3-none-any.whl", hash = "sha256:a4bef4296d56305a30e0a97509e541ebe4f2cc2d5da73536d0541233e28f2d22"},
-    {file = "pyshacl-0.26.0.tar.gz", hash = "sha256:48d44f317cd9aad8e3fdb5df8aa5706fa92dc6b2746419698035e84a320fb89d"},
-]
-
-[package.dependencies]
-html5lib = ">=1.1,<2"
-importlib-metadata = {version = ">6", markers = "python_version < \"3.12\""}
-owlrl = ">=6.0.2,<7"
-packaging = ">=21.3"
-prettytable = [
-    {version = ">=3.5.0", markers = "python_version >= \"3.8\" and python_version < \"3.12\""},
-    {version = ">=3.7.0", markers = "python_version >= \"3.12\""},
-]
-rdflib = {version = ">=6.3.2,<8.0", markers = "python_full_version >= \"3.8.1\""}
-
-[package.extras]
-dev-coverage = ["coverage (>6.1,!=6.1.1,<7)", "platformdirs", "pytest-cov (>=2.8.1,<3.0.0)"]
-dev-lint = ["black (==24.3.0)", "platformdirs", "ruff (>=0.1.5,<0.2.0)"]
-dev-type-checking = ["mypy (>=0.812,<0.900)", "mypy (>=0.900,<0.1000)", "platformdirs", "types-setuptools"]
-http = ["sanic (>=22.12,<23)", "sanic-cors (==2.2.0)", "sanic-ext (>=23.3,<23.6)"]
-js = ["pyduktape2 (>=0.4.6,<0.5.0)"]
-
 [[package]]
 name = "pytest"
 version = "7.4.4"
@@ -1979,28 +1898,6 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
-[[package]]
-name = "wcwidth"
-version = "0.2.13"
-description = "Measures the displayed width of unicode strings in a terminal"
-optional = false
-python-versions = "*"
-files = [
-    {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
-    {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
-]
-
-[[package]]
-name = "webencodings"
-version = "0.5.1"
-description = "Character encoding aliases for legacy web content"
-optional = false
-python-versions = "*"
-files = [
-    {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
-    {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
-]
-
 [[package]]
 name = "zipp"
 version = "3.21.0"
@@ -2023,4 +1920,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<4.0"
-content-hash = "0ef42bb4efef0321a95a0108c759ceb45de4f6702428e5869ebc18ced68cf3c0"
+content-hash = "ddcf7954deabd2ca49a8e05483c0017d1e910f7ce7caa166c7f237c0dc80ea10"
diff --git a/pyproject.toml b/pyproject.toml
index e5150e3..9872ae1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,8 +27,6 @@ classifiers = [
 python = ">=3.9,<4.0"
 gitpython = ">=3.1.35"
 PyDriller = "^2.5"
-pyshacl = "^0.26.0"
-# temporarily disabled due to installation problems
 typer = "^0.7.0"
 calamus = "^0.4.2"
 requests = "^2.28.2"
diff --git a/tests/test_output.py b/tests/test_output.py
index 991ab87..819ab80 100644
--- a/tests/test_output.py
+++ b/tests/test_output.py
@@ -15,8 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Test the gimie output"""
-# from pyshacl import validate
-# import pytest
+import pytest
 from rdflib import Graph
 
 from gimie.project import Project
@@ -33,24 +32,3 @@ def test_validate_output_is_linked_data():
     """Is output valid RDF?"""
     g = Graph().parse(format="ttl", data=OUT_TTL)
     assert g is not None
-
-
-# @pytest.mark.skip("not yet implemented")
-# def test_output_conforms_shapes():
-#     """Does graph conform SHACL shapes graph?"""
-#     with open("shaclgraph.ttl") as shapes:
-#         shapes_graph = Graph().parse(shapes.read())
-#     valid_graph, _, _ = validate(
-#         data_graph=Graph().parse(data=OUT_TTL),
-#         shacl_graph=shapes_graph,
-#         ont_graph=None,
-#         inference="rdfs",
-#         abort_on_first=False,
-#         allow_infos=False,
-#         allow_warnings=False,
-#         meta_shacl=False,
-#         advanced=False,
-#         js=False,
-#         debug=False,
-#     )
-#     assert valid_graph

From 5aba09d81481860ac7f0dd048f9c2c2bb126a256 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 11:45:36 +0100
Subject: [PATCH 13/31] fix: typo

---
 tests/test_parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index 1e9de8f..566dd9c 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -28,5 +28,5 @@ def test_parse_license():
 
 def test_parse_nothing():
     folder = LocalResource("tests")
-    graph = parse_files(subject=URIRef("https://exmaple.org/"), files=[folder])
+    graph = parse_files(subject=URIRef("https://example.org/"), files=[folder])
     assert len(graph) == 0

From aef27ddce5f53da2d3502309276fa892990567c2 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Wed, 20 Nov 2024 11:47:16 +0100
Subject: [PATCH 14/31] fix: remove unused imports

---
 gimie/parsers/cff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index b6aa2e2..51334e7 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -19,11 +19,11 @@
 from typing import List, Optional, Set
 import yaml
 from rdflib.term import URIRef
-from rdflib import Graph, BNode, URIRef, Literal
+from rdflib import Graph, URIRef, Literal
 from rdflib.namespace import RDF
 from gimie import logger
 from gimie.graph.namespaces import SDO, MD4I
-from gimie.parsers.abstract import Parser, Property
+from gimie.parsers.abstract import Parser
 
 
 class CffParser(Parser):

From ee9238e67f6a9d0f8178cb3df917913d808da4e9 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Fri, 22 Nov 2024 10:56:54 +0100
Subject: [PATCH 15/31] fix: tests for cff, add test for doi, move doi and
 orcid matchers to utils

---
 gimie/parsers/cff.py |  9 ++----
 gimie/utils/uri.py   | 60 +++++++++++++++++++++++++++++++++++
 tests/test_cff.py    | 75 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 138 insertions(+), 6 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 51334e7..839ac87 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -24,6 +24,7 @@
 from gimie import logger
 from gimie.graph.namespaces import SDO, MD4I
 from gimie.parsers.abstract import Parser
+from gimie.utils.uri import is_valid_orcid, valid_doi_match_extractor
 
 
 class CffParser(Parser):
@@ -52,9 +53,7 @@ def parse(self, data: bytes) -> Graph:
             return extracted_cff_triples
         for author in authors:
             orcid = URIRef(author["orcid"])
-            if re.match(
-                r"https:\/\/orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
-            ):
+            if is_valid_orcid(orcid):
                 extracted_cff_triples.add(
                     (self.subject, SDO.author, URIRef(orcid))
                 )
@@ -114,9 +113,7 @@ def doi_to_url(doi: str) -> str:
 
     # regex from:
     # https://www.crossref.org/blog/dois-and-matching-regular-expressions
-    doi_match = re.search(
-        r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
-    )
+    doi_match = valid_doi_match_extractor(doi)
 
     if doi_match is None:
         raise ValueError(f"Not a valid DOI: {doi}")
diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py
index 738ce6f..232c307 100644
--- a/gimie/utils/uri.py
+++ b/gimie/utils/uri.py
@@ -18,6 +18,7 @@
 
 from typing import List, Literal
 from urllib.parse import urlparse
+import re
 
 from gimie.graph.namespaces import GIMIE
 
@@ -64,3 +65,62 @@ def generate_uri(ref: str):
     'https://sdsc-ordes.github.io/gimie/abc'
     """
     return str(GIMIE[ref])
+
+
+def is_valid_orcid(orcid):
+    """Check if the input is a valid ORCID according to definition from orcid.org [1]_.
+    .. [1] [https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier](https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier)
+
+    Parameters
+    ----------
+    orcid:
+        The ORCID to validate.
+
+    Returns
+    -------
+    bool:
+        True if the ORCID is valid, False otherwise.
+
+    Examples
+    --------
+    >>> is_valid_orcid("https://orcid.org/0000-0001-2345-6789")
+    True
+    >>> is_valid_orcid("0000-0001-2345-6789")
+    False
+    >>> is_valid_orcid("http://orcid.org/0000-0001-2345-6789")
+    False
+
+    """
+    return bool(
+        re.match(
+            r"(https:\/\/)?orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
+        )
+    )
+
+
+def valid_doi_match_extractor(doi):
+    """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_.
+    .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions)
+
+    Parameters
+    ----------
+    doi:
+        The DOI to validate.
+
+    Returns
+    -------
+    bool:
+        True if the DOI is valid, False otherwise.
+
+    Examples
+    --------
+    >>> is_valid_doi("10.0000/example.abcd")
+    True
+    >>> is_valid_doi("doi.org/10.0000/example.abcd")
+    False
+    >>> is_valid_doi("https://doi.org/10.0000/example.abcd")
+    False
+    """
+    return re.search(
+        r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
+    )
diff --git a/tests/test_cff.py b/tests/test_cff.py
index 88a21cf..e844b40 100644
--- a/tests/test_cff.py
+++ b/tests/test_cff.py
@@ -1,5 +1,7 @@
 from gimie.io import LocalResource
+from gimie.parsers import CffParser
 from gimie.parsers.cff import get_cff_authors
+from rdflib import URIRef
 
 
 def test_parse_cff():
@@ -8,3 +10,76 @@ def test_parse_cff():
         cff_content = f.read()
     authors = get_cff_authors(cff_content)
     assert authors is not None
+
+
+def test_broken_cff():
+    cff_file_emptyish = b"""
+    cff-version: 1.2.0
+    message: "This is a CFF devoid authors or DOI"
+    """
+    cff_file_bad_syntax = b"""
+    cff-version: 1.2.0
+    title: gimie :
+    authors:
+      family-names: Doe
+        given-names: John
+        - family-names: Smith
+    given-names:
+    Jane
+        orcid: 0000-0001-2345-6789
+    """
+    cff_file_broken_orcid = b"""
+    cff-version: 1.2.0
+    title: gimie
+    authors:
+      - family-names: Doe
+        given-names: John
+        orcid: 0000-0001-2345-6789
+      - family-names: Smith
+        given-names: Jane
+        orcid: http://www.orcid.org/0000-0001-2345-6789
+    """
+    cff_file_author_without_orcid = b"""
+    cff-version: 1.2.0
+    title: gimie
+    authors:
+      - family-names: Doe
+        given-names: John
+    """
+
+    # empty graph = \n according to rdflib
+    assert (
+        CffParser(subject=URIRef("https://example.org/"))
+        .parse(data=cff_file_emptyish)
+        .serialize(format="ttl")
+        == "\n"
+    )
+    assert (
+        CffParser(subject=URIRef("https://example.org/"))
+        .parse(data=cff_file_bad_syntax)
+        .serialize(format="ttl")
+        == "\n"
+    )
+    assert (
+        CffParser(subject=URIRef("https://example.org/"))
+        .parse(data=cff_file_broken_orcid)
+        .serialize(format="ttl")
+        == "\n"
+    )
+    assert (
+        CffParser(subject=URIRef("https://example.org/"))
+        .parse(data=cff_file_author_without_orcid)
+        .serialize(format="ttl")
+        == "\n"
+    )
+
+
+def test_doi():
+    cff_file = b"""
+    cff-version: 1.2.0
+    title: gimie
+    doi: 10.5281/zenodo.1234567
+    """
+    assert "https://doi.org/10.5281/zenodo.1234567" in CffParser(
+        subject=URIRef("https://example.org/")
+    ).parse(data=cff_file).serialize(format="ttl")

From 195c7781010ba2f006dc1152fad56fbd55941b94 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Fri, 22 Nov 2024 11:04:19 +0100
Subject: [PATCH 16/31] docs:fix docs of valid_doi_extractor

---
 gimie/utils/uri.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py
index 232c307..7fd0e25 100644
--- a/gimie/utils/uri.py
+++ b/gimie/utils/uri.py
@@ -109,17 +109,15 @@ def valid_doi_match_extractor(doi):
 
     Returns
     -------
-    bool:
-        True if the DOI is valid, False otherwise.
+    str:
+        The extracted DOI if it is valid, None otherwise.
 
     Examples
     --------
-    >>> is_valid_doi("10.0000/example.abcd")
-    True
-    >>> is_valid_doi("doi.org/10.0000/example.abcd")
-    False
-    >>> is_valid_doi("https://doi.org/10.0000/example.abcd")
-    False
+    >>> valid_doi_match_extractor("10.5281/zenodo.1234567")
+    '10.5281/zenodo.1234567'
+    >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567")
+    '10.5281/zenodo.1234567'
     """
     return re.search(
         r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE

From 03adbf0a496708826c50b6f9d4b99a417ec06756 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Fri, 22 Nov 2024 11:11:44 +0100
Subject: [PATCH 17/31] refactor: doi re matcher

---
 gimie/parsers/cff.py | 4 +---
 gimie/utils/uri.py   | 6 ++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 839ac87..6f1211b 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -118,9 +118,7 @@ def doi_to_url(doi: str) -> str:
     if doi_match is None:
         raise ValueError(f"Not a valid DOI: {doi}")
 
-    short_doi = doi_match.group()
-
-    return f"https://doi.org/{short_doi}"
+    return f"https://doi.org/{doi_match}"
 
 
 def get_cff_doi(data: bytes) -> Optional[str]:
diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py
index 7fd0e25..ca206a3 100644
--- a/gimie/utils/uri.py
+++ b/gimie/utils/uri.py
@@ -110,7 +110,7 @@ def valid_doi_match_extractor(doi):
     Returns
     -------
     str:
-        The extracted DOI if it is valid, None otherwise.
+        The extracted short DOI if it is valid, None otherwise.
 
     Examples
     --------
@@ -119,6 +119,8 @@ def valid_doi_match_extractor(doi):
     >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567")
     '10.5281/zenodo.1234567'
     """
-    return re.search(
+    match = re.search(
         r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
     )
+    if match:
+        return match.group()

From 9673aaca885d5fe7db32cb59ee23b11d4990419a Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Fri, 22 Nov 2024 11:12:35 +0100
Subject: [PATCH 18/31] chore: remove unneccessary comment

---
 gimie/parsers/cff.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 6f1211b..60f5058 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -111,8 +111,6 @@ def doi_to_url(doi: str) -> str:
     'https://doi.org/10.0000/example.abcd'
     """
 
-    # regex from:
-    # https://www.crossref.org/blog/dois-and-matching-regular-expressions
     doi_match = valid_doi_match_extractor(doi)
 
     if doi_match is None:

From 420252ea032ea591940044fccbdadc5f6689e138 Mon Sep 17 00:00:00 2001
From: cmdoret <cyril.mattheydoret@gmail.com>
Date: Thu, 28 Nov 2024 13:22:54 +0100
Subject: [PATCH 19/31] chore(docker): bump base layer to python 3.13

---
 .docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index 24e7c04..4ae7c0b 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -1,6 +1,6 @@
 ARG VERSION_BUILD
 
-FROM python:3.10-slim-bullseye as python
+FROM python:3.13-slim-bookworm as python
 ENV PYTHONUNBUFFERED=true
 WORKDIR /app
 

From 2a6272a7d4ce13cfc203e76e9a067f3de6e5ace2 Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:14:11 +0100
Subject: [PATCH 20/31] Update gimie/parsers/abstract.py

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
---
 gimie/parsers/abstract.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gimie/parsers/abstract.py b/gimie/parsers/abstract.py
index 3a06af4..e5376f7 100644
--- a/gimie/parsers/abstract.py
+++ b/gimie/parsers/abstract.py
@@ -31,8 +31,8 @@ class Parser(ABC):
 
     Parameters
     ----------
-    subject : str
-    The subject of a triple (subject - predicate - object) to be used for writing parsed properties to.
+    subject:
+        The subject of a triple (subject - predicate - object) to be used for writing parsed properties to.
     """
 
     def __init__(self, subject: str):

From a331a631a48a3a070f3b5f56b70b0567c4fbea6a Mon Sep 17 00:00:00 2001
From: Robin Franken <77491494+rmfranken@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:15:03 +0100
Subject: [PATCH 21/31] Update gimie/parsers/cff.py

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
---
 gimie/parsers/cff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 60f5058..514b15a 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -52,8 +52,8 @@ def parse(self, data: bytes) -> Graph:
         if not authors:
             return extracted_cff_triples
         for author in authors:
-            orcid = URIRef(author["orcid"])
-            if is_valid_orcid(orcid):
+            if is_valid_orcid(author["orcid"]):
+                orcid = URIRef(author["orcid"])
                 extracted_cff_triples.add(
                     (self.subject, SDO.author, URIRef(orcid))
                 )

From 9d6926742b2594afc84c2d713a144af8f2fd87fa Mon Sep 17 00:00:00 2001
From: cmdoret <cyril.mattheydoret@gmail.com>
Date: Thu, 28 Nov 2024 14:34:09 +0100
Subject: [PATCH 22/31] chore(docker): use python 3.12 base

---
 .docker/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index 4ae7c0b..5f4d491 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -1,18 +1,18 @@
 ARG VERSION_BUILD
 
-FROM python:3.13-slim-bookworm as python
+FROM python:3.12-slim-bookworm AS python
 ENV PYTHONUNBUFFERED=true
 WORKDIR /app
 
 LABEL org.opencontainers.image.source=https://github.com/sdsc-ordes/gimie
 LABEL org.opencontainers.image.description="Extract linked metadata from repositories."
 LABEL org.opencontainers.image.licenses=Apache-2.0
-LABEL org.opencontainers.image.version ${VERSION_BUILD}
+LABEL org.opencontainers.image.version=${VERSION_BUILD}
 
 ##################################################
 # Poetry setup
 ##################################################
-FROM python as poetry
+FROM python AS poetry
 
 # Install poetry
 ENV POETRY_HOME=/opt/poetry
@@ -36,7 +36,7 @@ RUN poetry install --no-interaction --no-ansi -vvv
 ##################################################
 # Gimie setup
 ##################################################
-FROM python as runtime
+FROM python AS runtime
 ENV PATH="/app/.venv/bin:$PATH"
 RUN apt-get update && \
     apt-get install -y git libgomp1 libmagic-dev

From b91df2a4b4d12ba4afc43b956b325736f7640c91 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Thu, 28 Nov 2024 14:46:57 +0100
Subject: [PATCH 23/31] fix: improve tests, rename some variables

---
 gimie/parsers/cff.py              |  4 +-
 gimie/parsers/license/__init__.py |  8 ++--
 gimie/utils/uri.py                |  2 +-
 tests/test_cff.py                 | 67 +++++++++++++++----------------
 4 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 60f5058..6480ba9 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -24,7 +24,7 @@
 from gimie import logger
 from gimie.graph.namespaces import SDO, MD4I
 from gimie.parsers.abstract import Parser
-from gimie.utils.uri import is_valid_orcid, valid_doi_match_extractor
+from gimie.utils.uri import is_valid_orcid, extract_doi_match
 
 
 class CffParser(Parser):
@@ -111,7 +111,7 @@ def doi_to_url(doi: str) -> str:
     'https://doi.org/10.0000/example.abcd'
     """
 
-    doi_match = valid_doi_match_extractor(doi)
+    doi_match = extract_doi_match(doi)
 
     if doi_match is None:
         raise ValueError(f"Not a valid DOI: {doi}")
diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py
index fa4bedc..db0de2a 100644
--- a/gimie/parsers/license/__init__.py
+++ b/gimie/parsers/license/__init__.py
@@ -41,14 +41,12 @@ def parse(self, data: bytes) -> Graph:
         graph with a single triple <url> <schema:license> <spdx_url>.
         If no matching URL is found, an empty graph is returned.
         """
-        extracted_license_triple = Graph()
+        license_facts = Graph()
         license_url = match_license(data)
 
         if license_url:
-            extracted_license_triple.add(
-                (self.subject, SDO.license, URIRef(license_url))
-            )
-        return extracted_license_triple
+            license_facts.add((self.subject, SDO.license, URIRef(license_url)))
+        return license_facts
 
 
 def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:
diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py
index ca206a3..c94bdf6 100644
--- a/gimie/utils/uri.py
+++ b/gimie/utils/uri.py
@@ -98,7 +98,7 @@ def is_valid_orcid(orcid):
     )
 
 
-def valid_doi_match_extractor(doi):
+def extract_doi_match(doi):
     """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_.
     .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions)
 
diff --git a/tests/test_cff.py b/tests/test_cff.py
index e844b40..66444de 100644
--- a/tests/test_cff.py
+++ b/tests/test_cff.py
@@ -1,7 +1,8 @@
 from gimie.io import LocalResource
 from gimie.parsers import CffParser
 from gimie.parsers.cff import get_cff_authors
-from rdflib import URIRef
+from rdflib import URIRef, Literal
+import pytest
 
 
 def test_parse_cff():
@@ -12,12 +13,17 @@ def test_parse_cff():
     assert authors is not None
 
 
-def test_broken_cff():
-    cff_file_emptyish = b"""
+@pytest.mark.parametrize(
+    "cff_file",
+    [
+        (
+            b"""
     cff-version: 1.2.0
     message: "This is a CFF devoid authors or DOI"
     """
-    cff_file_bad_syntax = b"""
+        ),
+        (
+            b"""
     cff-version: 1.2.0
     title: gimie :
     authors:
@@ -28,7 +34,9 @@ def test_broken_cff():
     Jane
         orcid: 0000-0001-2345-6789
     """
-    cff_file_broken_orcid = b"""
+        ),
+        (
+            b"""
     cff-version: 1.2.0
     title: gimie
     authors:
@@ -39,47 +47,38 @@ def test_broken_cff():
         given-names: Jane
         orcid: http://www.orcid.org/0000-0001-2345-6789
     """
-    cff_file_author_without_orcid = b"""
+        ),
+        (
+            b"""
     cff-version: 1.2.0
     title: gimie
     authors:
       - family-names: Doe
         given-names: John
     """
-
-    # empty graph = \n according to rdflib
+        ),
+    ],
+)
+def test_broken_cff(cff_file):
     assert (
-        CffParser(subject=URIRef("https://example.org/"))
-        .parse(data=cff_file_emptyish)
-        .serialize(format="ttl")
-        == "\n"
-    )
-    assert (
-        CffParser(subject=URIRef("https://example.org/"))
-        .parse(data=cff_file_bad_syntax)
-        .serialize(format="ttl")
-        == "\n"
-    )
-    assert (
-        CffParser(subject=URIRef("https://example.org/"))
-        .parse(data=cff_file_broken_orcid)
-        .serialize(format="ttl")
-        == "\n"
-    )
-    assert (
-        CffParser(subject=URIRef("https://example.org/"))
-        .parse(data=cff_file_author_without_orcid)
-        .serialize(format="ttl")
-        == "\n"
+        len(
+            CffParser(subject=URIRef("https://example.org/")).parse(
+                data=cff_file
+            )
+        )
+        == 0
     )
 
 
-def test_doi():
+def test_parse_doi():
     cff_file = b"""
     cff-version: 1.2.0
     title: gimie
     doi: 10.5281/zenodo.1234567
     """
-    assert "https://doi.org/10.5281/zenodo.1234567" in CffParser(
-        subject=URIRef("https://example.org/")
-    ).parse(data=cff_file).serialize(format="ttl")
+    obj = next(
+        CffParser(subject=URIRef("https://example.org/"))
+        .parse(data=cff_file)
+        .objects()
+    )
+    assert URIRef("https://doi.org/10.5281/zenodo.1234567") == obj

From 477ab88b5b22a6b993642b675d11c73a944c5faa Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Thu, 28 Nov 2024 14:54:09 +0100
Subject: [PATCH 24/31] fix:rename the example in extract_doi_march

---
 gimie/utils/uri.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py
index c94bdf6..163be3b 100644
--- a/gimie/utils/uri.py
+++ b/gimie/utils/uri.py
@@ -114,9 +114,9 @@ def extract_doi_match(doi):
 
     Examples
     --------
-    >>> valid_doi_match_extractor("10.5281/zenodo.1234567")
+    >>> extract_doi_match("10.5281/zenodo.1234567")
     '10.5281/zenodo.1234567'
-    >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567")
+    >>> extract_doi_match("https://doi.org/10.5281/zenodo.1234567")
     '10.5281/zenodo.1234567'
     """
     match = re.search(

From 0eb1423588d511e3ce5cfb73a972a3fd25f0202d Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:24:09 +0100
Subject: [PATCH 25/31] fix: DOI from dict, not flat value

---
 gimie/parsers/cff.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index ef76ad2..7b8d935 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -145,9 +145,15 @@ def get_cff_doi(data: bytes) -> Optional[str]:
     except yaml.scanner.ScannerError:
         logger.warning("cannot read CITATION.cff, skipped.")
         return None
-
     try:
-        doi_url = doi_to_url(cff["doi"])
+        identifiers = cff.get("identifiers", [])
+        doi_identifier = next(
+            (id for id in identifiers if id.get("type") == "doi"), None
+        )
+        if doi_identifier:
+            doi_url = doi_to_url(doi_identifier["value"])
+        else:
+            raise KeyError("No DOI found in identifiers")
     # No doi in cff file
     except (KeyError, TypeError):
         logger.warning("CITATION.cff does not contain a 'doi' key.")

From 43fdf514ce082f4974ff255a9f48a700aa2e54b9 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:31:47 +0100
Subject: [PATCH 26/31] fix:make cff example correct

---
 tests/test_cff.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_cff.py b/tests/test_cff.py
index 66444de..118b124 100644
--- a/tests/test_cff.py
+++ b/tests/test_cff.py
@@ -73,12 +73,15 @@ def test_broken_cff(cff_file):
 def test_parse_doi():
     cff_file = b"""
     cff-version: 1.2.0
-    title: gimie
-    doi: 10.5281/zenodo.1234567
+    message: If you use this software, please cite it using these metadata.
+    title: 'napari: a multi-dimensional image viewer for Python'
+    identifiers:
+    - type: doi
+      value: 10.5281/zenodo.3555620
     """
     obj = next(
         CffParser(subject=URIRef("https://example.org/"))
         .parse(data=cff_file)
         .objects()
     )
-    assert URIRef("https://doi.org/10.5281/zenodo.1234567") == obj
+    assert URIRef("https://doi.org/10.5281/zenodo.3555620") == obj

From 343defd5f6ea91f2520d74167fba5b40cdeed995 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:40:11 +0100
Subject: [PATCH 27/31] docs: adapt docstring example to match real CFF
 structure

---
 gimie/parsers/cff.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 7b8d935..30cb77d 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -122,21 +122,23 @@ def doi_to_url(doi: str) -> str:
 def get_cff_doi(data: bytes) -> Optional[str]:
     """Given a CFF file, returns the DOI, if any.
 
-    Parameters
-    ----------
-    data
-        The cff file body as bytes.
-
-    Returns
-    -------
-    str, optional
-        doi formatted as a valid url
-
-    Examples
-    --------
-    >>> get_cff_doi(bytes("doi:   10.5281/zenodo.1234", encoding="utf8"))
-    'https://doi.org/10.5281/zenodo.1234'
-    >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
+        Parameters
+        ----------
+        data
+            The cff file body as bytes.
+
+        Returns
+        -------
+        str, optional
+            doi formatted as a valid url
+
+        Examples
+        --------
+        >>> get_cff_doi(bytes("identifiers:
+    - type: doi
+      value: 10.5281/zenodo.1234", encoding="utf8"))
+        'https://doi.org/10.5281/zenodo.1234'
+        >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
 
     """
 

From 513a54648dd09e66faf2b3b6180267c8044c5200 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:43:34 +0100
Subject: [PATCH 28/31] fix:docstring still fucked

---
 gimie/parsers/cff.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 30cb77d..f9e553a 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -122,23 +122,25 @@ def doi_to_url(doi: str) -> str:
 def get_cff_doi(data: bytes) -> Optional[str]:
     """Given a CFF file, returns the DOI, if any.
 
-        Parameters
-        ----------
-        data
-            The cff file body as bytes.
-
-        Returns
-        -------
-        str, optional
-            doi formatted as a valid url
-
-        Examples
-        --------
-        >>> get_cff_doi(bytes("identifiers:
+    Parameters
+    ----------
+    data
+        The cff file body as bytes.
+
+    Returns
+    -------
+    str, optional
+        doi formatted as a valid url
+
+    Examples
+    --------
+    >>> get_cff_doi(bytes(
+    "identifiers:
     - type: doi
       value: 10.5281/zenodo.1234", encoding="utf8"))
-        'https://doi.org/10.5281/zenodo.1234'
-        >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
+        'https://doi.org/10.5281/zenodo.1234'", encoding="utf8"))
+    'https://doi.org/10.5281/zenodo.1234'
+    >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
 
     """
 

From d650e6a50f84c578c962d3cd0a007c0bdbd169de Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:46:10 +0100
Subject: [PATCH 29/31] fix:typo docstring

---
 gimie/parsers/cff.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index f9e553a..2af8676 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -138,7 +138,6 @@ def get_cff_doi(data: bytes) -> Optional[str]:
     "identifiers:
     - type: doi
       value: 10.5281/zenodo.1234", encoding="utf8"))
-        'https://doi.org/10.5281/zenodo.1234'", encoding="utf8"))
     'https://doi.org/10.5281/zenodo.1234'
     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
 

From 077ad377bc9ff03783c66839a6f3e0c8add28a14 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 16:51:47 +0100
Subject: [PATCH 30/31] fix:chatGPT's suggestion for docstring formatting

---
 gimie/parsers/cff.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index 2af8676..cfec3bc 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -134,13 +134,9 @@ def get_cff_doi(data: bytes) -> Optional[str]:
 
     Examples
     --------
-    >>> get_cff_doi(bytes(
-    "identifiers:
-    - type: doi
-      value: 10.5281/zenodo.1234", encoding="utf8"))
+    >>> get_cff_doi(bytes("identifiers:\n- type: doi\n  value: 10.5281/zenodo.1234", encoding="utf8"))
     'https://doi.org/10.5281/zenodo.1234'
     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
-
     """
 
     try:

From 5a16e3c3f753f47cb55130473f077bc342986289 Mon Sep 17 00:00:00 2001
From: rmfranken <robin.franken@epfl.ch>
Date: Mon, 16 Dec 2024 17:08:05 +0100
Subject: [PATCH 31/31] fix:OK, no multiline, and double escape newlines

---
 gimie/parsers/cff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py
index cfec3bc..09ae71e 100644
--- a/gimie/parsers/cff.py
+++ b/gimie/parsers/cff.py
@@ -134,7 +134,7 @@ def get_cff_doi(data: bytes) -> Optional[str]:
 
     Examples
     --------
-    >>> get_cff_doi(bytes("identifiers:\n- type: doi\n  value: 10.5281/zenodo.1234", encoding="utf8"))
+    >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.1234", encoding="utf8"))
     'https://doi.org/10.5281/zenodo.1234'
     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
     """