From fada9d4adae7de75d7960a2f3014bb90a6b15ab7 Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Thu, 14 Nov 2024 15:42:41 +0100 Subject: [PATCH 01/31] docs: add CFF file (#111) Adds citation file (cff) to gimie repository for documenation and testing purposes. --- CITATION.cff | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..0f6d174 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,44 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: gimie +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Cyril + family-names: Matthey-Doret + affiliation: Swiss Data Science Center + orcid: 'https://orcid.org/0000-0002-1126-1535' + - given-names: Sabine + family-names: Maennel + orcid: 'https://orcid.org/0009-0001-3022-8239' + affiliation: Swiss Data Science Center + - given-names: Robin + family-names: Franken + orcid: 'https://orcid.org/0009-0008-0143-9118' + affiliation: Swiss Data Science Center + - given-names: Martin + family-names: Fontanet + orcid: 'https://orcid.org/0000-0002-6441-8540' + affiliation: Swiss Data Science Center + - given-names: Laure + family-names: Vancauwenberghe + affiliation: Swiss Data Science Center + - given-names: Stefan + family-names: Milosavljevic + email: supermegaiperste@hotmail.com + affiliation: Swiss Data Science Center +repository-code: 'https://github.com/sdsc-ordes/gimie' +abstract: Extract linked metadata from repositories +keywords: + - git + - cli + - library + - linked-open-data + - metadata-extraction + - fair-data + - scientific-software +license: Apache-2.0 From f1b7e0bb653ebdd32c6356c6f7273d2053ae2f54 Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Tue, 19 Nov 2024 07:46:38 +0100 Subject: [PATCH 02/31] fix: spelling mistake in run as library docs (#113) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ef023b5..88477e9 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ gimie data https://github.com/numpy/numpy ```python from gimie.project import Project -proj = Project("https://github.com/numpy/numpy) +proj = Project("https://github.com/numpy/numpy") # To retrieve the rdflib.Graph object g = proj.extract() From 5d0a1dc06b6b2234d1932c8f939738c9c6772933 Mon Sep 17 00:00:00 2001 From: raj kumar <79806602+raj921@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:21:28 +0530 Subject: [PATCH 03/31] Improve authentication error messages (#116) This commit improves the error messages shown when authentication fails: 1. Added specific error messages for: - Missing GitHub token - Invalid GitHub token - API rate limit exceeded 2. Added clear guidance on how to fix authentication issues: - Instructions to set GITHUB_TOKEN/GITLAB_TOKEN - Suggestion to check token validity 3. Improved error handling in GitHub extractor: - Better token validation in _headers method - More descriptive error messages - Proper handling of network connection issues Example error messages: - 'GitHub token not found. Please set the GITHUB_TOKEN environment variable...' - 'GitHub authentication failed. Please check that your GITHUB_TOKEN is valid' - 'Authentication failed: API rate limit exceeded. Please check...' This makes it easier for users to understand and fix authentication issues. Co-authored-by: Robin Franken <77491494+rmfranken@users.noreply.github.com> --- gimie/extractors/common/queries.py | 16 ++++++++++++++-- gimie/extractors/github.py | 16 +++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/gimie/extractors/common/queries.py b/gimie/extractors/common/queries.py index 3414b95..dcc6d53 100644 --- a/gimie/extractors/common/queries.py +++ b/gimie/extractors/common/queries.py @@ -28,7 +28,13 @@ def send_rest_query( ) if resp.status_code != 200: - raise ConnectionError(resp.json()["message"]) + error_msg = resp.json().get("message", "") + if "API rate limit exceeded" in error_msg: + raise ConnectionError( + "Authentication failed: API rate limit exceeded. Please check that you have added " + "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables." + ) + raise ConnectionError(f"API request failed: {error_msg}") return resp.json() @@ -46,5 +52,11 @@ def send_graphql_query( ) if resp.status_code != 200: - raise ConnectionError(resp.json()["message"]) + error_msg = resp.json().get("message", "") + if "API rate limit exceeded" in error_msg: + raise ConnectionError( + "Authentication failed: API rate limit exceeded. Please check that you have added " + "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables." + ) + raise ConnectionError(f"API request failed: {error_msg}") return resp.json() diff --git a/gimie/extractors/github.py b/gimie/extractors/github.py index 96d56f6..fb28afa 100644 --- a/gimie/extractors/github.py +++ b/gimie/extractors/github.py @@ -256,15 +256,21 @@ def _headers(self) -> Any: try: if not self.token: self.token = os.environ.get("GITHUB_TOKEN") - assert self.token + if not self.token: + raise ValueError( + "GitHub token not found. Please set the GITHUB_TOKEN environment variable " + "with your GitHub personal access token." + ) headers = {"Authorization": f"token {self.token}"} login = requests.get(f"{GH_API}/user", headers=headers) - assert login.json().get("login") - except AssertionError: - return {} - else: + if not login.ok or not login.json().get("login"): + raise ValueError( + "GitHub authentication failed. Please check that your GITHUB_TOKEN is valid." + ) return headers + except requests.exceptions.RequestException as e: + raise ConnectionError(f"Failed to connect to GitHub API: {str(e)}") def _get_keywords(self, *nodes: Dict[str, Any]) -> List[str]: """Extract names from GraphQL topic nodes.""" From 1866420f39b15312e6f73cab60c73f8e3ca677d3 Mon Sep 17 00:00:00 2001 From: Cyril Matthey-Doret Date: Tue, 19 Nov 2024 10:07:23 +0100 Subject: [PATCH 04/31] ci: make conventional PR title check optional (#117) --- .github/workflows/conventional-prs.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml index d45ea2e..1de34c7 100644 --- a/.github/workflows/conventional-prs.yml +++ b/.github/workflows/conventional-prs.yml @@ -1,4 +1,4 @@ -name: PR +name: Conventional PR title on: pull_request_target: types: @@ -12,7 +12,9 @@ jobs: runs-on: ubuntu-latest steps: # https://github.com/amannn/action-semantic-pull-request - - uses: amannn/action-semantic-pull-request@v5.3.0 + - name: PR title format check + uses: amannn/action-semantic-pull-request@v5.3.0 + continue-on-error: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: From a91b59c991277893d4dbd702112b9b8690e6c219 Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:44:17 +0100 Subject: [PATCH 05/31] fix: unused import Co-authored-by: Cyril Matthey-Doret --- gimie/parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimie/parsers/__init__.py b/gimie/parsers/__init__.py index 599c49b..ef5a31c 100644 --- a/gimie/parsers/__init__.py +++ b/gimie/parsers/__init__.py @@ -24,7 +24,7 @@ from gimie.parsers.license import LicenseParser, is_license_filename from gimie.parsers.cff import CffParser -from rdflib import Graph, URIRef +from rdflib import Graph class ParserInfo(NamedTuple): From b088a451c4ebcea1afbe8c0d82e810b6651c187e Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:15:20 +0100 Subject: [PATCH 06/31] fix: typo Co-authored-by: Cyril Matthey-Doret --- gimie/parsers/cff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index ba950ae..8bef575 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -27,7 +27,7 @@ class CffParser(Parser): - """Parse DOI and authorsfrom CITATION.cff into schema:citation . and schema:""" + """Parse DOI and authors from CITATION.cff.""" def __init__(self, subject: str): super().__init__(subject) From d3eb1f4f17d30c8d596ab1c4fb0969cb76b621a3 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 10:22:58 +0100 Subject: [PATCH 07/31] refactor: rename variable --- gimie/parsers/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gimie/parsers/__init__.py b/gimie/parsers/__init__.py index ef5a31c..8abaa49 100644 --- a/gimie/parsers/__init__.py +++ b/gimie/parsers/__init__.py @@ -104,11 +104,11 @@ def parse_files( parsers: A set of parser names. If None, use the default collection. """ - new_graph = Graph() + parsed_properties = Graph() for file in files: parser = select_parser(file.path, parsers) if not parser: continue data = file.open().read() - new_graph |= parser(subject).parse(data or b"") - return new_graph + parsed_properties |= parser(subject).parse(data or b"") + return parsed_properties From b81b5b648a3230b6502b251abb84fc21e8357438 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 10:26:01 +0100 Subject: [PATCH 08/31] docs: add docstring parameter for parser class --- gimie/parsers/abstract.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gimie/parsers/abstract.py b/gimie/parsers/abstract.py index b9508d9..3a06af4 100644 --- a/gimie/parsers/abstract.py +++ b/gimie/parsers/abstract.py @@ -22,11 +22,17 @@ class Parser(ABC): - """Parser is an Abstract Base Class. It is only meant + """ + Parser is an Abstract Base Class. It is only meant to define a standard interface for all parsers. All subclasses must implement parse(). A parser parses bytes data into a set of predicate-object tuples. + + Parameters + ---------- + subject : str + The subject of a triple (subject - predicate - object) to be used for writing parsed properties to. """ def __init__(self, subject: str): From 5238b6449dc478578b696abfb53fdcef3f537236 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 10:28:05 +0100 Subject: [PATCH 09/31] refactor: rename variable --- gimie/parsers/cff.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 8bef575..e0d1202 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -40,20 +40,22 @@ def parse(self, data: bytes) -> Graph: If no authors are found, it will not be included in the graph. If neither authors nor DOI are found, an empty graph is returned. """ - rdf_graph = Graph() + extracted_cff_triples = Graph() doi = get_cff_doi(data) authors = get_cff_authors(data) if doi: - rdf_graph.add((self.subject, SDO.citation, URIRef(doi))) + extracted_cff_triples.add( + (self.subject, SDO.citation, URIRef(doi)) + ) if not authors: - return rdf_graph + return extracted_cff_triples for author in authors: if author["orcid"]: - rdf_graph.add( + extracted_cff_triples.add( (self.subject, SDO.author, URIRef(author["orcid"])) ) - rdf_graph.add( + extracted_cff_triples.add( ( URIRef(author["orcid"]), SDO.name, @@ -64,22 +66,24 @@ def parse(self, data: bytes) -> Graph: ), ) ) - rdf_graph.add( + extracted_cff_triples.add( ( URIRef(author["orcid"]), MD4I.orcidId, Literal(author["orcid"]), ) ) - rdf_graph.add( + extracted_cff_triples.add( ( URIRef(author["orcid"]), SDO.affiliation, Literal(author["affiliation"]), ) ) - rdf_graph.add((URIRef(author["orcid"]), RDF.type, SDO.Person)) - return rdf_graph + extracted_cff_triples.add( + (URIRef(author["orcid"]), RDF.type, SDO.Person) + ) + return extracted_cff_triples def doi_to_url(doi: str) -> str: From 5645f6d7df6757652b838adca03e921b0cb88ca5 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 11:28:44 +0100 Subject: [PATCH 10/31] feat: check if orcid is valid before writing --- gimie/parsers/cff.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index e0d1202..b6aa2e2 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -51,13 +51,16 @@ def parse(self, data: bytes) -> Graph: if not authors: return extracted_cff_triples for author in authors: - if author["orcid"]: + orcid = URIRef(author["orcid"]) + if re.match( + r"https:\/\/orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid) + ): extracted_cff_triples.add( - (self.subject, SDO.author, URIRef(author["orcid"])) + (self.subject, SDO.author, URIRef(orcid)) ) extracted_cff_triples.add( ( - URIRef(author["orcid"]), + URIRef(orcid), SDO.name, Literal( author["given-names"] @@ -68,21 +71,19 @@ def parse(self, data: bytes) -> Graph: ) extracted_cff_triples.add( ( - URIRef(author["orcid"]), + orcid, MD4I.orcidId, - Literal(author["orcid"]), + Literal(orcid), ) ) extracted_cff_triples.add( ( - URIRef(author["orcid"]), + orcid, SDO.affiliation, Literal(author["affiliation"]), ) ) - extracted_cff_triples.add( - (URIRef(author["orcid"]), RDF.type, SDO.Person) - ) + extracted_cff_triples.add((orcid, RDF.type, SDO.Person)) return extracted_cff_triples From 6a25951ffb262e2acf9be89d47a1424ebddb03e9 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 11:31:47 +0100 Subject: [PATCH 11/31] refactor: rename variable --- gimie/parsers/license/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py index d19eef4..fa4bedc 100644 --- a/gimie/parsers/license/__init__.py +++ b/gimie/parsers/license/__init__.py @@ -41,12 +41,14 @@ def parse(self, data: bytes) -> Graph: graph with a single triple . If no matching URL is found, an empty graph is returned. """ - new_graph = Graph() + extracted_license_triple = Graph() license_url = match_license(data) if license_url: - new_graph.add((self.subject, SDO.license, URIRef(license_url))) - return new_graph + extracted_license_triple.add( + (self.subject, SDO.license, URIRef(license_url)) + ) + return extracted_license_triple def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]: From f7a1165d5a17245924c23b1604c7c1946ab46470 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 11:44:12 +0100 Subject: [PATCH 12/31] chore: remove pyshacl --- poetry.lock | 107 +------------------------------------------ pyproject.toml | 2 - tests/test_output.py | 24 +--------- 3 files changed, 3 insertions(+), 130 deletions(-) diff --git a/poetry.lock b/poetry.lock index ae5ff2b..a76846e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -510,27 +510,6 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] -[[package]] -name = "html5lib" -version = "1.1" -description = "HTML parser based on the WHATWG HTML specification" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -files = [ - {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, - {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, -] - -[package.dependencies] -six = ">=1.9" -webencodings = "*" - -[package.extras] -all = ["chardet (>=2.2)", "genshi", "lxml"] -chardet = ["chardet (>=2.2)"] -genshi = ["genshi"] -lxml = ["lxml"] - [[package]] name = "identify" version = "2.6.2" @@ -1085,20 +1064,6 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] -[[package]] -name = "owlrl" -version = "6.0.2" -description = "OWL-RL and RDFS based RDF Closure inferencing for Python" -optional = false -python-versions = "*" -files = [ - {file = "owlrl-6.0.2-py3-none-any.whl", hash = "sha256:57eca06b221edbbc682376c8d42e2ddffc99f61e82c0da02e26735592f08bacc"}, - {file = "owlrl-6.0.2.tar.gz", hash = "sha256:904e3310ff4df15101475776693d2427d1f8244ee9a6a9f9e13c3c57fae90b74"}, -] - -[package.dependencies] -rdflib = ">=6.0.2" - [[package]] name = "packaging" version = "24.2" @@ -1170,23 +1135,6 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" -[[package]] -name = "prettytable" -version = "3.12.0" -description = "A simple Python library for easily displaying tabular data in a visually appealing ASCII table format" -optional = false -python-versions = ">=3.9" -files = [ - {file = "prettytable-3.12.0-py3-none-any.whl", hash = "sha256:77ca0ad1c435b6e363d7e8623d7cc4fcf2cf15513bf77a1c1b2e814930ac57cc"}, - {file = "prettytable-3.12.0.tar.gz", hash = "sha256:f04b3e1ba35747ac86e96ec33e3bb9748ce08e254dc2a1c6253945901beec804"}, -] - -[package.dependencies] -wcwidth = "*" - -[package.extras] -tests = ["pytest", "pytest-cov", "pytest-lazy-fixtures"] - [[package]] name = "pydantic" version = "2.9.2" @@ -1202,8 +1150,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.6.1", markers = "python_version < \"3.13\""}, {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, ] [package.extras] @@ -1378,35 +1326,6 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] -[[package]] -name = "pyshacl" -version = "0.26.0" -description = "Python SHACL Validator" -optional = false -python-versions = "<4.0.0,>=3.8.1" -files = [ - {file = "pyshacl-0.26.0-py3-none-any.whl", hash = "sha256:a4bef4296d56305a30e0a97509e541ebe4f2cc2d5da73536d0541233e28f2d22"}, - {file = "pyshacl-0.26.0.tar.gz", hash = "sha256:48d44f317cd9aad8e3fdb5df8aa5706fa92dc6b2746419698035e84a320fb89d"}, -] - -[package.dependencies] -html5lib = ">=1.1,<2" -importlib-metadata = {version = ">6", markers = "python_version < \"3.12\""} -owlrl = ">=6.0.2,<7" -packaging = ">=21.3" -prettytable = [ - {version = ">=3.5.0", markers = "python_version >= \"3.8\" and python_version < \"3.12\""}, - {version = ">=3.7.0", markers = "python_version >= \"3.12\""}, -] -rdflib = {version = ">=6.3.2,<8.0", markers = "python_full_version >= \"3.8.1\""} - -[package.extras] -dev-coverage = ["coverage (>6.1,!=6.1.1,<7)", "platformdirs", "pytest-cov (>=2.8.1,<3.0.0)"] -dev-lint = ["black (==24.3.0)", "platformdirs", "ruff (>=0.1.5,<0.2.0)"] -dev-type-checking = ["mypy (>=0.812,<0.900)", "mypy (>=0.900,<0.1000)", "platformdirs", "types-setuptools"] -http = ["sanic (>=22.12,<23)", "sanic-cors (==2.2.0)", "sanic-ext (>=23.3,<23.6)"] -js = ["pyduktape2 (>=0.4.6,<0.5.0)"] - [[package]] name = "pytest" version = "7.4.4" @@ -1979,28 +1898,6 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] -[[package]] -name = "wcwidth" -version = "0.2.13" -description = "Measures the displayed width of unicode strings in a terminal" -optional = false -python-versions = "*" -files = [ - {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, - {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, -] - -[[package]] -name = "webencodings" -version = "0.5.1" -description = "Character encoding aliases for legacy web content" -optional = false -python-versions = "*" -files = [ - {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, - {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, -] - [[package]] name = "zipp" version = "3.21.0" @@ -2023,4 +1920,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "0ef42bb4efef0321a95a0108c759ceb45de4f6702428e5869ebc18ced68cf3c0" +content-hash = "ddcf7954deabd2ca49a8e05483c0017d1e910f7ce7caa166c7f237c0dc80ea10" diff --git a/pyproject.toml b/pyproject.toml index e5150e3..9872ae1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,6 @@ classifiers = [ python = ">=3.9,<4.0" gitpython = ">=3.1.35" PyDriller = "^2.5" -pyshacl = "^0.26.0" -# temporarily disabled due to installation problems typer = "^0.7.0" calamus = "^0.4.2" requests = "^2.28.2" diff --git a/tests/test_output.py b/tests/test_output.py index 991ab87..819ab80 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -15,8 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Test the gimie output""" -# from pyshacl import validate -# import pytest +import pytest from rdflib import Graph from gimie.project import Project @@ -33,24 +32,3 @@ def test_validate_output_is_linked_data(): """Is output valid RDF?""" g = Graph().parse(format="ttl", data=OUT_TTL) assert g is not None - - -# @pytest.mark.skip("not yet implemented") -# def test_output_conforms_shapes(): -# """Does graph conform SHACL shapes graph?""" -# with open("shaclgraph.ttl") as shapes: -# shapes_graph = Graph().parse(shapes.read()) -# valid_graph, _, _ = validate( -# data_graph=Graph().parse(data=OUT_TTL), -# shacl_graph=shapes_graph, -# ont_graph=None, -# inference="rdfs", -# abort_on_first=False, -# allow_infos=False, -# allow_warnings=False, -# meta_shacl=False, -# advanced=False, -# js=False, -# debug=False, -# ) -# assert valid_graph From 5aba09d81481860ac7f0dd048f9c2c2bb126a256 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 11:45:36 +0100 Subject: [PATCH 13/31] fix: typo --- tests/test_parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 1e9de8f..566dd9c 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -28,5 +28,5 @@ def test_parse_license(): def test_parse_nothing(): folder = LocalResource("tests") - graph = parse_files(subject=URIRef("https://exmaple.org/"), files=[folder]) + graph = parse_files(subject=URIRef("https://example.org/"), files=[folder]) assert len(graph) == 0 From aef27ddce5f53da2d3502309276fa892990567c2 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Wed, 20 Nov 2024 11:47:16 +0100 Subject: [PATCH 14/31] fix: remove unused imports --- gimie/parsers/cff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index b6aa2e2..51334e7 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -19,11 +19,11 @@ from typing import List, Optional, Set import yaml from rdflib.term import URIRef -from rdflib import Graph, BNode, URIRef, Literal +from rdflib import Graph, URIRef, Literal from rdflib.namespace import RDF from gimie import logger from gimie.graph.namespaces import SDO, MD4I -from gimie.parsers.abstract import Parser, Property +from gimie.parsers.abstract import Parser class CffParser(Parser): From ee9238e67f6a9d0f8178cb3df917913d808da4e9 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Fri, 22 Nov 2024 10:56:54 +0100 Subject: [PATCH 15/31] fix: tests for cff, add test for doi, move doi and orcid matchers to utils --- gimie/parsers/cff.py | 9 ++---- gimie/utils/uri.py | 60 +++++++++++++++++++++++++++++++++++ tests/test_cff.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 6 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 51334e7..839ac87 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -24,6 +24,7 @@ from gimie import logger from gimie.graph.namespaces import SDO, MD4I from gimie.parsers.abstract import Parser +from gimie.utils.uri import is_valid_orcid, valid_doi_match_extractor class CffParser(Parser): @@ -52,9 +53,7 @@ def parse(self, data: bytes) -> Graph: return extracted_cff_triples for author in authors: orcid = URIRef(author["orcid"]) - if re.match( - r"https:\/\/orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid) - ): + if is_valid_orcid(orcid): extracted_cff_triples.add( (self.subject, SDO.author, URIRef(orcid)) ) @@ -114,9 +113,7 @@ def doi_to_url(doi: str) -> str: # regex from: # https://www.crossref.org/blog/dois-and-matching-regular-expressions - doi_match = re.search( - r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE - ) + doi_match = valid_doi_match_extractor(doi) if doi_match is None: raise ValueError(f"Not a valid DOI: {doi}") diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py index 738ce6f..232c307 100644 --- a/gimie/utils/uri.py +++ b/gimie/utils/uri.py @@ -18,6 +18,7 @@ from typing import List, Literal from urllib.parse import urlparse +import re from gimie.graph.namespaces import GIMIE @@ -64,3 +65,62 @@ def generate_uri(ref: str): 'https://sdsc-ordes.github.io/gimie/abc' """ return str(GIMIE[ref]) + + +def is_valid_orcid(orcid): + """Check if the input is a valid ORCID according to definition from orcid.org [1]_. + .. [1] [https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier](https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier) + + Parameters + ---------- + orcid: + The ORCID to validate. + + Returns + ------- + bool: + True if the ORCID is valid, False otherwise. + + Examples + -------- + >>> is_valid_orcid("https://orcid.org/0000-0001-2345-6789") + True + >>> is_valid_orcid("0000-0001-2345-6789") + False + >>> is_valid_orcid("http://orcid.org/0000-0001-2345-6789") + False + + """ + return bool( + re.match( + r"(https:\/\/)?orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid) + ) + ) + + +def valid_doi_match_extractor(doi): + """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_. + .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions) + + Parameters + ---------- + doi: + The DOI to validate. + + Returns + ------- + bool: + True if the DOI is valid, False otherwise. + + Examples + -------- + >>> is_valid_doi("10.0000/example.abcd") + True + >>> is_valid_doi("doi.org/10.0000/example.abcd") + False + >>> is_valid_doi("https://doi.org/10.0000/example.abcd") + False + """ + return re.search( + r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE + ) diff --git a/tests/test_cff.py b/tests/test_cff.py index 88a21cf..e844b40 100644 --- a/tests/test_cff.py +++ b/tests/test_cff.py @@ -1,5 +1,7 @@ from gimie.io import LocalResource +from gimie.parsers import CffParser from gimie.parsers.cff import get_cff_authors +from rdflib import URIRef def test_parse_cff(): @@ -8,3 +10,76 @@ def test_parse_cff(): cff_content = f.read() authors = get_cff_authors(cff_content) assert authors is not None + + +def test_broken_cff(): + cff_file_emptyish = b""" + cff-version: 1.2.0 + message: "This is a CFF devoid authors or DOI" + """ + cff_file_bad_syntax = b""" + cff-version: 1.2.0 + title: gimie : + authors: + family-names: Doe + given-names: John + - family-names: Smith + given-names: + Jane + orcid: 0000-0001-2345-6789 + """ + cff_file_broken_orcid = b""" + cff-version: 1.2.0 + title: gimie + authors: + - family-names: Doe + given-names: John + orcid: 0000-0001-2345-6789 + - family-names: Smith + given-names: Jane + orcid: http://www.orcid.org/0000-0001-2345-6789 + """ + cff_file_author_without_orcid = b""" + cff-version: 1.2.0 + title: gimie + authors: + - family-names: Doe + given-names: John + """ + + # empty graph = \n according to rdflib + assert ( + CffParser(subject=URIRef("https://example.org/")) + .parse(data=cff_file_emptyish) + .serialize(format="ttl") + == "\n" + ) + assert ( + CffParser(subject=URIRef("https://example.org/")) + .parse(data=cff_file_bad_syntax) + .serialize(format="ttl") + == "\n" + ) + assert ( + CffParser(subject=URIRef("https://example.org/")) + .parse(data=cff_file_broken_orcid) + .serialize(format="ttl") + == "\n" + ) + assert ( + CffParser(subject=URIRef("https://example.org/")) + .parse(data=cff_file_author_without_orcid) + .serialize(format="ttl") + == "\n" + ) + + +def test_doi(): + cff_file = b""" + cff-version: 1.2.0 + title: gimie + doi: 10.5281/zenodo.1234567 + """ + assert "https://doi.org/10.5281/zenodo.1234567" in CffParser( + subject=URIRef("https://example.org/") + ).parse(data=cff_file).serialize(format="ttl") From 195c7781010ba2f006dc1152fad56fbd55941b94 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Fri, 22 Nov 2024 11:04:19 +0100 Subject: [PATCH 16/31] docs:fix docs of valid_doi_extractor --- gimie/utils/uri.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py index 232c307..7fd0e25 100644 --- a/gimie/utils/uri.py +++ b/gimie/utils/uri.py @@ -109,17 +109,15 @@ def valid_doi_match_extractor(doi): Returns ------- - bool: - True if the DOI is valid, False otherwise. + str: + The extracted DOI if it is valid, None otherwise. Examples -------- - >>> is_valid_doi("10.0000/example.abcd") - True - >>> is_valid_doi("doi.org/10.0000/example.abcd") - False - >>> is_valid_doi("https://doi.org/10.0000/example.abcd") - False + >>> valid_doi_match_extractor("10.5281/zenodo.1234567") + '10.5281/zenodo.1234567' + >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567") + '10.5281/zenodo.1234567' """ return re.search( r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE From 03adbf0a496708826c50b6f9d4b99a417ec06756 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Fri, 22 Nov 2024 11:11:44 +0100 Subject: [PATCH 17/31] refactor: doi re matcher --- gimie/parsers/cff.py | 4 +--- gimie/utils/uri.py | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 839ac87..6f1211b 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -118,9 +118,7 @@ def doi_to_url(doi: str) -> str: if doi_match is None: raise ValueError(f"Not a valid DOI: {doi}") - short_doi = doi_match.group() - - return f"https://doi.org/{short_doi}" + return f"https://doi.org/{doi_match}" def get_cff_doi(data: bytes) -> Optional[str]: diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py index 7fd0e25..ca206a3 100644 --- a/gimie/utils/uri.py +++ b/gimie/utils/uri.py @@ -110,7 +110,7 @@ def valid_doi_match_extractor(doi): Returns ------- str: - The extracted DOI if it is valid, None otherwise. + The extracted short DOI if it is valid, None otherwise. Examples -------- @@ -119,6 +119,8 @@ def valid_doi_match_extractor(doi): >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567") '10.5281/zenodo.1234567' """ - return re.search( + match = re.search( r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE ) + if match: + return match.group() From 9673aaca885d5fe7db32cb59ee23b11d4990419a Mon Sep 17 00:00:00 2001 From: rmfranken Date: Fri, 22 Nov 2024 11:12:35 +0100 Subject: [PATCH 18/31] chore: remove unneccessary comment --- gimie/parsers/cff.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 6f1211b..60f5058 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -111,8 +111,6 @@ def doi_to_url(doi: str) -> str: 'https://doi.org/10.0000/example.abcd' """ - # regex from: - # https://www.crossref.org/blog/dois-and-matching-regular-expressions doi_match = valid_doi_match_extractor(doi) if doi_match is None: From 420252ea032ea591940044fccbdadc5f6689e138 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Thu, 28 Nov 2024 13:22:54 +0100 Subject: [PATCH 19/31] chore(docker): bump base layer to python 3.13 --- .docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 24e7c04..4ae7c0b 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,6 +1,6 @@ ARG VERSION_BUILD -FROM python:3.10-slim-bullseye as python +FROM python:3.13-slim-bookworm as python ENV PYTHONUNBUFFERED=true WORKDIR /app From 2a6272a7d4ce13cfc203e76e9a067f3de6e5ace2 Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:14:11 +0100 Subject: [PATCH 20/31] Update gimie/parsers/abstract.py Co-authored-by: Cyril Matthey-Doret --- gimie/parsers/abstract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimie/parsers/abstract.py b/gimie/parsers/abstract.py index 3a06af4..e5376f7 100644 --- a/gimie/parsers/abstract.py +++ b/gimie/parsers/abstract.py @@ -31,8 +31,8 @@ class Parser(ABC): Parameters ---------- - subject : str - The subject of a triple (subject - predicate - object) to be used for writing parsed properties to. + subject: + The subject of a triple (subject - predicate - object) to be used for writing parsed properties to. """ def __init__(self, subject: str): From a331a631a48a3a070f3b5f56b70b0567c4fbea6a Mon Sep 17 00:00:00 2001 From: Robin Franken <77491494+rmfranken@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:15:03 +0100 Subject: [PATCH 21/31] Update gimie/parsers/cff.py Co-authored-by: Cyril Matthey-Doret --- gimie/parsers/cff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 60f5058..514b15a 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -52,8 +52,8 @@ def parse(self, data: bytes) -> Graph: if not authors: return extracted_cff_triples for author in authors: - orcid = URIRef(author["orcid"]) - if is_valid_orcid(orcid): + if is_valid_orcid(author["orcid"]): + orcid = URIRef(author["orcid"]) extracted_cff_triples.add( (self.subject, SDO.author, URIRef(orcid)) ) From 9d6926742b2594afc84c2d713a144af8f2fd87fa Mon Sep 17 00:00:00 2001 From: cmdoret Date: Thu, 28 Nov 2024 14:34:09 +0100 Subject: [PATCH 22/31] chore(docker): use python 3.12 base --- .docker/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 4ae7c0b..5f4d491 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,18 +1,18 @@ ARG VERSION_BUILD -FROM python:3.13-slim-bookworm as python +FROM python:3.12-slim-bookworm AS python ENV PYTHONUNBUFFERED=true WORKDIR /app LABEL org.opencontainers.image.source=https://github.com/sdsc-ordes/gimie LABEL org.opencontainers.image.description="Extract linked metadata from repositories." LABEL org.opencontainers.image.licenses=Apache-2.0 -LABEL org.opencontainers.image.version ${VERSION_BUILD} +LABEL org.opencontainers.image.version=${VERSION_BUILD} ################################################## # Poetry setup ################################################## -FROM python as poetry +FROM python AS poetry # Install poetry ENV POETRY_HOME=/opt/poetry @@ -36,7 +36,7 @@ RUN poetry install --no-interaction --no-ansi -vvv ################################################## # Gimie setup ################################################## -FROM python as runtime +FROM python AS runtime ENV PATH="/app/.venv/bin:$PATH" RUN apt-get update && \ apt-get install -y git libgomp1 libmagic-dev From b91df2a4b4d12ba4afc43b956b325736f7640c91 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Thu, 28 Nov 2024 14:46:57 +0100 Subject: [PATCH 23/31] fix: improve tests, rename some variables --- gimie/parsers/cff.py | 4 +- gimie/parsers/license/__init__.py | 8 ++-- gimie/utils/uri.py | 2 +- tests/test_cff.py | 67 +++++++++++++++---------------- 4 files changed, 39 insertions(+), 42 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 60f5058..6480ba9 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -24,7 +24,7 @@ from gimie import logger from gimie.graph.namespaces import SDO, MD4I from gimie.parsers.abstract import Parser -from gimie.utils.uri import is_valid_orcid, valid_doi_match_extractor +from gimie.utils.uri import is_valid_orcid, extract_doi_match class CffParser(Parser): @@ -111,7 +111,7 @@ def doi_to_url(doi: str) -> str: 'https://doi.org/10.0000/example.abcd' """ - doi_match = valid_doi_match_extractor(doi) + doi_match = extract_doi_match(doi) if doi_match is None: raise ValueError(f"Not a valid DOI: {doi}") diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py index fa4bedc..db0de2a 100644 --- a/gimie/parsers/license/__init__.py +++ b/gimie/parsers/license/__init__.py @@ -41,14 +41,12 @@ def parse(self, data: bytes) -> Graph: graph with a single triple . If no matching URL is found, an empty graph is returned. """ - extracted_license_triple = Graph() + license_facts = Graph() license_url = match_license(data) if license_url: - extracted_license_triple.add( - (self.subject, SDO.license, URIRef(license_url)) - ) - return extracted_license_triple + license_facts.add((self.subject, SDO.license, URIRef(license_url))) + return license_facts def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]: diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py index ca206a3..c94bdf6 100644 --- a/gimie/utils/uri.py +++ b/gimie/utils/uri.py @@ -98,7 +98,7 @@ def is_valid_orcid(orcid): ) -def valid_doi_match_extractor(doi): +def extract_doi_match(doi): """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_. .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions) diff --git a/tests/test_cff.py b/tests/test_cff.py index e844b40..66444de 100644 --- a/tests/test_cff.py +++ b/tests/test_cff.py @@ -1,7 +1,8 @@ from gimie.io import LocalResource from gimie.parsers import CffParser from gimie.parsers.cff import get_cff_authors -from rdflib import URIRef +from rdflib import URIRef, Literal +import pytest def test_parse_cff(): @@ -12,12 +13,17 @@ def test_parse_cff(): assert authors is not None -def test_broken_cff(): - cff_file_emptyish = b""" +@pytest.mark.parametrize( + "cff_file", + [ + ( + b""" cff-version: 1.2.0 message: "This is a CFF devoid authors or DOI" """ - cff_file_bad_syntax = b""" + ), + ( + b""" cff-version: 1.2.0 title: gimie : authors: @@ -28,7 +34,9 @@ def test_broken_cff(): Jane orcid: 0000-0001-2345-6789 """ - cff_file_broken_orcid = b""" + ), + ( + b""" cff-version: 1.2.0 title: gimie authors: @@ -39,47 +47,38 @@ def test_broken_cff(): given-names: Jane orcid: http://www.orcid.org/0000-0001-2345-6789 """ - cff_file_author_without_orcid = b""" + ), + ( + b""" cff-version: 1.2.0 title: gimie authors: - family-names: Doe given-names: John """ - - # empty graph = \n according to rdflib + ), + ], +) +def test_broken_cff(cff_file): assert ( - CffParser(subject=URIRef("https://example.org/")) - .parse(data=cff_file_emptyish) - .serialize(format="ttl") - == "\n" - ) - assert ( - CffParser(subject=URIRef("https://example.org/")) - .parse(data=cff_file_bad_syntax) - .serialize(format="ttl") - == "\n" - ) - assert ( - CffParser(subject=URIRef("https://example.org/")) - .parse(data=cff_file_broken_orcid) - .serialize(format="ttl") - == "\n" - ) - assert ( - CffParser(subject=URIRef("https://example.org/")) - .parse(data=cff_file_author_without_orcid) - .serialize(format="ttl") - == "\n" + len( + CffParser(subject=URIRef("https://example.org/")).parse( + data=cff_file + ) + ) + == 0 ) -def test_doi(): +def test_parse_doi(): cff_file = b""" cff-version: 1.2.0 title: gimie doi: 10.5281/zenodo.1234567 """ - assert "https://doi.org/10.5281/zenodo.1234567" in CffParser( - subject=URIRef("https://example.org/") - ).parse(data=cff_file).serialize(format="ttl") + obj = next( + CffParser(subject=URIRef("https://example.org/")) + .parse(data=cff_file) + .objects() + ) + assert URIRef("https://doi.org/10.5281/zenodo.1234567") == obj From 477ab88b5b22a6b993642b675d11c73a944c5faa Mon Sep 17 00:00:00 2001 From: rmfranken Date: Thu, 28 Nov 2024 14:54:09 +0100 Subject: [PATCH 24/31] fix:rename the example in extract_doi_march --- gimie/utils/uri.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimie/utils/uri.py b/gimie/utils/uri.py index c94bdf6..163be3b 100644 --- a/gimie/utils/uri.py +++ b/gimie/utils/uri.py @@ -114,9 +114,9 @@ def extract_doi_match(doi): Examples -------- - >>> valid_doi_match_extractor("10.5281/zenodo.1234567") + >>> extract_doi_match("10.5281/zenodo.1234567") '10.5281/zenodo.1234567' - >>> valid_doi_match_extractor("https://doi.org/10.5281/zenodo.1234567") + >>> extract_doi_match("https://doi.org/10.5281/zenodo.1234567") '10.5281/zenodo.1234567' """ match = re.search( From 0eb1423588d511e3ce5cfb73a972a3fd25f0202d Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:24:09 +0100 Subject: [PATCH 25/31] fix: DOI from dict, not flat value --- gimie/parsers/cff.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index ef76ad2..7b8d935 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -145,9 +145,15 @@ def get_cff_doi(data: bytes) -> Optional[str]: except yaml.scanner.ScannerError: logger.warning("cannot read CITATION.cff, skipped.") return None - try: - doi_url = doi_to_url(cff["doi"]) + identifiers = cff.get("identifiers", []) + doi_identifier = next( + (id for id in identifiers if id.get("type") == "doi"), None + ) + if doi_identifier: + doi_url = doi_to_url(doi_identifier["value"]) + else: + raise KeyError("No DOI found in identifiers") # No doi in cff file except (KeyError, TypeError): logger.warning("CITATION.cff does not contain a 'doi' key.") From 43fdf514ce082f4974ff255a9f48a700aa2e54b9 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:31:47 +0100 Subject: [PATCH 26/31] fix:make cff example correct --- tests/test_cff.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_cff.py b/tests/test_cff.py index 66444de..118b124 100644 --- a/tests/test_cff.py +++ b/tests/test_cff.py @@ -73,12 +73,15 @@ def test_broken_cff(cff_file): def test_parse_doi(): cff_file = b""" cff-version: 1.2.0 - title: gimie - doi: 10.5281/zenodo.1234567 + message: If you use this software, please cite it using these metadata. + title: 'napari: a multi-dimensional image viewer for Python' + identifiers: + - type: doi + value: 10.5281/zenodo.3555620 """ obj = next( CffParser(subject=URIRef("https://example.org/")) .parse(data=cff_file) .objects() ) - assert URIRef("https://doi.org/10.5281/zenodo.1234567") == obj + assert URIRef("https://doi.org/10.5281/zenodo.3555620") == obj From 343defd5f6ea91f2520d74167fba5b40cdeed995 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:40:11 +0100 Subject: [PATCH 27/31] docs: adapt docstring example to match real CFF structure --- gimie/parsers/cff.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 7b8d935..30cb77d 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -122,21 +122,23 @@ def doi_to_url(doi: str) -> str: def get_cff_doi(data: bytes) -> Optional[str]: """Given a CFF file, returns the DOI, if any. - Parameters - ---------- - data - The cff file body as bytes. - - Returns - ------- - str, optional - doi formatted as a valid url - - Examples - -------- - >>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8")) - 'https://doi.org/10.5281/zenodo.1234' - >>> get_cff_doi(bytes("abc: def", encoding="utf8")) + Parameters + ---------- + data + The cff file body as bytes. + + Returns + ------- + str, optional + doi formatted as a valid url + + Examples + -------- + >>> get_cff_doi(bytes("identifiers: + - type: doi + value: 10.5281/zenodo.1234", encoding="utf8")) + 'https://doi.org/10.5281/zenodo.1234' + >>> get_cff_doi(bytes("abc: def", encoding="utf8")) """ From 513a54648dd09e66faf2b3b6180267c8044c5200 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:43:34 +0100 Subject: [PATCH 28/31] fix:docstring still fucked --- gimie/parsers/cff.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 30cb77d..f9e553a 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -122,23 +122,25 @@ def doi_to_url(doi: str) -> str: def get_cff_doi(data: bytes) -> Optional[str]: """Given a CFF file, returns the DOI, if any. - Parameters - ---------- - data - The cff file body as bytes. - - Returns - ------- - str, optional - doi formatted as a valid url - - Examples - -------- - >>> get_cff_doi(bytes("identifiers: + Parameters + ---------- + data + The cff file body as bytes. + + Returns + ------- + str, optional + doi formatted as a valid url + + Examples + -------- + >>> get_cff_doi(bytes( + "identifiers: - type: doi value: 10.5281/zenodo.1234", encoding="utf8")) - 'https://doi.org/10.5281/zenodo.1234' - >>> get_cff_doi(bytes("abc: def", encoding="utf8")) + 'https://doi.org/10.5281/zenodo.1234'", encoding="utf8")) + 'https://doi.org/10.5281/zenodo.1234' + >>> get_cff_doi(bytes("abc: def", encoding="utf8")) """ From d650e6a50f84c578c962d3cd0a007c0bdbd169de Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:46:10 +0100 Subject: [PATCH 29/31] fix:typo docstring --- gimie/parsers/cff.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index f9e553a..2af8676 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -138,7 +138,6 @@ def get_cff_doi(data: bytes) -> Optional[str]: "identifiers: - type: doi value: 10.5281/zenodo.1234", encoding="utf8")) - 'https://doi.org/10.5281/zenodo.1234'", encoding="utf8")) 'https://doi.org/10.5281/zenodo.1234' >>> get_cff_doi(bytes("abc: def", encoding="utf8")) From 077ad377bc9ff03783c66839a6f3e0c8add28a14 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 16:51:47 +0100 Subject: [PATCH 30/31] fix:chatGPT's suggestion for docstring formatting --- gimie/parsers/cff.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index 2af8676..cfec3bc 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -134,13 +134,9 @@ def get_cff_doi(data: bytes) -> Optional[str]: Examples -------- - >>> get_cff_doi(bytes( - "identifiers: - - type: doi - value: 10.5281/zenodo.1234", encoding="utf8")) + >>> get_cff_doi(bytes("identifiers:\n- type: doi\n value: 10.5281/zenodo.1234", encoding="utf8")) 'https://doi.org/10.5281/zenodo.1234' >>> get_cff_doi(bytes("abc: def", encoding="utf8")) - """ try: From 5a16e3c3f753f47cb55130473f077bc342986289 Mon Sep 17 00:00:00 2001 From: rmfranken Date: Mon, 16 Dec 2024 17:08:05 +0100 Subject: [PATCH 31/31] fix:OK, no multiline, and double escape newlines --- gimie/parsers/cff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py index cfec3bc..09ae71e 100644 --- a/gimie/parsers/cff.py +++ b/gimie/parsers/cff.py @@ -134,7 +134,7 @@ def get_cff_doi(data: bytes) -> Optional[str]: Examples -------- - >>> get_cff_doi(bytes("identifiers:\n- type: doi\n value: 10.5281/zenodo.1234", encoding="utf8")) + >>> get_cff_doi(bytes("identifiers:\\n - type: doi\\n value: 10.5281/zenodo.1234", encoding="utf8")) 'https://doi.org/10.5281/zenodo.1234' >>> get_cff_doi(bytes("abc: def", encoding="utf8")) """