Merge branch 'main' into equate_spikeglx_in_spikeinterface

catalystneuro · Dec 9, 2024 · a965d23 · a965d23
2 parents 71b3299 + 4b3172c
commit a965d23
Show file tree

Hide file tree

Showing 10 changed files with 207 additions and 15 deletions.
diff --git a/.github/workflows/deploy-tests.yml b/.github/workflows/deploy-tests.yml
@@ -69,6 +69,9 @@ jobs:
     if: ${{ needs.assess-file-changes.outputs.SOURCE_CHANGED == 'true' }}
     uses: ./.github/workflows/live-service-testing.yml
     secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      S3_GIN_BUCKET: ${{ secrets.S3_GIN_BUCKET }}
       DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
     with:  #  Ternary operator: condition && value_if_true || value_if_false
       python-versions: ${{ github.event.pull_request.draft == true && '["3.9"]' || needs.load_python_and_os_versions.outputs.ALL_PYTHON_VERSIONS }}

diff --git a/.github/workflows/live-service-testing.yml b/.github/workflows/live-service-testing.yml
@@ -13,6 +13,12 @@ on:
         type: string
 
     secrets:
+      AWS_ACCESS_KEY_ID:
+        required: true
+      AWS_SECRET_ACCESS_KEY:
+        required: true
+      S3_GIN_BUCKET:
+        required: true
       DANDI_API_KEY:
         required: true
 
@@ -45,7 +51,17 @@ jobs:
       - name: Install full requirements
         run: pip install .[test,full]
 
+      - name: Prepare data for tests
+        uses: ./.github/actions/load-data
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          s3-gin-bucket: ${{ secrets.S3_GIN_BUCKET }}
+          os: ${{ matrix.os }}
+
       - name: Run subset of tests that use DANDI live services
         run: pytest -rsx -n auto tests/test_minimal/test_tools/dandi_transfer_tools.py
+      - name: Run subset of tests that use DANDI live services with YAML
+        run: pytest -rsx -n auto tests/test_on_data/test_yaml/yaml_dandi_transfer_tools.py
       - name: Run subset of tests that use Globus live services
         run: pytest -rsx -n auto tests/test_minimal/test_tools/globus_transfer_tools.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
 # v0.6.6 (Upcoming)
 
 ## Deprecations
-* Completely removed compression settings from most places [PR #1126](https://github.com/catalystneuro/neuroconv/pull/1126)
+* Removed use of `jsonschema.RefResolver` as it will be deprecated from the jsonschema library [PR #1133](https://github.com/catalystneuro/neuroconv/pull/1133)
+* Completely removed compression settings from most places[PR #1126](https://github.com/catalystneuro/neuroconv/pull/1126)
 
 ## Bug Fixes
 * datetime objects now can be validated as conversion options [#1139](https://github.com/catalystneuro/neuroconv/pull/1126)
@@ -15,6 +16,7 @@
 * `SpikeGLXRecordingInterface` now also accepts `folder_path` making its behavior equivalent to SpikeInterface [#1150](https://github.com/catalystneuro/neuroconv/pull/1150)
 * Added the `rclone_transfer_batch_job` helper function for executing Rclone data transfers in AWS Batch jobs. [PR #1085](https://github.com/catalystneuro/neuroconv/pull/1085)
 * Added the `deploy_neuroconv_batch_job` helper function for deploying NeuroConv AWS Batch jobs. [PR #1086](https://github.com/catalystneuro/neuroconv/pull/1086)
+* YAML specification files now accept an outer keyword `upload_to_dandiset="< six-digit ID >"` to automatically upload the produced NWB files to the DANDI archive [PR #1089](https://github.com/catalystneuro/neuroconv/pull/1089)
 
 
 ## Improvements

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,8 @@ dependencies = [
     "parse>=1.20.0",
     "click",
     "docstring-parser",
-    "packaging"  # Issue 903
+    "packaging",  # Issue 903
+    "referencing",
 ]
 
 

diff --git a/src/neuroconv/schemas/yaml_conversion_specification_schema.json b/src/neuroconv/schemas/yaml_conversion_specification_schema.json
@@ -8,6 +8,7 @@
   "required": ["experiments"],
   "additionalProperties": false,
   "properties": {
+    "upload_to_dandiset": {"type": "string"},
     "metadata": {"$ref": "./metadata_schema.json#"},
     "conversion_options": {"type": "object"},
     "data_interfaces": {

diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
@@ -1,12 +1,15 @@
-import sys
+import json
+import os
 from importlib import import_module
 from pathlib import Path
 from typing import Optional
 
 import click
-from jsonschema import RefResolver, validate
+from jsonschema import validate
 from pydantic import DirectoryPath, FilePath
+from referencing import Registry, Resource
 
+from ..data_transfers import automatic_dandi_upload
 from ...nwbconverter import NWBConverter
 from ...utils import dict_deep_update, load_dict_from_file
 
@@ -50,7 +53,7 @@ def run_conversion_from_yaml(
     data_folder_path: Optional[DirectoryPath] = None,
     output_folder_path: Optional[DirectoryPath] = None,
     overwrite: bool = False,
-):
+) -> None:
     """
     Run conversion to NWB given a yaml specification file.
 
@@ -85,14 +88,29 @@ def run_conversion_from_yaml(
 
     specification = load_dict_from_file(file_path=specification_file_path)
     schema_folder = Path(__file__).parent.parent.parent / "schemas"
+
+    # Load all required schemas
     specification_schema = load_dict_from_file(file_path=schema_folder / "yaml_conversion_specification_schema.json")
-    sys_uri_base = "file:/" if sys.platform.startswith("win32") else "file://"
+    metadata_schema = load_dict_from_file(file_path=schema_folder / "metadata_schema.json")
+
+    # The yaml specification references the metadata schema, so we need to load it into the registry
+    registry = Registry().with_resource("metadata_schema.json", Resource.from_contents(metadata_schema))
+
+    # Validate using the registry
     validate(
         instance=specification,
         schema=specification_schema,
-        resolver=RefResolver(base_uri=sys_uri_base + str(schema_folder) + "/", referrer=specification_schema),
+        registry=registry,
     )
 
+    upload_to_dandiset = "upload_to_dandiset" in specification
+    if upload_to_dandiset and "DANDI_API_KEY" not in os.environ:
+        message = (
+            "The 'upload_to_dandiset' prompt was found in the YAML specification, "
+            "but the environment variable 'DANDI_API_KEY' was not set."
+        )
+        raise ValueError(message)
+
     global_metadata = specification.get("metadata", dict())
     global_conversion_options = specification.get("conversion_options", dict())
     data_interfaces_spec = specification.get("data_interfaces")
@@ -108,28 +126,55 @@ def run_conversion_from_yaml(
         experiment_metadata = experiment.get("metadata", dict())
         for session in experiment["sessions"]:
             file_counter += 1
+
             source_data = session["source_data"]
             for interface_name, interface_source_data in session["source_data"].items():
                 for key, value in interface_source_data.items():
                     if key == "file_paths":
                         source_data[interface_name].update({key: [str(Path(data_folder_path) / x) for x in value]})
                     elif key in ("file_path", "folder_path"):
                         source_data[interface_name].update({key: str(Path(data_folder_path) / value)})
+
             converter = CustomNWBConverter(source_data=source_data)
+
             metadata = converter.get_metadata()
             for metadata_source in [global_metadata, experiment_metadata, session.get("metadata", dict())]:
                 metadata = dict_deep_update(metadata, metadata_source)
-            nwbfile_name = session.get("nwbfile_name", f"temp_nwbfile_name_{file_counter}").strip(".nwb")
+
+            session_id = session.get("metadata", dict()).get("NWBFile", dict()).get("session_id", None)
+            if upload_to_dandiset and session_id is None:
+                message = (
+                    "The 'upload_to_dandiset' prompt was found in the YAML specification, "
+                    "but the 'session_id' was not found for session with info block: "
+                    f"\n\n {json.dumps(obj=session, indent=2)}\n\n"
+                    "File intended for DANDI upload must include a session ID."
+                )
+                raise ValueError(message)
+
             session_conversion_options = session.get("conversion_options", dict())
             conversion_options = dict()
             for key in converter.data_interface_objects:
                 conversion_options[key] = dict(session_conversion_options.get(key, dict()), **global_conversion_options)
+
+            nwbfile_name = session.get("nwbfile_name", f"temp_nwbfile_name_{file_counter}").strip(".nwb")
             converter.run_conversion(
                 nwbfile_path=output_folder_path / f"{nwbfile_name}.nwb",
                 metadata=metadata,
                 overwrite=overwrite,
                 conversion_options=conversion_options,
             )
+
+    if upload_to_dandiset:
+        dandiset_id = specification["upload_to_dandiset"]
+        staging = int(dandiset_id) >= 200_000
+        automatic_dandi_upload(
+            dandiset_id=dandiset_id,
+            nwb_folder_path=output_folder_path,
+            staging=staging,
+        )
+
+        return None  # We can early return since organization below will occur within the upload step
+
     # To properly mimic a true dandi organization, the full directory must be populated with NWBFiles.
     all_nwbfile_paths = [nwbfile_path for nwbfile_path in output_folder_path.iterdir() if nwbfile_path.suffix == ".nwb"]
     nwbfile_paths_to_set = [

diff --git a/tests/imports.py b/tests/imports.py
@@ -68,6 +68,7 @@ def test_tools(self):
             "get_package_version",
             "is_package_installed",
             "deploy_process",
+            "data_transfers",
             "LocalPathExpander",
             "get_module",
         ]

diff --git a/...on_data/test_yaml/conversion_specifications/GIN_conversion_specification_dandi_upload.yml b/...on_data/test_yaml/conversion_specifications/GIN_conversion_specification_dandi_upload.yml
@@ -0,0 +1,66 @@
+metadata:
+  NWBFile:
+    lab: My Lab
+    institution: My Institution
+
+conversion_options:
+  stub_test: True
+
+data_interfaces:
+  ap: SpikeGLXRecordingInterface
+  lf: SpikeGLXRecordingInterface
+  phy: PhySortingInterface
+
+upload_to_dandiset: "200560"
+
+experiments:
+  ymaze:
+    metadata:
+      NWBFile:
+        session_description: Subject navigating a Y-shaped maze.
+
+    sessions:
+      - nwbfile_name: example_converter_spec_1
+        source_data:
+          ap:
+            file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.ap.bin
+        metadata:
+          NWBFile:
+            session_start_time: "2020-10-09T21:19:09+00:00"
+            session_id: "test-yaml-1"
+          Subject:
+            subject_id: "yaml-1"
+            sex: F
+            age: P35D
+            species: Mus musculus
+      - nwbfile_name: example_converter_spec_2.nwb
+        metadata:
+          NWBFile:
+            session_start_time: "2020-10-10T21:19:09+00:00"
+            session_id: "test-yaml-2"
+          Subject:
+            subject_id: "yaml-002"
+            sex: F
+            age: P35D
+            species: Mus musculus
+        source_data:
+          lf:
+            file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.lf.bin
+
+  open_explore:
+    sessions:
+      - nwbfile_name: example_converter_spec_3
+        source_data:
+          lf:
+            file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.lf.bin
+          phy:
+            folder_path: phy/phy_example_0/
+        metadata:
+          NWBFile:
+            session_start_time: "2020-10-11T21:19:09+00:00"
+            session_id: test YAML 3
+          Subject:
+            subject_id: YAML Subject Name
+            sex: F
+            age: P35D
+            species: Mus musculus
diff --git a/tests/test_on_data/test_yaml/test_yaml_conversion_specification.py b/tests/test_on_data/test_yaml/test_yaml_conversion_specification.py
@@ -1,12 +1,12 @@
-import sys
 import unittest
 from datetime import datetime
 from pathlib import Path
 
 import pytest
 from hdmf.testing import TestCase
-from jsonschema import RefResolver, validate
+from jsonschema import validate
 from pynwb import NWBHDF5IO
+from referencing import Registry, Resource
 
 from neuroconv import run_conversion_from_yaml
 from neuroconv.utils import load_dict_from_file
@@ -19,6 +19,7 @@
     "fname",
     [
         "GIN_conversion_specification.yml",
+        "GIN_conversion_specification_dandi_upload.yml",
         "GIN_conversion_specification_missing_nwbfile_names.yml",
         "GIN_conversion_specification_no_nwbfile_name_or_other_metadata.yml",
         "GIN_conversion_specification_videos.yml",
@@ -27,16 +28,19 @@
 def test_validate_example_specifications(fname):
     path_to_test_yml_files = Path(__file__).parent / "conversion_specifications"
     schema_folder = path_to_test_yml_files.parent.parent.parent.parent / "src" / "neuroconv" / "schemas"
+
+    # Load schemas
     specification_schema = load_dict_from_file(file_path=schema_folder / "yaml_conversion_specification_schema.json")
-    sys_uri_base = "file://"
-    if sys.platform.startswith("win32"):
-        sys_uri_base = "file:/"
+    metadata_schema = load_dict_from_file(file_path=schema_folder / "metadata_schema.json")
+
+    # The yaml specification references the metadata schema, so we need to load it into the registry
+    registry = Registry().with_resource("metadata_schema.json", Resource.from_contents(metadata_schema))
 
     yaml_file_path = path_to_test_yml_files / fname
     validate(
         instance=load_dict_from_file(file_path=yaml_file_path),
-        schema=load_dict_from_file(file_path=schema_folder / "yaml_conversion_specification_schema.json"),
-        resolver=RefResolver(base_uri=sys_uri_base + str(schema_folder) + "/", referrer=specification_schema),
+        schema=specification_schema,
+        registry=registry,
     )
 
 

diff --git a/tests/test_on_data/test_yaml/yaml_dandi_transfer_tools.py b/tests/test_on_data/test_yaml/yaml_dandi_transfer_tools.py
@@ -0,0 +1,53 @@
+import os
+import platform
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import dandi.dandiapi
+import pytest
+from packaging.version import Version
+
+from neuroconv import run_conversion_from_yaml
+
+from ..setup_paths import ECEPHY_DATA_PATH, OUTPUT_PATH
+
+DANDI_API_KEY = os.getenv("DANDI_API_KEY")
+HAVE_DANDI_KEY = DANDI_API_KEY is not None and DANDI_API_KEY != ""  # can be "" from external forks
+_PYTHON_VERSION = platform.python_version()
+
+
+@pytest.mark.skipif(
+    not HAVE_DANDI_KEY or Version(".".join(_PYTHON_VERSION.split(".")[:2])) != Version("3.12"),
+    reason="You must set your DANDI_API_KEY to run this test!",
+)
+def test_run_conversion_from_yaml_with_dandi_upload():
+    path_to_test_yml_files = Path(__file__).parent / "conversion_specifications"
+    yaml_file_path = path_to_test_yml_files / "GIN_conversion_specification_dandi_upload.yml"
+    run_conversion_from_yaml(
+        specification_file_path=yaml_file_path,
+        data_folder_path=ECEPHY_DATA_PATH,
+        output_folder_path=OUTPUT_PATH,
+        overwrite=True,
+    )
+
+    time.sleep(60)  # Give some buffer room for server to process before making assertions against DANDI API
+
+    client = dandi.dandiapi.DandiAPIClient(api_url="https://api-staging.dandiarchive.org/api")
+    dandiset = client.get_dandiset("200560")
+
+    expected_asset_paths = [
+        "sub-yaml-1/sub-yaml-1_ses-test-yaml-1_ecephys.nwb",
+        "sub-yaml-002/sub-yaml-002_ses-test-yaml-2_ecephys.nwb",
+        "sub-YAML-Subject-Name/sub-YAML-Subject-Name_ses-test-YAML-3_ecephys.nwb",
+    ]
+    for asset_path in expected_asset_paths:
+        test_asset = dandiset.get_asset_by_path(path=asset_path)  # Will error if not found
+        test_asset_metadata = test_asset.get_raw_metadata()
+
+        # Past uploads may have created the same apparent file, so look at the modification time to ensure
+        # this test is actually testing the most recent upload
+        date_modified = datetime.fromisoformat(
+            test_asset_metadata["dateModified"].split("Z")[0]  # Timezones look a little messy
+        )
+        assert datetime.now() - date_modified < timedelta(minutes=10)