tira-io · mam10eks · Mar 11, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 5, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
     timeout-minutes: 15
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/documentation/participants/images/tira-code-submission.png b/documentation/participants/images/tira-code-submission.png
diff --git a/documentation/participants/images/tira-execute-sandboxed.png b/documentation/participants/images/tira-execute-sandboxed.png
diff --git a/documentation/participants/images/tira-verify-installation.png b/documentation/participants/images/tira-verify-installation.png
diff --git a/documentation/participants/participate.rst b/documentation/participants/participate.rst
@@ -31,32 +31,40 @@ out more):
 
 .. tab-set::
 
-    .. tab-item:: Upload
-        :sync: upload-submission
+    .. tab-item:: Code Submission
+        :sync: code-submission
 
-        The upload submission is the simplest form of submitting and requires you to run the evaluation yourself and
-        upload the :term:`runfile`. As such it has two notable drawbacks such that we discourage from using it:
+        Code submission is the recommended form of submitting to TIRA. Code submissions are compatible with CI/CD systems like `Github Actions <https://github.com/features/actions>`_ and build a docker image from a git repository while collecting important experimental metadata to improve transparency and reproducibility.
 
-        (1) Participants need access to the dataset. This may not be possible (e.g., due to legal reasons) or desirable
-            (e.g., to avoid that future models profit from the author's analysis of the dataset).
-        (2) The result is not verifiable -- the organizer can not ensure that your model actually produced the runfile.
+        The requirements for code submissions are:
+
+        (1) Your approach is in a git repository.
+        (2) Your git repository is complete, i.e., contains all code and a Dockerfile to bundle the code.
+        (3) Your git repository is clean.
+            E.g., ``git status`` reports "nothing to commit, working tree clean".
+
+        When those requirements are fulfilled, code submissions perform the following steps:
+
+        (1) Build the docker image from the git repository while `tracking  important experimental meta data <https://github.com/tira-io/tirex-tracker>`_ (e.g., on git, code, etc.).
+        (2) Run the docker image on a small spot check dataset to ensure it produces valid outputs.
+        (3) Upload the docker image together with the meta data to TIRA.
 
     .. tab-item:: Docker Submission
         :sync: docker-submission
 
         .. todo:: TODO
 
-    .. tab-item:: Code Submission
-        :sync: code-submission
+    .. tab-item:: Run Upload
+        :sync: upload-submission
 
-        The code submission is the simplest (recommended) form of submitting.
-
-        .. todo:: TODO
+        The upload submission is the simplest form of submitting and requires you to run the evaluation yourself and
+        upload the :term:`runfile`. As such it has two notable drawbacks such that we discourage from using it:
 
-.. hint:: If you want to use the simplest type of submission, we recommend a **Code Submission**. Note however, that the
-    Code Submission requires some access to your GitHub Account to perform all the setup steps for you. Submissions via
-    uploads are generally discouraged since they can not be verified. Such that we *highly recommend*, you either go for
-    a **Code Submission** or a **Docker Submission**.
+        (1) Participants need access to the dataset. This may not be possible (e.g., due to legal reasons) or desirable
+            (e.g., to avoid that future models profit from the author's analysis of the dataset).
+        (2) The result is not verifiable -- the organizer can not ensure that your model actually produced the runfile.
+
+.. hint:: If you want to use the simplest type of submission, we recommend a **Code Submission** as this works with Github Actions or other CI/CD automations.
 
 
 .. _SubmitSubmission:
@@ -72,22 +80,48 @@ your leaderboard position.
 
     .. tab-set::
 
-        .. tab-item:: Upload
-            :sync: upload-submission
+        .. tab-item:: Code Submission
+            :sync: code-submission
 
-            .. todo:: TODO
+            (1) Please install the TIRA client via ``pip3 install tira``.
+
+            (2) Please authenticate your tira client using your API key. You get your API key after registration on TIRA on your submit page.
+
+            (3) Ensure that your TIRA installation is valid by running ``tira-cli verify-installation``. A valid output should look like:
+
+            .. figure:: images/tira-verify-installation.png
+                :width: 700
+                :align: center
+
+            (4) Now you are ready to upload your code submission to TIRA. Assuming that you want to upload to your code in a directory ``approach-xyz`` to the task ``wows-eval``, the command ``tira-cli code-submission --path some-directory/ --task wows-eval`` would do the code submission. A valid output should look like:
+
+            .. figure:: images/tira-code-submission.png
+                :width: 900
+                :align: center
+
 
         .. tab-item:: Docker Submission
             :sync: docker-submission
 
             .. todo:: TODO
 
-        .. tab-item:: Code Submission
-            :sync: code-submission
+        .. tab-item:: Run Upload
+            :sync: upload-submission
 
             .. todo:: TODO
 
-.. todo:: For development: The "Country" field should probably be a dropdown
 
-.. todo:: The upload of artifacts should not be inside the file-upload-submission since it indicates that it would not
-    apply to docker- or code submissions, which it does.
+
+.. _ExecuteSubmission:
+
+Execute Your Submission
+-----------------------
+
+Now that you have uploaded your code or docker submission, you can execute it within TIRA (this is not needed for run uploads). Navigate to your task page and select your submission. Then, select the resources and dataset on which your submission should, and click "RUN":
+
+
+.. figure:: images/tira-execute-sandboxed.png
+   :width: 700
+   :align: center
+
+After your software was executed, you can directly see the outputs and evaluation scores for public training datasets. For private or test datasets, the organizers will manually review the output of your system and will contact you and in case there are errors.
diff --git a/python-client/setup.cfg b/python-client/setup.cfg
@@ -33,6 +33,7 @@ install_requires =
     pandas
     packaging
     tqdm
+    gitpython
 
 [options.extras_require]
 test =
@@ -41,7 +42,7 @@ test =
     approvaltests
 dev =
     python-terrier==0.10.*
-    ir-datasets
+    ir-datasets==0.5.9
     trectools
 
 [options.entry_points]

diff --git a/python-client/tests/code_submission_test.py b/python-client/tests/code_submission_test.py
@@ -0,0 +1,46 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from zipfile import ZipFile
+
+from tira.rest_api_client import Client
+
+
+class CodeSubmissionTest(unittest.TestCase):
+    def test_code_submission_fails_if_code_not_in_version_control(self):
+        tira = Client()
+        with tempfile.TemporaryDirectory() as tmp_file:
+            with self.assertRaises(ValueError):
+                tira.submit_code(Path(tmp_file), "wows-eval", dry_run=True)
+
+    def test_code_submission_fails_for_dirty_git_repo(self):
+        tira = Client()
+        with tempfile.TemporaryDirectory() as tmp_file:
+            with ZipFile(Path("tests") / "resources" / "example-git-repositories.zip", "r") as zip_ref:
+                zip_ref.extractall(tmp_file)
+            with self.assertRaises(ValueError):
+                tira.submit_code(Path(tmp_file) / "git-repo-dirty" / "some-directory", "wows-eval", dry_run=True)
+
+    def test_code_submission_works(self):
+        tira = Client(tira_cache_dir="./tests/resources/local_cached_zip")
+        expected_code_files = ["some-directory/.gitignore", "some-directory/Dockerfile", "some-directory/script.sh"]
+
+        with tempfile.TemporaryDirectory() as tmp_file:
+            with ZipFile(Path("tests") / "resources" / "example-git-repositories.zip", "r") as zip_ref:
+                zip_ref.extractall(tmp_file)
+
+            os.chmod(str(Path(tmp_file) / "git-repo-clean" / "some-directory" / "script.sh"), 0o0766)
+            actual = tira.submit_code(
+                Path(tmp_file) / "git-repo-clean" / "some-directory", "task-does-not-exist", dry_run=True
+            )
+
+        zipObj = ZipFile(actual["code"])
+        files_in_zip = [i.filename for i in zipObj.infolist()]
+
+        self.assertEqual({"origin": "foo"}, actual["remotes"])
+        self.assertEqual("976c6949b9992aabc785ccb8544652dc3b149fb5", actual["commit"])
+        self.assertEqual("main", actual["active_branch"])
+        self.assertTrue(actual["image"].startswith("some-directory"))
+
+        self.assertEqual(files_in_zip, expected_code_files)
diff --git a/python-client/tests/ir_datasets_test.py b/python-client/tests/ir_datasets_test.py
@@ -111,7 +111,7 @@ def test_loading_raw_ir_datasets_01(self):
         ensure_pyterrier_is_loaded(patch_ir_datasets=False)
         ir_datasets = load_ir_datasets()
         dataset = ir_datasets.load("cranfield")
-        queries = {i.query_id: i.text for i in dataset.queries_iter()}
+        queries = {str(i.query_id): i.text for i in dataset.queries_iter()}
 
         assert len(list(dataset.queries_iter())) == 225
         assert queries["269"] == "has a criterion been established for determining the axial compressor\nchoking line ."
@@ -120,7 +120,7 @@ def test_loading_raw_ir_datasets_02(self):
         ensure_pyterrier_is_loaded(patch_ir_datasets=True)
         ir_datasets = load_ir_datasets()
         dataset = ir_datasets.load("cranfield")
-        queries = {i.query_id: i.text for i in dataset.queries_iter()}
+        queries = {str(i.query_id): i.text for i in dataset.queries_iter()}
 
         assert len(list(dataset.queries_iter())) == 225
         assert queries["269"] == "has a criterion been established for determining the axial compressor\nchoking line ."
@@ -215,7 +215,7 @@ def test_no_patching_for_pyterrier_datasets_01(self):
         import pyterrier as pt
 
         dataset = pt.get_dataset("irds:cranfield")
-        queries = {i["qid"]: i["query"] for _, i in dataset.get_topics().iterrows()}
+        queries = {str(i["qid"]): i["query"] for _, i in dataset.get_topics().iterrows()}
 
         assert len(dataset.get_topics()) == 225
         assert queries["269"] == "has a criterion been established for determining the axial compressor choking line"
@@ -225,7 +225,7 @@ def test_no_patching_for_pyterrier_datasets_02(self):
         import pyterrier as pt
 
         dataset = pt.get_dataset("irds:cranfield")
-        queries = {i["qid"]: i["query"] for _, i in dataset.get_topics().iterrows()}
+        queries = {str(i["qid"]): i["query"] for _, i in dataset.get_topics().iterrows()}
 
         assert len(dataset.get_topics()) == 225
         assert queries["269"] == "has a criterion been established for determining the axial compressor choking line"

diff --git a/python-client/tests/resources/example-git-repositories.zip b/python-client/tests/resources/example-git-repositories.zip
diff --git a/...lient/tests/resources/local_cached_zip/.archived/api/datasets_by_task/task-does-not-exist b/...lient/tests/resources/local_cached_zip/.archived/api/datasets_by_task/task-does-not-exist
@@ -0,0 +1,13 @@
+{
+  "status": "0",
+  "context": {
+    "include_navigation": false,
+    "user_id": null,
+    "role": "guest",
+    "organizer_teams": "[]",
+    "datasets": "{\"dataset-does-not-exist-20241201-training\": {\"display_name\": \"pairwise-smoke-test\", \"evaluator_id\": \"pairwise-smoke-test-20250210-training-evaluator\", \"dataset_id\": \"dataset-does-not-exist-20241201-training\", \"is_confidential\": false, \"is_deprecated\": false, \"year\": \"2025-02-10 14:31:11.079033\", \"task\": \"wows-eval\", \"organizer\": \"Webis\", \"organizer_id\": \"webis\", \"software_count\": 0, \"runs_count\": 0, \"evaluations_count\": 0, \"evaluations_public_count\": 0, \"default_upload_name\": \"predictions.jsonl\", \"created\": \"2025-02-10\", \"last_modified\": \"2025-02-10\", \"irds_docker_image\": null, \"irds_import_command\": null, \"irds_import_truth_command\": null, \"evaluator_git_runner_image\": \"ubuntu:18.04\", \"evaluator_git_runner_command\": \"echo 'this is no real evaluator'\", \"format\": [\"*.jsonl\"], \"description\": \"A pairwise smoke test dataset.\", \"chatnoir_id\": null, \"ir_datasets_id\": null, \"file_listing\": [{\"title\": \"$inputDir\", \"children\": [{\"title\": \"inputs.jsonl (size: 33549; md5sum: d7852dfdf100a3304c9786f1b7179d0b)\", \"size\": 33549, \"md5sum\": \"d7852dfdf100a3304c9786f1b7179d0b\"}]}, {\"title\": \"$inputDataset\", \"children\": [{\"title\": \"labels.jsonl (size: 37522; md5sum: 6f597478dbfc15fa787ba317b6612248)\", \"size\": 37522, \"md5sum\": \"6f597478dbfc15fa787ba317b6612248\"}]}]}}",
+    "selected_dataset_id": "",
+    "test_dataset_ids": "[]",
+    "training_dataset_ids": "[\"dataset-does-not-exist-20241201-training\"]"
+  }
+}
diff --git a/python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all b/python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all
@@ -7,6 +7,7 @@
     "display_name": "Display Name",
     "is_confidential": false,
     "is_deprecated": false,
+    "format": ["*.jsonl"],
     "mirrors": {
       "truths": {
         "Zenodo": "URL does not exist"

diff --git a/python-client/tira/__init__.py b/python-client/tira/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.145"
+__version__ = "0.0.146"
diff --git a/python-client/tira/check_format.py b/python-client/tira/check_format.py
@@ -18,6 +18,24 @@ class FormatMsgType(Enum):
 _fmt = FormatMsgType
 
 
+def log_message(message: str, level: _fmt):
+    """
+    Prints a formatted log message with a symbol indicating the status.
+
+    Parameters:
+    - message (str): The log message to display.
+    - level (_fmt): The level of the message; can be _fmt.OK, _fmt.WARN, _fmt.ERROR.
+    """
+    symbols = {
+        _fmt.OK: "\033[92m\u2713\033[0m",  # Green check mark
+        _fmt.WARN: "\033[93m" + b"\xe2\x9a\xa0".decode("utf-8") + "\033[0m",  # Yellow warning
+        _fmt.ERROR: "\033[91m" + b"\xe2\x9c\x96".decode("utf-8") + "\033[0m",  # Red cross
+    }
+
+    symbol = symbols[level]
+    print(f"{symbol} {message}")
+
+
 class FormatBase:
     def all_lines(self, f: Path):
         try:

diff --git a/python-client/tira/io_utils.py b/python-client/tira/io_utils.py
@@ -8,6 +8,7 @@
 
 import pandas as pd
 
+from tira.check_format import _fmt, log_message
 from tira.tira_client import TiraClient
 
 
@@ -58,6 +59,55 @@ def dataset_as_iterator(
         yield i
 
 
+def verify_docker_installation():
+    try:
+        from tira.local_execution_integration import LocalExecutionIntegration
+
+        local_execution = LocalExecutionIntegration()
+        assert local_execution.docker_is_installed_failsave()
+        return _fmt.OK, "Docker/Podman is installed."
+    except:
+        return _fmt.ERROR, "Docker/Podman is not installed. You can not run dockerized TIRA submissions."
+
+
+def tira_home_exists():
+    try:
+        from tira.rest_api_client import Client
+
+        assert Path(Client().tira_cache_dir).exists()
+        return _fmt.OK, "TIRA home is writable."
+    except:
+        return _fmt.ERROR, "TIRA can not write data to disk, ensure that TIRA_CACHE_DIR is writable."
+
+
+def api_key_is_valid():
+    try:
+        from tira.rest_api_client import Client
+
+        assert Client().api_key_is_valid()
+        return _fmt.OK, "You are authenticated against www.tira.io."
+    except:
+        return _fmt.WARN, "Your TIRA client is not authenticated. Please run 'tira-cli login'."
+
+
+def verify_tirex_tracker():
+    return _fmt.OK, "The tirex-tracker works and will track experimental metadata."
+
+
+def verify_tira_installation():
+    ret = _fmt.OK
+
+    for i in [api_key_is_valid, tira_home_exists, verify_docker_installation, verify_tirex_tracker]:
+        status, msg = i()
+        log_message(msg, status)
+        if status == _fmt.ERROR:
+            ret = _fmt.ERROR
+        if status == _fmt.WARN and ret != _fmt.ERROR:
+            ret = _fmt.WARN
+
+    return ret
+
+
 def parse_jsonl_line(input: Union[str, bytearray, bytes], load_default_text: bool) -> Dict:
     """
     Deseralizes the line using JSON deserialization. Optionally strips the 'original_query' and 'original_document'

diff --git a/python-client/tira/local_execution_integration.py b/python-client/tira/local_execution_integration.py
@@ -75,6 +75,16 @@ def construct_verbosity_output(self, input_dir, output_dir, image, command, orig
             ),
         }
 
+    def build_docker_image(self, path, tag, dockerfile):
+        image_build_code = subprocess.call(["docker", "build", "-f", str(dockerfile), "-t", str(tag), str(path)])
+
+        if image_build_code != 0:
+            raise ValueError(
+                f"Building the docker image failed with error code {image_build_code}. See above for details."
+            )
+
+        print("\n\n Image build successfully.\n\n")
+
     def ensure_image_available_locally(self, image, client=None):
         try:
             output = subprocess.check_output(["docker", "images", "-q", image])
@@ -131,7 +141,9 @@ def extract_entrypoint(self, image):
         return self.make_command_absolute(image_name, " ".join(ret))
 
     def make_command_absolute(self, image_name, command):
-        from tira.third_party_integrations import extract_to_be_executed_notebook_from_command_or_none
+        from tira.third_party_integrations import (
+            extract_to_be_executed_notebook_from_command_or_none,
+        )
 
         executable = extract_to_be_executed_notebook_from_command_or_none(command)
 
@@ -153,6 +165,9 @@ def __docker_linux_sockets(self):
 
         return ret + ["/var/run/docker.sock"]
 
+    def docker_is_installed_failsave(self):
+        return self.__docker_client() is not None
+
     def __docker_client(self):
         try:
             environ = os.environ.copy()