add ability to declare dependency in listing.yaml

oli-clive-griffin · Oct 4, 2024 · f568542 · f568542
1 parent f7c063e
commit f568542
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 13 deletions.
diff --git a/src/inspect_evals/mathematics/README.md b/src/inspect_evals/mathematics/README.md
@@ -16,7 +16,12 @@ First, install the inspect_evals Python package with:
 pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
 ```
 
-Then, evaluate against one more models with:
+Then, install evaluation specific dependencies
+```bash
+pip install inspect_evals[math]@git+https://github.com/UKGovernmentBEIS/inspect_evals
+```
+
+Finally, evaluate against one more models with:
 ```bash
 inspect eval inspect_evals/math --model openai/gpt-4o
 ```

diff --git a/src/inspect_evals/swe_bench/README.md b/src/inspect_evals/swe_bench/README.md
@@ -13,7 +13,12 @@ First, install the inspect_evals Python package with:
 pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
 ```
 
-Then, evaluate against one more models with:
+Then, install evaluation specific dependencies
+```bash
+pip install inspect_evals[swe_bench]@git+https://github.com/UKGovernmentBEIS/inspect_evals
+```
+
+Finally, evaluate against one more models with:
 ```bash
 inspect eval inspect_evals/swe_bench --model openai/gpt-4o
 ```
@@ -27,13 +32,10 @@ ANTHROPIC_API_KEY=<anthropic-api-key>
 <!-- /Usage: Automatically Generated -->
 
 >[!NOTE]
->- **Install requirements.** Install the `swe_bench` requirements with:
+>When first running the swe_bench task, it will build the necessary docker images. This can be resource intensive - for the full swe_bench split, up to several hours, and ~100GB of storage.
 >
->   ```bash
->   pip install inspect_evals[swe_bench]@git+https://github.com/UKGovernmentBEIS/inspect_evals
->   ```
->
->- **Build environment images.** When first running the swe_bench task, it will build the necessary docker images. This can be resource intensive - for the full swe_bench split, up to several hours, and ~100GB of storage.
+>SWE-bench will take a while to run, and uses a lot of tokens. If things are too slow, you should increase the level of parallelism - see https://inspect.ai-safety-institute.org.uk/parallelism.html. Note that running too many docker containers on your machine can also cause issues, most notably with a 'ALL PREDEFINED ADDRESS POOLS HAVE BEEN FULLY SUBNETTED' error - we don't recommend running more than 32 containers at any one time.
+
 
 <!-- Options: Automatically Generated -->
 ## Options
@@ -48,9 +50,6 @@ inspect eval inspect_evals/swe_bench --temperature 0.5
 See `inspect eval --help` for all available options.
 <!-- /Options: Automatically Generated -->
 
->[!NOTE]
->SWE-bench will take a while to run, and uses a lot of tokens. If things are too slow, you should increase the level of parallelism - see https://inspect.ai-safety-institute.org.uk/parallelism.html. Note that running too many docker containers on your machine can also cause issues, most notably with a 'ALL PREDEFINED ADDRESS POOLS HAVE BEEN FULLY SUBNETTED' error - we don't recommend running more than 32 containers at any one time.
-
 ## Dataset
 
 

diff --git a/tools/listing.py b/tools/listing.py
@@ -112,15 +112,30 @@ def generate_options(task_metadata: dict[str, Any]) -> None:
 
 
 def generate_usage(task_metadata: dict[str, Any]) -> None:
+    dependency = task_metadata["dependency"] if "dependency" in task_metadata else None
+
     contents: list[str] = []
     contents.append("## Usage")
     contents.append("")
     contents.append("First, install the inspect_evals Python package with:")
     contents.append("```bash")
     contents.append("pip install git+https://github.com/UKGovernmentBEIS/inspect_evals")
     contents.append("```")
-    contents.append("")
-    contents.append("Then, evaluate against one more models with:")
+
+    if dependency is not None:
+        contents.append("")
+        contents.append("Then, install evaluation specific dependencies")
+        contents.append("```bash")
+        contents.append(
+            f"pip install inspect_evals[{dependency}]@git+https://github.com/UKGovernmentBEIS/inspect_evals"
+        )
+        contents.append("```")
+        contents.append("")
+        contents.append("Finally, evaluate against one more models with:")
+    else:
+        contents.append("")
+        contents.append("Then, evaluate against one more models with:")
+
     contents.append("```bash")
     for index, task in enumerate(task_metadata["tasks"]):
         if index > 3:

diff --git a/tools/listing.yaml b/tools/listing.yaml
@@ -27,6 +27,7 @@
   group: Coding
   contributors: ["max-kaufmann"]
   tasks: ["swe_bench"]
+  dependency: "swe_bench"
 
 - title: "GAIA: A Benchmark for General AI Assistants"
   description: |
@@ -63,6 +64,7 @@
   group: Mathematics
   contributors: ["xeon27"]
   tasks: ["math"]
+  dependency: "math"
 
 - title: "GSM8K: Training Verifiers to Solve Math Word Problems"
   description: |