added humaneval

wenting-zhao · wenting-zhao · commit 38205e61dc2e · 2024-12-02T01:02:02.000Z
diff --git a/commit0/cli.py b/commit0/cli.py
@@ -118,7 +118,7 @@ def setup(
 ) -> None:
     """Commit0 clone a repo split."""
     check_commit0_path()
-    if "commit0" in dataset_name.lower():
+    if "commit0" in dataset_name.split('/')[-1].lower():
         check_valid(repo_split, SPLIT)
 
     base_dir = str(Path(base_dir).resolve())
@@ -169,7 +169,7 @@ def build(
     check_commit0_path()
 
     commit0_config = read_commit0_config_file(commit0_config_file)
-    if "commit0" in commit0_config["dataset_name"].lower():
+    if "commit0" in commit0_config["dataset_name"].split('/')[-1].lower():
         check_valid(commit0_config["repo_split"], SPLIT)
 
     typer.echo(
@@ -251,16 +251,20 @@ def test(
     commit0_config = read_commit0_config_file(commit0_config_file)
     if repo_or_repo_path.endswith("/"):
         repo_or_repo_path = repo_or_repo_path[:-1]
-    if "commit0" in commit0_config["dataset_name"].lower():
+    if "commit0" in commit0_config["dataset_name"].split('/')[-1].lower():
         check_valid(repo_or_repo_path.split("/")[-1], SPLIT)
 
     if reference:
         branch = "reference"
-    if branch is None and not reference:
-        git_path = os.path.join(
-            commit0_config["base_dir"], repo_or_repo_path.split("/")[-1]
-        )
-        branch = get_active_branch(git_path)
+    else:
+        if "humaneval" not in commit0_config["dataset_name"].split('/')[-1].lower():
+            if branch is None and not reference:
+                git_path = os.path.join(
+                    commit0_config["base_dir"], repo_or_repo_path.split("/")[-1]
+                )
+                branch = get_active_branch(git_path)
+        else:
+            branch = test_ids
 
     if stdin:
         # Read test names from stdin
@@ -317,7 +321,7 @@ def evaluate(
         branch = "reference"
 
     commit0_config = read_commit0_config_file(commit0_config_file)
-    if "commit0" in commit0_config["dataset_name"].lower():
+    if "commit0" in commit0_config["dataset_name"].split('/')[-1].lower():
         check_valid(commit0_config["repo_split"], SPLIT)
 
     typer.echo(f"Evaluating repository split: {commit0_config['repo_split']}")
@@ -393,7 +397,7 @@ def save(
     """Save Commit0 split you choose in Setup Stage to GitHub."""
     check_commit0_path()
     commit0_config = read_commit0_config_file(commit0_config_file)
-    if "commit0" in commit0_config["dataset_name"].lower():
+    if "commit0" in commit0_config["dataset_name"].split('/')[-1].lower():
         check_valid(commit0_config["repo_split"], SPLIT)
 
     typer.echo(f"Saving repository split: {commit0_config['repo_split']}")
diff --git a/commit0/harness/build.py b/commit0/harness/build.py
@@ -4,7 +4,7 @@
 from datasets import load_dataset
 from typing import Iterator
 
-from commit0.harness.constants import RepoInstance, SPLIT
+from commit0.harness.constants import RepoInstance, SimpleInstance, SPLIT
 from commit0.harness.docker_build import build_repo_images
 from commit0.harness.spec import make_spec
 
@@ -17,23 +17,25 @@
 def main(
     dataset_name: str,
     dataset_split: str,
-    repo_split: str,
+    split: str,
     num_workers: int,
     verbose: int,
 ) -> None:
-    dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
+    dataset: Iterator[Union[RepoInstance, SimpleInstance]] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
     specs = []
     if "swe" in dataset_name.lower():
         dataset_type = "swebench"
+    elif "humaneval" in dataset_name.lower():
+        dataset_type = "simple"
     else:
         dataset_type = "commit0"
     for example in dataset:
-        repo_name = example["repo"].split("/")[-1]
-        if "swe" in dataset_name.lower():
-            if repo_split != "all" and repo_split not in example["instance_id"]:
+        if "swe" in dataset_name.lower() or dataset_type == "simple":
+            if split != "all" and split not in example["instance_id"]:
                 continue
         else:
-            if repo_split != "all" and repo_name not in SPLIT[repo_split]:
+            repo_name = example["repo"].split("/")[-1]
+            if split != "all" and repo_name not in SPLIT[split]:
                 continue
         spec = make_spec(example, dataset_type)
         specs.append(spec)
diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -13,6 +13,14 @@ class RepoInstance(TypedDict):
     src_dir: str
 
 
+class SimpleInstance(TypedDict):
+    instance_id: str
+    prompt: str
+    canonical_solution: str
+    test: str
+    entry_point: str
+
+
 class Files(TypedDict):
     eval_script: Dict[str, Path]
     patch: Dict[str, Path]
diff --git a/commit0/harness/execution_context.py b/commit0/harness/execution_context.py
@@ -102,7 +102,7 @@ def __init__(
         self.client = docker.from_env()
         self.container = create_container(
             client=self.client,
-            image_name=spec.repo_image_tag,
+            image_name=spec.repo_image_key,
             container_name=spec.get_container_name(),
             nano_cpus=num_cpus,
             logger=logger,
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -1,5 +1,6 @@
 import git
 import os
+import re
 import sys
 import traceback
 from datasets import load_dataset
@@ -11,6 +12,7 @@
     Files,
     RUN_PYTEST_LOG_DIR,
     RepoInstance,
+    SimpleInstance,
 )
 from commit0.harness.spec import make_spec
 from commit0.harness.utils import (
@@ -46,7 +48,7 @@ def main(
     Tests are run either locally through docker
     or remotely through Modal.
     """
-    dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
+    dataset: Iterator[Union[RepoInstance, SimpleInstance]] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
     spec = None
     example = None
     repo_name = None
@@ -56,10 +58,13 @@ def main(
         if "swe" in dataset_name.lower():
             repo_name = example["instance_id"]
             dataset_type = "swebench"
+        elif "humaneval" in dataset_name.lower():
+            repo_name = example["instance_id"]
+            dataset_type = "simple"
         else:
             repo_name = example["repo"].split("/")[-1]
             dataset_type = "commit0"
-        if repo_name in os.path.basename(repo_or_repo_dir):
+        if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith(repo_name):
             spec = make_spec(example, dataset_type)
             break
     assert spec is not None, "No spec available"
@@ -73,46 +78,61 @@ def main(
     log_file = log_dir / "run_pytest.log"
     logger = setup_logger(repo_name, log_file, verbose=verbose)
 
-    try:
-        local_repo = git.Repo(repo_or_repo_dir)
-        logger.info(f"Loaded a git repo from {repo_or_repo_dir}")
-    except (git.exc.NoSuchPathError, git.exc.InvalidGitRepositoryError):  # type: ignore
-        repo_dir = os.path.join(base_dir, repo_name)
-        logger.error(f"{repo_or_repo_dir} is not a git dir, trying {repo_dir} again")
+    if dataset_type != "simple":  # if dataset_type is not simple, load git repo
         try:
-            local_repo = git.Repo(repo_dir)
-            logger.info(f"Retried succeeded. Loaded a git repo from {repo_dir}")
-        except git.exc.NoSuchPathError:  # type: ignore
-            raise Exception(
-                f"{repo_dir} and {repo_or_repo_dir} are not git directories.\nUsage: commit0 test {{repo_dir}} {{branch}} {{test_ids}}"
-            )
-        except Exception as e:
-            raise e
-    commit_id = ""
-    if branch == "reference":
-        commit_id = example["reference_commit"]
-    else:
-        # Check if it's a local branch
-        if branch in local_repo.branches:
-            commit_id = local_repo.commit(branch).hexsha
+            local_repo = git.Repo(repo_or_repo_dir)
+            logger.info(f"Loaded a git repo from {repo_or_repo_dir}")
+        except (git.exc.NoSuchPathError, git.exc.InvalidGitRepositoryError):  # type: ignore
+            repo_dir = os.path.join(base_dir, repo_name)
+            logger.error(f"{repo_or_repo_dir} is not a git dir, trying {repo_dir} again")
+            try:
+                local_repo = git.Repo(repo_dir)
+                logger.info(f"Retried succeeded. Loaded a git repo from {repo_dir}")
+            except git.exc.NoSuchPathError:  # type: ignore
+                raise Exception(
+                    f"{repo_dir} and {repo_or_repo_dir} are not git directories.\nUsage: commit0 test {{repo_dir}} {{branch}} {{test_ids}}"
+                )
+            except Exception as e:
+                raise e
+        commit_id = ""
+        if branch == "reference":
+            commit_id = example["reference_commit"]
         else:
-            found_remote_branch = False
-            for remote in local_repo.remotes:
-                remote.fetch()  # Fetch latest updates from each remote
+            # Check if it's a local branch
+            if branch in local_repo.branches:
+                commit_id = local_repo.commit(branch).hexsha
+            else:
+                found_remote_branch = False
+                for remote in local_repo.remotes:
+                    remote.fetch()  # Fetch latest updates from each remote
 
-                # Check if the branch exists in this remote
-                for ref in remote.refs:
-                    if (
-                        ref.remote_head == branch
-                    ):  # Compare branch name without remote prefix
-                        commit_id = local_repo.commit(ref.name).hexsha
-                        found_remote_branch = True
-                        break  # Branch found, no need to keep checking this remote
-                if found_remote_branch:
-                    break  # Stop checking other remotes if branch is found
-            if not found_remote_branch:
-                raise Exception(f"Branch {branch} does not exist locally or remotely.")
-    if "swe" in dataset_name.lower():
+                    # Check if the branch exists in this remote
+                    for ref in remote.refs:
+                        if (
+                            ref.remote_head == branch
+                        ):  # Compare branch name without remote prefix
+                            commit_id = local_repo.commit(ref.name).hexsha
+                            found_remote_branch = True
+                            break  # Branch found, no need to keep checking this remote
+                    if found_remote_branch:
+                        break  # Stop checking other remotes if branch is found
+                if not found_remote_branch:
+                    raise Exception(f"Branch {branch} does not exist locally or remotely.")
+    if dataset_type == "simple":
+        if branch == "reference":
+            patch = example["prompt"] + "\n\n" + example["canonical_solution"] + "\n\n" + example["test"]
+        else:
+            solution = open(test_ids).read()
+            pattern = r"```python\n(.*?)```"
+            matches = re.finditer(pattern, solution, re.DOTALL)
+            matches = [match.group(1).strip() for match in matches]
+            if len(matches) > 0:
+                solution = "\n\n".join(matches)
+            else:
+                solution = example["prompt"] + "\n\n" + solution
+            patch = solution + "\n\n" + example["test"]
+        patch = patch + "\n\n" + f"check({example['entry_point']})"
+    elif "swe" in dataset_name.lower():
         if branch == "reference":
             patch = example["test"]["patch"] + "\n\n" + example["test"]["test_patch"]
         else:
@@ -127,12 +147,15 @@ def main(
     patch_file = Path(log_dir / "patch.diff")
     patch_file.write_text(patch, encoding="utf-8", errors="ignore")
 
-    # make eval file
-    if coverage:
-        coverage_text = f" --cov={example['src_dir']} --cov-branch --cov-report json"
+    if dataset_type != "simple":
+        # make eval file
+        if coverage:
+            coverage_text = f" --cov={example['src_dir']} --cov-branch --cov-report json"
+        else:
+            coverage_text = ""
+        eval_script = spec.eval_script.format(test_ids=test_ids, coverage=coverage_text)
     else:
-        coverage_text = ""
-    eval_script = spec.eval_script.format(test_ids=test_ids, coverage=coverage_text)
+        eval_script = spec.eval_script
     eval_file = Path(log_dir / "eval.sh")
     eval_file.write_text(eval_script)
 
diff --git a/commit0/harness/setup.py b/commit0/harness/setup.py
@@ -23,6 +23,8 @@ def main(
     base_dir: str,
 ) -> None:
     dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
+    if "humaneval" in dataset_name.lower():
+        return
     for example in dataset:
         repo_name = example["repo"].split("/")[-1]
         clone_url = f"https://github.com/{example['repo']}.git"
diff --git a/commit0/harness/spec.py b/commit0/harness/spec.py
@@ -5,6 +5,7 @@
 
 from commit0.harness.constants import (
     RepoInstance,
+    SimpleInstance,
 )
 from commit0.harness.dockerfiles import (
     get_dockerfile_base,
@@ -19,7 +20,7 @@ class Spec(ABC):
     repo: str
     # repo dir on docker
     repo_directory: str
-    instance: RepoInstance
+    instance: Union[RepoInstance, SimpleInstance]
 
     @property
     def setup_script(self) -> str:
@@ -175,6 +176,31 @@ def make_eval_script_list(self) -> list[str]:
         return eval_script_list
 
 
+class SimpleSpec(Spec):
+    def make_repo_script_list(self) -> list[str]:
+        """Create a list of bash commands to set up the repository for testing.
+        This is the setup script for the instance image.
+        """
+        setup_commands = [
+            f"mkdir {self.repo_directory} && cd {self.repo_directory}",
+            f"uv venv --python 3.12",
+            "source .venv/bin/activate",
+            "which python",
+        ]
+        return setup_commands
+
+    def make_eval_script_list(self) -> list[str]:
+        """Run the tests."""
+        eval_script_list = [
+            f"cd {self.repo_directory}",
+            "source .venv/bin/activate",
+            "cat /patch.diff > test.py",
+            "uv run test.py > test_output.txt 2>&1",
+            "echo $? > pytest_exit_code.txt",
+        ]
+        return eval_script_list
+
+
 class SWEBenchSpec(Spec):
     def make_repo_script_list(self) -> list[str]:
         """Create a list of bash commands to set up the repository for testing.
@@ -277,7 +303,7 @@ def make_eval_script_list(self) -> list[str]:
 
 
 def get_specs_from_dataset(
-    dataset: Union[list[RepoInstance], list[Spec]], dataset_type: str
+    dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]], dataset_type: str
 ) -> list[Spec]:
     """Idempotent function that converts a list of RepoInstance objects to a list of Spec objects."""
     if isinstance(dataset[0], Spec):
@@ -290,7 +316,7 @@ def get_specs_from_dataset(
     )
 
 
-def make_spec(instance: RepoInstance, dataset_type: str) -> Spec:
+def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec:
     if isinstance(instance, Spec):
         return instance
     repo_directory = "/testbed"
@@ -306,6 +332,12 @@ def make_spec(instance: RepoInstance, dataset_type: str) -> Spec:
             repo_directory=repo_directory,
             instance=instance,
         )
+    elif dataset_type == "simple":
+        return SimpleSpec(
+            repo="simple",  # all benchmarks with mere function writing will share the simple docker image
+            repo_directory=repo_directory,
+            instance=instance,
+        )
     else:
         raise NotImplementedError(
             f"{dataset_type} is not supported.\nWe only support commit0 and swebench instances for now."