Skip to content

Commit

Permalink
fix: Add BRIGHT (long) and fix bug in TaskResult.filter_and_validate() (
Browse files Browse the repository at this point in the history
#2041)

* fix: Add BRIGHT Long

Fixes #1978

* fix: Add BRIGHT(long)

* fix bug in task results

* updated bright

* updated tests for TaskResults
  • Loading branch information
KennethEnevoldsen authored Feb 13, 2025
1 parent 01fd6fb commit 3537223
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 14 deletions.
26 changes: 23 additions & 3 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,9 +1060,7 @@

BRIGHT = Benchmark(
name="BRIGHT",
tasks=get_tasks(
tasks=["BrightRetrieval"],
),
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
benchmark that requires intensive reasoning to retrieve relevant documents with
Expand All @@ -1079,6 +1077,28 @@
}""",
)


BRIGHT_LONG = Benchmark(
name="BRIGHT (long)",
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
benchmark that requires intensive reasoning to retrieve relevant documents with
a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
economics, psychology, mathematics, and coding. These queries are drawn from
naturally occurring and carefully curated human data.
This is the long version of the benchmark, which only filter longer documents.
""",
reference="https://brightbenchmark.github.io/",
citation="""@article{su2024bright,
title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal={arXiv preprint arXiv:2407.12883},
year={2024}
}""",
)

CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
Expand Down
2 changes: 1 addition & 1 deletion mteb/load_results/task_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
if task is None:
task = get_task(self.task_name)

splits = task.metadata.eval_splits
splits = task.eval_splits
hf_subsets = task.hf_subsets
hf_subsets = set(hf_subsets)

Expand Down
1 change: 0 additions & 1 deletion mteb/tasks/Retrieval/eng/BrightRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval):
domains=["Non-fiction", "Written"],
task_subtypes=["Article retrieval"],
license="cc-by-4.0",
socioeconomic_status="low",
annotations_creators="derived",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class DummyTask(AbsTask):
annotations_creators="derived",
dialect=[],
bibtex_citation="",
descriptive_stats={},
modalities=["text"],
sample_creation="created",
)
Expand All @@ -48,11 +47,11 @@ def _evaluate_subset(self, **kwargs):
def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall=False
) -> dict[str, float]:
pass
return {}


def test_mteb_results():
"""Test TaskResult class (this is the same as the example in the docstring)"""
@pytest.fixture()
def task_result():
scores = {
"train": {
"en-de": {
Expand All @@ -66,13 +65,19 @@ def test_mteb_results():

evaluation_time = 100

mteb_results = TaskResult.from_task_results(
return TaskResult.from_task_results(
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
)

assert mteb_results.get_score() == 0.55
assert mteb_results.get_score(languages=["eng"]) == 0.55
assert mteb_results.get_score(languages=["fra"]) == 0.6

def test_task_results_get_score(task_result: TaskResult):
"""Test TaskResult class (this is the same as the example in the docstring)"""
assert task_result.get_score() == 0.55
assert task_result.get_score(languages=["eng"]) == 0.55
assert task_result.get_score(languages=["fra"]) == 0.6


def test_task_results_to_dict(task_result: TaskResult):
dict_repr = {
"dataset_revision": "1.0",
"task_name": "dummy_task",
Expand All @@ -94,7 +99,52 @@ def test_mteb_results():
]
},
}
assert mteb_results.to_dict() == dict_repr
assert task_result.to_dict() == dict_repr


def test_task_results_validate_and_filter():
scores = {
"train": {
"en-de": {
"main_score": 0.5,
},
"en-fr": {
"main_score": 0.6,
},
},
"test": {
"en-de": {
"main_score": 0.3,
},
"en-fr": {
"main_score": 0.4,
},
},
}

evaluation_time = 100

res = TaskResult.from_task_results(
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
)

task = DummyTask()
task._eval_splits = ["train", "test"]
res1 = res.validate_and_filter_scores(task=task)

assert res1.scores.keys() == {"train", "test"}
assert res1.get_score() == (0.5 + 0.6 + 0.3 + 0.4) / 4

task._eval_splits = ["test"]
res2 = res.validate_and_filter_scores(task=task)
assert res2.scores.keys() == {"test"}
assert res2.get_score() == (0.3 + 0.4) / 2 # only test scores

task.hf_subsets = ["en-de"]
task._eval_splits = ["train", "test"]
res3 = res.validate_and_filter_scores(task=task)
assert res3.scores.keys() == {"train", "test"}
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores


@pytest.mark.parametrize(
Expand Down

0 comments on commit 3537223

Please sign in to comment.