Skip to content

push-fix #253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 181 additions & 4 deletions scripts/evaluate_best_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@
# Standard
from pathlib import Path
from typing import Optional
from typing_extensions import Annotated
import json

# Third Party
from rich import print
import typer

app = typer.Typer()


@app.command()
def main(
def best_checkpoint(
input_dir: Path = typer.Argument(..., help="Input directory to process"),
output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
tasks: Annotated[Optional[list[str]], typer.Option()] = None,
):
"""
Process files in the input directory and optionally save results to an output file.
Expand Down Expand Up @@ -54,6 +57,8 @@ def main(
evaluator = LeaderboardV2Evaluator(
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
)
if tasks:
evaluator.tasks = tasks
result = evaluator.run()
checkpoint_results[checkpoint.name] = result
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
Expand All @@ -63,12 +68,37 @@ def main(
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
)
typer.echo("Sorted checkpoints by score:")
for checkpoint_name, result in sorted_checkpoints:
for i, (checkpoint_name, result) in enumerate(sorted_checkpoints):
typer.echo(f"{'=' * 100}")
typer.echo(json.dumps(result, indent=2))
# Add [BEST CHECKPOINT] label for the first checkpoint
if i == 0:
typer.echo(
f"[bold]Leaderboard results[/bold]: {checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
)
else:
typer.echo(f"[bold]Leaderboard results[/bold]: {checkpoint_name}")
typer.echo(f"Overall: {result['overall_score'] * 100:.2f}%")
if "leaderboard_bbh" in result:
typer.echo(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
if "leaderboard_gpqa" in result:
typer.echo(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
if "leaderboard_ifeval" in result:
typer.echo(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
if "leaderboard_math_hard" in result:
typer.echo(
f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
)
if "leaderboard_mmlu_pro" in result:
typer.echo(
f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
)
if "leaderboard_musr" in result:
typer.echo(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")

typer.echo(f"{'=' * 100}")
typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
typer.echo(
f"Best checkpoint: {sorted_checkpoints[0][0]} [bold green][BEST CHECKPOINT][/bold green]"
)

if output_file:
typer.echo(f"Output will be saved to: {output_file}")
Expand All @@ -80,5 +110,152 @@ def main(
typer.echo("Processing complete!")


@app.command()
def evaluate(
input_dir: Path = typer.Argument(..., help="Input directory to process"),
tasks: Annotated[Optional[list[str]], typer.Option()] = None,
):
"""
Process files in the input directory and optionally save results to an output file.
"""
if not input_dir.exists():
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
raise typer.Exit(1)

if not input_dir.is_dir():
typer.echo(f"Error: '{input_dir}' is not a directory")
raise typer.Exit(1)

typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
# First Party
from instructlab.eval.leaderboard import LeaderboardV2Evaluator

typer.echo("done")

evaluator = LeaderboardV2Evaluator(
model_path=str(input_dir), num_gpus=8, eval_config={"batch_size": "auto"}
)
if tasks:
evaluator.tasks = tasks
result = evaluator.run()

# now just print out the checkpoint results
print(f"[bold]Leaderboard results[/bold]: {input_dir}")
print(f"Overall: {result['overall_score'] * 100:.2f}%")
if "leaderboard_bbh" in result:
print(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
if "leaderboard_gpqa" in result:
print(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
if "leaderboard_ifeval" in result:
print(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
if "leaderboard_math_hard" in result:
print(f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%")
if "leaderboard_mmlu_pro" in result:
print(f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
if "leaderboard_musr" in result:
print(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")

output_file = input_dir / "leaderboard_results.json"
output_file.write_text(json.dumps(result, indent=2))


@app.command()
def find_best(
input_dir: Path = typer.Argument(..., help="Input directory to process"),
show_all: bool = typer.Option(
False, "--show-all", help="Show scores for all checkpoints"
),
):
"""
Find the best checkpoint by looking through leaderboard_results.json files.
"""
if not input_dir.exists():
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
raise typer.Exit(1)

if not input_dir.is_dir():
typer.echo(f"Error: '{input_dir}' is not a directory")
raise typer.Exit(1)

# Find all leaderboard_results.json files
result_files = list(input_dir.glob("**/leaderboard_results.json"))

if not result_files:
typer.echo("No leaderboard results found in any subdirectories")
raise typer.Exit(1)

# Load and compare results
best_score = -1
best_checkpoint = None
best_results = None
all_results = []

for result_file in result_files:
try:
results = json.loads(result_file.read_text())
score = results.get("overall_score", -1)
all_results.append((result_file.parent, score, results))

if score > best_score:
best_score = score
best_checkpoint = result_file.parent
best_results = results
except Exception as e:
typer.echo(f"Error reading {result_file}: {e}")
continue

if best_checkpoint is None:
typer.echo("No valid results found")
raise typer.Exit(1)

# Sort all results by score
all_results.sort(key=lambda x: x[1], reverse=True)

# Print all results if requested
if show_all:
print("\n[bold]All checkpoint results:[/bold]")
for checkpoint, score, results in all_results:
is_best = checkpoint == best_checkpoint
prefix = "→ " if is_best else " "
print(f"\n{prefix}Checkpoint: {checkpoint}")
print(f" Overall score: {score * 100:.2f}%")
if "leaderboard_bbh" in results:
print(f" BBH: {results['leaderboard_bbh']['score'] * 100:.2f}%")
if "leaderboard_gpqa" in results:
print(f" GPQA: {results['leaderboard_gpqa']['score'] * 100:.2f}%")
if "leaderboard_ifeval" in results:
print(f" IFEval: {results['leaderboard_ifeval']['score'] * 100:.2f}%")
if "leaderboard_math_hard" in results:
print(
f" MATH-Hard: {results['leaderboard_math_hard']['score'] * 100:.2f}%"
)
if "leaderboard_mmlu_pro" in results:
print(
f" MMLU-Pro: {results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
)
if "leaderboard_musr" in results:
print(f" MUSR: {results['leaderboard_musr']['score'] * 100:.2f}%")
else:
# Print only best results
print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
print(f"Overall score: {best_score * 100:.2f}%")
if "leaderboard_bbh" in best_results:
print(f"BBH: {best_results['leaderboard_bbh']['score'] * 100:.2f}%")
if "leaderboard_gpqa" in best_results:
print(f"GPQA: {best_results['leaderboard_gpqa']['score'] * 100:.2f}%")
if "leaderboard_ifeval" in best_results:
print(f"IFEval: {best_results['leaderboard_ifeval']['score'] * 100:.2f}%")
if "leaderboard_math_hard" in best_results:
print(
f"MATH-Hard: {best_results['leaderboard_math_hard']['score'] * 100:.2f}%"
)
if "leaderboard_mmlu_pro" in best_results:
print(
f"MMLU-Pro: {best_results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
)
if "leaderboard_musr" in best_results:
print(f"MUSR: {best_results['leaderboard_musr']['score'] * 100:.2f}%")


if __name__ == "__main__":
app()
40 changes: 20 additions & 20 deletions src/instructlab/eval/leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
p.join()

# extract the result which is not None
assert (
len([res for res in results.values() if res is not None]) == 1
), "we expect exactly 1 process to return a results dict properly"
assert len([res for res in results.values() if res is not None]) == 1, (
"we expect exactly 1 process to return a results dict properly"
)
results_dict = [res for res in results.values() if res is not None][0]
return results_dict

Expand All @@ -251,8 +251,8 @@ def get_score_by_metric(score_dict: t.Dict[str, t.Any], metric: str) -> t.Any:
extracted_value = value
break

if not extracted_value:
if alias := score_dict.get("alias", None):
if extracted_value is None:
if alias := score_dict.get("alias", "[no-alias]"):
error_msg = (
f"Failed to find a metric matching '{metric}' for task '{alias}'."
)
Expand Down Expand Up @@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
parsed_scores = parse_multitask_results(
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
)
assert (
len(parsed_scores["subtasks"]) == 24
), "there should be 24 subtasks of bbh run"
assert len(parsed_scores["subtasks"]) == 24, (
"there should be 24 subtasks of bbh run"
)
return parsed_scores


Expand Down Expand Up @@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
scores.append(value)
target_metrics.remove(metric)

assert (
len(scores) == 2
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
assert len(scores) == 2, (
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
)
return {
"score": sum(scores) / 2,
}
Expand All @@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
parsed_scores = parse_multitask_results(
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
)
assert (
len(parsed_scores["subtasks"]) == 3
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
assert len(parsed_scores["subtasks"]) == 3, (
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
)
return parsed_scores


Expand All @@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
parsed_scores = parse_multitask_results(
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
)
assert (
len(parsed_scores["subtasks"]) == 7
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
assert len(parsed_scores["subtasks"]) == 7, (
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
)
return parsed_scores


Expand Down Expand Up @@ -463,9 +463,9 @@ def get_scores_from_result_dicts(
# this is just a sanity check step
benchmarks_already_covered = set(parsed_scores.keys())
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
assert (
len(benchmarks_already_covered & benchmarks_to_parse) == 0
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
)

# now actually add them
for benchmark in benchmarks_to_parse:
Expand Down
Loading