instructlab · RobotSail · May 13, 2025
diff --git a/scripts/evaluate_best_checkpoint.py b/scripts/evaluate_best_checkpoint.py
@@ -10,18 +10,21 @@
 # Standard
 from pathlib import Path
 from typing import Optional
+from typing_extensions import Annotated
 import json
 
 # Third Party
+from rich import print
 import typer
 
 app = typer.Typer()
 
 
 @app.command()
-def main(
+def best_checkpoint(
     input_dir: Path = typer.Argument(..., help="Input directory to process"),
     output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
+    tasks: Annotated[Optional[list[str]], typer.Option()] = None,
 ):
     """
     Process files in the input directory and optionally save results to an output file.
@@ -54,6 +57,8 @@ def main(
         evaluator = LeaderboardV2Evaluator(
             model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
         )
+        if tasks:
+            evaluator.tasks = tasks
         result = evaluator.run()
         checkpoint_results[checkpoint.name] = result
         typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
@@ -63,12 +68,37 @@ def main(
         checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
     )
     typer.echo("Sorted checkpoints by score:")
-    for checkpoint_name, result in sorted_checkpoints:
+    for i, (checkpoint_name, result) in enumerate(sorted_checkpoints):
         typer.echo(f"{'=' * 100}")
-        typer.echo(json.dumps(result, indent=2))
+        # Add [BEST CHECKPOINT] label for the first checkpoint
+        if i == 0:
+            typer.echo(
+                f"[bold]Leaderboard results[/bold]: {checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
+            )
+        else:
+            typer.echo(f"[bold]Leaderboard results[/bold]: {checkpoint_name}")
+        typer.echo(f"Overall: {result['overall_score'] * 100:.2f}%")
+        if "leaderboard_bbh" in result:
+            typer.echo(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
+        if "leaderboard_gpqa" in result:
+            typer.echo(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
+        if "leaderboard_ifeval" in result:
+            typer.echo(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
+        if "leaderboard_math_hard" in result:
+            typer.echo(
+                f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
+            )
+        if "leaderboard_mmlu_pro" in result:
+            typer.echo(
+                f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
+            )
+        if "leaderboard_musr" in result:
+            typer.echo(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
 
     typer.echo(f"{'=' * 100}")
-    typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
+    typer.echo(
+        f"Best checkpoint: {sorted_checkpoints[0][0]} [bold green][BEST CHECKPOINT][/bold green]"
+    )
 
     if output_file:
         typer.echo(f"Output will be saved to: {output_file}")
@@ -80,5 +110,152 @@ def main(
     typer.echo("Processing complete!")
 
 
+@app.command()
+def evaluate(
+    input_dir: Path = typer.Argument(..., help="Input directory to process"),
+    tasks: Annotated[Optional[list[str]], typer.Option()] = None,
+):
+    """
+    Process files in the input directory and optionally save results to an output file.
+    """
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory '{input_dir}' does not exist")
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: '{input_dir}' is not a directory")
+        raise typer.Exit(1)
+
+    typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
+    # First Party
+    from instructlab.eval.leaderboard import LeaderboardV2Evaluator
+
+    typer.echo("done")
+
+    evaluator = LeaderboardV2Evaluator(
+        model_path=str(input_dir), num_gpus=8, eval_config={"batch_size": "auto"}
+    )
+    if tasks:
+        evaluator.tasks = tasks
+    result = evaluator.run()
+
+    # now just print out the checkpoint results
+    print(f"[bold]Leaderboard results[/bold]: {input_dir}")
+    print(f"Overall: {result['overall_score'] * 100:.2f}%")
+    if "leaderboard_bbh" in result:
+        print(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
+    if "leaderboard_gpqa" in result:
+        print(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
+    if "leaderboard_ifeval" in result:
+        print(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
+    if "leaderboard_math_hard" in result:
+        print(f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%")
+    if "leaderboard_mmlu_pro" in result:
+        print(f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
+    if "leaderboard_musr" in result:
+        print(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
+
+    output_file = input_dir / "leaderboard_results.json"
+    output_file.write_text(json.dumps(result, indent=2))
+
+
+@app.command()
+def find_best(
+    input_dir: Path = typer.Argument(..., help="Input directory to process"),
+    show_all: bool = typer.Option(
+        False, "--show-all", help="Show scores for all checkpoints"
+    ),
+):
+    """
+    Find the best checkpoint by looking through leaderboard_results.json files.
+    """
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory '{input_dir}' does not exist")
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: '{input_dir}' is not a directory")
+        raise typer.Exit(1)
+
+    # Find all leaderboard_results.json files
+    result_files = list(input_dir.glob("**/leaderboard_results.json"))
+
+    if not result_files:
+        typer.echo("No leaderboard results found in any subdirectories")
+        raise typer.Exit(1)
+
+    # Load and compare results
+    best_score = -1
+    best_checkpoint = None
+    best_results = None
+    all_results = []
+
+    for result_file in result_files:
+        try:
+            results = json.loads(result_file.read_text())
+            score = results.get("overall_score", -1)
+            all_results.append((result_file.parent, score, results))
+
+            if score > best_score:
+                best_score = score
+                best_checkpoint = result_file.parent
+                best_results = results
+        except Exception as e:
+            typer.echo(f"Error reading {result_file}: {e}")
+            continue
+
+    if best_checkpoint is None:
+        typer.echo("No valid results found")
+        raise typer.Exit(1)
+
+    # Sort all results by score
+    all_results.sort(key=lambda x: x[1], reverse=True)
+
+    # Print all results if requested
+    if show_all:
+        print("\n[bold]All checkpoint results:[/bold]")
+        for checkpoint, score, results in all_results:
+            is_best = checkpoint == best_checkpoint
+            prefix = "→ " if is_best else "  "
+            print(f"\n{prefix}Checkpoint: {checkpoint}")
+            print(f"  Overall score: {score * 100:.2f}%")
+            if "leaderboard_bbh" in results:
+                print(f"  BBH: {results['leaderboard_bbh']['score'] * 100:.2f}%")
+            if "leaderboard_gpqa" in results:
+                print(f"  GPQA: {results['leaderboard_gpqa']['score'] * 100:.2f}%")
+            if "leaderboard_ifeval" in results:
+                print(f"  IFEval: {results['leaderboard_ifeval']['score'] * 100:.2f}%")
+            if "leaderboard_math_hard" in results:
+                print(
+                    f"  MATH-Hard: {results['leaderboard_math_hard']['score'] * 100:.2f}%"
+                )
+            if "leaderboard_mmlu_pro" in results:
+                print(
+                    f"  MMLU-Pro: {results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
+                )
+            if "leaderboard_musr" in results:
+                print(f"  MUSR: {results['leaderboard_musr']['score'] * 100:.2f}%")
+    else:
+        # Print only best results
+        print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
+        print(f"Overall score: {best_score * 100:.2f}%")
+        if "leaderboard_bbh" in best_results:
+            print(f"BBH: {best_results['leaderboard_bbh']['score'] * 100:.2f}%")
+        if "leaderboard_gpqa" in best_results:
+            print(f"GPQA: {best_results['leaderboard_gpqa']['score'] * 100:.2f}%")
+        if "leaderboard_ifeval" in best_results:
+            print(f"IFEval: {best_results['leaderboard_ifeval']['score'] * 100:.2f}%")
+        if "leaderboard_math_hard" in best_results:
+            print(
+                f"MATH-Hard: {best_results['leaderboard_math_hard']['score'] * 100:.2f}%"
+            )
+        if "leaderboard_mmlu_pro" in best_results:
+            print(
+                f"MMLU-Pro: {best_results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
+            )
+        if "leaderboard_musr" in best_results:
+            print(f"MUSR: {best_results['leaderboard_musr']['score'] * 100:.2f}%")
+
+
 if __name__ == "__main__":
     app()
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
@@ -251,8 +251,8 @@ def get_score_by_metric(score_dict: t.Dict[str, t.Any], metric: str) -> t.Any:
             extracted_value = value
             break
 
-    if not extracted_value:
-        if alias := score_dict.get("alias", None):
+    if extracted_value is None:
+        if alias := score_dict.get("alias", "[no-alias]"):
             error_msg = (
                 f"Failed to find a metric matching '{metric}' for task '{alias}'."
             )