Skip to content

fix: Adding detailed information on each metric evaluation #1480

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/google/adk/cli/cli_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from ..agents import Agent
from ..artifacts.base_artifact_service import BaseArtifactService
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
Expand All @@ -38,10 +39,6 @@
logger = logging.getLogger("google_adk." + __name__)


MISSING_EVAL_DEPENDENCIES_MESSAGE = (
"Eval module is not installed, please install via `pip install"
" google-adk[eval]`."
)
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
# This evaluation is not very stable.
Expand Down Expand Up @@ -150,7 +147,7 @@ async def run_evals(
artifact_service: The artifact service to use during inferencing.
"""
try:
from ..evaluation.agent_evaluator import EvaluationGenerator
from ..evaluation.evaluation_generator import EvaluationGenerator
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e

Expand Down
2 changes: 1 addition & 1 deletion src/google/adk/cli/cli_tools_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
from . import cli_create
from . import cli_deploy
from .. import version
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
from ..sessions.in_memory_session_service import InMemorySessionService
from .cli import run_cli
from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
from .fast_api import get_fast_api_app
from .utils import envs
from .utils import logs
Expand Down
109 changes: 99 additions & 10 deletions src/google/adk/evaluation/agent_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
import logging
import os
Expand All @@ -23,16 +25,16 @@
from typing import Union
import uuid

from google.genai import types as genai_types
from pydantic import ValidationError

from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
from .eval_case import IntermediateData
from .eval_set import EvalSet
from .evaluation_generator import EvaluationGenerator
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
from .response_evaluator import ResponseEvaluator
from .trajectory_evaluator import TrajectoryEvaluator

logger = logging.getLogger("google_adk." + __name__)

Expand Down Expand Up @@ -96,6 +98,7 @@ async def evaluate_eval_set(
criteria: dict[str, float],
num_runs=NUM_RUNS,
agent_name=None,
print_detailed_results: bool = True,
):
"""Evaluates an agent using the given EvalSet.

Expand All @@ -109,14 +112,22 @@ async def evaluate_eval_set(
num_runs: Number of times all entries in the eval dataset should be
assessed.
agent_name: The name of the agent.
print_detailed_results: Whether to print detailed results for each metric
evaluation.
"""
try:
from .evaluation_generator import EvaluationGenerator
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
eval_case_responses_list = await EvaluationGenerator.generate_responses(
eval_set=eval_set,
agent_module_path=agent_module,
repeat_num=num_runs,
agent_name=agent_name,
)

failures = []

for eval_case_responses in eval_case_responses_list:
actual_invocations = [
invocation
Expand All @@ -139,10 +150,25 @@ async def evaluate_eval_set(
)
)

assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
f" but got {evaluation_result.overall_score}."
)
if print_detailed_results:
AgentEvaluator._print_details(
evaluation_result=evaluation_result,
metric_name=metric_name,
threshold=threshold,
)

# Gather all the failures.
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
failures.append(
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
f" but got {evaluation_result.overall_score}."
)

assert not failures, (
"Following are all the test failures. If you looking to get more"
" details on the failures, then please re-run this test with"
" `print_details` set to `True`.\n{}".format("\n".join(failures))
)

@staticmethod
async def evaluate(
Expand All @@ -158,9 +184,10 @@ async def evaluate(
agent_module: The path to python module that contains the definition of
the agent. There is convention in place here, where the code is going to
look for 'root_agent' in the loaded module.
eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
full path to the file containing eval dataset, or a directory that is
recursively explored for all files that have a `.test.json` suffix.
eval_dataset_file_path_or_dir: The eval data set. This can be either a
string representing full path to the file containing eval dataset, or a
directory that is recursively explored for all files that have a
`.test.json` suffix.
num_runs: Number of times all entries in the eval dataset should be
assessed.
agent_name: The name of the agent.
Expand Down Expand Up @@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):

@staticmethod
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
try:
from .response_evaluator import ResponseEvaluator
from .trajectory_evaluator import TrajectoryEvaluator
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
return TrajectoryEvaluator(threshold=threshold)
elif (
Expand All @@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)

raise ValueError(f"Unsupported eval metric: {metric_name}")

@staticmethod
def _print_details(
evaluation_result: EvaluationResult, metric_name: str, threshold: float
):
try:
from pandas import pandas as pd
from tabulate import tabulate
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
print(
f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
f" `{evaluation_result.overall_score}`."
)

data = []
for per_invocation_result in evaluation_result.per_invocation_results:
data.append({
"eval_status": per_invocation_result.eval_status,
"score": per_invocation_result.score,
"threshold": threshold,
"prompt": AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.user_content
),
"expected_response": AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.final_response
),
"actual_response": AgentEvaluator._convert_content_to_text(
per_invocation_result.actual_invocation.final_response
),
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
per_invocation_result.expected_invocation.intermediate_data
),
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
per_invocation_result.actual_invocation.intermediate_data
),
})

print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
print("\n\n") # Few empty lines for visual clarity

@staticmethod
def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
if content and content.parts:
return "\n".join([p.text for p in content.parts if p.text])

return ""

@staticmethod
def _convert_tool_calls_to_text(
intermediate_data: Optional[IntermediateData],
) -> str:
if intermediate_data and intermediate_data.tool_uses:
return "\n".join([str(t) for t in intermediate_data.tool_uses])

return ""
20 changes: 20 additions & 0 deletions src/google/adk/evaluation/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

MISSING_EVAL_DEPENDENCIES_MESSAGE = (
"Eval module is not installed, please install via `pip install"
" google-adk[eval]`."
)
2 changes: 1 addition & 1 deletion src/google/adk/evaluation/evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ async def _generate_inferences_from_root_agent(
tool_uses = []
invocation_id = ""

for event in runner.run(
async for event in runner.run_async(
user_id=user_id, session_id=session_id, new_message=user_content
):
invocation_id = (
Expand Down
13 changes: 0 additions & 13 deletions tests/integration/fixture/trip_planner_agent/initial.session.json

This file was deleted.

133 changes: 115 additions & 18 deletions tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,116 @@
[
{
"query": "Hi, who are you? What can you do?",
"expected_tool_use": [],
"reference": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
},
{
"query": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?",
"expected_tool_use": [
{
"tool_name": "transfer_to_agent",
"tool_input": {
"agent_name": "indentify_agent"
{
"eval_set_id": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
"name": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
"description": null,
"eval_cases": [
{
"eval_id": "/google/src/cloud/ankusharma/CS-agent_evaluator-2025-06-17_115009/google3/third_party/py/google/adk/open_source_workspace/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json",
"conversation": [
{
"invocation_id": "d7ff8ec1-290b-48c5-b3aa-05cb8f27b8ae",
"user_content": {
"parts": [
{
"video_metadata": null,
"thought": null,
"inline_data": null,
"file_data": null,
"thought_signature": null,
"code_execution_result": null,
"executable_code": null,
"function_call": null,
"function_response": null,
"text": "Hi, who are you? What can you do?"
}
],
"role": "user"
},
"final_response": {
"parts": [
{
"video_metadata": null,
"thought": null,
"inline_data": null,
"file_data": null,
"thought_signature": null,
"code_execution_result": null,
"executable_code": null,
"function_call": null,
"function_response": null,
"text": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
}
],
"role": "model"
},
"intermediate_data": {
"tool_uses": [],
"intermediate_responses": []
},
"creation_timestamp": 1750190885.419684
},
{
"invocation_id": "f515ff57-ff21-488f-ab92-7d7de5bb76fe",
"user_content": {
"parts": [
{
"video_metadata": null,
"thought": null,
"inline_data": null,
"file_data": null,
"thought_signature": null,
"code_execution_result": null,
"executable_code": null,
"function_call": null,
"function_response": null,
"text": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?"
}
],
"role": "user"
},
"final_response": {
"parts": [
{
"video_metadata": null,
"thought": null,
"inline_data": null,
"file_data": null,
"thought_signature": null,
"code_execution_result": null,
"executable_code": null,
"function_call": null,
"function_response": null,
"text": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
}
],
"role": "model"
},
"intermediate_data": {
"tool_uses": [
{
"id": null,
"args": {
"agent_name": "indentify_agent"
},
"name": "transfer_to_agent"
}
],
"intermediate_responses": []
},
"creation_timestamp": 1750190885.4197457
}
}
],
"reference": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
}
]
],
"session_input": {
"app_name": "trip_planner_agent",
"user_id": "test_user",
"state": {
"origin": "San Francisco",
"interests": "Food, Shopping, Museums",
"range": "1000 miles",
"cities": ""
}
},
"creation_timestamp": 1750190885.4197533
}
],
"creation_timestamp": 1750190885.4197605
}