Skip to content

Allow definition of Model Settings for LLMJudge #1662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pydantic_evals/pydantic_evals/evaluators/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, cast

from pydantic_ai import models
from pydantic_ai.settings import ModelSettings

from ..otel.span_tree import SpanQuery
from .context import EvaluatorContext
Expand Down Expand Up @@ -164,6 +165,7 @@ class LLMJudge(Evaluator[object, object, object]):
rubric: str
model: models.Model | models.KnownModelName | None = None
include_input: bool = False
model_settings: ModelSettings | None = None

async def evaluate(
self,
Expand All @@ -172,11 +174,13 @@ async def evaluate(
if self.include_input:
from .llm_as_a_judge import judge_input_output

grading_output = await judge_input_output(ctx.inputs, ctx.output, self.rubric, self.model)
grading_output = await judge_input_output(
ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
)
else:
from .llm_as_a_judge import judge_output

grading_output = await judge_output(ctx.output, self.rubric, self.model)
grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
return EvaluationReason(value=grading_output.pass_, reason=grading_output.reason)

def build_serialization_arguments(self):
Expand Down
20 changes: 16 additions & 4 deletions pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pydantic_core import to_json

from pydantic_ai import Agent, models
from pydantic_ai.settings import ModelSettings

__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')

Expand Down Expand Up @@ -44,15 +45,20 @@ class GradingOutput(BaseModel, populate_by_name=True):


async def judge_output(
output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
output: Any,
rubric: str,
model: models.Model | models.KnownModelName | None = None,
model_settings: ModelSettings | None = None,
) -> GradingOutput:
"""Judge the output of a model based on a rubric.

If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
return (await _judge_output_agent.run(user_prompt, model=model or _default_model)).output
return (
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output


_judge_input_output_agent = Agent(
Expand All @@ -79,15 +85,21 @@ async def judge_output(


async def judge_input_output(
inputs: Any, output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
inputs: Any,
output: Any,
rubric: str,
model: models.Model | models.KnownModelName | None = None,
model_settings: ModelSettings | None = None,
) -> GradingOutput:
"""Judge the output of a model based on the inputs and a rubric.

If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
return (await _judge_input_output_agent.run(user_prompt, model=model or _default_model)).output
return (
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output


def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
Expand Down
61 changes: 59 additions & 2 deletions tests/evals/test_evaluator_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from inline_snapshot import snapshot
from pytest_mock import MockerFixture

from pydantic_ai.settings import ModelSettings

from ..conftest import try_import

with try_import() as imports_successful:
Expand Down Expand Up @@ -222,7 +224,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
assert result.value is True
assert result.reason == 'Test passed'

mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None)
mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None, None)

# Test with input
evaluator = LLMJudge(rubric='Output contains input', include_input=True, model='openai:gpt-4o')
Expand All @@ -232,7 +234,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
assert result.reason == 'Test passed'

mock_judge_input_output.assert_called_once_with(
{'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o'
{'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None
)

# Test with failing result
Expand All @@ -244,6 +246,61 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
assert result.reason == 'Test failed'


@pytest.mark.anyio
async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
"""Test LLMJudge evaluator with specific model_settings."""
mock_grading_output = mocker.MagicMock()
mock_grading_output.pass_ = True
mock_grading_output.reason = 'Test passed with settings'

mock_judge_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output')
mock_judge_output.return_value = mock_grading_output

mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
mock_judge_input_output.return_value = mock_grading_output

custom_model_settings = ModelSettings(temperature=0.77)

ctx = EvaluatorContext(
name='test_custom_settings',
inputs={'prompt': 'Hello Custom'},
metadata=None,
expected_output=None,
output='Hello world custom settings',
duration=0.0,
_span_tree=SpanTreeRecordingError('spans were not recorded'),
attributes={},
metrics={},
)

# Test without input, with custom model_settings
evaluator_no_input = LLMJudge(rubric='Greeting with custom settings', model_settings=custom_model_settings)
result_no_input = await evaluator_no_input.evaluate(ctx)
assert result_no_input.value is True
assert result_no_input.reason == 'Test passed with settings'
mock_judge_output.assert_called_once_with(
'Hello world custom settings', 'Greeting with custom settings', None, custom_model_settings
)

# Test with input, with custom model_settings
evaluator_with_input = LLMJudge(
rubric='Output contains input with custom settings',
include_input=True,
model='openai:gpt-3.5-turbo',
model_settings=custom_model_settings,
)
result_with_input = await evaluator_with_input.evaluate(ctx)
assert result_with_input.value is True
assert result_with_input.reason == 'Test passed with settings'
mock_judge_input_output.assert_called_once_with(
{'prompt': 'Hello Custom'},
'Hello world custom settings',
'Output contains input with custom settings',
'openai:gpt-3.5-turbo',
custom_model_settings,
)


async def test_python():
"""Test Python evaluator."""
evaluator = Python(expression='ctx.output > 0')
Expand Down
59 changes: 59 additions & 0 deletions tests/evals/test_llm_as_a_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ..conftest import try_import

with try_import() as imports_successful:
from pydantic_ai.settings import ModelSettings
from pydantic_evals.evaluators.llm_as_a_judge import (
GradingOutput,
_stringify, # pyright: ignore[reportPrivateUsage]
Expand Down Expand Up @@ -87,6 +88,34 @@ async def test_judge_output_mock(mocker: MockerFixture):
assert '<Rubric>\nContent contains a greeting\n</Rubric>' in call_args[0]


@pytest.mark.anyio
async def test_judge_output_with_model_settings_mock(mocker: MockerFixture):
"""Test judge_output function with model_settings and mocked agent."""
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)

test_model_settings = ModelSettings(temperature=1)

grading_output = await judge_output(
'Hello world settings',
'Content contains a greeting with settings',
model_settings=test_model_settings,
)
assert isinstance(grading_output, GradingOutput)
assert grading_output.reason == 'Test passed with settings'
assert grading_output.pass_ is True
assert grading_output.score == 1.0

mock_run.assert_called_once()
call_args, call_kwargs = mock_run.call_args
assert '<Output>\nHello world settings\n</Output>' in call_args[0]
assert '<Rubric>\nContent contains a greeting with settings\n</Rubric>' in call_args[0]
assert call_kwargs['model_settings'] == test_model_settings
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs


@pytest.mark.anyio
async def test_judge_input_output_mock(mocker: MockerFixture):
"""Test judge_input_output function with mocked agent."""
Expand All @@ -108,3 +137,33 @@ async def test_judge_input_output_mock(mocker: MockerFixture):
assert '<Input>\nHello\n</Input>' in call_args[0]
assert '<Output>\nHello world\n</Output>' in call_args[0]
assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]


@pytest.mark.anyio
async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture):
"""Test judge_input_output function with model_settings and mocked agent."""
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)

test_model_settings = ModelSettings(temperature=1)

result = await judge_input_output(
'Hello settings',
'Hello world with settings',
'Output contains input with settings',
model_settings=test_model_settings,
)
assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed with settings'
assert result.pass_ is True
assert result.score == 1.0

mock_run.assert_called_once()
call_args, call_kwargs = mock_run.call_args
assert '<Input>\nHello settings\n</Input>' in call_args[0]
assert '<Output>\nHello world with settings\n</Output>' in call_args[0]
assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
assert call_kwargs['model_settings'] == test_model_settings
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs