From 0b28bccfb0e611cb2457a80fe44db3ed9f02378c Mon Sep 17 00:00:00 2001 From: Assad Yousuf Date: Wed, 7 May 2025 16:56:20 -0700 Subject: [PATCH] Allow definition of Model Settings for LLMJudge --- .../pydantic_evals/evaluators/common.py | 8 ++- .../evaluators/llm_as_a_judge.py | 20 ++++-- tests/evals/test_evaluator_common.py | 61 ++++++++++++++++++- tests/evals/test_llm_as_a_judge.py | 59 ++++++++++++++++++ 4 files changed, 140 insertions(+), 8 deletions(-) diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py index 23aa7c03b..a4e938dfa 100644 --- a/pydantic_evals/pydantic_evals/evaluators/common.py +++ b/pydantic_evals/pydantic_evals/evaluators/common.py @@ -5,6 +5,7 @@ from typing import Any, cast from pydantic_ai import models +from pydantic_ai.settings import ModelSettings from ..otel.span_tree import SpanQuery from .context import EvaluatorContext @@ -164,6 +165,7 @@ class LLMJudge(Evaluator[object, object, object]): rubric: str model: models.Model | models.KnownModelName | None = None include_input: bool = False + model_settings: ModelSettings | None = None async def evaluate( self, @@ -172,11 +174,13 @@ async def evaluate( if self.include_input: from .llm_as_a_judge import judge_input_output - grading_output = await judge_input_output(ctx.inputs, ctx.output, self.rubric, self.model) + grading_output = await judge_input_output( + ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings + ) else: from .llm_as_a_judge import judge_output - grading_output = await judge_output(ctx.output, self.rubric, self.model) + grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings) return EvaluationReason(value=grading_output.pass_, reason=grading_output.reason) def build_serialization_arguments(self): diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py index 81f3bcf9d..066728084 100644 --- a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py +++ b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py @@ -7,6 +7,7 @@ from pydantic_core import to_json from pydantic_ai import Agent, models +from pydantic_ai.settings import ModelSettings __all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model') @@ -44,7 +45,10 @@ class GradingOutput(BaseModel, populate_by_name=True): async def judge_output( - output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None + output: Any, + rubric: str, + model: models.Model | models.KnownModelName | None = None, + model_settings: ModelSettings | None = None, ) -> GradingOutput: """Judge the output of a model based on a rubric. @@ -52,7 +56,9 @@ async def judge_output( but this can be changed using the `set_default_judge_model` function. """ user_prompt = f'\n{_stringify(output)}\n\n\n{rubric}\n' - return (await _judge_output_agent.run(user_prompt, model=model or _default_model)).output + return ( + await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings) + ).output _judge_input_output_agent = Agent( @@ -79,7 +85,11 @@ async def judge_output( async def judge_input_output( - inputs: Any, output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None + inputs: Any, + output: Any, + rubric: str, + model: models.Model | models.KnownModelName | None = None, + model_settings: ModelSettings | None = None, ) -> GradingOutput: """Judge the output of a model based on the inputs and a rubric. @@ -87,7 +97,9 @@ async def judge_input_output( but this can be changed using the `set_default_judge_model` function. """ user_prompt = f'\n{_stringify(inputs)}\n\n\n{_stringify(output)}\n\n\n{rubric}\n' - return (await _judge_input_output_agent.run(user_prompt, model=model or _default_model)).output + return ( + await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings) + ).output def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover diff --git a/tests/evals/test_evaluator_common.py b/tests/evals/test_evaluator_common.py index a3bdb995d..aa15f9f34 100644 --- a/tests/evals/test_evaluator_common.py +++ b/tests/evals/test_evaluator_common.py @@ -7,6 +7,8 @@ from inline_snapshot import snapshot from pytest_mock import MockerFixture +from pydantic_ai.settings import ModelSettings + from ..conftest import try_import with try_import() as imports_successful: @@ -222,7 +224,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture): assert result.value is True assert result.reason == 'Test passed' - mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None) + mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None, None) # Test with input evaluator = LLMJudge(rubric='Output contains input', include_input=True, model='openai:gpt-4o') @@ -232,7 +234,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture): assert result.reason == 'Test passed' mock_judge_input_output.assert_called_once_with( - {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o' + {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None ) # Test with failing result @@ -244,6 +246,61 @@ async def test_llm_judge_evaluator(mocker: MockerFixture): assert result.reason == 'Test failed' +@pytest.mark.anyio +async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture): + """Test LLMJudge evaluator with specific model_settings.""" + mock_grading_output = mocker.MagicMock() + mock_grading_output.pass_ = True + mock_grading_output.reason = 'Test passed with settings' + + mock_judge_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output') + mock_judge_output.return_value = mock_grading_output + + mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output') + mock_judge_input_output.return_value = mock_grading_output + + custom_model_settings = ModelSettings(temperature=0.77) + + ctx = EvaluatorContext( + name='test_custom_settings', + inputs={'prompt': 'Hello Custom'}, + metadata=None, + expected_output=None, + output='Hello world custom settings', + duration=0.0, + _span_tree=SpanTreeRecordingError('spans were not recorded'), + attributes={}, + metrics={}, + ) + + # Test without input, with custom model_settings + evaluator_no_input = LLMJudge(rubric='Greeting with custom settings', model_settings=custom_model_settings) + result_no_input = await evaluator_no_input.evaluate(ctx) + assert result_no_input.value is True + assert result_no_input.reason == 'Test passed with settings' + mock_judge_output.assert_called_once_with( + 'Hello world custom settings', 'Greeting with custom settings', None, custom_model_settings + ) + + # Test with input, with custom model_settings + evaluator_with_input = LLMJudge( + rubric='Output contains input with custom settings', + include_input=True, + model='openai:gpt-3.5-turbo', + model_settings=custom_model_settings, + ) + result_with_input = await evaluator_with_input.evaluate(ctx) + assert result_with_input.value is True + assert result_with_input.reason == 'Test passed with settings' + mock_judge_input_output.assert_called_once_with( + {'prompt': 'Hello Custom'}, + 'Hello world custom settings', + 'Output contains input with custom settings', + 'openai:gpt-3.5-turbo', + custom_model_settings, + ) + + async def test_python(): """Test Python evaluator.""" evaluator = Python(expression='ctx.output > 0') diff --git a/tests/evals/test_llm_as_a_judge.py b/tests/evals/test_llm_as_a_judge.py index d2a8fc643..7883f765c 100644 --- a/tests/evals/test_llm_as_a_judge.py +++ b/tests/evals/test_llm_as_a_judge.py @@ -6,6 +6,7 @@ from ..conftest import try_import with try_import() as imports_successful: + from pydantic_ai.settings import ModelSettings from pydantic_evals.evaluators.llm_as_a_judge import ( GradingOutput, _stringify, # pyright: ignore[reportPrivateUsage] @@ -87,6 +88,34 @@ async def test_judge_output_mock(mocker: MockerFixture): assert '\nContent contains a greeting\n' in call_args[0] +@pytest.mark.anyio +async def test_judge_output_with_model_settings_mock(mocker: MockerFixture): + """Test judge_output function with model_settings and mocked agent.""" + mock_result = mocker.MagicMock() + mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0) + mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result) + + test_model_settings = ModelSettings(temperature=1) + + grading_output = await judge_output( + 'Hello world settings', + 'Content contains a greeting with settings', + model_settings=test_model_settings, + ) + assert isinstance(grading_output, GradingOutput) + assert grading_output.reason == 'Test passed with settings' + assert grading_output.pass_ is True + assert grading_output.score == 1.0 + + mock_run.assert_called_once() + call_args, call_kwargs = mock_run.call_args + assert '\nHello world settings\n' in call_args[0] + assert '\nContent contains a greeting with settings\n' in call_args[0] + assert call_kwargs['model_settings'] == test_model_settings + # Check if 'model' kwarg is passed, its value will be the default model or None + assert 'model' in call_kwargs + + @pytest.mark.anyio async def test_judge_input_output_mock(mocker: MockerFixture): """Test judge_input_output function with mocked agent.""" @@ -108,3 +137,33 @@ async def test_judge_input_output_mock(mocker: MockerFixture): assert '\nHello\n' in call_args[0] assert '\nHello world\n' in call_args[0] assert '\nOutput contains input\n' in call_args[0] + + +@pytest.mark.anyio +async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture): + """Test judge_input_output function with model_settings and mocked agent.""" + mock_result = mocker.MagicMock() + mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0) + mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result) + + test_model_settings = ModelSettings(temperature=1) + + result = await judge_input_output( + 'Hello settings', + 'Hello world with settings', + 'Output contains input with settings', + model_settings=test_model_settings, + ) + assert isinstance(result, GradingOutput) + assert result.reason == 'Test passed with settings' + assert result.pass_ is True + assert result.score == 1.0 + + mock_run.assert_called_once() + call_args, call_kwargs = mock_run.call_args + assert '\nHello settings\n' in call_args[0] + assert '\nHello world with settings\n' in call_args[0] + assert '\nOutput contains input with settings\n' in call_args[0] + assert call_kwargs['model_settings'] == test_model_settings + # Check if 'model' kwarg is passed, its value will be the default model or None + assert 'model' in call_kwargs