From 61671df0e7ad9eb9529e1dfb52507f764a052264 Mon Sep 17 00:00:00 2001 From: Giulio Leone Date: Thu, 9 Apr 2026 02:51:32 +0200 Subject: [PATCH 1/2] fix(eval): make _EvalMetricResultWithInvocation.expected_invocation optional conversation_scenario eval cases intentionally pass expected_invocation=None from local_eval_service (matching the public EvalMetricResultPerInvocation model), but the private _EvalMetricResultWithInvocation required a non-None Invocation, causing a pydantic ValidationError. Changes: - Make expected_invocation Optional[Invocation] with default None - Guard attribute access in _print_details when expected_invocation is None Fixes google/adk-python#5214 --- src/google/adk/evaluation/agent_evaluator.py | 8 +- ...test_eval_metric_result_with_invocation.py | 167 ++++++++++++++++++ 2 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 tests/unittests/evaluation/test_eval_metric_result_with_invocation.py diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..cd05318fbd 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -90,7 +90,7 @@ class _EvalMetricResultWithInvocation(BaseModel): """ actual_invocation: Invocation - expected_invocation: Invocation + expected_invocation: Optional[Invocation] = None eval_metric_result: EvalMetricResult @@ -438,15 +438,21 @@ def _print_details( "threshold": threshold, "prompt": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.user_content + if per_invocation_result.expected_invocation + else None ), "expected_response": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.final_response + if per_invocation_result.expected_invocation + else None ), "actual_response": AgentEvaluator._convert_content_to_text( per_invocation_result.actual_invocation.final_response ), "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.expected_invocation.intermediate_data + if per_invocation_result.expected_invocation + else None ), "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.actual_invocation.intermediate_data diff --git a/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py b/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py new file mode 100644 index 0000000000..0cc3c1de08 --- /dev/null +++ b/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py @@ -0,0 +1,167 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Regression tests for _EvalMetricResultWithInvocation None-handling. + +Covers the bug described in https://github.com/google/adk-python/issues/5214 +where passing expected_invocation=None (the normal path for +conversation_scenario eval cases) caused a pydantic ValidationError. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from google.genai import types as genai_types + +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.agent_evaluator import ( + _EvalMetricResultWithInvocation, +) +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetricResult +from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation +from google.adk.evaluation.eval_metrics import EvalStatus +from google.adk.evaluation.eval_result import EvalCaseResult + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_invocation(**overrides) -> Invocation: + """Return a minimal Invocation instance.""" + defaults = { + "user_content": genai_types.Content( + role="user", parts=[genai_types.Part(text="hello")] + ), + } + defaults.update(overrides) + return Invocation(**defaults) + + +def _make_eval_metric_result( + metric_name: str = "test_metric", + score: float = 1.0, + status: EvalStatus = EvalStatus.PASSED, +) -> EvalMetricResult: + return EvalMetricResult( + metric_name=metric_name, + score=score, + eval_status=status, + ) + + +# --------------------------------------------------------------------------- +# Tests: _EvalMetricResultWithInvocation accepts None +# --------------------------------------------------------------------------- + +class TestEvalMetricResultWithInvocationNone: + """Regression: expected_invocation=None must be accepted (issue #5214).""" + + def test_construction_with_none_expected_invocation(self): + """_EvalMetricResultWithInvocation should accept None for expected_invocation.""" + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is None + + def test_construction_with_omitted_expected_invocation(self): + """expected_invocation should default to None when omitted.""" + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is None + + def test_construction_with_real_expected_invocation(self): + """Normal case: providing a real Invocation should still work.""" + inv = _make_invocation() + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + expected_invocation=inv, + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is inv + + +# --------------------------------------------------------------------------- +# Tests: _get_eval_metric_results_with_invocation passes None through +# --------------------------------------------------------------------------- + +class TestGetEvalMetricResultsWithNone: + """_get_eval_metric_results_with_invocation must propagate None.""" + + def test_none_expected_invocation_propagated(self): + actual = _make_invocation() + metric_result = _make_eval_metric_result(metric_name="m1") + + eval_case_result = EvalCaseResult( + eval_set_id="test_set", + eval_id="scenario_1", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[metric_result], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=actual, + expected_invocation=None, + eval_metric_results=[metric_result], + ) + ], + session_id="sess-1", + ) + + grouped = AgentEvaluator._get_eval_metric_results_with_invocation( + [eval_case_result] + ) + + assert "m1" in grouped + assert len(grouped["m1"]) == 1 + assert grouped["m1"][0].expected_invocation is None + assert grouped["m1"][0].actual_invocation is actual + + +# --------------------------------------------------------------------------- +# Tests: _print_details does not crash when expected_invocation is None +# --------------------------------------------------------------------------- + +class TestPrintDetailsNoneExpected: + """_print_details must handle None expected_invocation gracefully.""" + + def test_print_details_with_none_expected(self): + actual = _make_invocation() + metric_result = _make_eval_metric_result(score=0.9) + + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=actual, + expected_invocation=None, + eval_metric_result=metric_result, + ) + ] + + # _print_details prints to stdout via tabulate/pandas — we just + # verify it doesn't raise. + with patch("builtins.print"): + AgentEvaluator._print_details( + eval_metric_result_with_invocations=items, + overall_eval_status=EvalStatus.PASSED, + overall_score=0.9, + metric_name="test_metric", + threshold=0.5, + ) From 974ac70cb5b863daaf33f5f0165c6c525cc1285c Mon Sep 17 00:00:00 2001 From: Giulio Leone Date: Tue, 14 Apr 2026 03:44:08 +0200 Subject: [PATCH 2/2] style(eval): fix pyink and isort formatting --- ...test_eval_metric_result_with_invocation.py | 193 +++++++++--------- 1 file changed, 96 insertions(+), 97 deletions(-) diff --git a/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py b/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py index 0cc3c1de08..9fff1b83a2 100644 --- a/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py +++ b/tests/unittests/evaluation/test_eval_metric_result_with_invocation.py @@ -23,34 +23,30 @@ from unittest.mock import patch -import pytest - -from google.genai import types as genai_types - +from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation from google.adk.evaluation.agent_evaluator import AgentEvaluator -from google.adk.evaluation.agent_evaluator import ( - _EvalMetricResultWithInvocation, -) from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import EvalMetricResult from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation from google.adk.evaluation.eval_metrics import EvalStatus from google.adk.evaluation.eval_result import EvalCaseResult - +from google.genai import types as genai_types +import pytest # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + def _make_invocation(**overrides) -> Invocation: - """Return a minimal Invocation instance.""" - defaults = { - "user_content": genai_types.Content( - role="user", parts=[genai_types.Part(text="hello")] - ), - } - defaults.update(overrides) - return Invocation(**defaults) + """Return a minimal Invocation instance.""" + defaults = { + "user_content": genai_types.Content( + role="user", parts=[genai_types.Part(text="hello")] + ), + } + defaults.update(overrides) + return Invocation(**defaults) def _make_eval_metric_result( @@ -58,110 +54,113 @@ def _make_eval_metric_result( score: float = 1.0, status: EvalStatus = EvalStatus.PASSED, ) -> EvalMetricResult: - return EvalMetricResult( - metric_name=metric_name, - score=score, - eval_status=status, - ) + return EvalMetricResult( + metric_name=metric_name, + score=score, + eval_status=status, + ) # --------------------------------------------------------------------------- # Tests: _EvalMetricResultWithInvocation accepts None # --------------------------------------------------------------------------- -class TestEvalMetricResultWithInvocationNone: - """Regression: expected_invocation=None must be accepted (issue #5214).""" - def test_construction_with_none_expected_invocation(self): - """_EvalMetricResultWithInvocation should accept None for expected_invocation.""" - result = _EvalMetricResultWithInvocation( - actual_invocation=_make_invocation(), - expected_invocation=None, - eval_metric_result=_make_eval_metric_result(), - ) - assert result.expected_invocation is None +class TestEvalMetricResultWithInvocationNone: + """Regression: expected_invocation=None must be accepted (issue #5214).""" + + def test_construction_with_none_expected_invocation(self): + """_EvalMetricResultWithInvocation should accept None for expected_invocation.""" + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is None - def test_construction_with_omitted_expected_invocation(self): - """expected_invocation should default to None when omitted.""" - result = _EvalMetricResultWithInvocation( - actual_invocation=_make_invocation(), - eval_metric_result=_make_eval_metric_result(), - ) - assert result.expected_invocation is None - - def test_construction_with_real_expected_invocation(self): - """Normal case: providing a real Invocation should still work.""" - inv = _make_invocation() - result = _EvalMetricResultWithInvocation( - actual_invocation=_make_invocation(), - expected_invocation=inv, - eval_metric_result=_make_eval_metric_result(), - ) - assert result.expected_invocation is inv + def test_construction_with_omitted_expected_invocation(self): + """expected_invocation should default to None when omitted.""" + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is None + + def test_construction_with_real_expected_invocation(self): + """Normal case: providing a real Invocation should still work.""" + inv = _make_invocation() + result = _EvalMetricResultWithInvocation( + actual_invocation=_make_invocation(), + expected_invocation=inv, + eval_metric_result=_make_eval_metric_result(), + ) + assert result.expected_invocation is inv # --------------------------------------------------------------------------- # Tests: _get_eval_metric_results_with_invocation passes None through # --------------------------------------------------------------------------- + class TestGetEvalMetricResultsWithNone: - """_get_eval_metric_results_with_invocation must propagate None.""" - - def test_none_expected_invocation_propagated(self): - actual = _make_invocation() - metric_result = _make_eval_metric_result(metric_name="m1") - - eval_case_result = EvalCaseResult( - eval_set_id="test_set", - eval_id="scenario_1", - final_eval_status=EvalStatus.PASSED, - overall_eval_metric_results=[metric_result], - eval_metric_result_per_invocation=[ - EvalMetricResultPerInvocation( - actual_invocation=actual, - expected_invocation=None, - eval_metric_results=[metric_result], - ) - ], - session_id="sess-1", - ) + """_get_eval_metric_results_with_invocation must propagate None.""" + + def test_none_expected_invocation_propagated(self): + actual = _make_invocation() + metric_result = _make_eval_metric_result(metric_name="m1") + + eval_case_result = EvalCaseResult( + eval_set_id="test_set", + eval_id="scenario_1", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[metric_result], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=actual, + expected_invocation=None, + eval_metric_results=[metric_result], + ) + ], + session_id="sess-1", + ) - grouped = AgentEvaluator._get_eval_metric_results_with_invocation( - [eval_case_result] - ) + grouped = AgentEvaluator._get_eval_metric_results_with_invocation( + [eval_case_result] + ) - assert "m1" in grouped - assert len(grouped["m1"]) == 1 - assert grouped["m1"][0].expected_invocation is None - assert grouped["m1"][0].actual_invocation is actual + assert "m1" in grouped + assert len(grouped["m1"]) == 1 + assert grouped["m1"][0].expected_invocation is None + assert grouped["m1"][0].actual_invocation is actual # --------------------------------------------------------------------------- # Tests: _print_details does not crash when expected_invocation is None # --------------------------------------------------------------------------- + class TestPrintDetailsNoneExpected: - """_print_details must handle None expected_invocation gracefully.""" + """_print_details must handle None expected_invocation gracefully.""" - def test_print_details_with_none_expected(self): - actual = _make_invocation() - metric_result = _make_eval_metric_result(score=0.9) + def test_print_details_with_none_expected(self): + actual = _make_invocation() + metric_result = _make_eval_metric_result(score=0.9) - items = [ - _EvalMetricResultWithInvocation( - actual_invocation=actual, - expected_invocation=None, - eval_metric_result=metric_result, - ) - ] - - # _print_details prints to stdout via tabulate/pandas — we just - # verify it doesn't raise. - with patch("builtins.print"): - AgentEvaluator._print_details( - eval_metric_result_with_invocations=items, - overall_eval_status=EvalStatus.PASSED, - overall_score=0.9, - metric_name="test_metric", - threshold=0.5, - ) + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=actual, + expected_invocation=None, + eval_metric_result=metric_result, + ) + ] + + # _print_details prints to stdout via tabulate/pandas — we just + # verify it doesn't raise. + with patch("builtins.print"): + AgentEvaluator._print_details( + eval_metric_result_with_invocations=items, + overall_eval_status=EvalStatus.PASSED, + overall_score=0.9, + metric_name="test_metric", + threshold=0.5, + )