Skip to content

Commit b453a08

Browse files
committed
feat(evaluation): add option to save eval results to CSV
Add an optional `output_file` parameter to `AgentEvaluator.evaluate` and `AgentEvaluator.evaluate_eval_set`. When set, per-invocation evaluation results for every metric (both passing and failing) are flattened and written to the given path as a CSV file, making it easy to persist and inspect results from pytest-based eval runs. The option is disabled by default, so existing behavior is unchanged. The parent directory is created if needed, and rows are appended so results from a directory of test files accumulate in a single file. CSV writing reuses the existing text/tool-call formatting helpers and relies on pandas, which is already part of the `eval` optional dependencies. Fixes #2652.
1 parent 6bc9c9f commit b453a08

2 files changed

Lines changed: 304 additions & 1 deletion

File tree

src/google/adk/evaluation/agent_evaluator.py

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ async def evaluate_eval_set(
113113
num_runs: int = NUM_RUNS,
114114
agent_name: Optional[str] = None,
115115
print_detailed_results: bool = True,
116+
output_file: Optional[str] = None,
116117
):
117118
"""Evaluates an agent using the given EvalSet.
118119
@@ -130,6 +131,10 @@ async def evaluate_eval_set(
130131
than root agent. If left empty or none, then root agent is evaluated.
131132
print_detailed_results: Whether to print detailed results for each metric
132133
evaluation.
134+
output_file: If provided, per-invocation evaluation results (for both
135+
passing and failing metrics) are written to this path as a CSV file.
136+
Disabled by default. The parent directory is created if it does not
137+
already exist.
133138
"""
134139
if criteria:
135140
logger.warning(
@@ -169,7 +174,11 @@ async def evaluate_eval_set(
169174
# test failures. We track them and then report them towards the end.
170175
failures: list[str] = []
171176

172-
for _, eval_results_per_eval_id in eval_results_by_eval_id.items():
177+
# Optionally, we collect per-invocation results across all eval cases and
178+
# metrics so that they can be written out to a CSV file at the end.
179+
csv_rows: list[dict[str, Any]] = []
180+
181+
for eval_id, eval_results_per_eval_id in eval_results_by_eval_id.items():
173182
eval_metric_results = (
174183
AgentEvaluator._get_eval_metric_results_with_invocation(
175184
eval_results_per_eval_id
@@ -183,6 +192,20 @@ async def evaluate_eval_set(
183192

184193
failures.extend(failures_per_eval_case)
185194

195+
if output_file:
196+
csv_rows.extend(
197+
AgentEvaluator._get_results_as_rows(
198+
eval_set_id=eval_set.eval_set_id,
199+
eval_id=eval_id,
200+
eval_metric_results=eval_metric_results,
201+
)
202+
)
203+
204+
if output_file:
205+
AgentEvaluator._write_results_to_csv(
206+
rows=csv_rows, output_file=output_file
207+
)
208+
186209
failure_message = "Following are all the test failures."
187210
if not print_detailed_results:
188211
failure_message += (
@@ -200,6 +223,7 @@ async def evaluate(
200223
agent_name: Optional[str] = None,
201224
initial_session_file: Optional[str] = None,
202225
print_detailed_results: bool = True,
226+
output_file: Optional[str] = None,
203227
):
204228
"""Evaluates an Agent given eval data.
205229
@@ -218,6 +242,10 @@ async def evaluate(
218242
needed by all the evals in the eval dataset.
219243
print_detailed_results: Whether to print detailed results for each metric
220244
evaluation.
245+
output_file: If provided, per-invocation evaluation results are written to
246+
this path as a CSV file. Disabled by default. When the eval data spans
247+
multiple test files, results from all of them are appended to the same
248+
file.
221249
"""
222250
test_files = []
223251
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
@@ -245,6 +273,7 @@ async def evaluate(
245273
num_runs=num_runs,
246274
agent_name=agent_name,
247275
print_detailed_results=print_detailed_results,
276+
output_file=output_file,
248277
)
249278

250279
@staticmethod
@@ -698,3 +727,79 @@ def _process_metrics_and_get_failures(
698727
)
699728

700729
return failures
730+
731+
@staticmethod
732+
def _get_results_as_rows(
733+
eval_set_id: str,
734+
eval_id: str,
735+
eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]],
736+
) -> list[dict[str, Any]]:
737+
"""Flattens eval results into one row per metric per invocation.
738+
739+
The columns mirror the ones used in `_print_details`, with additional
740+
identifier columns so that rows from different eval cases and metrics can be
741+
distinguished within a single CSV file.
742+
"""
743+
rows: list[dict[str, Any]] = []
744+
for metric_name, results_with_invocations in eval_metric_results.items():
745+
for result_with_invocation in results_with_invocations:
746+
eval_metric_result = result_with_invocation.eval_metric_result
747+
expected_invocation = result_with_invocation.expected_invocation
748+
actual_invocation = result_with_invocation.actual_invocation
749+
rows.append({
750+
"eval_set_id": eval_set_id,
751+
"eval_id": eval_id,
752+
"metric_name": metric_name,
753+
"threshold": eval_metric_result.threshold,
754+
"score": eval_metric_result.score,
755+
"eval_status": eval_metric_result.eval_status.name,
756+
"prompt": AgentEvaluator._convert_content_to_text(
757+
expected_invocation.user_content
758+
if expected_invocation
759+
else actual_invocation.user_content
760+
),
761+
"expected_response": AgentEvaluator._convert_content_to_text(
762+
expected_invocation.final_response
763+
if expected_invocation
764+
else None
765+
),
766+
"actual_response": AgentEvaluator._convert_content_to_text(
767+
actual_invocation.final_response
768+
),
769+
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
770+
expected_invocation.intermediate_data
771+
if expected_invocation
772+
else None
773+
),
774+
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
775+
actual_invocation.intermediate_data
776+
),
777+
})
778+
779+
return rows
780+
781+
@staticmethod
782+
def _write_results_to_csv(
783+
rows: list[dict[str, Any]],
784+
output_file: str,
785+
) -> None:
786+
"""Appends the collected eval result rows to a CSV file.
787+
788+
Rows are appended so that results from multiple eval sets (for example, when
789+
evaluating a directory of test files) can be accumulated in a single file.
790+
The header is only written when the file does not already exist.
791+
"""
792+
try:
793+
import pandas as pd
794+
except ModuleNotFoundError as e:
795+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
796+
797+
output_dir = os.path.dirname(output_file)
798+
if output_dir:
799+
os.makedirs(output_dir, exist_ok=True)
800+
801+
file_exists = os.path.isfile(output_file)
802+
pd.DataFrame(rows).to_csv(
803+
output_file, mode="a", header=not file_exists, index=False
804+
)
805+
logger.info("Saved eval results to %s", output_file)
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import os
18+
19+
from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
20+
from google.adk.evaluation.agent_evaluator import AgentEvaluator
21+
from google.adk.evaluation.eval_case import Invocation
22+
from google.adk.evaluation.eval_metrics import EvalMetricResult
23+
from google.adk.evaluation.evaluator import EvalStatus
24+
from google.genai import types as genai_types
25+
import pandas as pd
26+
import pytest
27+
28+
29+
def _content(text: str) -> genai_types.Content:
30+
return genai_types.Content(parts=[genai_types.Part(text=text)])
31+
32+
33+
def _make_result_with_invocation(
34+
metric_name: str,
35+
score: float,
36+
threshold: float,
37+
eval_status: EvalStatus,
38+
prompt: str,
39+
expected_response: str,
40+
actual_response: str,
41+
) -> _EvalMetricResultWithInvocation:
42+
return _EvalMetricResultWithInvocation(
43+
actual_invocation=Invocation(
44+
user_content=_content(prompt),
45+
final_response=_content(actual_response),
46+
),
47+
expected_invocation=Invocation(
48+
user_content=_content(prompt),
49+
final_response=_content(expected_response),
50+
),
51+
eval_metric_result=EvalMetricResult(
52+
metric_name=metric_name,
53+
threshold=threshold,
54+
score=score,
55+
eval_status=eval_status,
56+
),
57+
)
58+
59+
60+
def test_get_results_as_rows_flattens_metrics_and_invocations():
61+
eval_metric_results = {
62+
"response_match_score": [
63+
_make_result_with_invocation(
64+
metric_name="response_match_score",
65+
score=1.0,
66+
threshold=0.8,
67+
eval_status=EvalStatus.PASSED,
68+
prompt="What is 2 + 2?",
69+
expected_response="4",
70+
actual_response="4",
71+
),
72+
_make_result_with_invocation(
73+
metric_name="response_match_score",
74+
score=0.0,
75+
threshold=0.8,
76+
eval_status=EvalStatus.FAILED,
77+
prompt="Capital of France?",
78+
expected_response="Paris",
79+
actual_response="London",
80+
),
81+
],
82+
}
83+
84+
rows = AgentEvaluator._get_results_as_rows(
85+
eval_set_id="my_eval_set",
86+
eval_id="my_eval_case",
87+
eval_metric_results=eval_metric_results,
88+
)
89+
90+
assert len(rows) == 2
91+
first = rows[0]
92+
assert first["eval_set_id"] == "my_eval_set"
93+
assert first["eval_id"] == "my_eval_case"
94+
assert first["metric_name"] == "response_match_score"
95+
assert first["threshold"] == 0.8
96+
assert first["score"] == 1.0
97+
assert first["eval_status"] == "PASSED"
98+
assert first["prompt"] == "What is 2 + 2?"
99+
assert first["expected_response"] == "4"
100+
assert first["actual_response"] == "4"
101+
102+
# Failing invocation should still be captured.
103+
assert rows[1]["eval_status"] == "FAILED"
104+
assert rows[1]["actual_response"] == "London"
105+
106+
107+
def test_get_results_as_rows_handles_missing_expected_invocation():
108+
result = _EvalMetricResultWithInvocation(
109+
actual_invocation=Invocation(
110+
user_content=_content("hi"),
111+
final_response=_content("hello"),
112+
),
113+
expected_invocation=None,
114+
eval_metric_result=EvalMetricResult(
115+
metric_name="safety_v1",
116+
threshold=0.5,
117+
score=1.0,
118+
eval_status=EvalStatus.PASSED,
119+
),
120+
)
121+
122+
rows = AgentEvaluator._get_results_as_rows(
123+
eval_set_id="s",
124+
eval_id="c",
125+
eval_metric_results={"safety_v1": [result]},
126+
)
127+
128+
assert len(rows) == 1
129+
assert rows[0]["prompt"] == "hi"
130+
assert rows[0]["expected_response"] == ""
131+
assert rows[0]["actual_response"] == "hello"
132+
133+
134+
def test_write_results_to_csv_writes_expected_file(tmp_path):
135+
rows = [
136+
{
137+
"eval_set_id": "s",
138+
"eval_id": "c",
139+
"metric_name": "response_match_score",
140+
"threshold": 0.8,
141+
"score": 1.0,
142+
"eval_status": "PASSED",
143+
"prompt": "What is 2 + 2?",
144+
"expected_response": "4",
145+
"actual_response": "4",
146+
"expected_tool_calls": "",
147+
"actual_tool_calls": "",
148+
},
149+
]
150+
output_file = os.path.join(str(tmp_path), "nested", "eval_results.csv")
151+
152+
AgentEvaluator._write_results_to_csv(rows=rows, output_file=output_file)
153+
154+
# The nested directory should have been created.
155+
assert os.path.isfile(output_file)
156+
157+
df = pd.read_csv(output_file)
158+
assert list(df.columns) == list(rows[0].keys())
159+
assert len(df) == 1
160+
assert df.iloc[0]["metric_name"] == "response_match_score"
161+
assert df.iloc[0]["eval_status"] == "PASSED"
162+
assert df.iloc[0]["score"] == 1.0
163+
164+
165+
def test_write_results_to_csv_appends_without_duplicate_header(tmp_path):
166+
output_file = os.path.join(str(tmp_path), "eval_results.csv")
167+
168+
def _row(eval_id: str, score: float, status: str) -> dict:
169+
return {
170+
"eval_set_id": "s",
171+
"eval_id": eval_id,
172+
"metric_name": "response_match_score",
173+
"threshold": 0.8,
174+
"score": score,
175+
"eval_status": status,
176+
"prompt": "p",
177+
"expected_response": "e",
178+
"actual_response": "a",
179+
"expected_tool_calls": "",
180+
"actual_tool_calls": "",
181+
}
182+
183+
AgentEvaluator._write_results_to_csv(
184+
rows=[_row("case_1", 1.0, "PASSED")], output_file=output_file
185+
)
186+
AgentEvaluator._write_results_to_csv(
187+
rows=[_row("case_2", 0.0, "FAILED")], output_file=output_file
188+
)
189+
190+
df = pd.read_csv(output_file)
191+
# Two appends should accumulate two rows, with the header written only once.
192+
assert len(df) == 2
193+
assert sorted(df["eval_id"].tolist()) == ["case_1", "case_2"]
194+
assert "eval_id" not in df["eval_id"].tolist()
195+
196+
197+
if __name__ == "__main__":
198+
raise SystemExit(pytest.main([__file__, "-v"]))

0 commit comments

Comments
 (0)