feat(evaluation): add option to save eval results to CSV

vaibhav-patel · vaibhav-patel · commit b453a089eb7a · 2026-06-22T14:59:37.000+05:30
Add an optional `output_file` parameter to `AgentEvaluator.evaluate` and `AgentEvaluator.evaluate_eval_set`. When set, per-invocation evaluation results for every metric (both passing and failing) are flattened and written to the given path as a CSV file, making it easy to persist and inspect results from pytest-based eval runs. The option is disabled by default, so existing behavior is unchanged. The parent directory is created if needed, and rows are appended so results from a directory of test files accumulate in a single file. CSV writing reuses the existing text/tool-call formatting helpers and relies on pandas, which is already part of the `eval` optional dependencies. Fixes #2652.
diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
@@ -113,6 +113,7 @@ async def evaluate_eval_set(
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
       print_detailed_results: bool = True,
+      output_file: Optional[str] = None,
   ):
     """Evaluates an agent using the given EvalSet.
 
@@ -130,6 +131,10 @@ async def evaluate_eval_set(
         than root agent. If left empty or none, then root agent is evaluated.
       print_detailed_results: Whether to print detailed results for each metric
         evaluation.
+      output_file: If provided, per-invocation evaluation results (for both
+        passing and failing metrics) are written to this path as a CSV file.
+        Disabled by default. The parent directory is created if it does not
+        already exist.
     """
     if criteria:
       logger.warning(
@@ -169,7 +174,11 @@ async def evaluate_eval_set(
     # test failures. We track them and then report them towards the end.
     failures: list[str] = []
 
-    for _, eval_results_per_eval_id in eval_results_by_eval_id.items():
+    # Optionally, we collect per-invocation results across all eval cases and
+    # metrics so that they can be written out to a CSV file at the end.
+    csv_rows: list[dict[str, Any]] = []
+
+    for eval_id, eval_results_per_eval_id in eval_results_by_eval_id.items():
       eval_metric_results = (
           AgentEvaluator._get_eval_metric_results_with_invocation(
               eval_results_per_eval_id
@@ -183,6 +192,20 @@ async def evaluate_eval_set(
 
       failures.extend(failures_per_eval_case)
 
+      if output_file:
+        csv_rows.extend(
+            AgentEvaluator._get_results_as_rows(
+                eval_set_id=eval_set.eval_set_id,
+                eval_id=eval_id,
+                eval_metric_results=eval_metric_results,
+            )
+        )
+
+    if output_file:
+      AgentEvaluator._write_results_to_csv(
+          rows=csv_rows, output_file=output_file
+      )
+
     failure_message = "Following are all the test failures."
     if not print_detailed_results:
       failure_message += (
@@ -200,6 +223,7 @@ async def evaluate(
       agent_name: Optional[str] = None,
       initial_session_file: Optional[str] = None,
       print_detailed_results: bool = True,
+      output_file: Optional[str] = None,
   ):
     """Evaluates an Agent given eval data.
 
@@ -218,6 +242,10 @@ async def evaluate(
         needed by all the evals in the eval dataset.
       print_detailed_results: Whether to print detailed results for each metric
         evaluation.
+      output_file: If provided, per-invocation evaluation results are written to
+        this path as a CSV file. Disabled by default. When the eval data spans
+        multiple test files, results from all of them are appended to the same
+        file.
     """
     test_files = []
     if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
@@ -245,6 +273,7 @@ async def evaluate(
           num_runs=num_runs,
           agent_name=agent_name,
           print_detailed_results=print_detailed_results,
+          output_file=output_file,
       )
 
   @staticmethod
@@ -698,3 +727,79 @@ def _process_metrics_and_get_failures(
         )
 
     return failures
+
+  @staticmethod
+  def _get_results_as_rows(
+      eval_set_id: str,
+      eval_id: str,
+      eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]],
+  ) -> list[dict[str, Any]]:
+    """Flattens eval results into one row per metric per invocation.
+
+    The columns mirror the ones used in `_print_details`, with additional
+    identifier columns so that rows from different eval cases and metrics can be
+    distinguished within a single CSV file.
+    """
+    rows: list[dict[str, Any]] = []
+    for metric_name, results_with_invocations in eval_metric_results.items():
+      for result_with_invocation in results_with_invocations:
+        eval_metric_result = result_with_invocation.eval_metric_result
+        expected_invocation = result_with_invocation.expected_invocation
+        actual_invocation = result_with_invocation.actual_invocation
+        rows.append({
+            "eval_set_id": eval_set_id,
+            "eval_id": eval_id,
+            "metric_name": metric_name,
+            "threshold": eval_metric_result.threshold,
+            "score": eval_metric_result.score,
+            "eval_status": eval_metric_result.eval_status.name,
+            "prompt": AgentEvaluator._convert_content_to_text(
+                expected_invocation.user_content
+                if expected_invocation
+                else actual_invocation.user_content
+            ),
+            "expected_response": AgentEvaluator._convert_content_to_text(
+                expected_invocation.final_response
+                if expected_invocation
+                else None
+            ),
+            "actual_response": AgentEvaluator._convert_content_to_text(
+                actual_invocation.final_response
+            ),
+            "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+                expected_invocation.intermediate_data
+                if expected_invocation
+                else None
+            ),
+            "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+                actual_invocation.intermediate_data
+            ),
+        })
+
+    return rows
+
+  @staticmethod
+  def _write_results_to_csv(
+      rows: list[dict[str, Any]],
+      output_file: str,
+  ) -> None:
+    """Appends the collected eval result rows to a CSV file.
+
+    Rows are appended so that results from multiple eval sets (for example, when
+    evaluating a directory of test files) can be accumulated in a single file.
+    The header is only written when the file does not already exist.
+    """
+    try:
+      import pandas as pd
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+
+    output_dir = os.path.dirname(output_file)
+    if output_dir:
+      os.makedirs(output_dir, exist_ok=True)
+
+    file_exists = os.path.isfile(output_file)
+    pd.DataFrame(rows).to_csv(
+        output_file, mode="a", header=not file_exists, index=False
+    )
+    logger.info("Saved eval results to %s", output_file)
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -0,0 +1,198 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+
+from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetricResult
+from google.adk.evaluation.evaluator import EvalStatus
+from google.genai import types as genai_types
+import pandas as pd
+import pytest
+
+
+def _content(text: str) -> genai_types.Content:
+  return genai_types.Content(parts=[genai_types.Part(text=text)])
+
+
+def _make_result_with_invocation(
+    metric_name: str,
+    score: float,
+    threshold: float,
+    eval_status: EvalStatus,
+    prompt: str,
+    expected_response: str,
+    actual_response: str,
+) -> _EvalMetricResultWithInvocation:
+  return _EvalMetricResultWithInvocation(
+      actual_invocation=Invocation(
+          user_content=_content(prompt),
+          final_response=_content(actual_response),
+      ),
+      expected_invocation=Invocation(
+          user_content=_content(prompt),
+          final_response=_content(expected_response),
+      ),
+      eval_metric_result=EvalMetricResult(
+          metric_name=metric_name,
+          threshold=threshold,
+          score=score,
+          eval_status=eval_status,
+      ),
+  )
+
+
+def test_get_results_as_rows_flattens_metrics_and_invocations():
+  eval_metric_results = {
+      "response_match_score": [
+          _make_result_with_invocation(
+              metric_name="response_match_score",
+              score=1.0,
+              threshold=0.8,
+              eval_status=EvalStatus.PASSED,
+              prompt="What is 2 + 2?",
+              expected_response="4",
+              actual_response="4",
+          ),
+          _make_result_with_invocation(
+              metric_name="response_match_score",
+              score=0.0,
+              threshold=0.8,
+              eval_status=EvalStatus.FAILED,
+              prompt="Capital of France?",
+              expected_response="Paris",
+              actual_response="London",
+          ),
+      ],
+  }
+
+  rows = AgentEvaluator._get_results_as_rows(
+      eval_set_id="my_eval_set",
+      eval_id="my_eval_case",
+      eval_metric_results=eval_metric_results,
+  )
+
+  assert len(rows) == 2
+  first = rows[0]
+  assert first["eval_set_id"] == "my_eval_set"
+  assert first["eval_id"] == "my_eval_case"
+  assert first["metric_name"] == "response_match_score"
+  assert first["threshold"] == 0.8
+  assert first["score"] == 1.0
+  assert first["eval_status"] == "PASSED"
+  assert first["prompt"] == "What is 2 + 2?"
+  assert first["expected_response"] == "4"
+  assert first["actual_response"] == "4"
+
+  # Failing invocation should still be captured.
+  assert rows[1]["eval_status"] == "FAILED"
+  assert rows[1]["actual_response"] == "London"
+
+
+def test_get_results_as_rows_handles_missing_expected_invocation():
+  result = _EvalMetricResultWithInvocation(
+      actual_invocation=Invocation(
+          user_content=_content("hi"),
+          final_response=_content("hello"),
+      ),
+      expected_invocation=None,
+      eval_metric_result=EvalMetricResult(
+          metric_name="safety_v1",
+          threshold=0.5,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+  )
+
+  rows = AgentEvaluator._get_results_as_rows(
+      eval_set_id="s",
+      eval_id="c",
+      eval_metric_results={"safety_v1": [result]},
+  )
+
+  assert len(rows) == 1
+  assert rows[0]["prompt"] == "hi"
+  assert rows[0]["expected_response"] == ""
+  assert rows[0]["actual_response"] == "hello"
+
+
+def test_write_results_to_csv_writes_expected_file(tmp_path):
+  rows = [
+      {
+          "eval_set_id": "s",
+          "eval_id": "c",
+          "metric_name": "response_match_score",
+          "threshold": 0.8,
+          "score": 1.0,
+          "eval_status": "PASSED",
+          "prompt": "What is 2 + 2?",
+          "expected_response": "4",
+          "actual_response": "4",
+          "expected_tool_calls": "",
+          "actual_tool_calls": "",
+      },
+  ]
+  output_file = os.path.join(str(tmp_path), "nested", "eval_results.csv")
+
+  AgentEvaluator._write_results_to_csv(rows=rows, output_file=output_file)
+
+  # The nested directory should have been created.
+  assert os.path.isfile(output_file)
+
+  df = pd.read_csv(output_file)
+  assert list(df.columns) == list(rows[0].keys())
+  assert len(df) == 1
+  assert df.iloc[0]["metric_name"] == "response_match_score"
+  assert df.iloc[0]["eval_status"] == "PASSED"
+  assert df.iloc[0]["score"] == 1.0
+
+
+def test_write_results_to_csv_appends_without_duplicate_header(tmp_path):
+  output_file = os.path.join(str(tmp_path), "eval_results.csv")
+
+  def _row(eval_id: str, score: float, status: str) -> dict:
+    return {
+        "eval_set_id": "s",
+        "eval_id": eval_id,
+        "metric_name": "response_match_score",
+        "threshold": 0.8,
+        "score": score,
+        "eval_status": status,
+        "prompt": "p",
+        "expected_response": "e",
+        "actual_response": "a",
+        "expected_tool_calls": "",
+        "actual_tool_calls": "",
+    }
+
+  AgentEvaluator._write_results_to_csv(
+      rows=[_row("case_1", 1.0, "PASSED")], output_file=output_file
+  )
+  AgentEvaluator._write_results_to_csv(
+      rows=[_row("case_2", 0.0, "FAILED")], output_file=output_file
+  )
+
+  df = pd.read_csv(output_file)
+  # Two appends should accumulate two rows, with the header written only once.
+  assert len(df) == 2
+  assert sorted(df["eval_id"].tolist()) == ["case_1", "case_2"]
+  assert "eval_id" not in df["eval_id"].tolist()
+
+
+if __name__ == "__main__":
+  raise SystemExit(pytest.main([__file__, "-v"]))