From 1d0a72123361178d990d9238bb38353778dd61f5 Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Wed, 8 Apr 2026 15:12:01 -0700
Subject: [PATCH 1/2] feat(eval): add evaluate_full_response option to
 rubric-based evaluation

When an agent emits text before a tool call (e.g. presenting a plan),
then calls a tool, then emits more text (e.g. an explanation), the
rubric_based_final_response_quality_v1 metric only sends the post-tool-call
text to the judge. The pre-tool-call text is stored in
intermediate_data.invocation_events but is never included in the judge prompt.

This means rubrics that check for content in the pre-tool-call text always
fail, even though the agent correctly produced that content.

This commit adds an `evaluate_full_response` boolean field to
`RubricsBasedCriterion` (following the pattern of `evaluate_intermediate_nl_responses`
on `HallucinationsCriterion`). When set to true, the evaluator concatenates
all text from invocation_events with the final_response before sending to
the judge, giving it the complete picture of the agent's output.

Usage:
```json
{
  "rubric_based_final_response_quality_v1": {
    "threshold": 0.8,
    "evaluate_full_response": true,
    "rubrics": [...]
  }
}
```

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/google/adk/evaluation/eval_metrics.py     | 13 +++++++
 .../rubric_based_final_response_quality_v1.py | 38 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
index 50c3473c3a..5895ad807a 100644
--- a/src/google/adk/evaluation/eval_metrics.py
+++ b/src/google/adk/evaluation/eval_metrics.py
@@ -144,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion):
       ),
   )
 
+  evaluate_full_response: bool = Field(
+      default=False,
+      description=(
+          "Whether to evaluate the full agent response including intermediate"
+          " natural language text (e.g. text emitted before tool calls) in"
+          " addition to the final response. By default, only the final"
+          " response text is sent to the judge. When True, text from all"
+          " intermediate invocation events is concatenated with the final"
+          " response before evaluation. This is useful for agents that emit"
+          " text both before and after tool calls within a single invocation."
+      ),
+  )
+
 
 class HallucinationsCriterion(BaseCriterion):
   """Criterion to use when evaluating agents response for hallucinations."""
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index df01aba4ff..9207749984 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -274,7 +274,21 @@ def format_auto_rater_prompt(
     """Returns the autorater prompt."""
     self.create_effective_rubrics_list(actual_invocation.rubrics)
     user_input = get_text_from_content(actual_invocation.user_content)
-    final_response = get_text_from_content(actual_invocation.final_response)
+
+    # When evaluate_full_response is enabled, include text from intermediate
+    # invocation events (e.g. text emitted before tool calls) in addition to
+    # the final response. This is useful for agents that stream text, call
+    # tools, then stream more text within a single invocation.
+    criterion = self._eval_metric.criterion
+    evaluate_full = (
+        isinstance(criterion, RubricsBasedCriterion)
+        and criterion.evaluate_full_response
+    )
+
+    if evaluate_full:
+      final_response = self._get_full_response_text(actual_invocation)
+    else:
+      final_response = get_text_from_content(actual_invocation.final_response)
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
@@ -310,3 +324,25 @@ def format_auto_rater_prompt(
     )
 
     return auto_rater_prompt
+
+  @staticmethod
+  def _get_full_response_text(invocation: Invocation) -> str:
+    """Concatenates all NL text from invocation events and the final response.
+
+    When an agent emits text before a tool call (e.g. presenting a plan),
+    that text is stored in intermediate_data.invocation_events but not in
+    final_response. This method collects text from both sources to give the
+    judge a complete picture of the agent's output.
+    """
+    parts = []
+    if invocation.intermediate_data and isinstance(
+        invocation.intermediate_data, InvocationEvents
+    ):
+      for evt in invocation.intermediate_data.invocation_events:
+        text = get_text_from_content(evt.content)
+        if text:
+          parts.append(text)
+    final_text = get_text_from_content(invocation.final_response)
+    if final_text:
+      parts.append(final_text)
+    return "\n\n".join(parts)

From e8b16089dda73f8714c0e7f29eab26e45afc7a5e Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Wed, 8 Apr 2026 16:05:38 -0700
Subject: [PATCH 2/2] fix: use getattr instead of isinstance for criterion
 check

The criterion may be deserialized as BaseCriterion (which accepts extra
fields via extra="allow") rather than RubricsBasedCriterion, so isinstance
check fails even when evaluate_full_response is present. Using getattr
with a default handles both cases.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../adk/evaluation/rubric_based_final_response_quality_v1.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index 9207749984..5dc6320da3 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -280,10 +280,7 @@ def format_auto_rater_prompt(
     # the final response. This is useful for agents that stream text, call
     # tools, then stream more text within a single invocation.
     criterion = self._eval_metric.criterion
-    evaluate_full = (
-        isinstance(criterion, RubricsBasedCriterion)
-        and criterion.evaluate_full_response
-    )
+    evaluate_full = getattr(criterion, "evaluate_full_response", False)
 
     if evaluate_full:
       final_response = self._get_full_response_text(actual_invocation)