From 1d0a72123361178d990d9238bb38353778dd61f5 Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Wed, 8 Apr 2026 15:12:01 -0700 Subject: [PATCH 1/2] feat(eval): add evaluate_full_response option to rubric-based evaluation When an agent emits text before a tool call (e.g. presenting a plan), then calls a tool, then emits more text (e.g. an explanation), the rubric_based_final_response_quality_v1 metric only sends the post-tool-call text to the judge. The pre-tool-call text is stored in intermediate_data.invocation_events but is never included in the judge prompt. This means rubrics that check for content in the pre-tool-call text always fail, even though the agent correctly produced that content. This commit adds an `evaluate_full_response` boolean field to `RubricsBasedCriterion` (following the pattern of `evaluate_intermediate_nl_responses` on `HallucinationsCriterion`). When set to true, the evaluator concatenates all text from invocation_events with the final_response before sending to the judge, giving it the complete picture of the agent's output. Usage: ```json { "rubric_based_final_response_quality_v1": { "threshold": 0.8, "evaluate_full_response": true, "rubrics": [...] } } ``` Co-Authored-By: Claude Opus 4.6 (1M context) --- src/google/adk/evaluation/eval_metrics.py | 13 +++++++ .../rubric_based_final_response_quality_v1.py | 38 ++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 50c3473c3a..5895ad807a 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -144,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion): ), ) + evaluate_full_response: bool = Field( + default=False, + description=( + "Whether to evaluate the full agent response including intermediate" + " natural language text (e.g. text emitted before tool calls) in" + " addition to the final response. By default, only the final" + " response text is sent to the judge. When True, text from all" + " intermediate invocation events is concatenated with the final" + " response before evaluation. This is useful for agents that emit" + " text both before and after tool calls within a single invocation." + ), + ) + class HallucinationsCriterion(BaseCriterion): """Criterion to use when evaluating agents response for hallucinations.""" diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index df01aba4ff..9207749984 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -274,7 +274,21 @@ def format_auto_rater_prompt( """Returns the autorater prompt.""" self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) - final_response = get_text_from_content(actual_invocation.final_response) + + # When evaluate_full_response is enabled, include text from intermediate + # invocation events (e.g. text emitted before tool calls) in addition to + # the final response. This is useful for agents that stream text, call + # tools, then stream more text within a single invocation. + criterion = self._eval_metric.criterion + evaluate_full = ( + isinstance(criterion, RubricsBasedCriterion) + and criterion.evaluate_full_response + ) + + if evaluate_full: + final_response = self._get_full_response_text(actual_invocation) + else: + final_response = get_text_from_content(actual_invocation.final_response) rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" @@ -310,3 +324,25 @@ def format_auto_rater_prompt( ) return auto_rater_prompt + + @staticmethod + def _get_full_response_text(invocation: Invocation) -> str: + """Concatenates all NL text from invocation events and the final response. + + When an agent emits text before a tool call (e.g. presenting a plan), + that text is stored in intermediate_data.invocation_events but not in + final_response. This method collects text from both sources to give the + judge a complete picture of the agent's output. + """ + parts = [] + if invocation.intermediate_data and isinstance( + invocation.intermediate_data, InvocationEvents + ): + for evt in invocation.intermediate_data.invocation_events: + text = get_text_from_content(evt.content) + if text: + parts.append(text) + final_text = get_text_from_content(invocation.final_response) + if final_text: + parts.append(final_text) + return "\n\n".join(parts) From e8b16089dda73f8714c0e7f29eab26e45afc7a5e Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Wed, 8 Apr 2026 16:05:38 -0700 Subject: [PATCH 2/2] fix: use getattr instead of isinstance for criterion check The criterion may be deserialized as BaseCriterion (which accepts extra fields via extra="allow") rather than RubricsBasedCriterion, so isinstance check fails even when evaluate_full_response is present. Using getattr with a default handles both cases. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../adk/evaluation/rubric_based_final_response_quality_v1.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 9207749984..5dc6320da3 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -280,10 +280,7 @@ def format_auto_rater_prompt( # the final response. This is useful for agents that stream text, call # tools, then stream more text within a single invocation. criterion = self._eval_metric.criterion - evaluate_full = ( - isinstance(criterion, RubricsBasedCriterion) - and criterion.evaluate_full_response - ) + evaluate_full = getattr(criterion, "evaluate_full_response", False) if evaluate_full: final_response = self._get_full_response_text(actual_invocation)