Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion):
),
)

evaluate_full_response: bool = Field(
default=False,
description=(
"Whether to evaluate the full agent response including intermediate"
" natural language text (e.g. text emitted before tool calls) in"
" addition to the final response. By default, only the final"
" response text is sent to the judge. When True, text from all"
" intermediate invocation events is concatenated with the final"
" response before evaluation. This is useful for agents that emit"
" text both before and after tool calls within a single invocation."
),
)


class HallucinationsCriterion(BaseCriterion):
"""Criterion to use when evaluating agents response for hallucinations."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,18 @@ def format_auto_rater_prompt(
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
final_response = get_text_from_content(actual_invocation.final_response)

# When evaluate_full_response is enabled, include text from intermediate
# invocation events (e.g. text emitted before tool calls) in addition to
# the final response. This is useful for agents that stream text, call
# tools, then stream more text within a single invocation.
criterion = self._eval_metric.criterion
evaluate_full = getattr(criterion, "evaluate_full_response", False)

if evaluate_full:
final_response = self._get_full_response_text(actual_invocation)
else:
final_response = get_text_from_content(actual_invocation.final_response)

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
Expand Down Expand Up @@ -310,3 +321,25 @@ def format_auto_rater_prompt(
)

return auto_rater_prompt

@staticmethod
def _get_full_response_text(invocation: Invocation) -> str:
"""Concatenates all NL text from invocation events and the final response.

When an agent emits text before a tool call (e.g. presenting a plan),
that text is stored in intermediate_data.invocation_events but not in
final_response. This method collects text from both sources to give the
judge a complete picture of the agent's output.
"""
parts = []
if invocation.intermediate_data and isinstance(
invocation.intermediate_data, InvocationEvents
):
for evt in invocation.intermediate_data.invocation_events:
text = get_text_from_content(evt.content)
if text:
parts.append(text)
final_text = get_text_from_content(invocation.final_response)
if final_text:
parts.append(final_text)
return "\n\n".join(parts)