Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion python/packages/core/agent_framework/_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1659,6 +1659,7 @@ async def evaluate_workflow(
workflow: Workflow,
workflow_result: WorkflowRunResult | None = None,
queries: str | Sequence[str] | None = None,
expected_output: str | Sequence[str] | None = None,
evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]],
eval_name: str | None = None,
include_overall: bool = True,
Expand All @@ -1683,6 +1684,11 @@ async def evaluate_workflow(
workflow: The workflow instance.
workflow_result: A completed ``WorkflowRunResult``.
queries: Test queries to run through the workflow.
expected_output: Ground-truth expected output(s), one per query. A
single string is wrapped into a one-element list. When provided,
must be the same length as ``queries``. Each value is stamped on
the corresponding ``EvalItem.expected_output`` for evaluators
that compare against a reference answer (e.g. similarity).
evaluators: One or more ``Evaluator`` instances.
eval_name: Display name for the evaluation.
include_overall: Whether to evaluate the workflow's final output.
Expand Down Expand Up @@ -1720,10 +1726,20 @@ async def evaluate_workflow(
# Normalize singular query to list
if isinstance(queries, str):
queries = [queries]
if isinstance(expected_output, str):
expected_output = [expected_output]

if workflow_result is None and queries is None:
raise ValueError("Provide either 'workflow_result' or 'queries'.")

Comment thread
chetantoshniwal marked this conversation as resolved.
if expected_output is not None and queries is None:
raise ValueError(
"Provide 'queries' when using 'expected_output';"
" 'expected_output' is not supported with 'workflow_result' only."
)
if expected_output is not None and queries is not None and len(expected_output) != len(queries):
raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")

if num_repetitions < 1:
raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.")

Expand All @@ -1737,7 +1753,7 @@ async def evaluate_workflow(
if queries is not None:
results_list: list[WRR] = []
for _rep in range(num_repetitions):
for q in queries:
for qi, q in enumerate(queries):
result = await workflow.run(q)
if not isinstance(result, WRR):
raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.")
Expand All @@ -1746,6 +1762,8 @@ async def evaluate_workflow(
if include_overall:
overall_item = _build_overall_item(q, result)
if overall_item:
if expected_output is not None:
overall_item.expected_output = expected_output[qi]
overall_items.append(overall_item)
else:
assert workflow_result is not None # noqa: S101 # nosec B101
Expand Down
20 changes: 18 additions & 2 deletions python/packages/foundry/agent_framework_foundry/_foundry_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@
"builtin.tool_call_success",
}

# Evaluators that require a ground_truth / expected_output field.
_GROUND_TRUTH_EVALUATORS: set[str] = {
"builtin.similarity",
}

_BUILTIN_EVALUATORS: dict[str, str] = {
# Agent behavior
"intent_resolution": "builtin.intent_resolution",
Expand Down Expand Up @@ -196,6 +201,8 @@ def _build_testing_criteria(
}
if qualified == "builtin.groundedness":
mapping["context"] = "{{item.context}}"
if qualified in _GROUND_TRUTH_EVALUATORS:
mapping["ground_truth"] = "{{item.ground_truth}}"
if qualified in _TOOL_EVALUATORS:
mapping["tool_definitions"] = "{{item.tool_definitions}}"
entry["data_mapping"] = mapping
Expand All @@ -204,7 +211,9 @@ def _build_testing_criteria(
return criteria


def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]:
def _build_item_schema(
*, has_context: bool = False, has_tools: bool = False, has_ground_truth: bool = False
) -> dict[str, Any]:
"""Build the ``item_schema`` for custom JSONL eval definitions."""
properties: dict[str, Any] = {
"query": {"type": "string"},
Expand All @@ -214,6 +223,8 @@ def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) ->
}
if has_context:
properties["context"] = {"type": "string"}
if has_ground_truth:
properties["ground_truth"] = {"type": "string"}
if has_tools:
properties["tool_definitions"] = {"type": "array"}
return {
Expand Down Expand Up @@ -681,16 +692,21 @@ async def _evaluate_via_dataset(
]
if item.context:
d["context"] = item.context
if item.expected_output is not None:
d["ground_truth"] = item.expected_output
dicts.append(d)

has_context = any("context" in d for d in dicts)
has_ground_truth = any("ground_truth" in d for d in dicts)
has_tools = any("tool_definitions" in d for d in dicts)

eval_obj = await self._client.evals.create(
name=eval_name,
data_source_config={ # type: ignore[arg-type] # pyright: ignore[reportArgumentType]
"type": "custom",
"item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools),
"item_schema": _build_item_schema(
has_context=has_context, has_ground_truth=has_ground_truth, has_tools=has_tools
),
"include_sample_schema": True,
},
testing_criteria=_build_testing_criteria( # type: ignore[arg-type] # pyright: ignore[reportArgumentType]
Expand Down
148 changes: 148 additions & 0 deletions python/packages/foundry/tests/test_foundry_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,10 @@ def test_quality_evaluators_use_strings(self) -> None:
assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}"
assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}"

def test_similarity_includes_ground_truth(self) -> None:
criteria = _build_testing_criteria(["similarity"], "gpt-4o", include_data_mapping=True)
assert criteria[0]["data_mapping"]["ground_truth"] == "{{item.ground_truth}}"

def test_all_tool_evaluators_include_tool_definitions(self) -> None:
tool_evals = [
"tool_call_accuracy",
Expand Down Expand Up @@ -801,6 +805,10 @@ def test_with_tools(self) -> None:
schema = _build_item_schema(has_tools=True)
assert "tool_definitions" in schema["properties"]

def test_with_ground_truth(self) -> None:
schema = _build_item_schema(has_ground_truth=True)
assert "ground_truth" in schema["properties"]

def test_with_context_and_tools(self) -> None:
schema = _build_item_schema(has_context=True, has_tools=True)
assert "context" in schema["properties"]
Expand Down Expand Up @@ -1015,6 +1023,50 @@ async def test_evaluate_with_tool_items_uses_dataset_path(self) -> None:
assert ds["type"] == "jsonl"
assert "tool_definitions" in ds["source"]["content"][0]["item"]

async def test_evaluate_ground_truth_in_dataset(self) -> None:
"""Items with expected_output include ground_truth in the JSONL payload."""
mock_client = MagicMock()

mock_eval = MagicMock()
mock_eval.id = "eval_gt"
mock_client.evals.create = AsyncMock(return_value=mock_eval)

mock_run = MagicMock()
mock_run.id = "run_gt"
mock_client.evals.runs.create = AsyncMock(return_value=mock_run)

mock_completed = MagicMock()
mock_completed.status = "completed"
mock_completed.result_counts = _rc(passed=1)
mock_completed.report_url = None
mock_completed.per_testing_criteria_results = None
mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed)

items = [
EvalItem(
conversation=[Message("user", ["What is 2+2?"]), Message("assistant", ["4"])],
expected_output="4",
),
]

fe = FoundryEvals(
client=mock_client,
model="gpt-4o",
evaluators=[FoundryEvals.SIMILARITY],
)
await fe.evaluate(items)

# Verify ground_truth appears in JSONL data
run_call = mock_client.evals.runs.create.call_args
ds = run_call.kwargs["data_source"]
assert ds["type"] == "jsonl"
assert ds["source"]["content"][0]["item"]["ground_truth"] == "4"

# Verify item_schema includes ground_truth
create_call = mock_client.evals.create.call_args
schema = create_call.kwargs["data_source_config"]["item_schema"]
assert "ground_truth" in schema["properties"]

async def test_evaluate_image_content_in_dataset(self) -> None:
"""Image content in conversations is preserved in the JSONL payload."""
mock_client = MagicMock()
Expand Down Expand Up @@ -1988,6 +2040,102 @@ async def test_per_agent_excludes_tool_evaluators_when_no_tools(self) -> None:
"researcher has tools — should get tool_call_accuracy"
)

async def test_expected_output_stamps_overall_items(self) -> None:
"""expected_output is stamped on overall items as ground_truth in the dataset."""
mock_oai = self._mock_oai_client()

aer = _make_agent_exec_response("agent", "Response", ["Query"])
final_output = [Message("assistant", ["Final answer"])]

events = [
WorkflowEvent.executor_invoked("agent", "Test query"),
WorkflowEvent.executor_completed("agent", [aer]),
WorkflowEvent.output("end", final_output),
]
wf_result = WorkflowRunResult(events, [])

mock_workflow = MagicMock()
mock_workflow.executors = {}
mock_workflow.run = AsyncMock(return_value=wf_result)

results = await evaluate_workflow(
workflow=mock_workflow,
queries=["Test query"],
expected_output=["Expected answer"],
evaluators=FoundryEvals(
client=mock_oai,
model="gpt-4o",
evaluators=[FoundryEvals.SIMILARITY],
),
)

assert results[0].status == "completed"

# Verify overall eval's dataset includes ground_truth
# The overall eval is the last evals.runs.create call
calls = mock_oai.evals.runs.create.call_args_list
overall_call = calls[-1]
ds = overall_call.kwargs["data_source"]
overall_item = ds["source"]["content"][0]["item"]
assert overall_item["ground_truth"] == "Expected answer"

async def test_expected_output_with_num_repetitions(self) -> None:
"""expected_output is correctly stamped on overall items across multiple repetitions."""
mock_oai = self._mock_oai_client()

aer = _make_agent_exec_response("agent", "Response", ["Query"])
final_output = [Message("assistant", ["Final answer"])]

events = [
WorkflowEvent.executor_invoked("agent", "Test query"),
WorkflowEvent.executor_completed("agent", [aer]),
WorkflowEvent.output("end", final_output),
]
wf_result = WorkflowRunResult(events, [])

mock_workflow = MagicMock()
mock_workflow.executors = {}
mock_workflow.run = AsyncMock(return_value=wf_result)

results = await evaluate_workflow(
workflow=mock_workflow,
queries=["Test query"],
expected_output=["Expected answer"],
evaluators=FoundryEvals(
client=mock_oai,
model="gpt-4o",
evaluators=[FoundryEvals.SIMILARITY],
),
num_repetitions=2,
)

assert results[0].status == "completed"

# workflow.run should be called twice (once per repetition)
assert mock_workflow.run.call_count == 2

# Verify all overall items have ground_truth stamped
calls = mock_oai.evals.runs.create.call_args_list
overall_call = calls[-1]
ds = overall_call.kwargs["data_source"]
items = ds["source"]["content"]
assert len(items) == 2
for item in items:
assert item["item"]["ground_truth"] == "Expected answer"

async def test_expected_output_length_mismatch_raises(self) -> None:
"""Mismatched queries and expected_output lengths raise ValueError."""
mock_oai = MagicMock()
mock_workflow = MagicMock()

with pytest.raises(ValueError, match="expected_output"):
await evaluate_workflow(
workflow=mock_workflow,
queries=["q1", "q2"],
expected_output=["e1"],
evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"),
)


# ---------------------------------------------------------------------------
# EvalItemResult and EvalScoreResult
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

"""Evaluate an agent using Azure AI Foundry's built-in evaluators.

This sample demonstrates two patterns:
This sample demonstrates three patterns:
1. evaluate_agent(responses=...) — Evaluate a response you already have.
2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
3. Similarity — Compare agent output against ground-truth reference answers.

See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.

Expand Down Expand Up @@ -149,6 +150,41 @@ async def main() -> None:
else:
print(f"[FAIL] {r.failed} failed")

# =========================================================================
# Pattern 3: Similarity — compare agent output to ground-truth answers
# =========================================================================
print()
print("=" * 60)
print("Pattern 3: Similarity evaluation with ground truth")
print("=" * 60)

# Similarity requires expected_output — a reference answer per query
# that the evaluator compares against the agent's actual response.
results = await evaluate_agent(
agent=agent,
queries=[
"What's the weather like in Seattle?",
"How much does a flight from Seattle to Paris cost?",
],
expected_output=[
"62°F, cloudy with a chance of rain",
"Flights from Seattle to Paris: $450 round-trip",
],
evaluators=FoundryEvals(
client=chat_client,
evaluators=[FoundryEvals.SIMILARITY],
),
)

for r in results:
print(f"Status: {r.status}")
print(f"Results: {r.passed}/{r.total} passed")
print(f"Portal: {r.report_url}")
if r.all_passed:
print("[PASS] All passed")
else:
print(f"[FAIL] {r.failed} failed")


if __name__ == "__main__":
asyncio.run(main())
Loading
Loading