agent-framework/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py at 72c3461668ea92d60d64355e31a6b3bc78effa8d · microsoft/agent-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# Copyright (c) Microsoft. All rights reserved.

"""Evaluate an agent using Azure AI Foundry's built-in evaluators.

This sample demonstrates three patterns:
1. evaluate_agent(responses=...) — Evaluate a response you already have.
2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
3. Similarity — Compare agent output against ground-truth reference answers.

See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.

Prerequisites:
- An Azure AI Foundry project with a deployed model
- Set FOUNDRY_PROJECT_ENDPOINT and FOUNDRY_MODEL in .env
"""

import asyncio
import os

from agent_framework import Agent, ConversationSplit, evaluate_agent
from agent_framework.foundry import FoundryChatClient, FoundryEvals
from azure.identity import AzureCliCredential
from dotenv import load_dotenv

load_dotenv()


# Define a simple tool for the agent
def get_weather(location: str) -> str:
    """Get the current weather for a location."""
    weather_data = {
        "seattle": "62°F, cloudy with a chance of rain",
        "london": "55°F, overcast",
        "paris": "68°F, partly sunny",
    }
    return weather_data.get(location.lower(), f"Weather data not available for {location}")


def get_flight_price(origin: str, destination: str) -> str:
    """Get the price of a flight between two cities."""
    return f"Flights from {origin} to {destination}: $450 round-trip"


async def main() -> None:
    # 1. Set up the FoundryChatClient
    chat_client = FoundryChatClient(
        project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"],
        model=os.environ.get("FOUNDRY_MODEL", "gpt-4o"),
        credential=AzureCliCredential(),
    )

    # 2. Create an agent with tools
    agent = Agent(
        client=chat_client,
        name="travel-assistant",
        instructions=(
            "You are a helpful travel assistant. Use your tools to answer questions about weather and flights."
        ),
        tools=[get_weather, get_flight_price],
    )

    # 3. Create the evaluator — provider config goes here, once
    evals = FoundryEvals(client=chat_client)

    # =========================================================================
    # Pattern 1: evaluate_agent(responses=...) — evaluate a response you already have
    # =========================================================================
    print("=" * 60)
    print("Pattern 1: evaluate_agent(responses=...) — evaluate existing response")
    print("=" * 60)

    query = "How much does a flight from Seattle to Paris cost?"
    response = await agent.run(query)
    print(f"Agent said: {response.text[:100]}...")

    # Pass agent= so tool definitions are extracted, queries= for the eval item context
    results = await evaluate_agent(
        agent=agent,
        responses=response,
        queries=[query],
        evaluators=FoundryEvals(
            client=chat_client,
            evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY],
        ),
    )

    for r in results:
        print(f"Status: {r.status}")
        print(f"Results: {r.passed}/{r.total} passed")
        print(f"Portal: {r.report_url}")
        if r.all_passed:
            print("[PASS] All passed")
        else:
            print(f"[FAIL] {r.failed} failed")

    # =========================================================================
    # Pattern 2a: evaluate_agent() — batch test queries
    # =========================================================================
    print()
    print("=" * 60)
    print("Pattern 2a: evaluate_agent()")
    print("=" * 60)

    # Calls agent.run() under the covers for each query, then evaluates
    results = await evaluate_agent(
        agent=agent,
        queries=[
            "What's the weather like in Seattle?",
            "How much does a flight from Seattle to Paris cost?",
            "What should I pack for London?",
        ],
        evaluators=evals,  # uses smart defaults (auto-adds tool_call_accuracy)
    )

    for r in results:
        print(f"Status: {r.status}")
        print(f"Results: {r.passed}/{r.total} passed")
        print(f"Portal: {r.report_url}")
        if r.all_passed:
            print("[PASS] All passed")
        else:
            print(f"[FAIL] {r.failed} failed")

    # =========================================================================
    # Pattern 2b: evaluate_agent() — with conversation split override
    # =========================================================================
    print()
    print("=" * 60)
    print("Pattern 2b: evaluate_agent() with conversation_split")
    print("=" * 60)

    # conversation_split forces all evaluators to use the same split strategy.
    # FULL evaluates the entire conversation trajectory against the original query.
    results = await evaluate_agent(
        agent=agent,
        queries=[
            "What's the weather like in Seattle?",
            "What should I pack for London?",
        ],
        evaluators=evals,
        conversation_split=ConversationSplit.FULL,  # overrides evaluator defaults
    )

    for r in results:
        print(f"Status: {r.status}")
        print(f"Results: {r.passed}/{r.total} passed")
        print(f"Portal: {r.report_url}")
        if r.all_passed:
            print("[PASS] All passed")
        else:
            print(f"[FAIL] {r.failed} failed")

    # =========================================================================
    # Pattern 3: Similarity — compare agent output to ground-truth answers
    # =========================================================================
    print()
    print("=" * 60)
    print("Pattern 3: Similarity evaluation with ground truth")
    print("=" * 60)

    # Similarity requires expected_output — a reference answer per query
    # that the evaluator compares against the agent's actual response.
    results = await evaluate_agent(
        agent=agent,
        queries=[
            "What's the weather like in Seattle?",
            "How much does a flight from Seattle to Paris cost?",
        ],
        expected_output=[
            "62°F, cloudy with a chance of rain",
            "Flights from Seattle to Paris: $450 round-trip",
        ],
        evaluators=FoundryEvals(
            client=chat_client,
            evaluators=[FoundryEvals.SIMILARITY],
        ),
    )

    for r in results:
        print(f"Status: {r.status}")
        print(f"Results: {r.passed}/{r.total} passed")
        print(f"Portal: {r.report_url}")
        if r.all_passed:
            print("[PASS] All passed")
        else:
            print(f"[FAIL] {r.failed} failed")


if __name__ == "__main__":
    asyncio.run(main())