Skip to content

Commit 8a20b02

Browse files
committed
klavis MCP
1 parent c3de2a2 commit 8a20b02

File tree

3 files changed

+46
-32
lines changed

3 files changed

+46
-32
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"mcpServers": {
3-
"klavis-strata": {
4-
"url": "https://strata.klavis.ai/mcp/",
5-
"authorization": "Bearer ${KLAVIS_API_KEY}"
6-
}
3+
"klavis-strata": {
4+
"url": "https://strata.klavis.ai/mcp/",
5+
"authorization": "Bearer ${KLAVIS_API_KEY}"
76
}
7+
}
88
}
Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,54 @@
11
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
22
from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
3+
from openai import AsyncOpenAI
4+
import json
5+
from pydantic import BaseModel
6+
import logging
7+
8+
logger = logging.getLogger(__name__)
9+
import os
10+
11+
12+
class ResponseFormat(BaseModel):
13+
score: float
314

415

516
@evaluation_test(
6-
input_messages=[
7-
[
8-
[
9-
Message(
10-
role="system",
11-
content=(
12-
"You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n"
13-
),
14-
),
15-
Message(
16-
role="user",
17-
content=("Find the first 5 emails title in my inbox."),
18-
),
19-
]
20-
]
21-
],
17+
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
2218
rollout_processor=AgentRolloutProcessor(),
2319
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
2420
mode="pointwise",
2521
mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
2622
)
27-
def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
28-
# filter for all tool calls
29-
tool_calls = [msg for msg in row.messages if msg.role == "tool"]
30-
if len(tool_calls) == 0:
23+
async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
24+
ground_truth = row.ground_truth
25+
# check if the final messages contains the ground truth
26+
27+
async with AsyncOpenAI(
28+
api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
29+
) as client:
30+
response = await client.chat.completions.create(
31+
model="accounts/fireworks/models/kimi-k2-instruct-0905",
32+
messages=[
33+
{
34+
"role": "system",
35+
"content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.",
36+
},
37+
{
38+
"role": "user",
39+
"content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
40+
},
41+
],
42+
response_format={
43+
"type": "json_schema",
44+
"json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()},
45+
},
46+
)
47+
response_text = response.choices[0].message.content
48+
logger.info("response_text: %s", response_text)
49+
score = json.loads(response_text or "{}")["score"]
3150
row.evaluation_result = EvaluateResult(
32-
score=0,
33-
reason="No tool calls made",
51+
score=score,
52+
reason=response_text,
3453
)
35-
return row
36-
37-
row.evaluation_result = EvaluateResult(
38-
score=1,
39-
reason="At least one tool call was made",
40-
)
4154
return row

0 commit comments

Comments
 (0)