Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/pytest/datasets/gmail_inbox.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}
8 changes: 4 additions & 4 deletions tests/pytest/mcp_configurations/klavis_strata_mcp.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"mcpServers": {
"klavis-strata": {
"url": "https://strata.klavis.ai/mcp/",
"authorization": "Bearer ${KLAVIS_API_KEY}"
}
"klavis-strata": {
"url": "https://strata.klavis.ai/mcp/",
"authorization": "Bearer ${KLAVIS_API_KEY}"
}
}
}
69 changes: 41 additions & 28 deletions tests/pytest/test_pytest_klavis_mcp.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,54 @@
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
from openai import AsyncOpenAI
import json
from pydantic import BaseModel
import logging

logger = logging.getLogger(__name__)
import os


class ResponseFormat(BaseModel):
score: float


@evaluation_test(
input_messages=[
[
[
Message(
role="system",
content=(
"You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n"
),
),
Message(
role="user",
content=("Find the first 5 emails title in my inbox."),
),
]
]
],
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
mode="pointwise",
mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
)
def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
# filter for all tool calls
tool_calls = [msg for msg in row.messages if msg.role == "tool"]
if len(tool_calls) == 0:
async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
ground_truth = row.ground_truth
# check if the final messages contains the ground truth

async with AsyncOpenAI(
api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
) as client:
response = await client.chat.completions.create(
model="accounts/fireworks/models/kimi-k2-instruct-0905",
messages=[
{
"role": "system",
"content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.",
},
{
"role": "user",
"content": f"Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
},
],
response_format={
"type": "json_schema",
"json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()},
},
)
response_text = response.choices[0].message.content
logger.info("response_text: %s", response_text)
score = json.loads(response_text or "{}")["score"]
row.evaluation_result = EvaluateResult(
score=0,
reason="No tool calls made",
score=score,
reason=response_text,
)
return row

row.evaluation_result = EvaluateResult(
score=1,
reason="At least one tool call was made",
)
return row
Loading