From 8a20b0240248ff5cc2da1389720283f8e4e3ecf2 Mon Sep 17 00:00:00 2001 From: Benny Chen Date: Mon, 3 Nov 2025 16:33:43 -0800 Subject: [PATCH 1/2] klavis MCP --- tests/pytest/datasets/gmail_inbox.jsonl | 1 + .../mcp_configurations/klavis_strata_mcp.json | 8 +-- tests/pytest/test_pytest_klavis_mcp.py | 69 +++++++++++-------- 3 files changed, 46 insertions(+), 32 deletions(-) create mode 100644 tests/pytest/datasets/gmail_inbox.jsonl diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl new file mode 100644 index 00000000..789a7dde --- /dev/null +++ b/tests/pytest/datasets/gmail_inbox.jsonl @@ -0,0 +1 @@ +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"} diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json index 02bf542d..fd9e6923 100644 --- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -1,8 +1,8 @@ { "mcpServers": { - "klavis-strata": { - "url": "https://strata.klavis.ai/mcp/", - "authorization": "Bearer ${KLAVIS_API_KEY}" - } + "klavis-strata": { + "url": "https://strata.klavis.ai/mcp/", + "authorization": "Bearer ${KLAVIS_API_KEY}" } + } } diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index bec5b4b0..9db74154 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -1,41 +1,54 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, Message from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test +from openai import AsyncOpenAI +import json +from pydantic import BaseModel +import logging + +logger = logging.getLogger(__name__) +import os + + +class ResponseFormat(BaseModel): + score: float @evaluation_test( - input_messages=[ - [ - [ - Message( - role="system", - content=( - "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n" - ), - ), - Message( - role="user", - content=("Find the first 5 emails title in my inbox."), - ), - ] - ] - ], + input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", ) -def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: - # filter for all tool calls - tool_calls = [msg for msg in row.messages if msg.role == "tool"] - if len(tool_calls) == 0: +async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: + ground_truth = row.ground_truth + # check if the final messages contains the ground truth + + async with AsyncOpenAI( + api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1" + ) as client: + response = await client.chat.completions.create( + model="accounts/fireworks/models/kimi-k2-instruct-0905", + messages=[ + { + "role": "system", + "content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.", + }, + { + "role": "user", + "content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}", + }, + ], + response_format={ + "type": "json_schema", + "json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()}, + }, + ) + response_text = response.choices[0].message.content + logger.info("response_text: %s", response_text) + score = json.loads(response_text or "{}")["score"] row.evaluation_result = EvaluateResult( - score=0, - reason="No tool calls made", + score=score, + reason=response_text, ) - return row - - row.evaluation_result = EvaluateResult( - score=1, - reason="At least one tool call was made", - ) return row From e2d8427989b5a7f1bf23b91aa4d4ac594cabbedd Mon Sep 17 00:00:00 2001 From: Benny Chen Date: Mon, 3 Nov 2025 17:46:03 -0800 Subject: [PATCH 2/2] format string fix --- tests/pytest/test_pytest_klavis_mcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 9db74154..00f48c9c 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -36,7 +36,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: }, { "role": "user", - "content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}", + "content": f"Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}", }, ], response_format={