Skip to content

Commit f162be3

Browse files
Pr 330 (#357)
* First Commit of Klavis Strata MCP * Add auth header for Klavis MCP * Update the simple email use case * Add notion task * change dataset file name * Add more use cases * change the API key to auth token * Modify test case * updated klavis * Update klavis_strata_mcp.json --------- Co-authored-by: zhengliu <lliuzzheng@gmail.com> Co-authored-by: LLiuZheng <zhengliu@klavis.ai>
1 parent e0923cf commit f162be3

File tree

4 files changed

+26
-5
lines changed

4 files changed

+26
-5
lines changed

tests/pytest/datasets/gmail_inbox.jsonl

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." }
2+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" }
3+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" }
4+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" }
5+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" }
6+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" }
7+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" }
8+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" }
9+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" }
10+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour the next working day?" } ], "ground_truth": "2 hour" }
11+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" }
12+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" }
13+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" }
14+
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" }
15+

tests/pytest/mcp_configurations/klavis_strata_mcp.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"mcpServers": {
33
"klavis-strata": {
44
"url": "https://strata.klavis.ai/mcp/",
5-
"authorization": "Bearer ${KLAVIS_API_KEY}"
5+
"authorization": "Bearer ${KLAVIS_AUTH_TOKEN}"
66
}
77
}
88
}

tests/pytest/test_pytest_klavis_mcp.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,16 @@ class ResponseFormat(BaseModel):
1313
score: float
1414

1515

16+
"""
17+
You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819
18+
into your Notion for the notion test.
19+
"""
20+
21+
1622
@evaluation_test(
17-
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
23+
input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"],
1824
rollout_processor=AgentRolloutProcessor(),
19-
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
25+
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}],
2026
mode="pointwise",
2127
mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
2228
)
@@ -28,7 +34,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
2834
api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
2935
) as client:
3036
response = await client.chat.completions.create(
31-
model="accounts/fireworks/models/kimi-k2-instruct-0905",
37+
model="accounts/fireworks/models/kimi-k2-thinking",
3238
messages=[
3339
{
3440
"role": "system",
@@ -47,6 +53,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
4753
response_text = response.choices[0].message.content
4854
logger.info("response_text: %s", response_text)
4955
score = json.loads(response_text or "{}")["score"]
56+
5057
row.evaluation_result = EvaluateResult(
5158
score=score,
5259
reason=response_text,

0 commit comments

Comments
 (0)