diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl deleted file mode 100644 index 789a7dde..00000000 --- a/tests/pytest/datasets/gmail_inbox.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"} diff --git a/tests/pytest/datasets/klavis_mcp_test.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl new file mode 100644 index 00000000..9cee59a7 --- /dev/null +++ b/tests/pytest/datasets/klavis_mcp_test.jsonl @@ -0,0 +1,15 @@ +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour the next working day?" } ], "ground_truth": "2 hour" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" } + diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json index fd9e6923..b6becb87 100644 --- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -2,7 +2,7 @@ "mcpServers": { "klavis-strata": { "url": "https://strata.klavis.ai/mcp/", - "authorization": "Bearer ${KLAVIS_API_KEY}" + "authorization": "Bearer ${KLAVIS_AUTH_TOKEN}" } } } diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 67ff6de4..c7e92b06 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -13,10 +13,16 @@ class ResponseFormat(BaseModel): score: float +""" +You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 +into your Notion for the notion test. +""" + + @evaluation_test( - input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], + input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", ) @@ -28,7 +34,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1" ) as client: response = await client.chat.completions.create( - model="accounts/fireworks/models/kimi-k2-instruct-0905", + model="accounts/fireworks/models/kimi-k2-thinking", messages=[ { "role": "system", @@ -47,6 +53,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: response_text = response.choices[0].message.content logger.info("response_text: %s", response_text) score = json.loads(response_text or "{}")["score"] + row.evaluation_result = EvaluateResult( score=score, reason=response_text,