|
1 | | -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } |
2 | | -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to help you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout" } ], "ground_truth": "Pizzeria Badiali" } |
| 1 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } |
| 2 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" } |
| 3 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" } |
| 4 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" } |
| 5 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" } |
| 6 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" } |
| 7 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" } |
| 8 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" } |
| 9 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" } |
| 10 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour tomorrow?" } ], "ground_truth": "3 hour" } |
| 11 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" } |
| 12 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" } |
| 13 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" } |
| 14 | +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" } |
| 15 | + |
0 commit comments