From 8a20b0240248ff5cc2da1389720283f8e4e3ecf2 Mon Sep 17 00:00:00 2001
From: Benny Chen <youfychenbc5000@gmail.com>
Date: Mon, 3 Nov 2025 16:33:43 -0800
Subject: [PATCH 1/2] klavis MCP

---
 tests/pytest/datasets/gmail_inbox.jsonl       |  1 +
 .../mcp_configurations/klavis_strata_mcp.json |  8 +--
 tests/pytest/test_pytest_klavis_mcp.py        | 69 +++++++++++--------
 3 files changed, 46 insertions(+), 32 deletions(-)
 create mode 100644 tests/pytest/datasets/gmail_inbox.jsonl

diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl
new file mode 100644
index 00000000..789a7dde
--- /dev/null
+++ b/tests/pytest/datasets/gmail_inbox.jsonl
@@ -0,0 +1 @@
+{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}
diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json
index 02bf542d..fd9e6923 100644
--- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json
+++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json
@@ -1,8 +1,8 @@
 {
   "mcpServers": {
-      "klavis-strata": {
-        "url": "https://strata.klavis.ai/mcp/",
-        "authorization": "Bearer ${KLAVIS_API_KEY}"
-      }
+    "klavis-strata": {
+      "url": "https://strata.klavis.ai/mcp/",
+      "authorization": "Bearer ${KLAVIS_API_KEY}"
     }
+  }
 }
diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
index bec5b4b0..9db74154 100644
--- a/tests/pytest/test_pytest_klavis_mcp.py
+++ b/tests/pytest/test_pytest_klavis_mcp.py
@@ -1,41 +1,54 @@
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
+from openai import AsyncOpenAI
+import json
+from pydantic import BaseModel
+import logging
+
+logger = logging.getLogger(__name__)
+import os
+
+
+class ResponseFormat(BaseModel):
+    score: float
 
 
 @evaluation_test(
-    input_messages=[
-        [
-            [
-                Message(
-                    role="system",
-                    content=(
-                        "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n"
-                    ),
-                ),
-                Message(
-                    role="user",
-                    content=("Find the first 5 emails title in my inbox."),
-                ),
-            ]
-        ]
-    ],
+    input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
     rollout_processor=AgentRolloutProcessor(),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
 )
-def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
-    # filter for all tool calls
-    tool_calls = [msg for msg in row.messages if msg.role == "tool"]
-    if len(tool_calls) == 0:
+async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
+    ground_truth = row.ground_truth
+    # check if the final messages contains the ground truth
+
+    async with AsyncOpenAI(
+        api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
+    ) as client:
+        response = await client.chat.completions.create(
+            model="accounts/fireworks/models/kimi-k2-instruct-0905",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.",
+                },
+                {
+                    "role": "user",
+                    "content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
+                },
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()},
+            },
+        )
+        response_text = response.choices[0].message.content
+        logger.info("response_text: %s", response_text)
+        score = json.loads(response_text or "{}")["score"]
         row.evaluation_result = EvaluateResult(
-            score=0,
-            reason="No tool calls made",
+            score=score,
+            reason=response_text,
         )
-        return row
-
-    row.evaluation_result = EvaluateResult(
-        score=1,
-        reason="At least one tool call was made",
-    )
     return row

From e2d8427989b5a7f1bf23b91aa4d4ac594cabbedd Mon Sep 17 00:00:00 2001
From: Benny Chen <youfychenbc5000@gmail.com>
Date: Mon, 3 Nov 2025 17:46:03 -0800
Subject: [PATCH 2/2] format string fix

---
 tests/pytest/test_pytest_klavis_mcp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
index 9db74154..00f48c9c 100644
--- a/tests/pytest/test_pytest_klavis_mcp.py
+++ b/tests/pytest/test_pytest_klavis_mcp.py
@@ -36,7 +36,7 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
                 },
                 {
                     "role": "user",
-                    "content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
+                    "content": f"Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
                 },
             ],
             response_format={