Skip to content

Commit 7f35e24

Browse files
committed
update logic
1 parent a5f3cb3 commit 7f35e24

File tree

3 files changed

+97
-42
lines changed

3 files changed

+97
-42
lines changed

eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def __init__(
3030
self.server_name = server_name
3131
self.initialize_data_factory = initialize_data_factory
3232
self.klavis_client = Klavis(api_key=os.environ.get("KLAVIS_API_KEY"))
33-
self.sandbox = self._init_sandbox()
3433

3534
def _init_sandbox(self) -> CreateSandboxResponse:
3635
try:
@@ -63,27 +62,46 @@ def __call__(
6362
self, rows: List[EvaluationRow], config: RolloutProcessorConfig
6463
) -> List[asyncio.Task[EvaluationRow]]:
6564
"""Process evaluation rows with Klavis sandbox lifecycle management"""
66-
if not self.sandbox:
67-
raise RuntimeError("Sandbox not initialized")
68-
6965
semaphore = config.semaphore
7066

7167
async def process_row(row: EvaluationRow) -> EvaluationRow:
7268
"""Process a single row with complete sandbox lifecycle"""
7369

7470
start_time = time.perf_counter()
71+
agent: Agent | None = None
72+
temp_config_path: str | None = None
73+
sandbox: CreateSandboxResponse | None = None
7574

7675
try:
76+
# Step 0: Create a sandbox for this row
77+
sandbox = self._init_sandbox()
78+
logger.info(f"Sandbox created: {sandbox}")
79+
7780
# Step 1: Initialize data in the sandbox
81+
init_data: Dict[str, Any] | None = None
7882
if self.initialize_data_factory:
79-
logger.info(f"Initializing {self.server_name} sandbox {self.sandbox.sandbox_id}")
8083
init_data = self.initialize_data_factory(row)
81-
initialize_method = getattr(self.klavis_client.sandbox, f"initialize_{self.sandbox.server_name}_sandbox")
82-
initialize_method(sandbox_id=self.sandbox.sandbox_id, **init_data)
83-
logger.info(f"Sandbox initialized successfully")
84-
84+
else:
85+
# Allow datasets to provide initialization payload directly
86+
init_data = (
87+
(row.input_metadata.session_data or {}).get("initialize_data")
88+
if row.input_metadata is not None
89+
else None
90+
)
91+
92+
if init_data:
93+
logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")
94+
initialize_method = getattr(
95+
self.klavis_client.sandbox, f"initialize_{sandbox.server_name.value}_sandbox"
96+
)
97+
init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)
98+
logger.info(f"Initialization response: {init_response}")
99+
85100
# Step 2: Create temporary MCP config with sandbox URL
86-
temp_config_path = self.create_mcp_config(server_url=self.sandbox.server_url, server_key=self.sandbox.server_name)
101+
temp_config_path = self.create_mcp_config(
102+
server_url=sandbox.server_url, server_key=sandbox.server_name.value
103+
)
104+
logger.info(f"MCP config created: {temp_config_path}")
87105

88106
# Step 3: Run agent with sandbox MCP server
89107
logger.info(f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox")
@@ -106,16 +124,16 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
106124
logger.info(f"Agent execution completed for row {row.execution_metadata.rollout_id}")
107125

108126
# Step 4: Export sandbox data
109-
logger.info(f"Exporting {self.server_name} sandbox data")
110-
dump_method = getattr(self.klavis_client.sandbox, f"dump_{self.sandbox.server_name}_sandbox")
111-
dump_response = dump_method(sandbox_id=self.sandbox.sandbox_id)
127+
dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")
128+
dump_response = dump_method(sandbox_id=sandbox.sandbox_id)
112129
sandbox_data = dump_response.data
130+
logger.info(f"Sandbox data: {sandbox_data}")
113131

114132
# Store sandbox data in row metadata for evaluation
115133
if not row.execution_metadata.extra:
116134
row.execution_metadata.extra = {}
117135
row.execution_metadata.extra["sandbox_data"] = sandbox_data
118-
row.execution_metadata.extra["sandbox_id"] = self.sandbox.sandbox_id
136+
row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id
119137
row.execution_metadata.extra["server_name"] = self.server_name
120138

121139
except Exception as e:
@@ -133,15 +151,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
133151
os.unlink(temp_config_path)
134152

135153
# Release sandbox
136-
if self.sandbox.sandbox_id:
154+
if sandbox and sandbox.sandbox_id:
137155
try:
138-
logger.info(f"Releasing {self.server_name} sandbox {self.sandbox.sandbox_id}")
139156
self.klavis_client.sandbox.delete_sandbox(
140-
server_name=self.sandbox.server_name, sandbox_id=self.sandbox.sandbox_id
157+
server_name=sandbox.server_name, sandbox_id=sandbox.sandbox_id
141158
)
142-
logger.info(f"Sandbox {self.sandbox.sandbox_id} released successfully")
159+
logger.info(f"Sandbox {sandbox.sandbox_id} released successfully")
143160
except Exception as e:
144-
logger.error(f"Error releasing sandbox {self.sandbox.sandbox_id}: {str(e)}", exc_info=True)
161+
logger.error(f"Error releasing sandbox {sandbox.sandbox_id}: {str(e)}", exc_info=True)
145162

146163
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
147164

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
{"messages": [{"role": "system", "content": "You are a helpful assistant with access to Gmail. You can send emails, draft emails, and manage messages."}, {"role": "user", "content": "Send an email to john@example.com with subject 'Meeting Tomorrow' and body 'Hi John, Just confirming our meeting tomorrow at 2pm. Best regards.'"}], "ground_truth": "One email sent to john@example.com with subject 'Meeting Tomorrow' containing meeting confirmation"}
2-
{"messages": [{"role": "system", "content": "You are a helpful assistant with access to Gmail. You can send emails, draft emails, and manage messages."}, {"role": "user", "content": "Draft an email to sarah@company.com with subject 'Project Update' and body 'Hi Sarah, The project is progressing well. I will send you the detailed report by Friday.'"}], "ground_truth": "One draft email created for sarah@company.com with subject 'Project Update' about project progress"}
1+
{"initialize_data": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}, {"subject": "Spam Newsletter", "to": "zihao@klavisai.com", "body": "Check out our amazing deals! Click here now!", "cc": "", "bcc": "", "from": "marketing@spammy.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}, "messages": "Please delete the email with subject \"Spam Newsletter\" from my inbox.", "ground_truth": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}}
2+
{"initialize_data": {"messages": [], "drafts": []}, "messages": "Please directly send an email to zihao@klavisai.com with subject \"Meeting Tomorrow\" and body \"Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.\"", "ground_truth": {"messages": [{"subject": "Meeting Tomorrow", "to": "zihao@klavisai.com", "body": "Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.", "cc": "", "bcc": "", "from": "", "reply_to": "", "labels": ["SENT"]}], "drafts": []}}

tests/pytest/test_pytest_klavis_sandbox.py

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
import os
44

5-
from eval_protocol.models import EvaluateResult, EvaluationRow
5+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
66
from eval_protocol.pytest import KlavisSandboxRolloutProcessor, evaluation_test
77
from openai import AsyncOpenAI
88
from pydantic import BaseModel
@@ -12,29 +12,68 @@
1212

1313
class ResponseFormat(BaseModel):
1414
score: float
15-
reasoning: str
15+
16+
17+
def klavis_gmail_sandbox_dataset_adapter(rows: list[dict]) -> list[EvaluationRow]:
18+
"""Dataset adapter for sandbox JSONL rows.
19+
20+
Supports the new schema:
21+
- initialize_data: dict (passed to Klavis sandbox initializer)
22+
- messages: str (task instruction)
23+
- ground_truth: dict (expected final sandbox state)
24+
25+
"""
26+
adapted: list[EvaluationRow] = []
27+
system_prompt = (
28+
"You are a helpful assistant with access to Gmail. "
29+
"You can send emails, draft emails, and manage messages, etc."
30+
)
31+
32+
for r in rows:
33+
if isinstance(r.get("messages"), str) and "initialize_data" in r:
34+
init_data = r.get("initialize_data") or {}
35+
task = r.get("messages") or ""
36+
ground_truth = r.get("ground_truth")
37+
38+
row = EvaluationRow(
39+
messages=[
40+
Message(role="system", content=system_prompt),
41+
Message(role="user", content=task),
42+
],
43+
ground_truth=ground_truth,
44+
)
45+
row.input_metadata.session_data = {
46+
"initialize_data": init_data,
47+
"task": task,
48+
}
49+
adapted.append(row)
50+
else:
51+
adapted.append(EvaluationRow(**r))
52+
53+
return adapted
1654

1755

1856
@evaluation_test(
1957
input_dataset=["tests/pytest/datasets/klavis_gmail_sandbox_test.jsonl"],
2058
rollout_processor=KlavisSandboxRolloutProcessor(
2159
server_name="gmail",
22-
# Optional: provide custom initialization data factory
23-
# initialize_data_factory=lambda row: {"messages": [], "drafts": []},
2460
),
25-
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p2"}],
61+
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-thinking"}],
2662
mode="pointwise",
63+
dataset_adapter=klavis_gmail_sandbox_dataset_adapter,
2764
)
2865
async def test_pytest_gmail_sandbox(row: EvaluationRow) -> EvaluationRow:
2966
"""
3067
Evaluate Gmail sandbox results by comparing with ground truth using LLM judge.
3168
3269
The sandbox data is exported after agent execution and compared with expected output.
33-
Sandbox data is available in row.execution_metadata.metadata["sandbox_data"].
70+
Sandbox data is available in row.execution_metadata.extra["sandbox_data"].
3471
"""
3572
ground_truth = row.ground_truth
3673
sandbox_data = row.execution_metadata.extra.get("sandbox_data", {}) if row.execution_metadata.extra else {}
3774
final_message = row.messages[-1].content if row.messages else ""
75+
initialize_data = (row.input_metadata.session_data or {}).get("initialize_data", {})
76+
task = (row.input_metadata.session_data or {}).get("task", "")
3877

3978
logger.info(f"Evaluating row {row.execution_metadata.rollout_id}")
4079
logger.info(f"Final message: {final_message}")
@@ -44,31 +83,34 @@ async def test_pytest_gmail_sandbox(row: EvaluationRow) -> EvaluationRow:
4483
async with AsyncOpenAI(
4584
api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
4685
) as client:
47-
# Use LLM to judge if the sandbox data matches the ground truth
48-
evaluation_prompt = f"""You are evaluating an AI agent's performance on a Gmail task.
4986

50-
Task: {row.messages[0].content if row.messages else 'N/A'}
87+
evaluation_prompt = f"""You are evaluating an AI agent's performance on a Gmail sandbox task.
88+
89+
Task:
90+
{task or (row.messages[-1].content if row.messages else 'N/A')}
5191
52-
Ground Truth: {ground_truth}
92+
Initial Gmail Sandbox State (initialize_data):
93+
{json.dumps(initialize_data, indent=2, default=str)}
5394
54-
Agent's Final Response: {final_message}
95+
Expected Final Gmail Sandbox State (ground_truth):
96+
{json.dumps(ground_truth, indent=2, default=str)}
5597
5698
Gmail Sandbox State After Execution:
5799
{json.dumps(sandbox_data, indent=2, default=str)}
58100
59101
Evaluate whether the agent successfully completed the task by checking:
60-
1. Did the agent understand and attempt the task?
61-
2. Does the sandbox data reflect the expected outcome described in the ground truth?
62-
3. Are there any emails sent/drafted that match the task requirements?
102+
1. Does the final sandbox state match the expected ground_truth state?
103+
2. If there are small formatting differences, judge semantically
104+
3. Use the initial state only as context; the key is whether the correct changes happened.
63105
64106
Return:
65107
- score: 1.0 if task completed successfully, 0.5 if partially completed, 0.0 if failed
66-
- reasoning: Explain your evaluation in 1-2 sentences
108+
67109
"""
68110

69111
try:
70112
response = await client.chat.completions.create(
71-
model="accounts/fireworks/models/deepseek-v3p2",
113+
model="accounts/fireworks/models/kimi-k2-thinking",
72114
messages=[
73115
{
74116
"role": "system",
@@ -88,12 +130,8 @@ async def test_pytest_gmail_sandbox(row: EvaluationRow) -> EvaluationRow:
88130

89131
parsed = json.loads(response_text or "{}")
90132
score = parsed.get("score", 0.0)
91-
reasoning = parsed.get("reasoning", "No reasoning provided")
92133

93-
row.evaluation_result = EvaluateResult(
94-
score=score,
95-
reason=reasoning,
96-
)
134+
row.evaluation_result = EvaluateResult(score=score)
97135
except Exception as e:
98136
logger.error(f"Error during LLM evaluation: {str(e)}", exc_info=True)
99137
row.evaluation_result = EvaluateResult(

0 commit comments

Comments
 (0)