Skip to content

Commit cf6625d

Browse files
author
Dylan Huang
committed
Merge branch 'main' into dhuang/dxe-442-refactor-ep-upload-to-use-fireworks-sdk-auto-generate-flags
2 parents 1d7d807 + ac376b7 commit cf6625d

File tree

9 files changed

+398
-6
lines changed

9 files changed

+398
-6
lines changed

eval_protocol/cli_commands/export_docs.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,25 @@
1313
logger = logging.getLogger(__name__)
1414

1515

16+
def _escape_mdx_text(text: str) -> str:
17+
"""
18+
Escape text that will be emitted as the *children* of an MDX/JSX component.
19+
20+
In MDX, `{` and `}` can start JS expressions even in otherwise plain text,
21+
which can break parsing when help strings include JSON examples.
22+
"""
23+
if not text:
24+
return ""
25+
# IMPORTANT: escape '&' first to avoid double-escaping.
26+
return (
27+
text.replace("&", "&")
28+
.replace("<", "&lt;")
29+
.replace(">", "&gt;")
30+
.replace("{", "&#123;")
31+
.replace("}", "&#125;")
32+
)
33+
34+
1635
def _get_parser_info(parser: argparse.ArgumentParser, subparser_help: str = "") -> Dict:
1736
"""Extract information from an ArgumentParser."""
1837
info = {
@@ -110,10 +129,19 @@ def _format_argument_item(arg: Dict) -> List[str]:
110129
if arg["required"]:
111130
attrs.append("required")
112131

113-
# Build description with short alias mention
114-
help_text = (arg["help"] or "").replace("<", "&lt;").replace(">", "&gt;")
115-
if short_opts:
116-
alias_note = f"Short: `{short_opts[0]}`"
132+
# Build description with alias mention (short + additional long aliases)
133+
help_text = _escape_mdx_text(arg["help"] or "")
134+
135+
aliases: List[str] = []
136+
if arg["option_strings"]:
137+
aliases = [o for o in arg["option_strings"] if o != primary]
138+
139+
if aliases:
140+
# Put long aliases first, then short ones for readability.
141+
long_aliases = [a for a in aliases if a.startswith("--")]
142+
short_aliases = [a for a in aliases if not a.startswith("--")]
143+
aliases_fmt = ", ".join([f"`{a}`" for a in (long_aliases + short_aliases)])
144+
alias_note = f"Aliases: {aliases_fmt}"
117145
if help_text:
118146
help_text = f"{help_text} ({alias_note})"
119147
else:

eval_protocol/pytest/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,15 @@
1111
from .rollout_result_post_processor import RolloutResultPostProcessor, NoOpRolloutResultPostProcessor
1212
from .types import RolloutProcessorConfig
1313

14+
# Conditional import for optional Klavis dependency
15+
try:
16+
from .default_klavis_sandbox_rollout_processor import KlavisSandboxRolloutProcessor
17+
18+
KLAVIS_AVAILABLE = True
19+
except ImportError:
20+
KLAVIS_AVAILABLE = False
21+
KlavisSandboxRolloutProcessor = None
22+
1423
# Conditional import for optional dependencies
1524
try:
1625
from .default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
@@ -47,6 +56,10 @@
4756
"NoOpRolloutResultPostProcessor",
4857
]
4958

59+
# Only add to __all__ if available
60+
if KLAVIS_AVAILABLE:
61+
__all__.append("KlavisSandboxRolloutProcessor")
62+
5063
# Only add to __all__ if available
5164
if PYDANTIC_AI_AVAILABLE:
5265
__all__.append("PydanticAgentRolloutProcessor")
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import asyncio
2+
import json
3+
import logging
4+
import os
5+
import tempfile
6+
import time
7+
from typing import Any, Callable, Dict, List, Optional
8+
9+
from pydantic import BaseModel, Field
10+
11+
from eval_protocol.models import EvaluationRow
12+
from eval_protocol.pytest.rollout_processor import RolloutProcessor
13+
from eval_protocol.pytest.types import RolloutProcessorConfig
14+
15+
from eval_protocol.pytest.default_agent_rollout_processor import Agent
16+
from klavis import Klavis
17+
from klavis.types import CreateSandboxResponse, SandboxMcpServer
18+
from openai.types import CompletionUsage
19+
20+
logger = logging.getLogger(__name__)
21+
22+
23+
class KlavisSandboxRolloutProcessor(RolloutProcessor):
24+
def __init__(
25+
self,
26+
server_name: str,
27+
initialize_data_factory: Optional[Callable[[EvaluationRow], Dict[str, Any]]] = None,
28+
):
29+
super().__init__()
30+
self.server_name = server_name
31+
self.initialize_data_factory = initialize_data_factory
32+
self.klavis_client = Klavis(api_key=os.environ.get("KLAVIS_API_KEY"))
33+
34+
def _init_sandbox(self) -> CreateSandboxResponse:
35+
try:
36+
server_name_enum = SandboxMcpServer(self.server_name)
37+
return self.klavis_client.sandbox.create_sandbox(server_name=server_name_enum)
38+
except Exception as e:
39+
logger.error(f"Error creating sandbox: {str(e)}", exc_info=True)
40+
raise
41+
42+
@staticmethod
43+
def create_mcp_config(server_url: str, server_key: str = "main", auth_token: str | None = None) -> str:
44+
"""Create a temporary MCP config file and return its path."""
45+
config = {
46+
"mcpServers": {
47+
server_key: {
48+
"url": server_url,
49+
"transport": "streamable_http",
50+
**({"authorization": f"Bearer {auth_token}"} if auth_token else {})
51+
}
52+
}
53+
}
54+
55+
# Create a temp file that persists for the session
56+
fd, path = tempfile.mkstemp(suffix=".json", prefix="mcp_config_")
57+
with os.fdopen(fd, 'w') as f:
58+
json.dump(config, f)
59+
return path
60+
61+
def __call__(
62+
self, rows: List[EvaluationRow], config: RolloutProcessorConfig
63+
) -> List[asyncio.Task[EvaluationRow]]:
64+
"""Process evaluation rows with Klavis sandbox lifecycle management"""
65+
semaphore = config.semaphore
66+
67+
async def process_row(row: EvaluationRow) -> EvaluationRow:
68+
"""Process a single row with complete sandbox lifecycle"""
69+
70+
start_time = time.perf_counter()
71+
agent: Agent | None = None
72+
temp_config_path: str | None = None
73+
sandbox: CreateSandboxResponse | None = None
74+
75+
try:
76+
# Step 0: Create a sandbox for this row
77+
sandbox = self._init_sandbox()
78+
logger.info(f"Sandbox created: {sandbox}")
79+
80+
# Step 1: Initialize data in the sandbox
81+
init_data: Dict[str, Any] | None = None
82+
if self.initialize_data_factory:
83+
init_data = self.initialize_data_factory(row)
84+
else:
85+
# Allow datasets to provide initialization payload directly
86+
init_data = (
87+
(row.input_metadata.session_data or {}).get("initialize_data")
88+
if row.input_metadata is not None
89+
else None
90+
)
91+
92+
if init_data:
93+
logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")
94+
initialize_method = getattr(
95+
self.klavis_client.sandbox, f"initialize_{sandbox.server_name.value}_sandbox"
96+
)
97+
init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)
98+
logger.info(f"Initialization response: {init_response}")
99+
100+
# Step 2: Create temporary MCP config with sandbox URL
101+
temp_config_path = self.create_mcp_config(
102+
server_url=sandbox.server_url, server_key=sandbox.server_name.value
103+
)
104+
logger.info(f"MCP config created: {temp_config_path}")
105+
106+
# Step 3: Run agent with sandbox MCP server
107+
logger.info(f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox")
108+
agent = Agent(
109+
model=row.input_metadata.completion_params["model"],
110+
row=row,
111+
config_path=temp_config_path,
112+
logger=config.logger,
113+
)
114+
await agent.setup()
115+
await agent.call_agent()
116+
117+
# Update usage metadata
118+
row.execution_metadata.usage = CompletionUsage(
119+
prompt_tokens=agent.usage.get("prompt_tokens", 0),
120+
completion_tokens=agent.usage.get("completion_tokens", 0),
121+
total_tokens=agent.usage.get("total_tokens", 0),
122+
)
123+
row = agent.evaluation_row
124+
logger.info(f"Agent execution completed for row {row.execution_metadata.rollout_id}")
125+
126+
# Step 4: Export sandbox data
127+
dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")
128+
dump_response = dump_method(sandbox_id=sandbox.sandbox_id)
129+
sandbox_data = dump_response.data
130+
logger.info(f"Sandbox data: {sandbox_data}")
131+
132+
# Store sandbox data in row metadata for evaluation
133+
if not row.execution_metadata.extra:
134+
row.execution_metadata.extra = {}
135+
row.execution_metadata.extra["sandbox_data"] = sandbox_data
136+
row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id
137+
row.execution_metadata.extra["server_name"] = self.server_name
138+
139+
except Exception as e:
140+
logger.error(f"Error processing row {row.execution_metadata.rollout_id}: {str(e)}", exc_info=True)
141+
if not row.execution_metadata.extra:
142+
row.execution_metadata.extra = {}
143+
row.execution_metadata.extra["error"] = str(e)
144+
raise
145+
146+
finally:
147+
# Cleanup agent MCP client and temp config
148+
if agent and agent.mcp_client:
149+
await agent.mcp_client.cleanup()
150+
if temp_config_path and os.path.exists(temp_config_path):
151+
os.unlink(temp_config_path)
152+
153+
# Release sandbox
154+
if sandbox and sandbox.sandbox_id:
155+
try:
156+
self.klavis_client.sandbox.delete_sandbox(
157+
server_name=sandbox.server_name, sandbox_id=sandbox.sandbox_id
158+
)
159+
logger.info(f"Sandbox {sandbox.sandbox_id} released successfully")
160+
except Exception as e:
161+
logger.error(f"Error releasing sandbox {sandbox.sandbox_id}: {str(e)}", exc_info=True)
162+
163+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
164+
165+
return row
166+
167+
async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
168+
async with semaphore:
169+
result = await process_row(r)
170+
return result
171+
172+
# Create and return tasks
173+
tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
174+
return tasks

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ openenv = [
134134
dspy = [
135135
"dspy>=3.0.0",
136136
]
137+
klavis = [
138+
"klavis>=2.18.0",
139+
]
137140

138141
# Optional deps for LangGraph example/tests
139142
langgraph = [

tests/chinook/langgraph/test_langgraph_chinook.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import pytest
22

3+
pytest.skip(
4+
"Skipping Chinook langgraph integration tests (requires external services/credentials).",
5+
allow_module_level=True,
6+
)
7+
38
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
49
from eval_protocol.pytest import evaluation_test
510

tests/chinook/pydantic/test_pydantic_chinook.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1+
import pytest
2+
3+
pytest.skip(
4+
"Skipping Chinook pydantic-ai integration tests (requires external LLM access/credentials).",
5+
allow_module_level=True,
6+
)
7+
18
from pydantic import BaseModel
29
from pydantic_ai import Agent
3-
import pytest
410

511
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
612
from eval_protocol.pytest import evaluation_test
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"initialize_data": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}, {"subject": "Spam Newsletter", "to": "zihao@klavisai.com", "body": "Check out our amazing deals! Click here now!", "cc": "", "bcc": "", "from": "marketing@spammy.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}, "messages": "Please delete the email with subject \"Spam Newsletter\" from my inbox.", "ground_truth": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}}
2+
{"initialize_data": {"messages": [], "drafts": []}, "messages": "Please directly send an email to zihao@klavisai.com with subject \"Meeting Tomorrow\" and body \"Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.\"", "ground_truth": {"messages": [{"subject": "Meeting Tomorrow", "to": "zihao@klavisai.com", "body": "Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.", "cc": "", "bcc": "", "from": "", "reply_to": "", "labels": ["SENT"]}], "drafts": []}}

0 commit comments

Comments
 (0)