Merge branch 'main' into dhuang/dxe-442-refactor-ep-upload-to-use-fireworks-sdk-auto-generate-flags

Dylan Huang · Dylan Huang · commit cf6625d9d8e6 · 2025-12-30T09:10:16.000-08:00
diff --git a/eval_protocol/cli_commands/export_docs.py b/eval_protocol/cli_commands/export_docs.py
@@ -13,6 +13,25 @@
 logger = logging.getLogger(__name__)
 
 
+def _escape_mdx_text(text: str) -> str:
+    """
+    Escape text that will be emitted as the *children* of an MDX/JSX component.
+
+    In MDX, `{` and `}` can start JS expressions even in otherwise plain text,
+    which can break parsing when help strings include JSON examples.
+    """
+    if not text:
+        return ""
+    # IMPORTANT: escape '&' first to avoid double-escaping.
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace("{", "&#123;")
+        .replace("}", "&#125;")
+    )
+
+
 def _get_parser_info(parser: argparse.ArgumentParser, subparser_help: str = "") -> Dict:
     """Extract information from an ArgumentParser."""
     info = {
@@ -110,10 +129,19 @@ def _format_argument_item(arg: Dict) -> List[str]:
     if arg["required"]:
         attrs.append("required")
 
-    # Build description with short alias mention
-    help_text = (arg["help"] or "").replace("<", "&lt;").replace(">", "&gt;")
-    if short_opts:
-        alias_note = f"Short: `{short_opts[0]}`"
+    # Build description with alias mention (short + additional long aliases)
+    help_text = _escape_mdx_text(arg["help"] or "")
+
+    aliases: List[str] = []
+    if arg["option_strings"]:
+        aliases = [o for o in arg["option_strings"] if o != primary]
+
+    if aliases:
+        # Put long aliases first, then short ones for readability.
+        long_aliases = [a for a in aliases if a.startswith("--")]
+        short_aliases = [a for a in aliases if not a.startswith("--")]
+        aliases_fmt = ", ".join([f"`{a}`" for a in (long_aliases + short_aliases)])
+        alias_note = f"Aliases: {aliases_fmt}"
         if help_text:
             help_text = f"{help_text} ({alias_note})"
         else:
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -11,6 +11,15 @@
 from .rollout_result_post_processor import RolloutResultPostProcessor, NoOpRolloutResultPostProcessor
 from .types import RolloutProcessorConfig
 
+# Conditional import for optional Klavis dependency
+try:
+    from .default_klavis_sandbox_rollout_processor import KlavisSandboxRolloutProcessor
+
+    KLAVIS_AVAILABLE = True
+except ImportError:
+    KLAVIS_AVAILABLE = False
+    KlavisSandboxRolloutProcessor = None
+
 # Conditional import for optional dependencies
 try:
     from .default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
@@ -47,6 +56,10 @@
     "NoOpRolloutResultPostProcessor",
 ]
 
+# Only add to __all__ if available
+if KLAVIS_AVAILABLE:
+    __all__.append("KlavisSandboxRolloutProcessor")
+
 # Only add to __all__ if available
 if PYDANTIC_AI_AVAILABLE:
     __all__.append("PydanticAgentRolloutProcessor")
diff --git a/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py b/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py
@@ -0,0 +1,174 @@
+import asyncio
+import json
+import logging
+import os
+import tempfile
+import time
+from typing import Any, Callable, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest.rollout_processor import RolloutProcessor
+from eval_protocol.pytest.types import RolloutProcessorConfig
+
+from eval_protocol.pytest.default_agent_rollout_processor import Agent
+from klavis import Klavis
+from klavis.types import CreateSandboxResponse, SandboxMcpServer
+from openai.types import CompletionUsage
+
+logger = logging.getLogger(__name__)
+
+
+class KlavisSandboxRolloutProcessor(RolloutProcessor):
+    def __init__(
+        self,
+        server_name: str,
+        initialize_data_factory: Optional[Callable[[EvaluationRow], Dict[str, Any]]] = None,
+    ):
+        super().__init__()
+        self.server_name = server_name
+        self.initialize_data_factory = initialize_data_factory
+        self.klavis_client = Klavis(api_key=os.environ.get("KLAVIS_API_KEY"))
+        
+    def _init_sandbox(self) -> CreateSandboxResponse:
+        try:
+            server_name_enum = SandboxMcpServer(self.server_name)
+            return self.klavis_client.sandbox.create_sandbox(server_name=server_name_enum)
+        except Exception as e:
+            logger.error(f"Error creating sandbox: {str(e)}", exc_info=True)
+            raise
+    
+    @staticmethod
+    def create_mcp_config(server_url: str, server_key: str = "main", auth_token: str | None = None) -> str:
+        """Create a temporary MCP config file and return its path."""
+        config = {
+            "mcpServers": {
+                server_key: {
+                    "url": server_url,
+                    "transport": "streamable_http",
+                    **({"authorization": f"Bearer {auth_token}"} if auth_token else {})
+                }
+            }
+        }
+        
+        # Create a temp file that persists for the session
+        fd, path = tempfile.mkstemp(suffix=".json", prefix="mcp_config_")
+        with os.fdopen(fd, 'w') as f:
+            json.dump(config, f)
+        return path
+
+    def __call__(
+        self, rows: List[EvaluationRow], config: RolloutProcessorConfig
+    ) -> List[asyncio.Task[EvaluationRow]]:
+        """Process evaluation rows with Klavis sandbox lifecycle management"""
+        semaphore = config.semaphore
+
+        async def process_row(row: EvaluationRow) -> EvaluationRow:
+            """Process a single row with complete sandbox lifecycle"""
+            
+            start_time = time.perf_counter()
+            agent: Agent | None = None
+            temp_config_path: str | None = None
+            sandbox: CreateSandboxResponse | None = None
+
+            try:
+                # Step 0: Create a sandbox for this row
+                sandbox = self._init_sandbox()
+                logger.info(f"Sandbox created: {sandbox}")
+
+                # Step 1: Initialize data in the sandbox
+                init_data: Dict[str, Any] | None = None
+                if self.initialize_data_factory:
+                    init_data = self.initialize_data_factory(row)
+                else:
+                    # Allow datasets to provide initialization payload directly
+                    init_data = (
+                        (row.input_metadata.session_data or {}).get("initialize_data")
+                        if row.input_metadata is not None
+                        else None
+                    )
+                
+                if init_data:
+                    logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")
+                    initialize_method = getattr(
+                        self.klavis_client.sandbox, f"initialize_{sandbox.server_name.value}_sandbox"
+                    )
+                    init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)
+                    logger.info(f"Initialization response: {init_response}")
+                    
+                # Step 2: Create temporary MCP config with sandbox URL
+                temp_config_path = self.create_mcp_config(
+                    server_url=sandbox.server_url, server_key=sandbox.server_name.value
+                )
+                logger.info(f"MCP config created: {temp_config_path}")
+
+                # Step 3: Run agent with sandbox MCP server
+                logger.info(f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox")
+                agent = Agent(
+                    model=row.input_metadata.completion_params["model"],
+                    row=row,
+                    config_path=temp_config_path,
+                    logger=config.logger,
+                )
+                await agent.setup()
+                await agent.call_agent()
+
+                # Update usage metadata
+                row.execution_metadata.usage = CompletionUsage(
+                    prompt_tokens=agent.usage.get("prompt_tokens", 0),
+                    completion_tokens=agent.usage.get("completion_tokens", 0),
+                    total_tokens=agent.usage.get("total_tokens", 0),
+                )
+                row = agent.evaluation_row
+                logger.info(f"Agent execution completed for row {row.execution_metadata.rollout_id}")
+
+                # Step 4: Export sandbox data
+                dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")
+                dump_response = dump_method(sandbox_id=sandbox.sandbox_id)
+                sandbox_data = dump_response.data
+                logger.info(f"Sandbox data: {sandbox_data}")
+
+                # Store sandbox data in row metadata for evaluation
+                if not row.execution_metadata.extra:
+                    row.execution_metadata.extra = {}
+                row.execution_metadata.extra["sandbox_data"] = sandbox_data
+                row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id
+                row.execution_metadata.extra["server_name"] = self.server_name
+
+            except Exception as e:
+                logger.error(f"Error processing row {row.execution_metadata.rollout_id}: {str(e)}", exc_info=True)
+                if not row.execution_metadata.extra:
+                    row.execution_metadata.extra = {}
+                row.execution_metadata.extra["error"] = str(e)
+                raise
+
+            finally:
+                # Cleanup agent MCP client and temp config
+                if agent and agent.mcp_client:
+                    await agent.mcp_client.cleanup()
+                if temp_config_path and os.path.exists(temp_config_path):
+                    os.unlink(temp_config_path)
+                
+                # Release sandbox
+                if sandbox and sandbox.sandbox_id:
+                    try:
+                        self.klavis_client.sandbox.delete_sandbox(
+                            server_name=sandbox.server_name, sandbox_id=sandbox.sandbox_id
+                        )
+                        logger.info(f"Sandbox {sandbox.sandbox_id} released successfully")
+                    except Exception as e:
+                        logger.error(f"Error releasing sandbox {sandbox.sandbox_id}: {str(e)}", exc_info=True)
+
+                row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
+
+            return row
+
+        async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+            async with semaphore:
+                result = await process_row(r)
+                return result
+
+        # Create and return tasks
+        tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+        return tasks
diff --git a/pyproject.toml b/pyproject.toml
@@ -134,6 +134,9 @@ openenv = [
 dspy = [
     "dspy>=3.0.0",
 ]
+klavis = [
+    "klavis>=2.18.0",
+]
 
 # Optional deps for LangGraph example/tests
 langgraph = [
diff --git a/tests/chinook/langgraph/test_langgraph_chinook.py b/tests/chinook/langgraph/test_langgraph_chinook.py
@@ -1,5 +1,10 @@
 import pytest
 
+pytest.skip(
+    "Skipping Chinook langgraph integration tests (requires external services/credentials).",
+    allow_module_level=True,
+)
+
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 
diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -1,6 +1,12 @@
+import pytest
+
+pytest.skip(
+    "Skipping Chinook pydantic-ai integration tests (requires external LLM access/credentials).",
+    allow_module_level=True,
+)
+
 from pydantic import BaseModel
 from pydantic_ai import Agent
-import pytest
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
diff --git a/tests/pytest/datasets/klavis_gmail_sandbox_test.jsonl b/tests/pytest/datasets/klavis_gmail_sandbox_test.jsonl
@@ -0,0 +1,2 @@
+{"initialize_data": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}, {"subject": "Spam Newsletter", "to": "zihao@klavisai.com", "body": "Check out our amazing deals! Click here now!", "cc": "", "bcc": "", "from": "marketing@spammy.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}, "messages": "Please delete the email with subject \"Spam Newsletter\" from my inbox.", "ground_truth": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}}
+{"initialize_data": {"messages": [], "drafts": []}, "messages": "Please directly send an email to zihao@klavisai.com with subject \"Meeting Tomorrow\" and body \"Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.\"", "ground_truth": {"messages": [{"subject": "Meeting Tomorrow", "to": "zihao@klavisai.com", "body": "Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.", "cc": "", "bcc": "", "from": "", "reply_to": "", "labels": ["SENT"]}], "drafts": []}}
diff --git a/tests/pytest/test_pytest_klavis_sandbox.py b/tests/pytest/test_pytest_klavis_sandbox.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,9 @@ openenv = [`
`134`	`134`	`dspy = [`
`135`	`135`	`"dspy>=3.0.0",`
`136`	`136`	`]`
	`137`	`+klavis = [`
	`138`	`+ "klavis>=2.18.0",`
	`139`	`+]`
`137`	`140`
`138`	`141`	`# Optional deps for LangGraph example/tests`
`139`	`142`	`langgraph = [`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+{"initialize_data": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}, {"subject": "Spam Newsletter", "to": "zihao@klavisai.com", "body": "Check out our amazing deals! Click here now!", "cc": "", "bcc": "", "from": "marketing@spammy.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}, "messages": "Please delete the email with subject \"Spam Newsletter\" from my inbox.", "ground_truth": {"messages": [{"subject": "Project Update", "to": "zihao@klavisai.com", "body": "The project is progressing well. We should have the final deliverables by next week.", "cc": "", "bcc": "", "from": "sarah@klavisai.com", "reply_to": "", "labels": ["INBOX"]}], "drafts": []}}
	`2`	`+{"initialize_data": {"messages": [], "drafts": []}, "messages": "Please directly send an email to zihao@klavisai.com with subject \"Meeting Tomorrow\" and body \"Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.\"", "ground_truth": {"messages": [{"subject": "Meeting Tomorrow", "to": "zihao@klavisai.com", "body": "Hi Zihao, just confirming our meeting tomorrow at 2pm. Best regards.", "cc": "", "bcc": "", "from": "", "reply_to": "", "labels": ["SENT"]}], "drafts": []}}`