diff --git a/eval_protocol/mcp_servers/frozen_lake/server.py b/eval_protocol/mcp_servers/frozen_lake/server.py index a5ddff45..43c59b5e 100755 --- a/eval_protocol/mcp_servers/frozen_lake/server.py +++ b/eval_protocol/mcp_servers/frozen_lake/server.py @@ -14,8 +14,13 @@ import sys from pathlib import Path -# Add root directory to path so we can import eval_protocol -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +# Add current directory first for local imports (frozen_lake_mcp) +sys.path.insert(0, str(Path(__file__).parent)) + +# Add eval_protocol parent to path, but use append to avoid priority conflicts +parent_dir = str(Path(__file__).parent.parent.parent) +if parent_dir not in sys.path: + sys.path.append(parent_dir) from frozen_lake_mcp import FrozenLakeMcp diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 42428b5c..9173e6f9 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -25,7 +25,7 @@ class MCPServerManager: def __init__(self, server_script: str, port: int = 8000, **kwargs): self.server_script = server_script self.port = port - self.domain = str(kwargs.get("domain", "airline")) + self.domain = kwargs.get("domain", None) self.process: Optional[subprocess.Popen] = None self.base_dir = Path(".").resolve() self._log_file = None @@ -59,11 +59,14 @@ def start(self) -> None: env = os.environ.copy() env["PORT"] = str(self.port) - # Start server process (no domain argument needed for tau2_mcp server) - cmd = ["python", self.server_script, "--port", str(self.port), "--domain", self.domain] + # Build command, add --domain only if provided (e.g. tau2 needs it, frozen_lake doesn't) + cmd = ["python", self.server_script, "--port", str(self.port)] + if self.domain: + cmd.extend(["--domain", self.domain]) # Setup log file with cleanup - log_file_path = os.path.join(self.base_dir, f"server_output_{self.domain}_{self.port}.log") + domain_part = self.domain if self.domain else "server" + log_file_path = os.path.join(self.base_dir, f"server_output_{domain_part}_{self.port}.log") if os.path.exists(log_file_path): os.remove(log_file_path) diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 88eb06e9..6a707adf 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -73,6 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval } ], rollout_processor=MCPGymRolloutProcessor(), + rollout_processor_kwargs={"domain": "airline"}, passed_threshold={"success": 0.4, "standard_error": 0.02}, num_runs=8, mode="pointwise", diff --git a/uv.lock b/uv.lock index 6c194582..e75a7c55 100644 --- a/uv.lock +++ b/uv.lock @@ -1263,6 +1263,7 @@ dev = [ { name = "build" }, { name = "docker" }, { name = "e2b" }, + { name = "gymnasium" }, { name = "haikus" }, { name = "ipykernel" }, { name = "jupyter" }, @@ -1358,6 +1359,7 @@ requires-dist = [ { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.19" }, { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, + { name = "gymnasium", marker = "extra == 'dev'", specifier = ">=1.2.0" }, { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, { name = "httpx", specifier = ">=0.24.0" },