From db701aba2e7eaeec38152a3a56beb2e55fbc9ca0 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:31:33 -0700 Subject: [PATCH 1/5] Cache airline flight DB and harden adapters --- eval_protocol/adapters/langfuse.py | 17 +++++++++---- eval_protocol/adapters/langsmith.py | 22 +++++++++++++---- .../airline_environment.py | 24 ++++++++++++------- tests/pytest/test_mcp_session_autocreate.py | 8 ++++++- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index 44c43fe2..c1cea52f 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -9,7 +9,7 @@ import random import time from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional, Protocol +from typing import Any, Callable, Dict, List, Optional, Protocol, cast from eval_protocol.models import EvaluationRow, InputMetadata, Message from .base import BaseAdapter @@ -44,14 +44,19 @@ def __call__( ... +LangfuseClient = Any + +_get_langfuse_client: Callable[[], Any] | None + try: - from langfuse import get_client # pyright: ignore[reportPrivateImportUsage] + from langfuse import get_client as _get_langfuse_client # type: ignore[attr-defined, reportPrivateImportUsage] from langfuse.api.resources.trace.types.traces import Traces from langfuse.api.resources.commons.types.trace import Trace from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails LANGFUSE_AVAILABLE = True -except ImportError: +except ImportError: # pragma: no cover - optional dependency + _get_langfuse_client = None LANGFUSE_AVAILABLE = False @@ -219,7 +224,11 @@ def __init__(self): if not LANGFUSE_AVAILABLE: raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") - self.client = get_client() + if _get_langfuse_client is None: + raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") + + client_factory = cast(Callable[[], LangfuseClient], _get_langfuse_client) + self.client: LangfuseClient = client_factory() def get_evaluation_rows( self, diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index fc1daf71..eb23e4f1 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -10,18 +10,23 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Iterable +from typing import Any, Callable, Dict, Iterable, List, Optional, cast from eval_protocol.models import EvaluationRow, InputMetadata, Message from .base import BaseAdapter logger = logging.getLogger(__name__) +LangSmithClient = Any + +_LANGSMITH_CLIENT_CTOR: Callable[..., LangSmithClient] | None + try: - from langsmith import Client # type: ignore + from langsmith import Client as _LANGSMITH_CLIENT_CTOR # type: ignore[attr-defined] LANGSMITH_AVAILABLE = True -except ImportError: +except ImportError: # pragma: no cover - optional dependency + _LANGSMITH_CLIENT_CTOR = None LANGSMITH_AVAILABLE = False @@ -35,10 +40,17 @@ class LangSmithAdapter(BaseAdapter): - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict] """ - def __init__(self, client: Optional[Client] = None) -> None: + def __init__(self, client: Optional[LangSmithClient] = None) -> None: if not LANGSMITH_AVAILABLE: raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'") - self.client = client or Client() + if client is not None: + self.client = client + return + + if _LANGSMITH_CLIENT_CTOR is None: + raise ImportError("LangSmith client constructor unavailable despite successful import check") + + self.client: LangSmithClient = cast(LangSmithClient, _LANGSMITH_CLIENT_CTOR()) def get_evaluation_rows( self, diff --git a/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py b/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py index 1e9b00dd..ce9d905a 100644 --- a/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +++ b/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py @@ -9,8 +9,7 @@ import json import logging import os -import time -from copy import deepcopy +from functools import lru_cache from dataclasses import dataclass, field from enum import Enum from pathlib import Path @@ -24,6 +23,16 @@ from vendor.tau2.domains.airline.utils import AIRLINE_DB_PATH +@lru_cache(maxsize=1) +def _load_flight_db(path: str) -> FlightDB: + """Load and cache the flight database for reuse across resets.""" + + logger.info("🗂️ Loading airline database from disk (cached)") + db_loaded = FlightDB.load(path) + assert isinstance(db_loaded, FlightDB) + return db_loaded + + class AirlineEnvironment: """ Airline environment that integrates τ²-Bench simulation pattern @@ -37,13 +46,10 @@ def __init__(self, config: Optional[Dict[str, Any]] = None): def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Reset the environment to initial state""" - logger.info("🔄 Resetting airline environment - reloading database from disk") - # FlightDB.load expects a str path - # Ensure type matches expected FlightDB - # FlightDB.load returns vendor.tau2.domains.airline.data_model.FlightDB which is compatible - db_loaded = FlightDB.load(str(AIRLINE_DB_PATH)) - assert isinstance(db_loaded, FlightDB) - self.db = db_loaded + logger.info("🔄 Resetting airline environment - using cached airline database") + cached_db = _load_flight_db(str(AIRLINE_DB_PATH)) + # Provide a fresh copy for each environment reset without re-reading from disk. + self.db = cached_db.model_copy(deep=True) self.airline_tools = AirlineTools(self.db) return {}, {} diff --git a/tests/pytest/test_mcp_session_autocreate.py b/tests/pytest/test_mcp_session_autocreate.py index df816f55..dd755bbe 100644 --- a/tests/pytest/test_mcp_session_autocreate.py +++ b/tests/pytest/test_mcp_session_autocreate.py @@ -31,11 +31,14 @@ async def test_tool_call_returns_json_without_prior_initial_state(): try: base_url = "http://127.0.0.1:9780/mcp" client = httpx.Client(timeout=1.0) - deadline = time.time() + 20 + start_time = time.time() + deadline = start_time + 20 + ready_time = None while time.time() < deadline: try: r = client.get(base_url) if r.status_code in (200, 307, 406): + ready_time = time.time() break except Exception: pass @@ -43,6 +46,9 @@ async def test_tool_call_returns_json_without_prior_initial_state(): else: pytest.fail("Server did not start on port 9780 in time") + assert ready_time is not None, "Server did not return a successful status before exiting loop" + assert ready_time - start_time < 20, f"Server took too long to respond: {ready_time - start_time:.2f}s" + session = MCPSession(base_url=base_url, session_id="test-autocreate", seed=None, model_id="test-model") mgr = MCPConnectionManager() From 09c5d4957f4b04db62033749f0251383a73ee005 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 18 Sep 2025 18:08:35 -0700 Subject: [PATCH 2/5] revert langfuse / langsmith changes --- eval_protocol/adapters/langfuse.py | 17 ++++------------- eval_protocol/adapters/langsmith.py | 22 +++++----------------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index c1cea52f..44c43fe2 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -9,7 +9,7 @@ import random import time from datetime import datetime, timedelta -from typing import Any, Callable, Dict, List, Optional, Protocol, cast +from typing import Any, Dict, List, Optional, Protocol from eval_protocol.models import EvaluationRow, InputMetadata, Message from .base import BaseAdapter @@ -44,19 +44,14 @@ def __call__( ... -LangfuseClient = Any - -_get_langfuse_client: Callable[[], Any] | None - try: - from langfuse import get_client as _get_langfuse_client # type: ignore[attr-defined, reportPrivateImportUsage] + from langfuse import get_client # pyright: ignore[reportPrivateImportUsage] from langfuse.api.resources.trace.types.traces import Traces from langfuse.api.resources.commons.types.trace import Trace from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails LANGFUSE_AVAILABLE = True -except ImportError: # pragma: no cover - optional dependency - _get_langfuse_client = None +except ImportError: LANGFUSE_AVAILABLE = False @@ -224,11 +219,7 @@ def __init__(self): if not LANGFUSE_AVAILABLE: raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") - if _get_langfuse_client is None: - raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") - - client_factory = cast(Callable[[], LangfuseClient], _get_langfuse_client) - self.client: LangfuseClient = client_factory() + self.client = get_client() def get_evaluation_rows( self, diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index eb23e4f1..fc1daf71 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -10,23 +10,18 @@ from __future__ import annotations import logging -from typing import Any, Callable, Dict, Iterable, List, Optional, cast +from typing import Any, Dict, List, Optional, Iterable from eval_protocol.models import EvaluationRow, InputMetadata, Message from .base import BaseAdapter logger = logging.getLogger(__name__) -LangSmithClient = Any - -_LANGSMITH_CLIENT_CTOR: Callable[..., LangSmithClient] | None - try: - from langsmith import Client as _LANGSMITH_CLIENT_CTOR # type: ignore[attr-defined] + from langsmith import Client # type: ignore LANGSMITH_AVAILABLE = True -except ImportError: # pragma: no cover - optional dependency - _LANGSMITH_CLIENT_CTOR = None +except ImportError: LANGSMITH_AVAILABLE = False @@ -40,17 +35,10 @@ class LangSmithAdapter(BaseAdapter): - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict] """ - def __init__(self, client: Optional[LangSmithClient] = None) -> None: + def __init__(self, client: Optional[Client] = None) -> None: if not LANGSMITH_AVAILABLE: raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'") - if client is not None: - self.client = client - return - - if _LANGSMITH_CLIENT_CTOR is None: - raise ImportError("LangSmith client constructor unavailable despite successful import check") - - self.client: LangSmithClient = cast(LangSmithClient, _LANGSMITH_CLIENT_CTOR()) + self.client = client or Client() def get_evaluation_rows( self, From cc288b22930032248dfcccf14d79bb04511445ec Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 19 Sep 2025 12:48:02 -0700 Subject: [PATCH 3/5] try diff prots --- tests/pytest/test_mcp_session_autocreate.py | 30 ++++++++++++--------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tests/pytest/test_mcp_session_autocreate.py b/tests/pytest/test_mcp_session_autocreate.py index dd755bbe..1c1ec42c 100644 --- a/tests/pytest/test_mcp_session_autocreate.py +++ b/tests/pytest/test_mcp_session_autocreate.py @@ -3,8 +3,9 @@ without requiring a prior initial state fetch, and returns JSON. """ +import socket import time -from multiprocessing import Process +from multiprocessing import Process, Queue import httpx import pytest @@ -13,10 +14,16 @@ from eval_protocol.types import MCPSession -def _run_airline_server(): +def _run_airline_server(port_queue): import os - os.environ["PORT"] = "9780" + # Use dynamic port allocation to avoid conflicts in parallel CI runs + with socket.socket() as s: + s.bind(("", 0)) + port = s.getsockname()[1] + + port_queue.put(port) # Send the port back to the test + os.environ["PORT"] = str(port) from eval_protocol.mcp_servers.tau2.tau2_mcp import AirlineDomainMcp server = AirlineDomainMcp(seed=None) @@ -25,29 +32,26 @@ def _run_airline_server(): @pytest.mark.asyncio async def test_tool_call_returns_json_without_prior_initial_state(): - proc = Process(target=_run_airline_server, daemon=True) + port_queue = Queue() + proc = Process(target=_run_airline_server, args=(port_queue,), daemon=True) proc.start() try: - base_url = "http://127.0.0.1:9780/mcp" + # Get the dynamically assigned port + port = port_queue.get(timeout=10) + base_url = f"http://127.0.0.1:{port}/mcp" client = httpx.Client(timeout=1.0) - start_time = time.time() - deadline = start_time + 20 - ready_time = None + deadline = time.time() + 20 while time.time() < deadline: try: r = client.get(base_url) if r.status_code in (200, 307, 406): - ready_time = time.time() break except Exception: pass time.sleep(0.2) else: - pytest.fail("Server did not start on port 9780 in time") - - assert ready_time is not None, "Server did not return a successful status before exiting loop" - assert ready_time - start_time < 20, f"Server took too long to respond: {ready_time - start_time:.2f}s" + pytest.fail(f"Server did not start on port {port} in time") session = MCPSession(base_url=base_url, session_id="test-autocreate", seed=None, model_id="test-model") From e8477cf1eff052578dba8b4cc0af9b62cdda6626 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 19 Sep 2025 13:11:54 -0700 Subject: [PATCH 4/5] test --- tests/pytest/test_mcp_session_autocreate.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/pytest/test_mcp_session_autocreate.py b/tests/pytest/test_mcp_session_autocreate.py index 1c1ec42c..bac0a77f 100644 --- a/tests/pytest/test_mcp_session_autocreate.py +++ b/tests/pytest/test_mcp_session_autocreate.py @@ -3,7 +3,6 @@ without requiring a prior initial state fetch, and returns JSON. """ -import socket import time from multiprocessing import Process, Queue @@ -17,13 +16,12 @@ def _run_airline_server(port_queue): import os - # Use dynamic port allocation to avoid conflicts in parallel CI runs - with socket.socket() as s: - s.bind(("", 0)) - port = s.getsockname()[1] + # Use different ports based on Python version to avoid conflicts in parallel CI runs + python_version = os.environ.get("PYTHON_VERSION", "3.10").replace(".", "") + port = str(9780 + int(python_version[-2:])) # 9780, 9781, 9782 + os.environ["PORT"] = port - port_queue.put(port) # Send the port back to the test - os.environ["PORT"] = str(port) + port_queue.put(int(port)) # Send the port back to the test from eval_protocol.mcp_servers.tau2.tau2_mcp import AirlineDomainMcp server = AirlineDomainMcp(seed=None) From 638bbf1cd475b19b0b1a9c91f758549e21722f40 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 19 Sep 2025 13:24:56 -0700 Subject: [PATCH 5/5] hard code port --- tests/pytest/test_mcp_session_autocreate.py | 29 ++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/pytest/test_mcp_session_autocreate.py b/tests/pytest/test_mcp_session_autocreate.py index bac0a77f..6cc76f50 100644 --- a/tests/pytest/test_mcp_session_autocreate.py +++ b/tests/pytest/test_mcp_session_autocreate.py @@ -4,7 +4,7 @@ """ import time -from multiprocessing import Process, Queue +from multiprocessing import Process import httpx import pytest @@ -13,15 +13,12 @@ from eval_protocol.types import MCPSession -def _run_airline_server(port_queue): +def _run_airline_server(): import os - # Use different ports based on Python version to avoid conflicts in parallel CI runs python_version = os.environ.get("PYTHON_VERSION", "3.10").replace(".", "") - port = str(9780 + int(python_version[-2:])) # 9780, 9781, 9782 + port = str(9780 + int(python_version[-1:])) os.environ["PORT"] = port - - port_queue.put(int(port)) # Send the port back to the test from eval_protocol.mcp_servers.tau2.tau2_mcp import AirlineDomainMcp server = AirlineDomainMcp(seed=None) @@ -30,26 +27,34 @@ def _run_airline_server(port_queue): @pytest.mark.asyncio async def test_tool_call_returns_json_without_prior_initial_state(): - port_queue = Queue() - proc = Process(target=_run_airline_server, args=(port_queue,), daemon=True) + import os + + proc = Process(target=_run_airline_server, daemon=True) proc.start() try: - # Get the dynamically assigned port - port = port_queue.get(timeout=10) + python_version = os.environ.get("PYTHON_VERSION", "3.10").replace(".", "") + port = str(9780 + int(python_version[-1:])) + base_url = f"http://127.0.0.1:{port}/mcp" client = httpx.Client(timeout=1.0) - deadline = time.time() + 20 + start_time = time.time() + deadline = start_time + 20 + ready_time = None while time.time() < deadline: try: r = client.get(base_url) if r.status_code in (200, 307, 406): + ready_time = time.time() break except Exception: pass time.sleep(0.2) else: - pytest.fail(f"Server did not start on port {port} in time") + pytest.fail("Server did not start on port 9780 in time") + + assert ready_time is not None, "Server did not return a successful status before exiting loop" + assert ready_time - start_time < 20, f"Server took too long to respond: {ready_time - start_time:.2f}s" session = MCPSession(base_url=base_url, session_id="test-autocreate", seed=None, model_id="test-model")