diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index b713441a..bbbe53e6 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -23,15 +23,6 @@ test_mcp, ) from .data_loader import DynamicDataLoader, InlineDataLoader - -# Try to import FireworksPolicy if available -try: - from .mcp_env import FireworksPolicy - - _FIREWORKS_AVAILABLE = True -except (ImportError, AttributeError): - _FIREWORKS_AVAILABLE = False -# Import submodules to make them available via eval_protocol.rewards, etc. from . import mcp, rewards from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata from .playback_policy import PlaybackPolicyBase @@ -42,6 +33,13 @@ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor from .pytest.parameterize import DefaultParameterIdGenerator +from .types.remote_rollout_processor import ( + InitRequest, + RolloutMetadata, + StatusResponse, + create_langfuse_config_tags, +) + try: from .adapters import OpenAIResponsesAdapter except ImportError: @@ -62,14 +60,6 @@ except ImportError: LangSmithAdapter = None -# Remote server types -from .types.remote_rollout_processor import ( - InitRequest, - RolloutMetadata, - StatusResponse, - create_langfuse_config_tags, -) - warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol") __all__ = [ diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py index 1f740ba2..afed2e42 100644 --- a/eval_protocol/adapters/huggingface.py +++ b/eval_protocol/adapters/huggingface.py @@ -13,12 +13,9 @@ logger = logging.getLogger(__name__) try: - from datasets import Dataset, DatasetDict, load_dataset - - DATASETS_AVAILABLE = True + from datasets import Dataset, DatasetDict, load_dataset # pyright: ignore[reportAttributeAccessIssue] except ImportError: - DATASETS_AVAILABLE = False - logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'") + raise ImportError("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'") # Type alias for transformation function TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] @@ -80,11 +77,6 @@ def __init__( revision: Optional dataset revision/commit hash **load_dataset_kwargs: Additional arguments to pass to load_dataset """ - if not DATASETS_AVAILABLE: - raise ImportError( - "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'" - ) - self.dataset_id = dataset_id self.transform_fn = transform_fn self.config_name = config_name diff --git a/eval_protocol/execution/pipeline.py b/eval_protocol/execution/pipeline.py index b61b43b8..283f102d 100644 --- a/eval_protocol/execution/pipeline.py +++ b/eval_protocol/execution/pipeline.py @@ -12,7 +12,6 @@ import aiohttp import hydra -from datasets import Dataset, DatasetDict from hydra.errors import InstantiationException from omegaconf import DictConfig, OmegaConf @@ -24,6 +23,14 @@ from eval_protocol.utils.module_loader import load_function as load_reward_function from eval_protocol.utils.packaging_utils import install_requirements +try: + from datasets import Dataset, DatasetDict # pyright: ignore[reportAttributeAccessIssue] +except ImportError: + raise ImportError( + "The 'datasets' package is required to use this function. " + "Please install it with 'pip install \"eval-protocol[huggingface]\"'" + ) + logger = logging.getLogger(__name__) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 85784295..f62134b2 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -17,9 +17,6 @@ import anyio from openai.types import CompletionUsage -from vendor.tau2.data_model.message import AssistantMessage, UserMessage -from vendor.tau2.user.user_simulator import UserSimulator - from ...models import EvaluationRow, InputMetadata, Message, Status from ...types import TerminationReason, Trajectory, NonSkippableException @@ -234,6 +231,10 @@ def extract_text_content(msg_dict): # If user simulation is enabled, initial message is from the simulated user if dataset_row.user_simulation and dataset_row.user_simulation.get("enabled", False): + # Lazy import vendor.tau2 - only load when user simulation is actually used + from vendor.tau2.data_model.message import AssistantMessage, UserMessage + from vendor.tau2.user.user_simulator import UserSimulator + user_simulator = UserSimulator( instructions=dataset_row.user_simulation.get("system_prompt"), llm=dataset_row.user_simulation.get("llm", "gpt-4.1"), @@ -598,6 +599,9 @@ def _get_user_simulator_messages(self, conversation_history: List[Dict[str, Any] """ Filter conversation history for user simulator and convert to tau2-bench format. """ + # Lazy import vendor.tau2 types + from vendor.tau2.data_model.message import AssistantMessage, UserMessage + tau2_messages = [] for message in conversation_history: diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 557f5be2..2417d8b9 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -29,7 +29,8 @@ import logging import json -import pandas as pd +import random +import statistics AggregationMethod = Literal["mean", "max", "min", "bootstrap"] @@ -122,30 +123,25 @@ async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig raise -def calculate_bootstrap_scores(all_scores: list[float]) -> float: +def calculate_bootstrap_scores(all_scores: list[float], n_boot: int = 100, seed: int | None = None) -> float: """ - Calculate bootstrap confidence intervals for individual scores. + Calculate the mean of bootstrap sample means for a list of scores. Args: - all_scores: List of individual scores from all rows + all_scores: List of individual scores from all rows. + n_boot: Number of bootstrap resamples to draw (default 100). + seed: Optional RNG seed for reproducibility. Returns: - Mean bootstrap score + Mean bootstrap score (float). Returns 0.0 if all_scores is empty. """ if not all_scores: return 0.0 - # Create DataFrame (single column of scores) - battles = pd.DataFrame({"score": all_scores}) - - # Bootstrap sampling for calculating relative performance - bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)] - - # Calculate final scores - bootstraps = pd.Series(bootstrap_means) - mean_score = bootstraps.mean() - - return float(mean_score) + rng = random.Random(seed) if seed is not None else random + k = len(all_scores) + bootstrap_means = [statistics.fmean(rng.choices(all_scores, k=k)) for _ in range(n_boot)] + return float(statistics.fmean(bootstrap_means)) def aggregate(scores: list[float], method: AggregationMethod) -> float: diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py index a5ab49d5..36685425 100644 --- a/eval_protocol/quickstart/utils.py +++ b/eval_protocol/quickstart/utils.py @@ -3,15 +3,10 @@ """ import os -from datetime import datetime import re -from typing import List, Dict, Any, Optional -from openai import AsyncOpenAI -import pandas as pd +from typing import Dict, Any, Optional -from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult -import asyncio -from openai import OpenAI +from eval_protocol.models import EvaluationRow, Message OG_ARENA_HARD_PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better. diff --git a/pyproject.toml b/pyproject.toml index bb2051c9..c4188c0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,33 +27,26 @@ dependencies = [ "aiohttp", "mcp>=1.9.2", "PyYAML>=5.0", - # Pin minimum datasets to avoid pyarrow API mismatch (PyExtensionType removal in pyarrow>=21) - "datasets>=3.0.0", - "fsspec", "hydra-core>=1.3.2", "omegaconf>=2.3.0", - "gymnasium>=0.29.0", "httpx>=0.24.0", "anthropic>=0.59.0", - "ipykernel>=6.30.0", - "jupyter>=1.1.1", + "litellm<1.75.0", + "pytest>=6.0.0", + "pytest-asyncio>=0.21.0", + "peewee>=3.18.2", + "backoff>=2.2.0", + "questionary>=2.0.0", # Dependencies for vendored tau2 package "toml>=0.10.0", "loguru>=0.6.0", "docstring-parser>=0.15", "rich>=12.0.0", "psutil>=5.8.0", - "litellm<1.75.0", "addict>=2.4.0", "deepdiff>=6.0.0", - "pandas>=1.5.0", "websockets>=15.0.1", "fastapi>=0.116.1", - "pytest>=6.0.0", - "pytest-asyncio>=0.21.0", - "peewee>=3.18.2", - "backoff>=2.2.0", - "questionary>=2.0.0", ] [project.urls] @@ -67,6 +60,7 @@ dev = [ "werkzeug>=2.0.0", "ruff>=0.5.0", "transformers>=4.0.0", + "pandas>=1.5.0", "types-setuptools", "types-requests", "types-PyYAML", @@ -110,12 +104,6 @@ huggingface = [ "datasets>=3.0.0", "transformers>=4.0.0", ] -adapters = [ - "langfuse>=2.0.0", - # Keep in sync with core dependency to ensure compatibility with latest pyarrow - "datasets>=3.0.0", - "transformers>=4.0.0", -] langsmith = [ "langsmith>=0.1.86", ] diff --git a/tests/test_evaluation_postprocess.py b/tests/test_evaluation_postprocess.py index 7d18205c..4bc790f9 100644 --- a/tests/test_evaluation_postprocess.py +++ b/tests/test_evaluation_postprocess.py @@ -208,6 +208,33 @@ def test_all_invalid_scores(self): assert mock_logger.log.call_count == 2 +class TestBootstrapEquivalence: + def test_bootstrap_equivalence_pandas_vs_pure_python(self): + import random + import pandas as pd + from eval_protocol.pytest.utils import calculate_bootstrap_scores as py_bootstrap + + # Deterministic synthetic scores + rng = random.Random(123) + scores = [rng.random() for _ in range(100)] + + n_boot = 1000 + seed = 42 + + # Old (pandas) style bootstrap: resample full column with replacement + df = pd.DataFrame({"score": scores}) + pandas_means = [ + df.sample(frac=1.0, replace=True, random_state=seed + i)["score"].mean() for i in range(n_boot) + ] + pandas_boot_mean = sum(pandas_means) / len(pandas_means) + + # New pure-python implementation + py_boot_mean = py_bootstrap(scores, n_boot=n_boot, seed=seed) + + # They estimate the same quantity; allow small Monte Carlo tolerance + assert abs(pandas_boot_mean - py_boot_mean) < 0.02 + + class TestComputeFixedSetMuCi: """Tests for compute_fixed_set_mu_ci function.""" diff --git a/uv.lock b/uv.lock index 27750bb5..6c194582 100644 --- a/uv.lock +++ b/uv.lock @@ -1218,22 +1218,16 @@ dependencies = [ { name = "anthropic" }, { name = "backoff" }, { name = "dataclasses-json" }, - { name = "datasets" }, { name = "deepdiff" }, { name = "docstring-parser" }, { name = "fastapi" }, - { name = "fsspec" }, - { name = "gymnasium" }, { name = "httpx" }, { name = "hydra-core" }, - { name = "ipykernel" }, - { name = "jupyter" }, { name = "litellm" }, { name = "loguru" }, { name = "mcp" }, { name = "omegaconf" }, { name = "openai" }, - { name = "pandas" }, { name = "peewee" }, { name = "psutil" }, { name = "pydantic" }, @@ -1250,11 +1244,6 @@ dependencies = [ ] [package.optional-dependencies] -adapters = [ - { name = "datasets" }, - { name = "langfuse" }, - { name = "transformers" }, -] bigquery = [ { name = "google-auth" }, { name = "google-cloud-bigquery" }, @@ -1278,6 +1267,7 @@ dev = [ { name = "ipykernel" }, { name = "jupyter" }, { name = "openai" }, + { name = "pandas" }, { name = "pip" }, { name = "pre-commit" }, { name = "pytest-cov" }, @@ -1359,8 +1349,6 @@ requires-dist = [ { name = "braintrust", extras = ["otel"], marker = "extra == 'braintrust'" }, { name = "build", marker = "extra == 'dev'" }, { name = "dataclasses-json", specifier = ">=0.5.7" }, - { name = "datasets", specifier = ">=3.0.0" }, - { name = "datasets", marker = "extra == 'adapters'", specifier = ">=3.0.0" }, { name = "datasets", marker = "extra == 'huggingface'", specifier = ">=3.0.0" }, { name = "deepdiff", specifier = ">=6.0.0" }, { name = "docker", marker = "extra == 'dev'", specifier = "==7.1.0" }, @@ -1368,23 +1356,18 @@ requires-dist = [ { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.19" }, - { name = "fsspec" }, { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, - { name = "gymnasium", specifier = ">=0.29.0" }, { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, { name = "httpx", specifier = ">=0.24.0" }, { name = "hydra-core", specifier = ">=1.3.2" }, - { name = "ipykernel", specifier = ">=6.30.0" }, { name = "ipykernel", marker = "extra == 'dev'", specifier = ">=6.30.0" }, - { name = "jupyter", specifier = ">=1.1.1" }, { name = "jupyter", marker = "extra == 'dev'", specifier = ">=1.1.1" }, { name = "langchain", marker = "extra == 'langgraph-tools'", specifier = ">=0.3.0" }, { name = "langchain-core", marker = "extra == 'langchain'", specifier = ">=0.3.0" }, { name = "langchain-core", marker = "extra == 'langgraph'", specifier = ">=0.3.75" }, { name = "langchain-fireworks", marker = "extra == 'langgraph-tools'", specifier = ">=0.3.0" }, - { name = "langfuse", marker = "extra == 'adapters'", specifier = ">=2.0.0" }, { name = "langfuse", marker = "extra == 'langfuse'", specifier = ">=2.0.0" }, { name = "langgraph", marker = "extra == 'langgraph'", specifier = ">=0.6.7" }, { name = "langgraph", marker = "extra == 'langgraph-tools'", specifier = ">=0.6.7" }, @@ -1396,7 +1379,7 @@ requires-dist = [ { name = "openai", specifier = ">=1.78.1" }, { name = "openai", marker = "extra == 'dev'", specifier = ">=1.78.1" }, { name = "openevals", marker = "extra == 'openevals'", specifier = ">=0.1.0" }, - { name = "pandas", specifier = ">=1.5.0" }, + { name = "pandas", marker = "extra == 'dev'", specifier = ">=1.5.0" }, { name = "peewee", specifier = ">=3.18.2" }, { name = "peft", marker = "extra == 'trl'", specifier = ">=0.7.0" }, { name = "pillow", marker = "extra == 'box2d'" }, @@ -1423,7 +1406,6 @@ requires-dist = [ { name = "syrupy", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "toml", specifier = ">=0.10.0" }, { name = "torch", marker = "extra == 'trl'", specifier = ">=1.9" }, - { name = "transformers", marker = "extra == 'adapters'", specifier = ">=4.0.0" }, { name = "transformers", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "transformers", marker = "extra == 'huggingface'", specifier = ">=4.0.0" }, { name = "transformers", marker = "extra == 'trl'", specifier = ">=4.0.0" }, @@ -1438,7 +1420,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "langsmith", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "braintrust", "langgraph", "langgraph-tools"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "langsmith", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "braintrust", "langgraph", "langgraph-tools"] [package.metadata.requires-dev] dev = [ @@ -5399,15 +5381,16 @@ wheels = [ [[package]] name = "pytest-asyncio" -version = "1.1.0" +version = "1.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4e/51/f8794af39eeb870e87a8c8068642fc07bce0c854d6865d7dd0f2a9d338c2/pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea", size = 46652, upload-time = "2025-07-16T04:29:26.393Z" } +sdist = { url = "https://files.pythonhosted.org/packages/42/86/9e3c5f48f7b7b638b216e4b9e645f54d199d7abbbab7a64a13b4e12ba10f/pytest_asyncio-1.2.0.tar.gz", hash = "sha256:c609a64a2a8768462d0c99811ddb8bd2583c33fd33cf7f21af1c142e824ffb57", size = 50119, upload-time = "2025-09-12T07:33:53.816Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/9d/bf86eddabf8c6c9cb1ea9a869d6873b46f105a5d292d3a6f7071f5b07935/pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf", size = 15157, upload-time = "2025-07-16T04:29:24.929Z" }, + { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" }, ] [[package]]