eval-protocol · benjibc · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
@@ -23,15 +23,6 @@
     test_mcp,
 )
 from .data_loader import DynamicDataLoader, InlineDataLoader
-
-# Try to import FireworksPolicy if available
-try:
-    from .mcp_env import FireworksPolicy
-
-    _FIREWORKS_AVAILABLE = True
-except (ImportError, AttributeError):
-    _FIREWORKS_AVAILABLE = False
-# Import submodules to make them available via eval_protocol.rewards, etc.
 from . import mcp, rewards
 from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata
 from .playback_policy import PlaybackPolicyBase
@@ -42,6 +33,13 @@
 from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
 from .pytest.parameterize import DefaultParameterIdGenerator
 
+from .types.remote_rollout_processor import (
+    InitRequest,
+    RolloutMetadata,
+    StatusResponse,
+    create_langfuse_config_tags,
+)
+
 try:
     from .adapters import OpenAIResponsesAdapter
 except ImportError:
@@ -62,14 +60,6 @@
 except ImportError:
     LangSmithAdapter = None
 
-# Remote server types
-from .types.remote_rollout_processor import (
-    InitRequest,
-    RolloutMetadata,
-    StatusResponse,
-    create_langfuse_config_tags,
-)
-
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 
 __all__ = [

diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py
@@ -13,12 +13,9 @@
 logger = logging.getLogger(__name__)
 
 try:
-    from datasets import Dataset, DatasetDict, load_dataset
-
-    DATASETS_AVAILABLE = True
+    from datasets import Dataset, DatasetDict, load_dataset  # pyright: ignore[reportAttributeAccessIssue]
 except ImportError:
-    DATASETS_AVAILABLE = False
-    logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")
+    raise ImportError("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")
 
 # Type alias for transformation function
 TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
@@ -80,11 +77,6 @@ def __init__(
             revision: Optional dataset revision/commit hash
             **load_dataset_kwargs: Additional arguments to pass to load_dataset
         """
-        if not DATASETS_AVAILABLE:
-            raise ImportError(
-                "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
-            )
-
         self.dataset_id = dataset_id
         self.transform_fn = transform_fn
         self.config_name = config_name

diff --git a/eval_protocol/execution/pipeline.py b/eval_protocol/execution/pipeline.py
@@ -12,7 +12,6 @@
 
 import aiohttp
 import hydra
-from datasets import Dataset, DatasetDict
 from hydra.errors import InstantiationException
 from omegaconf import DictConfig, OmegaConf
 
@@ -24,6 +23,14 @@
 from eval_protocol.utils.module_loader import load_function as load_reward_function
 from eval_protocol.utils.packaging_utils import install_requirements
 
+try:
+    from datasets import Dataset, DatasetDict  # pyright: ignore[reportAttributeAccessIssue]
+except ImportError:
+    raise ImportError(
+        "The 'datasets' package is required to use this function. "
+        "Please install it with 'pip install \"eval-protocol[huggingface]\"'"
+    )
+
 logger = logging.getLogger(__name__)
 
 

diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -17,9 +17,6 @@
 import anyio
 from openai.types import CompletionUsage
 
-from vendor.tau2.data_model.message import AssistantMessage, UserMessage
-from vendor.tau2.user.user_simulator import UserSimulator
-
 from ...models import EvaluationRow, InputMetadata, Message, Status
 from ...types import TerminationReason, Trajectory, NonSkippableException
 
@@ -234,6 +231,10 @@ def extract_text_content(msg_dict):
 
             # If user simulation is enabled, initial message is from the simulated user
             if dataset_row.user_simulation and dataset_row.user_simulation.get("enabled", False):
+                # Lazy import vendor.tau2 - only load when user simulation is actually used
+                from vendor.tau2.data_model.message import AssistantMessage, UserMessage
+                from vendor.tau2.user.user_simulator import UserSimulator
+
                 user_simulator = UserSimulator(
                     instructions=dataset_row.user_simulation.get("system_prompt"),
                     llm=dataset_row.user_simulation.get("llm", "gpt-4.1"),
@@ -598,6 +599,9 @@ def _get_user_simulator_messages(self, conversation_history: List[Dict[str, Any]
         """
         Filter conversation history for user simulator and convert to tau2-bench format.
         """
+        # Lazy import vendor.tau2 types
+        from vendor.tau2.data_model.message import AssistantMessage, UserMessage
+
         tau2_messages = []
 
         for message in conversation_history:

diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -29,7 +29,8 @@
 
 import logging
 import json
-import pandas as pd
+import random
+import statistics
 
 
 AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
@@ -122,30 +123,25 @@ async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig
             raise
 
 
-def calculate_bootstrap_scores(all_scores: list[float]) -> float:
+def calculate_bootstrap_scores(all_scores: list[float], n_boot: int = 100, seed: int | None = None) -> float:
     """
-    Calculate bootstrap confidence intervals for individual scores.
+    Calculate the mean of bootstrap sample means for a list of scores.
 
     Args:
-        all_scores: List of individual scores from all rows
+        all_scores: List of individual scores from all rows.
+        n_boot: Number of bootstrap resamples to draw (default 100).
+        seed: Optional RNG seed for reproducibility.
 
     Returns:
-        Mean bootstrap score
+        Mean bootstrap score (float). Returns 0.0 if all_scores is empty.
     """
     if not all_scores:
         return 0.0
 
-    # Create DataFrame (single column of scores)
-    battles = pd.DataFrame({"score": all_scores})
-
-    # Bootstrap sampling for calculating relative performance
-    bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)]
-
-    # Calculate final scores
-    bootstraps = pd.Series(bootstrap_means)
-    mean_score = bootstraps.mean()
-
-    return float(mean_score)
+    rng = random.Random(seed) if seed is not None else random
+    k = len(all_scores)
+    bootstrap_means = [statistics.fmean(rng.choices(all_scores, k=k)) for _ in range(n_boot)]
+    return float(statistics.fmean(bootstrap_means))
 
 
 def aggregate(scores: list[float], method: AggregationMethod) -> float:

diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -3,15 +3,10 @@
 """
 
 import os
-from datetime import datetime
 import re
-from typing import List, Dict, Any, Optional
-from openai import AsyncOpenAI
-import pandas as pd
+from typing import Dict, Any, Optional
 
-from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
-import asyncio
-from openai import OpenAI
+from eval_protocol.models import EvaluationRow, Message
 
 OG_ARENA_HARD_PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,33 +27,26 @@ dependencies = [
     "aiohttp",
     "mcp>=1.9.2",
     "PyYAML>=5.0",
-    # Pin minimum datasets to avoid pyarrow API mismatch (PyExtensionType removal in pyarrow>=21)
-    "datasets>=3.0.0",
-    "fsspec",
     "hydra-core>=1.3.2",
     "omegaconf>=2.3.0",
-    "gymnasium>=0.29.0",
     "httpx>=0.24.0",
     "anthropic>=0.59.0",
-    "ipykernel>=6.30.0",
-    "jupyter>=1.1.1",
+    "litellm<1.75.0",
+    "pytest>=6.0.0",
+    "pytest-asyncio>=0.21.0",
+    "peewee>=3.18.2",
+    "backoff>=2.2.0",
+    "questionary>=2.0.0",
     # Dependencies for vendored tau2 package
     "toml>=0.10.0",
     "loguru>=0.6.0",
     "docstring-parser>=0.15",
     "rich>=12.0.0",
     "psutil>=5.8.0",
-    "litellm<1.75.0",
     "addict>=2.4.0",
     "deepdiff>=6.0.0",
-    "pandas>=1.5.0",
     "websockets>=15.0.1",
     "fastapi>=0.116.1",
-    "pytest>=6.0.0",
-    "pytest-asyncio>=0.21.0",
-    "peewee>=3.18.2",
-    "backoff>=2.2.0",
-    "questionary>=2.0.0",
 ]
 
 [project.urls]
@@ -67,6 +60,7 @@ dev = [
     "werkzeug>=2.0.0",
     "ruff>=0.5.0",
     "transformers>=4.0.0",
+    "pandas>=1.5.0",
     "types-setuptools",
     "types-requests",
     "types-PyYAML",
@@ -110,12 +104,6 @@ huggingface = [
     "datasets>=3.0.0",
     "transformers>=4.0.0",
 ]
-adapters = [
-    "langfuse>=2.0.0",
-    # Keep in sync with core dependency to ensure compatibility with latest pyarrow
-    "datasets>=3.0.0",
-    "transformers>=4.0.0",
-]
 langsmith = [
     "langsmith>=0.1.86",
 ]

diff --git a/tests/test_evaluation_postprocess.py b/tests/test_evaluation_postprocess.py
@@ -208,6 +208,33 @@ def test_all_invalid_scores(self):
         assert mock_logger.log.call_count == 2
 
 
+class TestBootstrapEquivalence:
+    def test_bootstrap_equivalence_pandas_vs_pure_python(self):
+        import random
+        import pandas as pd
+        from eval_protocol.pytest.utils import calculate_bootstrap_scores as py_bootstrap
+
+        # Deterministic synthetic scores
+        rng = random.Random(123)
+        scores = [rng.random() for _ in range(100)]
+
+        n_boot = 1000
+        seed = 42
+
+        # Old (pandas) style bootstrap: resample full column with replacement
+        df = pd.DataFrame({"score": scores})
+        pandas_means = [
+            df.sample(frac=1.0, replace=True, random_state=seed + i)["score"].mean() for i in range(n_boot)
+        ]
+        pandas_boot_mean = sum(pandas_means) / len(pandas_means)
+
+        # New pure-python implementation
+        py_boot_mean = py_bootstrap(scores, n_boot=n_boot, seed=seed)
+
+        # They estimate the same quantity; allow small Monte Carlo tolerance
+        assert abs(pandas_boot_mean - py_boot_mean) < 0.02
+
+
 class TestComputeFixedSetMuCi:
     """Tests for compute_fixed_set_mu_ci function."""