Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions eval_protocol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,6 @@
test_mcp,
)
from .data_loader import DynamicDataLoader, InlineDataLoader

# Try to import FireworksPolicy if available
try:
from .mcp_env import FireworksPolicy

_FIREWORKS_AVAILABLE = True
except (ImportError, AttributeError):
_FIREWORKS_AVAILABLE = False
# Import submodules to make them available via eval_protocol.rewards, etc.
from . import mcp, rewards
from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata
from .playback_policy import PlaybackPolicyBase
Expand All @@ -42,6 +33,13 @@
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
from .pytest.parameterize import DefaultParameterIdGenerator

from .types.remote_rollout_processor import (
InitRequest,
RolloutMetadata,
StatusResponse,
create_langfuse_config_tags,
)

try:
from .adapters import OpenAIResponsesAdapter
except ImportError:
Expand All @@ -62,14 +60,6 @@
except ImportError:
LangSmithAdapter = None

# Remote server types
from .types.remote_rollout_processor import (
InitRequest,
RolloutMetadata,
StatusResponse,
create_langfuse_config_tags,
)

warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")

__all__ = [
Expand Down
12 changes: 2 additions & 10 deletions eval_protocol/adapters/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@
logger = logging.getLogger(__name__)

try:
from datasets import Dataset, DatasetDict, load_dataset

DATASETS_AVAILABLE = True
from datasets import Dataset, DatasetDict, load_dataset # pyright: ignore[reportAttributeAccessIssue]
except ImportError:
DATASETS_AVAILABLE = False
logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")
raise ImportError("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")

# Type alias for transformation function
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
Expand Down Expand Up @@ -80,11 +77,6 @@ def __init__(
revision: Optional dataset revision/commit hash
**load_dataset_kwargs: Additional arguments to pass to load_dataset
"""
if not DATASETS_AVAILABLE:
raise ImportError(
"HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
)

self.dataset_id = dataset_id
self.transform_fn = transform_fn
self.config_name = config_name
Expand Down
9 changes: 8 additions & 1 deletion eval_protocol/execution/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import aiohttp
import hydra
from datasets import Dataset, DatasetDict
from hydra.errors import InstantiationException
from omegaconf import DictConfig, OmegaConf

Expand All @@ -24,6 +23,14 @@
from eval_protocol.utils.module_loader import load_function as load_reward_function
from eval_protocol.utils.packaging_utils import install_requirements

try:
from datasets import Dataset, DatasetDict # pyright: ignore[reportAttributeAccessIssue]
except ImportError:
raise ImportError(
"The 'datasets' package is required to use this function. "
"Please install it with 'pip install \"eval-protocol[huggingface]\"'"
)

logger = logging.getLogger(__name__)


Expand Down
10 changes: 7 additions & 3 deletions eval_protocol/mcp/execution/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@
import anyio
from openai.types import CompletionUsage

from vendor.tau2.data_model.message import AssistantMessage, UserMessage
from vendor.tau2.user.user_simulator import UserSimulator

from ...models import EvaluationRow, InputMetadata, Message, Status
from ...types import TerminationReason, Trajectory, NonSkippableException

Expand Down Expand Up @@ -234,6 +231,10 @@ def extract_text_content(msg_dict):

# If user simulation is enabled, initial message is from the simulated user
if dataset_row.user_simulation and dataset_row.user_simulation.get("enabled", False):
# Lazy import vendor.tau2 - only load when user simulation is actually used
from vendor.tau2.data_model.message import AssistantMessage, UserMessage
from vendor.tau2.user.user_simulator import UserSimulator

user_simulator = UserSimulator(
instructions=dataset_row.user_simulation.get("system_prompt"),
llm=dataset_row.user_simulation.get("llm", "gpt-4.1"),
Expand Down Expand Up @@ -598,6 +599,9 @@ def _get_user_simulator_messages(self, conversation_history: List[Dict[str, Any]
"""
Filter conversation history for user simulator and convert to tau2-bench format.
"""
# Lazy import vendor.tau2 types
from vendor.tau2.data_model.message import AssistantMessage, UserMessage

tau2_messages = []

for message in conversation_history:
Expand Down
28 changes: 12 additions & 16 deletions eval_protocol/pytest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@

import logging
import json
import pandas as pd
import random
import statistics


AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
Expand Down Expand Up @@ -122,30 +123,25 @@ async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig
raise


def calculate_bootstrap_scores(all_scores: list[float]) -> float:
def calculate_bootstrap_scores(all_scores: list[float], n_boot: int = 100, seed: int | None = None) -> float:
"""
Calculate bootstrap confidence intervals for individual scores.
Calculate the mean of bootstrap sample means for a list of scores.

Args:
all_scores: List of individual scores from all rows
all_scores: List of individual scores from all rows.
n_boot: Number of bootstrap resamples to draw (default 100).
seed: Optional RNG seed for reproducibility.

Returns:
Mean bootstrap score
Mean bootstrap score (float). Returns 0.0 if all_scores is empty.
"""
if not all_scores:
return 0.0

# Create DataFrame (single column of scores)
battles = pd.DataFrame({"score": all_scores})

# Bootstrap sampling for calculating relative performance
bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)]

# Calculate final scores
bootstraps = pd.Series(bootstrap_means)
mean_score = bootstraps.mean()

return float(mean_score)
rng = random.Random(seed) if seed is not None else random
k = len(all_scores)
bootstrap_means = [statistics.fmean(rng.choices(all_scores, k=k)) for _ in range(n_boot)]
return float(statistics.fmean(bootstrap_means))


def aggregate(scores: list[float], method: AggregationMethod) -> float:
Expand Down
9 changes: 2 additions & 7 deletions eval_protocol/quickstart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,10 @@
"""

import os
from datetime import datetime
import re
from typing import List, Dict, Any, Optional
from openai import AsyncOpenAI
import pandas as pd
from typing import Dict, Any, Optional

from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
import asyncio
from openai import OpenAI
from eval_protocol.models import EvaluationRow, Message

OG_ARENA_HARD_PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.
Expand Down
26 changes: 7 additions & 19 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,33 +27,26 @@ dependencies = [
"aiohttp",
"mcp>=1.9.2",
"PyYAML>=5.0",
# Pin minimum datasets to avoid pyarrow API mismatch (PyExtensionType removal in pyarrow>=21)
"datasets>=3.0.0",
"fsspec",
"hydra-core>=1.3.2",
"omegaconf>=2.3.0",
"gymnasium>=0.29.0",
"httpx>=0.24.0",
"anthropic>=0.59.0",
"ipykernel>=6.30.0",
"jupyter>=1.1.1",
"litellm<1.75.0",
"pytest>=6.0.0",
"pytest-asyncio>=0.21.0",
"peewee>=3.18.2",
"backoff>=2.2.0",
"questionary>=2.0.0",
# Dependencies for vendored tau2 package
"toml>=0.10.0",
"loguru>=0.6.0",
"docstring-parser>=0.15",
"rich>=12.0.0",
"psutil>=5.8.0",
"litellm<1.75.0",
"addict>=2.4.0",
"deepdiff>=6.0.0",
"pandas>=1.5.0",
"websockets>=15.0.1",
"fastapi>=0.116.1",
"pytest>=6.0.0",
"pytest-asyncio>=0.21.0",
"peewee>=3.18.2",
"backoff>=2.2.0",
"questionary>=2.0.0",
]

[project.urls]
Expand All @@ -67,6 +60,7 @@ dev = [
"werkzeug>=2.0.0",
"ruff>=0.5.0",
"transformers>=4.0.0",
"pandas>=1.5.0",
"types-setuptools",
"types-requests",
"types-PyYAML",
Expand Down Expand Up @@ -110,12 +104,6 @@ huggingface = [
"datasets>=3.0.0",
"transformers>=4.0.0",
]
adapters = [
"langfuse>=2.0.0",
# Keep in sync with core dependency to ensure compatibility with latest pyarrow
"datasets>=3.0.0",
"transformers>=4.0.0",
]
langsmith = [
"langsmith>=0.1.86",
]
Expand Down
27 changes: 27 additions & 0 deletions tests/test_evaluation_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,33 @@ def test_all_invalid_scores(self):
assert mock_logger.log.call_count == 2


class TestBootstrapEquivalence:
def test_bootstrap_equivalence_pandas_vs_pure_python(self):
import random
import pandas as pd
from eval_protocol.pytest.utils import calculate_bootstrap_scores as py_bootstrap

# Deterministic synthetic scores
rng = random.Random(123)
scores = [rng.random() for _ in range(100)]

n_boot = 1000
seed = 42

# Old (pandas) style bootstrap: resample full column with replacement
df = pd.DataFrame({"score": scores})
pandas_means = [
df.sample(frac=1.0, replace=True, random_state=seed + i)["score"].mean() for i in range(n_boot)
]
pandas_boot_mean = sum(pandas_means) / len(pandas_means)

# New pure-python implementation
py_boot_mean = py_bootstrap(scores, n_boot=n_boot, seed=seed)

# They estimate the same quantity; allow small Monte Carlo tolerance
assert abs(pandas_boot_mean - py_boot_mean) < 0.02


class TestComputeFixedSetMuCi:
"""Tests for compute_fixed_set_mu_ci function."""

Expand Down
Loading
Loading