Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions eval_protocol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,18 @@
from .resources import create_llm_resource
from .reward_function import RewardFunction
from .typed_interface import reward_function
from .quickstart import aha_judge, split_multi_turn_rows
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
from .pytest import evaluation_test, SingleTurnRolloutProcessor

from .adapters import OpenAIResponsesAdapter

try:
from .adapters import LangfuseAdapter
from .adapters import LangfuseAdapter, create_langfuse_adapter
except ImportError:
LangfuseAdapter = None

try:
from .adapters import BraintrustAdapter
from .adapters import BraintrustAdapter, create_braintrust_adapter
except ImportError:
BraintrustAdapter = None

Expand All @@ -62,12 +62,15 @@

__all__ = [
"aha_judge",
"split_multi_turn_rows",
"multi_turn_assistant_to_ground_truth",
"assistant_to_ground_truth",
"evaluation_test",
"SingleTurnRolloutProcessor",
"OpenAIResponsesAdapter",
"LangfuseAdapter",
"create_langfuse_adapter",
"BraintrustAdapter",
"create_braintrust_adapter",
"LangSmithAdapter",
# Core interfaces
"Message",
Expand Down
4 changes: 4 additions & 0 deletions eval_protocol/adapters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Upload evaluation scores back to the data source for tracking and analysis."""
pass

def upload_score(self, row: EvaluationRow, model_name: str) -> None:
"""Upload evaluation score for a single row back to the data source."""
pass
34 changes: 34 additions & 0 deletions eval_protocol/adapters/braintrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,40 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
except Exception as e:
logger.warning("Failed to push scores to Braintrust: %s", e)

def upload_score(self, row: EvaluationRow, model_name: str) -> None:
"""Upload evaluation score for a single row back to Braintrust.

Args:
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
model_name: Name of the model (used as the score name in Braintrust)
"""
try:
if (
row.evaluation_result
and row.evaluation_result.is_score_valid
and row.input_metadata
and row.input_metadata.session_data
and "braintrust_trace_id" in row.input_metadata.session_data
):
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}

trace_id = row.input_metadata.session_data["braintrust_trace_id"]
if trace_id:
feedback_items = [{"id": trace_id, "scores": {model_name: row.evaluation_result.score}}]

response = requests.post(
f"{self.api_url}/v1/feedback",
headers=headers,
json={"feedback": feedback_items},
timeout=30,
)
response.raise_for_status()
except Exception as e:
logger.warning("Failed to upload single score to Braintrust: %s", e)


def create_braintrust_adapter(
api_key: Optional[str] = None,
Expand Down
28 changes: 25 additions & 3 deletions eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,9 +445,6 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
rows: List of EvaluationRow objects with session_data containing trace IDs
model_name: Name of the model (used as the score name in Langfuse)
mean_score: The calculated mean score to push to Langfuse

Note:
Silently handles errors if rows lack session data
"""
try:
for trace_id in set(
Expand All @@ -464,6 +461,31 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
except Exception as e:
logger.warning("Failed to push scores to Langfuse: %s", e)

def upload_score(self, row: EvaluationRow, model_name: str) -> None:
"""Upload evaluation score for a single row back to Langfuse.

Args:
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
model_name: Name of the model (used as the score name in Langfuse)
"""
try:
if (
row.evaluation_result
and row.evaluation_result.is_score_valid
and row.input_metadata
and row.input_metadata.session_data
and "langfuse_trace_id" in row.input_metadata.session_data
):
trace_id = row.input_metadata.session_data["langfuse_trace_id"]
if trace_id:
self.client.create_score(
trace_id=trace_id,
name=model_name,
value=row.evaluation_result.score,
)
except Exception as e:
logger.warning("Failed to push score to Langfuse: %s", e)


def create_langfuse_adapter() -> LangfuseAdapter:
"""Factory function to create a Langfuse adapter."""
Expand Down
12 changes: 10 additions & 2 deletions eval_protocol/dataset_logger/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,19 @@ def read(self, rollout_id=None):

# Lazy property that creates the logger only when accessed
class _LazyLogger(DatasetLogger):
def __init__(self):
self._logger: DatasetLogger | None = None

def _get_logger(self):
if self._logger is None:
self._logger = _get_default_logger()
return self._logger

def log(self, row):
return _get_default_logger().log(row)
return self._get_logger().log(row)

def read(self, rollout_id=None):
return _get_default_logger().read(rollout_id)
return self._get_logger().read(rollout_id)


default_logger: DatasetLogger = _LazyLogger()
12 changes: 7 additions & 5 deletions eval_protocol/dataset_logger/sqlite_evaluation_row_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class SqliteEvaluationRowStore:
def __init__(self, db_path: str):
os.makedirs(os.path.dirname(db_path), exist_ok=True)
self._db_path = db_path
self._db = SqliteDatabase(self._db_path)
self._db = SqliteDatabase(self._db_path, pragmas={"journal_mode": "wal"})

class BaseModel(Model):
class Meta:
Expand All @@ -41,10 +41,12 @@ def upsert_row(self, data: dict) -> None:
rollout_id = data["execution_metadata"]["rollout_id"]
if rollout_id is None:
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
else:
self._EvaluationRow.create(rollout_id=rollout_id, data=data)

with self._db.atomic("EXCLUSIVE"):
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
else:
self._EvaluationRow.create(rollout_id=rollout_id, data=data)

def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
if rollout_id is None:
Expand Down
15 changes: 12 additions & 3 deletions eval_protocol/pytest/evaluation_test_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold
from eval_protocol.pytest.handle_persist_flow import handle_persist_flow
from eval_protocol.pytest.types import EvaluationTestMode
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename # pyright: ignore[reportUnknownVariableType]
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename
from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci


Expand All @@ -25,9 +25,18 @@ def postprocess(
num_runs: int,
experiment_duration_seconds: float,
):
scores = [
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result) for result in all_results
valid_results = [
[r for r in result if r.evaluation_result and r.evaluation_result.is_score_valid] for result in all_results
]

if aggregation_method == "bootstrap":
scores = [r.evaluation_result.score for result in valid_results for r in result if r.evaluation_result]
else:
scores = [
sum(r.evaluation_result.score for r in result if r.evaluation_result) / len(result)
for result in valid_results
if result
]
agg_score = aggregate(scores, aggregation_method)

# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
Expand Down
2 changes: 1 addition & 1 deletion eval_protocol/pytest/handle_persist_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
row_data["evals"] = {"score": 0}
row_data["eval_details"] = {
"score": 0,
"is_score_valid": True,
"is_score_valid": False,
"reason": "No evaluation result",
"metrics": {},
}
Expand Down
32 changes: 30 additions & 2 deletions eval_protocol/pytest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,36 @@

import logging
import json
import pandas as pd


AggregationMethod = Literal["mean", "max", "min"]
AggregationMethod = Literal["mean", "max", "min", "bootstrap"]


def calculate_bootstrap_scores(all_scores: list[float]) -> float:
"""
Calculate bootstrap confidence intervals for individual scores.

Args:
all_scores: List of individual scores from all rows

Returns:
Mean bootstrap score
"""
if not all_scores:
return 0.0

# Create DataFrame (single column of scores)
battles = pd.DataFrame({"score": all_scores})

# Bootstrap sampling for calculating relative performance
bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)]

# Calculate final scores
bootstraps = pd.Series(bootstrap_means)
mean_score = bootstraps.mean()

return float(mean_score)


def aggregate(scores: list[float], method: AggregationMethod) -> float:
Expand All @@ -41,7 +68,8 @@ def aggregate(scores: list[float], method: AggregationMethod) -> float:
return max(scores)
if method == "min":
return min(scores)
raise ValueError(f"Unknown aggregation method: {method}") # pyright: ignore[reportUnreachable]
if method == "bootstrap":
return calculate_bootstrap_scores(scores)


def log_eval_status_and_rows(
Expand Down
4 changes: 2 additions & 2 deletions eval_protocol/quickstart/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llm_judge import aha_judge
from .utils import split_multi_turn_rows
from .utils import multi_turn_assistant_to_ground_truth, assistant_to_ground_truth

__all__ = ["aha_judge"]
__all__ = ["aha_judge", "multi_turn_assistant_to_ground_truth", "assistant_to_ground_truth"]
Loading
Loading