Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions eval_protocol/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,26 @@
and converting them to EvaluationRow format for use in evaluation pipelines.

Available adapters:
- BaseAdapter: Abstract base class for all adapters
- LangfuseAdapter: Pull data from Langfuse deployments
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
- BigQueryAdapter: Query data from Google BigQuery
- Braintrust integration (legacy)
- TRL integration (legacy)
"""

# Always available
from .base import BaseAdapter

__all__ = ["BaseAdapter"]

# Conditional imports based on available dependencies
try:
from .langfuse import LangfuseAdapter, create_langfuse_adapter

__all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
__all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
except ImportError:
__all__ = []
pass

try:
from .huggingface import (
Expand Down Expand Up @@ -55,9 +61,9 @@

# Legacy adapters (always available)
Comment thread
xzrderek marked this conversation as resolved.
Outdated
try:
from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn

__all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
__all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"])
except ImportError:
pass

Expand Down
21 changes: 21 additions & 0 deletions eval_protocol/adapters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
Base adapter interface for Eval Protocol.
"""

from abc import ABC, abstractmethod
from typing import List

from eval_protocol.models import EvaluationRow


class BaseAdapter(ABC):
"""Abstract base class for all Eval Protocol adapters."""

@abstractmethod
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
"""Get evaluation rows from the data source."""
pass

def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
Comment thread
xzrderek marked this conversation as resolved.
Outdated
"""Push evaluation scores back to the data source for tracking and analysis."""
pass
3 changes: 2 additions & 1 deletion eval_protocol/adapters/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias

from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,7 +43,7 @@
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]


class BigQueryAdapter:
class BigQueryAdapter(BaseAdapter):
"""Adapter to query data from Google BigQuery and convert to EvaluationRow format.

This adapter connects to Google BigQuery, executes SQL queries, and applies
Expand Down
46 changes: 45 additions & 1 deletion eval_protocol/adapters/braintrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import requests

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter
from .utils import extract_messages_from_data

# Keep backward compatibility
Expand Down Expand Up @@ -128,7 +129,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool
return messages


class BraintrustAdapter:
class BraintrustAdapter(BaseAdapter):
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down Expand Up @@ -223,6 +224,49 @@ def get_evaluation_rows(
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
return eval_rows

def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Push evaluation scores back to Braintrust traces for tracking and analysis.

Creates score entries in Braintrust for each unique trace_id found in the evaluation
rows' session data. This allows you to see evaluation results directly in the
Braintrust UI alongside the original traces.

Args:
rows: List of EvaluationRow objects with session_data containing trace IDs
model_name: Name of the model (used as the score name in Braintrust)
mean_score: The calculated mean score to push to Braintrust

Note:
Silently handles errors if rows lack session data
"""
try:
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}

feedback_items = []
for trace_id in set(
row.input_metadata.session_data["braintrust_trace_id"]
for row in rows
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
):
if trace_id:
feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}})

if feedback_items:
payload = {"feedback": feedback_items}

response = requests.post(
f"{self.api_url}/v1/project_logs/{self.project_id}/feedback",
headers=headers,
json=payload,
)
response.raise_for_status()

except Exception as e:
logger.warning("Failed to push scores to Braintrust: %s", e)


def create_braintrust_adapter(
api_key: Optional[str] = None,
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any, Callable, Dict, Iterator, List, Optional

from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand All @@ -23,7 +24,7 @@
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]


class HuggingFaceAdapter:
class HuggingFaceAdapter(BaseAdapter):
"""Generic adapter to load HuggingFace datasets with custom transformations.

This adapter loads datasets from HuggingFace Hub and applies a user-provided
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any, Dict, List, Optional, Protocol

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter
from .utils import extract_messages_from_data

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) ->
return generations[-1]


class LangfuseAdapter:
class LangfuseAdapter(BaseAdapter):
"""Adapter to pull data from Langfuse and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/langsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import Any, Dict, List, Optional, Iterable

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand All @@ -24,7 +25,7 @@
LANGSMITH_AVAILABLE = False


class LangSmithAdapter:
class LangSmithAdapter(BaseAdapter):
"""Adapter to pull data from LangSmith and convert to EvaluationRow format.

By default, fetches root runs from a project and maps inputs/outputs into
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/openai_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
from openai.types.responses.tool import Tool

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)


from openai import OpenAI


class OpenAIResponsesAdapter:
class OpenAIResponsesAdapter(BaseAdapter):
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down
61 changes: 11 additions & 50 deletions eval_protocol/quickstart/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,65 +2,23 @@
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
"""

from collections.abc import Awaitable, Callable
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
from typing_extensions import cast
from tqdm import tqdm
from typing import Optional

import pytest

from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.models import EvaluationRow
from eval_protocol.adapters.base import BaseAdapter
from eval_protocol.quickstart.utils import (
split_multi_turn_rows,
JUDGE_CONFIGS,
calculate_bootstrap_scores,
run_judgment_async,
)
import asyncio
from openai import AsyncOpenAI
from eval_protocol.adapters.langfuse import create_langfuse_adapter

adapter = create_langfuse_adapter()


@pytest.mark.asyncio
@evaluation_test(
input_rows=[
adapter.get_evaluation_rows(
to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
limit=711,
sample_size=50,
sleep_between_gets=3.0,
max_retries=5,
)
],
completion_params=[
{"model": "gpt-4.1"},
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "medium"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
},
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "low"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
},
],
rollout_processor=SingleTurnRolloutProcessor(),
preprocess_fn=split_multi_turn_rows,
max_concurrent_rollouts=64,
mode="all",
)
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
return await aha_judge(rows)
Comment on lines -59 to -60
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes thank you



async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
async def aha_judge(
rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro", adapter: Optional[BaseAdapter] = None
) -> list[EvaluationRow]:
Comment on lines +19 to +21
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

exactly

"""
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.

Expand All @@ -73,6 +31,8 @@ async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro

Args:
rows: List of EvaluationRow objects with messages, ground_truth, and tools
judge_name: Name of the judge configuration to use
adapter: Optional adapter to push scores back to (if provided)

Returns:
Same rows with updated evaluation_result containing scores and judgments
Expand Down Expand Up @@ -133,7 +93,8 @@ async def run_judgment(row):
if row.evaluation_result:
row.evaluation_result.score = mean_score

# Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
adapter.push_scores(rows, model_name, mean_score)
# Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
if adapter:
adapter.push_scores(rows, model_name, mean_score)

return rows
Loading
Loading