Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 0 additions & 49 deletions docs/integrations/braintrust_integration.mdx

This file was deleted.

4 changes: 0 additions & 4 deletions eval_protocol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

import warnings

from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn

from .auth import get_fireworks_account_id, get_fireworks_api_key
from .common_utils import load_jsonl
from .config import RewardKitConfig, get_config, load_config
Expand Down Expand Up @@ -49,8 +47,6 @@
"EvaluateResult",
"reward_function",
"RewardFunction",
"scorer_to_reward_fn",
"reward_fn_to_scorer",
# Authentication
"get_fireworks_api_key",
"get_fireworks_account_id",
Expand Down
18 changes: 12 additions & 6 deletions eval_protocol/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,25 @@
and converting them to EvaluationRow format for use in evaluation pipelines.

Available adapters:
- BaseAdapter: Abstract base class for all adapters
- LangfuseAdapter: Pull data from Langfuse deployments
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
- BigQueryAdapter: Query data from Google BigQuery
- Braintrust integration (legacy)
- TRL integration (legacy)
"""

# Always available
from .base import BaseAdapter

__all__ = ["BaseAdapter"]

# Conditional imports based on available dependencies
try:
from .langfuse import LangfuseAdapter, create_langfuse_adapter

__all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
__all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
except ImportError:
__all__ = []
pass

try:
from .huggingface import (
Expand Down Expand Up @@ -53,14 +58,15 @@
except ImportError:
pass

# Legacy adapters (always available)
try:
from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
from .braintrust import BraintrustAdapter, create_braintrust_adapter

__all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
__all__.extend(["BraintrustAdapter", "create_braintrust_adapter"])
except ImportError:
pass

# Legacy adapters (always available)

try:
from .trl import create_trl_adapter

Expand Down
21 changes: 21 additions & 0 deletions eval_protocol/adapters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
Base adapter interface for Eval Protocol.
"""

from abc import ABC, abstractmethod
from typing import List

from eval_protocol.models import EvaluationRow


class BaseAdapter(ABC):
"""Abstract base class for all Eval Protocol adapters."""

@abstractmethod
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
"""Get evaluation rows from the data source."""
pass

def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Upload evaluation scores back to the data source for tracking and analysis."""
pass
3 changes: 2 additions & 1 deletion eval_protocol/adapters/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias

from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,7 +43,7 @@
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]


class BigQueryAdapter:
class BigQueryAdapter(BaseAdapter):
"""Adapter to query data from Google BigQuery and convert to EvaluationRow format.

This adapter connects to Google BigQuery, executes SQL queries, and applies
Expand Down
51 changes: 46 additions & 5 deletions eval_protocol/adapters/braintrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@
import requests

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter
from .utils import extract_messages_from_data

# Keep backward compatibility
from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -128,7 +126,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool
return messages


class BraintrustAdapter:
class BraintrustAdapter(BaseAdapter):
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down Expand Up @@ -223,6 +221,49 @@ def get_evaluation_rows(
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
return eval_rows

def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Upload evaluation scores back to Braintrust traces for tracking and analysis.

Creates score entries in Braintrust for each unique trace_id found in the evaluation
rows' session data. This allows you to see evaluation results directly in the
Braintrust UI alongside the original traces.

Args:
rows: List of EvaluationRow objects with session_data containing trace IDs
model_name: Name of the model (used as the score name in Braintrust)
mean_score: The calculated mean score to push to Braintrust

Note:
Silently handles errors if rows lack session data
"""
try:
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}

feedback_items = []
for trace_id in set(
row.input_metadata.session_data["braintrust_trace_id"]
for row in rows
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
):
if trace_id:
feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}})

if feedback_items:
payload = {"feedback": feedback_items}

response = requests.post(
f"{self.api_url}/v1/project_logs/{self.project_id}/feedback",
headers=headers,
json=payload,
)
response.raise_for_status()

except Exception as e:
logger.warning("Failed to push scores to Braintrust: %s", e)


def create_braintrust_adapter(
api_key: Optional[str] = None,
Expand All @@ -237,4 +278,4 @@ def create_braintrust_adapter(
)


__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"]
__all__ = ["BraintrustAdapter", "create_braintrust_adapter"]
3 changes: 2 additions & 1 deletion eval_protocol/adapters/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any, Callable, Dict, Iterator, List, Optional

from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand All @@ -23,7 +24,7 @@
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]


class HuggingFaceAdapter:
class HuggingFaceAdapter(BaseAdapter):
"""Generic adapter to load HuggingFace datasets with custom transformations.

This adapter loads datasets from HuggingFace Hub and applies a user-provided
Expand Down
7 changes: 4 additions & 3 deletions eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any, Dict, List, Optional, Protocol

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter
from .utils import extract_messages_from_data

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) ->
return generations[-1]


class LangfuseAdapter:
class LangfuseAdapter(BaseAdapter):
"""Adapter to pull data from Langfuse and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down Expand Up @@ -433,8 +434,8 @@ def get_evaluation_rows_by_ids(
continue
return eval_rows

def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Push evaluation scores back to Langfuse traces for tracking and analysis.
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
"""Upload evaluation scores back to Langfuse traces for tracking and analysis.

Creates a score entry in Langfuse for each unique trace_id found in the evaluation
rows' session data. This allows you to see evaluation results directly in the
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/langsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import Any, Dict, List, Optional, Iterable

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)

Expand All @@ -24,7 +25,7 @@
LANGSMITH_AVAILABLE = False


class LangSmithAdapter:
class LangSmithAdapter(BaseAdapter):
"""Adapter to pull data from LangSmith and convert to EvaluationRow format.

By default, fetches root runs from a project and maps inputs/outputs into
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/adapters/openai_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
from openai.types.responses.tool import Tool

from eval_protocol.models import EvaluationRow, InputMetadata, Message
from .base import BaseAdapter

logger = logging.getLogger(__name__)


from openai import OpenAI


class OpenAIResponsesAdapter:
class OpenAIResponsesAdapter(BaseAdapter):
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Expand Down
3 changes: 0 additions & 3 deletions eval_protocol/integrations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Integration helpers for Eval Protocol."""

from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
from .openeval import adapt
from .trl import create_trl_adapter

__all__ = [
"adapt",
"scorer_to_reward_fn",
"reward_fn_to_scorer",
"create_trl_adapter",
]
54 changes: 0 additions & 54 deletions eval_protocol/integrations/braintrust.py

This file was deleted.

Loading
Loading