Skip to content

Commit bcda711

Browse files
authored
Various small refactors (#182)
* Various small refactors * change name * remove old braintrust integration * comments * remove comment
1 parent 71faaf6 commit bcda711

23 files changed

+158
-499
lines changed

docs/integrations/braintrust_integration.mdx

Lines changed: 0 additions & 49 deletions
This file was deleted.

eval_protocol/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010

1111
import warnings
1212

13-
from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
14-
1513
from .auth import get_fireworks_account_id, get_fireworks_api_key
1614
from .common_utils import load_jsonl
1715
from .config import RewardKitConfig, get_config, load_config
@@ -49,8 +47,6 @@
4947
"EvaluateResult",
5048
"reward_function",
5149
"RewardFunction",
52-
"scorer_to_reward_fn",
53-
"reward_fn_to_scorer",
5450
# Authentication
5551
"get_fireworks_api_key",
5652
"get_fireworks_account_id",

eval_protocol/adapters/__init__.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,25 @@
44
and converting them to EvaluationRow format for use in evaluation pipelines.
55
66
Available adapters:
7+
- BaseAdapter: Abstract base class for all adapters
78
- LangfuseAdapter: Pull data from Langfuse deployments
89
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
910
- BigQueryAdapter: Query data from Google BigQuery
10-
- Braintrust integration (legacy)
1111
- TRL integration (legacy)
1212
"""
1313

14+
# Always available
15+
from .base import BaseAdapter
16+
17+
__all__ = ["BaseAdapter"]
18+
1419
# Conditional imports based on available dependencies
1520
try:
1621
from .langfuse import LangfuseAdapter, create_langfuse_adapter
1722

18-
__all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
23+
__all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
1924
except ImportError:
20-
__all__ = []
25+
pass
2126

2227
try:
2328
from .huggingface import (
@@ -53,14 +58,15 @@
5358
except ImportError:
5459
pass
5560

56-
# Legacy adapters (always available)
5761
try:
58-
from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
62+
from .braintrust import BraintrustAdapter, create_braintrust_adapter
5963

60-
__all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
64+
__all__.extend(["BraintrustAdapter", "create_braintrust_adapter"])
6165
except ImportError:
6266
pass
6367

68+
# Legacy adapters (always available)
69+
6470
try:
6571
from .trl import create_trl_adapter
6672

eval_protocol/adapters/base.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""
2+
Base adapter interface for Eval Protocol.
3+
"""
4+
5+
from abc import ABC, abstractmethod
6+
from typing import List
7+
8+
from eval_protocol.models import EvaluationRow
9+
10+
11+
class BaseAdapter(ABC):
12+
"""Abstract base class for all Eval Protocol adapters."""
13+
14+
@abstractmethod
15+
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
16+
"""Get evaluation rows from the data source."""
17+
pass
18+
19+
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
20+
"""Upload evaluation scores back to the data source for tracking and analysis."""
21+
pass

eval_protocol/adapters/bigquery.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias
1111

1212
from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
13+
from .base import BaseAdapter
1314

1415
logger = logging.getLogger(__name__)
1516

@@ -42,7 +43,7 @@
4243
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
4344

4445

45-
class BigQueryAdapter:
46+
class BigQueryAdapter(BaseAdapter):
4647
"""Adapter to query data from Google BigQuery and convert to EvaluationRow format.
4748
4849
This adapter connects to Google BigQuery, executes SQL queries, and applies

eval_protocol/adapters/braintrust.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,9 @@
1414
import requests
1515

1616
from eval_protocol.models import EvaluationRow, InputMetadata, Message
17+
from .base import BaseAdapter
1718
from .utils import extract_messages_from_data
1819

19-
# Keep backward compatibility
20-
from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
21-
2220

2321
logger = logging.getLogger(__name__)
2422

@@ -128,7 +126,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool
128126
return messages
129127

130128

131-
class BraintrustAdapter:
129+
class BraintrustAdapter(BaseAdapter):
132130
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.
133131
134132
This adapter can pull both chat conversations and tool calling traces from
@@ -223,6 +221,49 @@ def get_evaluation_rows(
223221
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
224222
return eval_rows
225223

224+
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
225+
"""Upload evaluation scores back to Braintrust traces for tracking and analysis.
226+
227+
Creates score entries in Braintrust for each unique trace_id found in the evaluation
228+
rows' session data. This allows you to see evaluation results directly in the
229+
Braintrust UI alongside the original traces.
230+
231+
Args:
232+
rows: List of EvaluationRow objects with session_data containing trace IDs
233+
model_name: Name of the model (used as the score name in Braintrust)
234+
mean_score: The calculated mean score to push to Braintrust
235+
236+
Note:
237+
Silently handles errors if rows lack session data
238+
"""
239+
try:
240+
headers = {
241+
"Authorization": f"Bearer {self.api_key}",
242+
"Content-Type": "application/json",
243+
}
244+
245+
feedback_items = []
246+
for trace_id in set(
247+
row.input_metadata.session_data["braintrust_trace_id"]
248+
for row in rows
249+
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
250+
):
251+
if trace_id:
252+
feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}})
253+
254+
if feedback_items:
255+
payload = {"feedback": feedback_items}
256+
257+
response = requests.post(
258+
f"{self.api_url}/v1/project_logs/{self.project_id}/feedback",
259+
headers=headers,
260+
json=payload,
261+
)
262+
response.raise_for_status()
263+
264+
except Exception as e:
265+
logger.warning("Failed to push scores to Braintrust: %s", e)
266+
226267

227268
def create_braintrust_adapter(
228269
api_key: Optional[str] = None,
@@ -237,4 +278,4 @@ def create_braintrust_adapter(
237278
)
238279

239280

240-
__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"]
281+
__all__ = ["BraintrustAdapter", "create_braintrust_adapter"]

eval_protocol/adapters/huggingface.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Any, Callable, Dict, Iterator, List, Optional
99

1010
from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
11+
from .base import BaseAdapter
1112

1213
logger = logging.getLogger(__name__)
1314

@@ -23,7 +24,7 @@
2324
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
2425

2526

26-
class HuggingFaceAdapter:
27+
class HuggingFaceAdapter(BaseAdapter):
2728
"""Generic adapter to load HuggingFace datasets with custom transformations.
2829
2930
This adapter loads datasets from HuggingFace Hub and applies a user-provided

eval_protocol/adapters/langfuse.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import Any, Dict, List, Optional, Protocol
1313

1414
from eval_protocol.models import EvaluationRow, InputMetadata, Message
15+
from .base import BaseAdapter
1516
from .utils import extract_messages_from_data
1617

1718
logger = logging.getLogger(__name__)
@@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) ->
188189
return generations[-1]
189190

190191

191-
class LangfuseAdapter:
192+
class LangfuseAdapter(BaseAdapter):
192193
"""Adapter to pull data from Langfuse and convert to EvaluationRow format.
193194
194195
This adapter can pull both chat conversations and tool calling traces from
@@ -433,8 +434,8 @@ def get_evaluation_rows_by_ids(
433434
continue
434435
return eval_rows
435436

436-
def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
437-
"""Push evaluation scores back to Langfuse traces for tracking and analysis.
437+
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
438+
"""Upload evaluation scores back to Langfuse traces for tracking and analysis.
438439
439440
Creates a score entry in Langfuse for each unique trace_id found in the evaluation
440441
rows' session data. This allows you to see evaluation results directly in the

eval_protocol/adapters/langsmith.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import Any, Dict, List, Optional, Iterable
1414

1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
16+
from .base import BaseAdapter
1617

1718
logger = logging.getLogger(__name__)
1819

@@ -24,7 +25,7 @@
2425
LANGSMITH_AVAILABLE = False
2526

2627

27-
class LangSmithAdapter:
28+
class LangSmithAdapter(BaseAdapter):
2829
"""Adapter to pull data from LangSmith and convert to EvaluationRow format.
2930
3031
By default, fetches root runs from a project and maps inputs/outputs into

eval_protocol/adapters/openai_responses.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@
2121
from openai.types.responses.tool import Tool
2222

2323
from eval_protocol.models import EvaluationRow, InputMetadata, Message
24+
from .base import BaseAdapter
2425

2526
logger = logging.getLogger(__name__)
2627

2728

2829
from openai import OpenAI
2930

3031

31-
class OpenAIResponsesAdapter:
32+
class OpenAIResponsesAdapter(BaseAdapter):
3233
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.
3334
3435
This adapter can pull both chat conversations and tool calling traces from

0 commit comments

Comments
 (0)