Skip to content

Commit 19d491a

Browse files
committed
Various small refactors
1 parent 71faaf6 commit 19d491a

File tree

11 files changed

+153
-149
lines changed

11 files changed

+153
-149
lines changed

eval_protocol/adapters/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,26 @@
44
and converting them to EvaluationRow format for use in evaluation pipelines.
55
66
Available adapters:
7+
- BaseAdapter: Abstract base class for all adapters
78
- LangfuseAdapter: Pull data from Langfuse deployments
89
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
910
- BigQueryAdapter: Query data from Google BigQuery
1011
- Braintrust integration (legacy)
1112
- TRL integration (legacy)
1213
"""
1314

15+
# Always available
16+
from .base import BaseAdapter
17+
18+
__all__ = ["BaseAdapter"]
19+
1420
# Conditional imports based on available dependencies
1521
try:
1622
from .langfuse import LangfuseAdapter, create_langfuse_adapter
1723

18-
__all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
24+
__all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
1925
except ImportError:
20-
__all__ = []
26+
pass
2127

2228
try:
2329
from .huggingface import (
@@ -55,9 +61,9 @@
5561

5662
# Legacy adapters (always available)
5763
try:
58-
from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
64+
from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn
5965

60-
__all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
66+
__all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"])
6167
except ImportError:
6268
pass
6369

eval_protocol/adapters/base.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""
2+
Base adapter interface for Eval Protocol.
3+
"""
4+
5+
from abc import ABC, abstractmethod
6+
from typing import List
7+
8+
from eval_protocol.models import EvaluationRow
9+
10+
11+
class BaseAdapter(ABC):
12+
"""Abstract base class for all Eval Protocol adapters."""
13+
14+
@abstractmethod
15+
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
16+
"""Get evaluation rows from the data source."""
17+
pass
18+
19+
def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
20+
"""Push evaluation scores back to the data source for tracking and analysis."""
21+
pass

eval_protocol/adapters/bigquery.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias
1111

1212
from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
13+
from .base import BaseAdapter
1314

1415
logger = logging.getLogger(__name__)
1516

@@ -42,7 +43,7 @@
4243
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
4344

4445

45-
class BigQueryAdapter:
46+
class BigQueryAdapter(BaseAdapter):
4647
"""Adapter to query data from Google BigQuery and convert to EvaluationRow format.
4748
4849
This adapter connects to Google BigQuery, executes SQL queries, and applies

eval_protocol/adapters/braintrust.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import requests
1515

1616
from eval_protocol.models import EvaluationRow, InputMetadata, Message
17+
from .base import BaseAdapter
1718
from .utils import extract_messages_from_data
1819

1920
# Keep backward compatibility
@@ -128,7 +129,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool
128129
return messages
129130

130131

131-
class BraintrustAdapter:
132+
class BraintrustAdapter(BaseAdapter):
132133
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.
133134
134135
This adapter can pull both chat conversations and tool calling traces from
@@ -223,6 +224,49 @@ def get_evaluation_rows(
223224
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
224225
return eval_rows
225226

227+
def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
228+
"""Push evaluation scores back to Braintrust traces for tracking and analysis.
229+
230+
Creates score entries in Braintrust for each unique trace_id found in the evaluation
231+
rows' session data. This allows you to see evaluation results directly in the
232+
Braintrust UI alongside the original traces.
233+
234+
Args:
235+
rows: List of EvaluationRow objects with session_data containing trace IDs
236+
model_name: Name of the model (used as the score name in Braintrust)
237+
mean_score: The calculated mean score to push to Braintrust
238+
239+
Note:
240+
Silently handles errors if rows lack session data
241+
"""
242+
try:
243+
headers = {
244+
"Authorization": f"Bearer {self.api_key}",
245+
"Content-Type": "application/json",
246+
}
247+
248+
feedback_items = []
249+
for trace_id in set(
250+
row.input_metadata.session_data["braintrust_trace_id"]
251+
for row in rows
252+
if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
253+
):
254+
if trace_id:
255+
feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}})
256+
257+
if feedback_items:
258+
payload = {"feedback": feedback_items}
259+
260+
response = requests.post(
261+
f"{self.api_url}/v1/project_logs/{self.project_id}/feedback",
262+
headers=headers,
263+
json=payload,
264+
)
265+
response.raise_for_status()
266+
267+
except Exception as e:
268+
logger.warning("Failed to push scores to Braintrust: %s", e)
269+
226270

227271
def create_braintrust_adapter(
228272
api_key: Optional[str] = None,

eval_protocol/adapters/huggingface.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Any, Callable, Dict, Iterator, List, Optional
99

1010
from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
11+
from .base import BaseAdapter
1112

1213
logger = logging.getLogger(__name__)
1314

@@ -23,7 +24,7 @@
2324
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
2425

2526

26-
class HuggingFaceAdapter:
27+
class HuggingFaceAdapter(BaseAdapter):
2728
"""Generic adapter to load HuggingFace datasets with custom transformations.
2829
2930
This adapter loads datasets from HuggingFace Hub and applies a user-provided

eval_protocol/adapters/langfuse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import Any, Dict, List, Optional, Protocol
1313

1414
from eval_protocol.models import EvaluationRow, InputMetadata, Message
15+
from .base import BaseAdapter
1516
from .utils import extract_messages_from_data
1617

1718
logger = logging.getLogger(__name__)
@@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) ->
188189
return generations[-1]
189190

190191

191-
class LangfuseAdapter:
192+
class LangfuseAdapter(BaseAdapter):
192193
"""Adapter to pull data from Langfuse and convert to EvaluationRow format.
193194
194195
This adapter can pull both chat conversations and tool calling traces from

eval_protocol/adapters/langsmith.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import Any, Dict, List, Optional, Iterable
1414

1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
16+
from .base import BaseAdapter
1617

1718
logger = logging.getLogger(__name__)
1819

@@ -24,7 +25,7 @@
2425
LANGSMITH_AVAILABLE = False
2526

2627

27-
class LangSmithAdapter:
28+
class LangSmithAdapter(BaseAdapter):
2829
"""Adapter to pull data from LangSmith and convert to EvaluationRow format.
2930
3031
By default, fetches root runs from a project and maps inputs/outputs into

eval_protocol/adapters/openai_responses.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@
2121
from openai.types.responses.tool import Tool
2222

2323
from eval_protocol.models import EvaluationRow, InputMetadata, Message
24+
from .base import BaseAdapter
2425

2526
logger = logging.getLogger(__name__)
2627

2728

2829
from openai import OpenAI
2930

3031

31-
class OpenAIResponsesAdapter:
32+
class OpenAIResponsesAdapter(BaseAdapter):
3233
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.
3334
3435
This adapter can pull both chat conversations and tool calling traces from

eval_protocol/quickstart/llm_judge.py

Lines changed: 11 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2,65 +2,23 @@
22
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
33
"""
44

5-
from collections.abc import Awaitable, Callable
6-
import os
7-
from datetime import datetime
8-
from typing import List, Dict, Any, Optional
9-
from typing_extensions import cast
105
from tqdm import tqdm
6+
from typing import Optional
117

12-
import pytest
13-
14-
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
15-
from eval_protocol.pytest import evaluation_test
16-
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
8+
from eval_protocol.models import EvaluationRow
9+
from eval_protocol.adapters.base import BaseAdapter
1710
from eval_protocol.quickstart.utils import (
18-
split_multi_turn_rows,
1911
JUDGE_CONFIGS,
2012
calculate_bootstrap_scores,
2113
run_judgment_async,
2214
)
2315
import asyncio
2416
from openai import AsyncOpenAI
25-
from eval_protocol.adapters.langfuse import create_langfuse_adapter
26-
27-
adapter = create_langfuse_adapter()
28-
29-
30-
@pytest.mark.asyncio
31-
@evaluation_test(
32-
input_rows=[
33-
adapter.get_evaluation_rows(
34-
to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
35-
limit=711,
36-
sample_size=50,
37-
sleep_between_gets=3.0,
38-
max_retries=5,
39-
)
40-
],
41-
completion_params=[
42-
{"model": "gpt-4.1"},
43-
{
44-
"max_tokens": 131000,
45-
"extra_body": {"reasoning_effort": "medium"},
46-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
47-
},
48-
{
49-
"max_tokens": 131000,
50-
"extra_body": {"reasoning_effort": "low"},
51-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
52-
},
53-
],
54-
rollout_processor=SingleTurnRolloutProcessor(),
55-
preprocess_fn=split_multi_turn_rows,
56-
max_concurrent_rollouts=64,
57-
mode="all",
58-
)
59-
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
60-
return await aha_judge(rows)
6117

6218

63-
async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
19+
async def aha_judge(
20+
rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro", adapter: Optional[BaseAdapter] = None
21+
) -> list[EvaluationRow]:
6422
"""
6523
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
6624
@@ -73,6 +31,8 @@ async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro
7331
7432
Args:
7533
rows: List of EvaluationRow objects with messages, ground_truth, and tools
34+
judge_name: Name of the judge configuration to use
35+
adapter: Optional adapter to push scores back to (if provided)
7636
7737
Returns:
7838
Same rows with updated evaluation_result containing scores and judgments
@@ -133,7 +93,8 @@ async def run_judgment(row):
13393
if row.evaluation_result:
13494
row.evaluation_result.score = mean_score
13595

136-
# Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
137-
adapter.push_scores(rows, model_name, mean_score)
96+
# Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
97+
if adapter:
98+
adapter.push_scores(rows, model_name, mean_score)
13899

139100
return rows

0 commit comments

Comments
 (0)