Skip to content

Commit 520bd1c

Browse files
committed
braintrust example
1 parent 1e9472c commit 520bd1c

File tree

6 files changed

+447
-272
lines changed

6 files changed

+447
-272
lines changed

eval_protocol/adapters/braintrust.py

Lines changed: 181 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,158 @@
11
"""Braintrust adapter for Eval Protocol.
22
3-
This adapter pulls traces from Braintrust projects and converts them
4-
to EvaluationRow format for evaluation pipelines.
3+
This adapter allows pulling data from Braintrust deployments and converting it
4+
to EvaluationRow format for use in evaluation pipelines.
55
"""
66

7+
import logging
78
import os
8-
from datetime import datetime
9-
from typing import Any, Dict, Iterator, List, Optional
9+
import random
10+
import time
11+
from datetime import datetime, timedelta
12+
from typing import Any, Dict, List, Optional, Protocol
1013

1114
import requests
1215

1316
from eval_protocol.models import EvaluationRow, InputMetadata, Message
17+
from .utils import extract_messages_from_data
1418

1519
# Keep backward compatibility
1620
from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
1721

1822

23+
logger = logging.getLogger(__name__)
24+
25+
26+
class TraceConverter(Protocol):
27+
"""Protocol for custom trace-to-EvaluationRow converter functions.
28+
29+
A converter function should take a Braintrust trace along with processing
30+
options and return an EvaluationRow or None to skip the trace.
31+
"""
32+
33+
def __call__(
34+
self,
35+
trace: Dict[str, Any],
36+
include_tool_calls: bool,
37+
) -> Optional[EvaluationRow]:
38+
"""Convert a Braintrust trace to an EvaluationRow.
39+
40+
Args:
41+
trace: The Braintrust trace object to convert
42+
include_tool_calls: Whether to include tool calling information
43+
44+
Returns:
45+
EvaluationRow or None if the trace should be skipped
46+
"""
47+
...
48+
49+
50+
def convert_trace_to_evaluation_row(trace: Dict[str, Any], include_tool_calls: bool = True) -> Optional[EvaluationRow]:
51+
"""Convert a Braintrust trace to EvaluationRow format.
52+
53+
Args:
54+
trace: Braintrust trace object
55+
include_tool_calls: Whether to include tool calling information
56+
57+
Returns:
58+
EvaluationRow or None if conversion fails
59+
"""
60+
try:
61+
# Extract messages from the trace
62+
messages = extract_messages_from_trace(trace, include_tool_calls)
63+
64+
# Extract tools if available
65+
tools = None
66+
if include_tool_calls:
67+
metadata = trace.get("metadata", {})
68+
tools = metadata.get("tools")
69+
if not tools:
70+
hidden_params = metadata.get("hidden_params", {})
71+
optional_params = hidden_params.get("optional_params", {})
72+
tools = optional_params.get("tools")
73+
74+
if not messages:
75+
return None
76+
77+
return EvaluationRow(
78+
messages=messages,
79+
tools=tools,
80+
input_metadata=InputMetadata(
81+
session_data={
82+
"braintrust_trace_id": trace.get("id"),
83+
}
84+
),
85+
)
86+
87+
except (AttributeError, ValueError, KeyError) as e:
88+
logger.error("Error converting trace %s: %s", trace.get("id", "unknown"), e)
89+
return None
90+
91+
92+
def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
93+
"""Extract messages from Braintrust trace input and output.
94+
95+
Args:
96+
trace: Braintrust trace object
97+
include_tool_calls: Whether to include tool calling information
98+
99+
Returns:
100+
List of Message objects
101+
"""
102+
messages = []
103+
104+
try:
105+
# Look for complete conversations (input + output arrays)
106+
input_data = trace.get("input")
107+
108+
output_data = None
109+
output_list = trace.get("output", [])
110+
if output_list and len(output_list) > 0:
111+
first_output = output_list[0]
112+
if isinstance(first_output, dict):
113+
output_data = first_output.get("message")
114+
115+
# Skip spans without meaningful conversation data
116+
if not input_data or not output_data:
117+
return messages
118+
119+
# Extract messages from input and output
120+
if input_data:
121+
messages.extend(extract_messages_from_data(input_data, include_tool_calls))
122+
if output_data:
123+
messages.extend(extract_messages_from_data(output_data, include_tool_calls))
124+
125+
except (AttributeError, ValueError, KeyError) as e:
126+
logger.warning("Error processing trace %s: %s", trace.get("id", "unknown"), e)
127+
128+
return messages
129+
130+
19131
class BraintrustAdapter:
20-
"""Minimal adapter to pull traces from Braintrust."""
132+
"""Adapter to pull data from Braintrust and convert to EvaluationRow format.
133+
134+
This adapter can pull both chat conversations and tool calling traces from
135+
Braintrust deployments and convert them into the EvaluationRow format expected
136+
by the evaluation protocol.
137+
138+
Examples:
139+
Basic usage:
140+
>>> adapter = BraintrustAdapter(
141+
... api_key="your_api_key",
142+
... project_id="your_project_id"
143+
... )
144+
>>> btql_query = "select: * from: project_logs('your_project_id') traces limit: 10"
145+
>>> rows = adapter.get_evaluation_rows(btql_query)
146+
147+
Using BTQL for custom queries:
148+
>>> btql_query = '''
149+
... select: *
150+
... from: project_logs('your_project_id') traces
151+
... filter: metadata.agent_name = 'agent_instance'
152+
... limit: 50
153+
... '''
154+
>>> rows = adapter.get_evaluation_rows(btql_query)
155+
"""
21156

22157
def __init__(
23158
self,
@@ -30,177 +165,71 @@ def __init__(
30165
Args:
31166
api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
32167
api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
33-
project_id: Project ID to fetch logs from
168+
project_id: Project ID to fetch logs from (defaults to BRAINTRUST_PROJECT_ID env var)
34169
"""
35170
self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY")
36171
self.api_url = api_url or os.getenv("BRAINTRUST_API_URL", "https://api.braintrust.dev")
37-
self.project_id = project_id
172+
self.project_id = project_id or os.getenv("BRAINTRUST_PROJECT_ID")
38173

39174
if not self.api_key:
40175
raise ValueError("BRAINTRUST_API_KEY environment variable or api_key parameter required")
176+
if not self.project_id:
177+
raise ValueError("BRAINTRUST_PROJECT_ID environment variable or project_id parameter required")
41178

42179
def get_evaluation_rows(
43180
self,
44-
project_id: Optional[str] = None,
45-
limit: Optional[int] = None,
46-
from_timestamp: Optional[datetime] = None,
47-
to_timestamp: Optional[datetime] = None,
48-
) -> Iterator[EvaluationRow]:
49-
"""Fetch traces from Braintrust and convert to EvaluationRow format."""
50-
project_id = project_id or self.project_id
51-
if not project_id:
52-
raise ValueError("project_id required")
53-
54-
# Prepare query parameters for GET request
55-
params = {"limit": 1000}
56-
if from_timestamp:
57-
params["from_timestamp"] = int(from_timestamp.timestamp())
58-
if to_timestamp:
59-
params["to_timestamp"] = int(to_timestamp.timestamp())
60-
61-
# Fetch logs from Braintrust using GET endpoint
62-
headers = {"Authorization": f"Bearer {self.api_key}"}
63-
64-
url = f"{self.api_url}/v1/project_logs/{project_id}/fetch"
65-
66-
response = requests.get(url, headers=headers, params=params)
67-
response.raise_for_status()
68-
69-
logs = response.json()
70-
71-
# Convert each log to EvaluationRow
72-
for log in logs.get("events", []):
73-
if log.get("metadata", {}).get("agent_name") == "agent_instance":
74-
try:
75-
eval_row = self._convert_log_to_evaluation_row(log)
76-
if eval_row:
77-
yield eval_row
78-
except Exception as e:
79-
print(f"Warning: Failed to convert log {log.get('id', 'unknown')}: {e}")
80-
continue
81-
82-
def _convert_log_to_evaluation_row(self, log: Dict[str, Any]) -> Optional[EvaluationRow]:
83-
"""Convert a Braintrust log to EvaluationRow format."""
84-
# Extract messages from the log
85-
messages = self._extract_messages(log)
86-
if not messages:
87-
return None
181+
btql_query: str,
182+
include_tool_calls: bool = True,
183+
converter: Optional[TraceConverter] = None,
184+
) -> List[EvaluationRow]:
185+
"""Get evaluation rows using a custom BTQL query.
88186
89-
# Extract metadata (pulling nothing currently)
90-
input_metadata = InputMetadata(
91-
row_id=log.get("id"),
92-
completion_params=log.get("metadata", {}),
93-
dataset_info={
94-
"braintrust_log_id": log.get("id"),
95-
"braintrust_project_id": self.project_id,
96-
"span_id": log.get("span_id"),
97-
"trace_id": log.get("root_span_id"),
98-
},
99-
)
100-
101-
# Extract ground truth from metadata
102-
metadata = log.get("metadata", {})
103-
ground_truth = metadata.get("ground_truth")
187+
Args:
188+
btql_query: The BTQL query string to execute
189+
include_tool_calls: Whether to include tool calling information
190+
converter: Optional custom converter implementing TraceConverter protocol
104191
105-
return EvaluationRow(
106-
messages=messages,
107-
input_metadata=input_metadata,
108-
ground_truth=str(ground_truth) if ground_truth else None,
109-
)
192+
Returns:
193+
List[EvaluationRow]: Converted evaluation rows
194+
"""
195+
eval_rows = []
110196

111-
def _extract_messages(self, log: Dict[str, Any]) -> List[Message]:
112-
"""Extract conversation messages from a Braintrust log."""
113-
messages = []
197+
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
114198

115-
# Look for complete conversations (input + output arrays)
116-
input_data = log.get("input")
117-
output_data = log.get("output")
199+
response = requests.post(f"{self.api_url}/btql", headers=headers, json={"query": btql_query, "fmt": "json"})
200+
response.raise_for_status()
201+
query_response = response.json()
118202

119-
# Skip spans without meaningful conversation data
120-
if not input_data or not output_data:
121-
return []
122-
123-
# Extract input messages (usually just user message)
124-
if isinstance(input_data, list):
125-
for msg in input_data:
126-
if isinstance(msg, dict) and "role" in msg and "content" in msg:
127-
messages.append(Message(role=msg["role"], content=str(msg["content"])))
128-
129-
# Extract output messages (assistant + tool responses)
130-
if isinstance(output_data, list):
131-
for msg in output_data:
132-
if isinstance(msg, dict) and "role" in msg:
133-
# Handle tool calls in assistant messages
134-
tool_calls = msg.get("tool_calls") if msg["role"] == "assistant" else None
135-
tool_call_id = msg.get("tool_call_id") if msg["role"] == "tool" else None
136-
name = msg.get("name") if msg["role"] == "tool" else None
137-
138-
messages.append(
139-
Message(
140-
role=msg["role"],
141-
content=str(msg.get("content", "")),
142-
tool_calls=tool_calls,
143-
tool_call_id=tool_call_id,
144-
name=name,
145-
)
146-
)
147-
148-
return messages
149-
150-
def create_score(
151-
self,
152-
log_id: str,
153-
name: str,
154-
value: float,
155-
comment: Optional[str] = None,
156-
project_id: Optional[str] = None,
157-
) -> bool:
158-
"""Create a score/feedback for a Braintrust log entry.
203+
if not query_response or not query_response.get("data"):
204+
logger.debug("No data returned from BTQL query")
205+
return eval_rows
159206

160-
Args:
161-
log_id: The ID of the log entry to score
162-
name: The score name/type
163-
value: The score value
164-
comment: Optional comment explaining the score
165-
project_id: Project ID (overrides instance default)
207+
all_traces = query_response["data"]
208+
logger.debug("BTQL query returned %d traces", len(all_traces))
166209

167-
Returns:
168-
True if successful, False otherwise
169-
"""
170-
project_id = project_id or self.project_id
171-
if not project_id:
172-
raise ValueError("project_id required")
173-
174-
# Prepare feedback data - API expects "feedback" array
175-
feedback_item = {
176-
"id": log_id,
177-
"name": name,
178-
"value": value,
179-
}
180-
if comment:
181-
feedback_item["comment"] = comment
182-
183-
feedback_data = {"feedback": [feedback_item]}
184-
185-
# Post feedback to Braintrust
186-
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
210+
# Process each selected trace
211+
for trace in all_traces:
212+
try:
213+
if converter:
214+
eval_row = converter(trace, include_tool_calls)
215+
else:
216+
eval_row = convert_trace_to_evaluation_row(trace, include_tool_calls)
217+
if eval_row:
218+
eval_rows.append(eval_row)
219+
except (AttributeError, ValueError, KeyError) as e:
220+
logger.warning("Failed to convert trace %s: %s", trace.get("id", "unknown"), e)
221+
continue
187222

188-
try:
189-
url = f"{self.api_url}/v1/project_logs/{project_id}/feedback"
190-
response = requests.post(url, headers=headers, json=feedback_data)
191-
response.raise_for_status()
192-
return True
193-
except Exception as e:
194-
print(f"Error creating Braintrust score: {e}")
195-
return False
223+
logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
224+
return eval_rows
196225

197226

198227
def create_braintrust_adapter(
199228
api_key: Optional[str] = None,
200229
api_url: Optional[str] = None,
201230
project_id: Optional[str] = None,
202231
) -> BraintrustAdapter:
203-
"""Create a BraintrustAdapter instance."""
232+
"""Factory function to create a Braintrust adapter."""
204233
return BraintrustAdapter(
205234
api_key=api_key,
206235
api_url=api_url,

0 commit comments

Comments
 (0)