11"""Braintrust adapter for Eval Protocol.
22
3- This adapter pulls traces from Braintrust projects and converts them
4- to EvaluationRow format for evaluation pipelines.
3+ This adapter allows pulling data from Braintrust deployments and converting it
4+ to EvaluationRow format for use in evaluation pipelines.
55"""
66
7+ import logging
78import os
8- from datetime import datetime
9- from typing import Any , Dict , Iterator , List , Optional
9+ import random
10+ import time
11+ from datetime import datetime , timedelta
12+ from typing import Any , Dict , List , Optional , Protocol
1013
1114import requests
1215
1316from eval_protocol .models import EvaluationRow , InputMetadata , Message
17+ from .utils import extract_messages_from_data
1418
1519# Keep backward compatibility
1620from ..integrations .braintrust import reward_fn_to_scorer , scorer_to_reward_fn
1721
1822
23+ logger = logging .getLogger (__name__ )
24+
25+
26+ class TraceConverter (Protocol ):
27+ """Protocol for custom trace-to-EvaluationRow converter functions.
28+
29+ A converter function should take a Braintrust trace along with processing
30+ options and return an EvaluationRow or None to skip the trace.
31+ """
32+
33+ def __call__ (
34+ self ,
35+ trace : Dict [str , Any ],
36+ include_tool_calls : bool ,
37+ ) -> Optional [EvaluationRow ]:
38+ """Convert a Braintrust trace to an EvaluationRow.
39+
40+ Args:
41+ trace: The Braintrust trace object to convert
42+ include_tool_calls: Whether to include tool calling information
43+
44+ Returns:
45+ EvaluationRow or None if the trace should be skipped
46+ """
47+ ...
48+
49+
50+ def convert_trace_to_evaluation_row (trace : Dict [str , Any ], include_tool_calls : bool = True ) -> Optional [EvaluationRow ]:
51+ """Convert a Braintrust trace to EvaluationRow format.
52+
53+ Args:
54+ trace: Braintrust trace object
55+ include_tool_calls: Whether to include tool calling information
56+
57+ Returns:
58+ EvaluationRow or None if conversion fails
59+ """
60+ try :
61+ # Extract messages from the trace
62+ messages = extract_messages_from_trace (trace , include_tool_calls )
63+
64+ # Extract tools if available
65+ tools = None
66+ if include_tool_calls :
67+ metadata = trace .get ("metadata" , {})
68+ tools = metadata .get ("tools" )
69+ if not tools :
70+ hidden_params = metadata .get ("hidden_params" , {})
71+ optional_params = hidden_params .get ("optional_params" , {})
72+ tools = optional_params .get ("tools" )
73+
74+ if not messages :
75+ return None
76+
77+ return EvaluationRow (
78+ messages = messages ,
79+ tools = tools ,
80+ input_metadata = InputMetadata (
81+ session_data = {
82+ "braintrust_trace_id" : trace .get ("id" ),
83+ }
84+ ),
85+ )
86+
87+ except (AttributeError , ValueError , KeyError ) as e :
88+ logger .error ("Error converting trace %s: %s" , trace .get ("id" , "unknown" ), e )
89+ return None
90+
91+
92+ def extract_messages_from_trace (trace : Dict [str , Any ], include_tool_calls : bool = True ) -> List [Message ]:
93+ """Extract messages from Braintrust trace input and output.
94+
95+ Args:
96+ trace: Braintrust trace object
97+ include_tool_calls: Whether to include tool calling information
98+
99+ Returns:
100+ List of Message objects
101+ """
102+ messages = []
103+
104+ try :
105+ # Look for complete conversations (input + output arrays)
106+ input_data = trace .get ("input" )
107+
108+ output_data = None
109+ output_list = trace .get ("output" , [])
110+ if output_list and len (output_list ) > 0 :
111+ first_output = output_list [0 ]
112+ if isinstance (first_output , dict ):
113+ output_data = first_output .get ("message" )
114+
115+ # Skip spans without meaningful conversation data
116+ if not input_data or not output_data :
117+ return messages
118+
119+ # Extract messages from input and output
120+ if input_data :
121+ messages .extend (extract_messages_from_data (input_data , include_tool_calls ))
122+ if output_data :
123+ messages .extend (extract_messages_from_data (output_data , include_tool_calls ))
124+
125+ except (AttributeError , ValueError , KeyError ) as e :
126+ logger .warning ("Error processing trace %s: %s" , trace .get ("id" , "unknown" ), e )
127+
128+ return messages
129+
130+
19131class BraintrustAdapter :
20- """Minimal adapter to pull traces from Braintrust."""
132+ """Adapter to pull data from Braintrust and convert to EvaluationRow format.
133+
134+ This adapter can pull both chat conversations and tool calling traces from
135+ Braintrust deployments and convert them into the EvaluationRow format expected
136+ by the evaluation protocol.
137+
138+ Examples:
139+ Basic usage:
140+ >>> adapter = BraintrustAdapter(
141+ ... api_key="your_api_key",
142+ ... project_id="your_project_id"
143+ ... )
144+ >>> btql_query = "select: * from: project_logs('your_project_id') traces limit: 10"
145+ >>> rows = adapter.get_evaluation_rows(btql_query)
146+
147+ Using BTQL for custom queries:
148+ >>> btql_query = '''
149+ ... select: *
150+ ... from: project_logs('your_project_id') traces
151+ ... filter: metadata.agent_name = 'agent_instance'
152+ ... limit: 50
153+ ... '''
154+ >>> rows = adapter.get_evaluation_rows(btql_query)
155+ """
21156
22157 def __init__ (
23158 self ,
@@ -30,177 +165,71 @@ def __init__(
30165 Args:
31166 api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
32167 api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
33- project_id: Project ID to fetch logs from
168+ project_id: Project ID to fetch logs from (defaults to BRAINTRUST_PROJECT_ID env var)
34169 """
35170 self .api_key = api_key or os .getenv ("BRAINTRUST_API_KEY" )
36171 self .api_url = api_url or os .getenv ("BRAINTRUST_API_URL" , "https://api.braintrust.dev" )
37- self .project_id = project_id
172+ self .project_id = project_id or os . getenv ( "BRAINTRUST_PROJECT_ID" )
38173
39174 if not self .api_key :
40175 raise ValueError ("BRAINTRUST_API_KEY environment variable or api_key parameter required" )
176+ if not self .project_id :
177+ raise ValueError ("BRAINTRUST_PROJECT_ID environment variable or project_id parameter required" )
41178
42179 def get_evaluation_rows (
43180 self ,
44- project_id : Optional [str ] = None ,
45- limit : Optional [int ] = None ,
46- from_timestamp : Optional [datetime ] = None ,
47- to_timestamp : Optional [datetime ] = None ,
48- ) -> Iterator [EvaluationRow ]:
49- """Fetch traces from Braintrust and convert to EvaluationRow format."""
50- project_id = project_id or self .project_id
51- if not project_id :
52- raise ValueError ("project_id required" )
53-
54- # Prepare query parameters for GET request
55- params = {"limit" : 1000 }
56- if from_timestamp :
57- params ["from_timestamp" ] = int (from_timestamp .timestamp ())
58- if to_timestamp :
59- params ["to_timestamp" ] = int (to_timestamp .timestamp ())
60-
61- # Fetch logs from Braintrust using GET endpoint
62- headers = {"Authorization" : f"Bearer { self .api_key } " }
63-
64- url = f"{ self .api_url } /v1/project_logs/{ project_id } /fetch"
65-
66- response = requests .get (url , headers = headers , params = params )
67- response .raise_for_status ()
68-
69- logs = response .json ()
70-
71- # Convert each log to EvaluationRow
72- for log in logs .get ("events" , []):
73- if log .get ("metadata" , {}).get ("agent_name" ) == "agent_instance" :
74- try :
75- eval_row = self ._convert_log_to_evaluation_row (log )
76- if eval_row :
77- yield eval_row
78- except Exception as e :
79- print (f"Warning: Failed to convert log { log .get ('id' , 'unknown' )} : { e } " )
80- continue
81-
82- def _convert_log_to_evaluation_row (self , log : Dict [str , Any ]) -> Optional [EvaluationRow ]:
83- """Convert a Braintrust log to EvaluationRow format."""
84- # Extract messages from the log
85- messages = self ._extract_messages (log )
86- if not messages :
87- return None
181+ btql_query : str ,
182+ include_tool_calls : bool = True ,
183+ converter : Optional [TraceConverter ] = None ,
184+ ) -> List [EvaluationRow ]:
185+ """Get evaluation rows using a custom BTQL query.
88186
89- # Extract metadata (pulling nothing currently)
90- input_metadata = InputMetadata (
91- row_id = log .get ("id" ),
92- completion_params = log .get ("metadata" , {}),
93- dataset_info = {
94- "braintrust_log_id" : log .get ("id" ),
95- "braintrust_project_id" : self .project_id ,
96- "span_id" : log .get ("span_id" ),
97- "trace_id" : log .get ("root_span_id" ),
98- },
99- )
100-
101- # Extract ground truth from metadata
102- metadata = log .get ("metadata" , {})
103- ground_truth = metadata .get ("ground_truth" )
187+ Args:
188+ btql_query: The BTQL query string to execute
189+ include_tool_calls: Whether to include tool calling information
190+ converter: Optional custom converter implementing TraceConverter protocol
104191
105- return EvaluationRow (
106- messages = messages ,
107- input_metadata = input_metadata ,
108- ground_truth = str (ground_truth ) if ground_truth else None ,
109- )
192+ Returns:
193+ List[EvaluationRow]: Converted evaluation rows
194+ """
195+ eval_rows = []
110196
111- def _extract_messages (self , log : Dict [str , Any ]) -> List [Message ]:
112- """Extract conversation messages from a Braintrust log."""
113- messages = []
197+ headers = {"Authorization" : f"Bearer { self .api_key } " , "Content-Type" : "application/json" }
114198
115- # Look for complete conversations (input + output arrays )
116- input_data = log . get ( "input" )
117- output_data = log . get ( "output" )
199+ response = requests . post ( f" { self . api_url } /btql" , headers = headers , json = { "query" : btql_query , "fmt" : "json" } )
200+ response . raise_for_status ( )
201+ query_response = response . json ( )
118202
119- # Skip spans without meaningful conversation data
120- if not input_data or not output_data :
121- return []
122-
123- # Extract input messages (usually just user message)
124- if isinstance (input_data , list ):
125- for msg in input_data :
126- if isinstance (msg , dict ) and "role" in msg and "content" in msg :
127- messages .append (Message (role = msg ["role" ], content = str (msg ["content" ])))
128-
129- # Extract output messages (assistant + tool responses)
130- if isinstance (output_data , list ):
131- for msg in output_data :
132- if isinstance (msg , dict ) and "role" in msg :
133- # Handle tool calls in assistant messages
134- tool_calls = msg .get ("tool_calls" ) if msg ["role" ] == "assistant" else None
135- tool_call_id = msg .get ("tool_call_id" ) if msg ["role" ] == "tool" else None
136- name = msg .get ("name" ) if msg ["role" ] == "tool" else None
137-
138- messages .append (
139- Message (
140- role = msg ["role" ],
141- content = str (msg .get ("content" , "" )),
142- tool_calls = tool_calls ,
143- tool_call_id = tool_call_id ,
144- name = name ,
145- )
146- )
147-
148- return messages
149-
150- def create_score (
151- self ,
152- log_id : str ,
153- name : str ,
154- value : float ,
155- comment : Optional [str ] = None ,
156- project_id : Optional [str ] = None ,
157- ) -> bool :
158- """Create a score/feedback for a Braintrust log entry.
203+ if not query_response or not query_response .get ("data" ):
204+ logger .debug ("No data returned from BTQL query" )
205+ return eval_rows
159206
160- Args:
161- log_id: The ID of the log entry to score
162- name: The score name/type
163- value: The score value
164- comment: Optional comment explaining the score
165- project_id: Project ID (overrides instance default)
207+ all_traces = query_response ["data" ]
208+ logger .debug ("BTQL query returned %d traces" , len (all_traces ))
166209
167- Returns:
168- True if successful, False otherwise
169- """
170- project_id = project_id or self .project_id
171- if not project_id :
172- raise ValueError ("project_id required" )
173-
174- # Prepare feedback data - API expects "feedback" array
175- feedback_item = {
176- "id" : log_id ,
177- "name" : name ,
178- "value" : value ,
179- }
180- if comment :
181- feedback_item ["comment" ] = comment
182-
183- feedback_data = {"feedback" : [feedback_item ]}
184-
185- # Post feedback to Braintrust
186- headers = {"Authorization" : f"Bearer { self .api_key } " , "Content-Type" : "application/json" }
210+ # Process each selected trace
211+ for trace in all_traces :
212+ try :
213+ if converter :
214+ eval_row = converter (trace , include_tool_calls )
215+ else :
216+ eval_row = convert_trace_to_evaluation_row (trace , include_tool_calls )
217+ if eval_row :
218+ eval_rows .append (eval_row )
219+ except (AttributeError , ValueError , KeyError ) as e :
220+ logger .warning ("Failed to convert trace %s: %s" , trace .get ("id" , "unknown" ), e )
221+ continue
187222
188- try :
189- url = f"{ self .api_url } /v1/project_logs/{ project_id } /feedback"
190- response = requests .post (url , headers = headers , json = feedback_data )
191- response .raise_for_status ()
192- return True
193- except Exception as e :
194- print (f"Error creating Braintrust score: { e } " )
195- return False
223+ logger .info ("Successfully processed %d BTQL results into %d evaluation rows" , len (all_traces ), len (eval_rows ))
224+ return eval_rows
196225
197226
198227def create_braintrust_adapter (
199228 api_key : Optional [str ] = None ,
200229 api_url : Optional [str ] = None ,
201230 project_id : Optional [str ] = None ,
202231) -> BraintrustAdapter :
203- """Create a BraintrustAdapter instance ."""
232+ """Factory function to create a Braintrust adapter ."""
204233 return BraintrustAdapter (
205234 api_key = api_key ,
206235 api_url = api_url ,
0 commit comments