eval-protocol
diff --git a/‎eval_protocol/adapters/braintrust.py‎
Lines changed: 181 additions & 152 deletions b/‎eval_protocol/adapters/braintrust.py‎
Lines changed: 181 additions & 152 deletions
@@ -1,23 +1,158 @@
 """Braintrust adapter for Eval Protocol.
 
-This adapter pulls traces from Braintrust projects and converts them
-to EvaluationRow format for evaluation pipelines.
+This adapter allows pulling data from Braintrust deployments and converting it
+to EvaluationRow format for use in evaluation pipelines.
 """
 
+import logging
 import os
-from datetime import datetime
-from typing import Any, Dict, Iterator, List, Optional
+import random
+import time
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Protocol
 
 import requests
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .utils import extract_messages_from_data
 
 # Keep backward compatibility
 from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
 
 
+logger = logging.getLogger(__name__)
+
+
+class TraceConverter(Protocol):
+    """Protocol for custom trace-to-EvaluationRow converter functions.
+
+    A converter function should take a Braintrust trace along with processing
+    options and return an EvaluationRow or None to skip the trace.
+    """
+
+    def __call__(
+        self,
+        trace: Dict[str, Any],
+        include_tool_calls: bool,
+    ) -> Optional[EvaluationRow]:
+        """Convert a Braintrust trace to an EvaluationRow.
+
+        Args:
+            trace: The Braintrust trace object to convert
+            include_tool_calls: Whether to include tool calling information
+
+        Returns:
+            EvaluationRow or None if the trace should be skipped
+        """
+        ...
+
+
+def convert_trace_to_evaluation_row(trace: Dict[str, Any], include_tool_calls: bool = True) -> Optional[EvaluationRow]:
+    """Convert a Braintrust trace to EvaluationRow format.
+
+    Args:
+        trace: Braintrust trace object
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        EvaluationRow or None if conversion fails
+    """
+    try:
+        # Extract messages from the trace
+        messages = extract_messages_from_trace(trace, include_tool_calls)
+
+        # Extract tools if available
+        tools = None
+        if include_tool_calls:
+            metadata = trace.get("metadata", {})
+            tools = metadata.get("tools")
+            if not tools:
+                hidden_params = metadata.get("hidden_params", {})
+                optional_params = hidden_params.get("optional_params", {})
+                tools = optional_params.get("tools")
+
+        if not messages:
+            return None
+
+        return EvaluationRow(
+            messages=messages,
+            tools=tools,
+            input_metadata=InputMetadata(
+                session_data={
+                    "braintrust_trace_id": trace.get("id"),
+                }
+            ),
+        )
+
+    except (AttributeError, ValueError, KeyError) as e:
+        logger.error("Error converting trace %s: %s", trace.get("id", "unknown"), e)
+        return None
+
+
+def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
+    """Extract messages from Braintrust trace input and output.
+
+    Args:
+        trace: Braintrust trace object
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        List of Message objects
+    """
+    messages = []
+
+    try:
+        # Look for complete conversations (input + output arrays)
+        input_data = trace.get("input")
+
+        output_data = None
+        output_list = trace.get("output", [])
+        if output_list and len(output_list) > 0:
+            first_output = output_list[0]
+            if isinstance(first_output, dict):
+                output_data = first_output.get("message")
+
+        # Skip spans without meaningful conversation data
+        if not input_data or not output_data:
+            return messages
+
+        # Extract messages from input and output
+        if input_data:
+            messages.extend(extract_messages_from_data(input_data, include_tool_calls))
+        if output_data:
+            messages.extend(extract_messages_from_data(output_data, include_tool_calls))
+
+    except (AttributeError, ValueError, KeyError) as e:
+        logger.warning("Error processing trace %s: %s", trace.get("id", "unknown"), e)
+
+    return messages
+
+
 class BraintrustAdapter:
-    """Minimal adapter to pull traces from Braintrust."""
+    """Adapter to pull data from Braintrust and convert to EvaluationRow format.
+
+    This adapter can pull both chat conversations and tool calling traces from
+    Braintrust deployments and convert them into the EvaluationRow format expected
+    by the evaluation protocol.
+
+    Examples:
+        Basic usage:
+        >>> adapter = BraintrustAdapter(
+        ...     api_key="your_api_key",
+        ...     project_id="your_project_id"
+        ... )
+        >>> btql_query = "select: * from: project_logs('your_project_id') traces limit: 10"
+        >>> rows = adapter.get_evaluation_rows(btql_query)
+
+        Using BTQL for custom queries:
+        >>> btql_query = '''
+        ... select: *
+        ... from: project_logs('your_project_id') traces
+        ... filter: metadata.agent_name = 'agent_instance'
+        ... limit: 50
+        ... '''
+        >>> rows = adapter.get_evaluation_rows(btql_query)
+    """
 
     def __init__(
         self,
@@ -30,177 +165,71 @@ def __init__(
         Args:
             api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
             api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
-            project_id: Project ID to fetch logs from
+            project_id: Project ID to fetch logs from (defaults to BRAINTRUST_PROJECT_ID env var)
         """
         self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY")
         self.api_url = api_url or os.getenv("BRAINTRUST_API_URL", "https://api.braintrust.dev")
-        self.project_id = project_id
+        self.project_id = project_id or os.getenv("BRAINTRUST_PROJECT_ID")
 
         if not self.api_key:
             raise ValueError("BRAINTRUST_API_KEY environment variable or api_key parameter required")
+        if not self.project_id:
+            raise ValueError("BRAINTRUST_PROJECT_ID environment variable or project_id parameter required")
 
     def get_evaluation_rows(
         self,
-        project_id: Optional[str] = None,
-        limit: Optional[int] = None,
-        from_timestamp: Optional[datetime] = None,
-        to_timestamp: Optional[datetime] = None,
-    ) -> Iterator[EvaluationRow]:
-        """Fetch traces from Braintrust and convert to EvaluationRow format."""
-        project_id = project_id or self.project_id
-        if not project_id:
-            raise ValueError("project_id required")
-
-        # Prepare query parameters for GET request
-        params = {"limit": 1000}
-        if from_timestamp:
-            params["from_timestamp"] = int(from_timestamp.timestamp())
-        if to_timestamp:
-            params["to_timestamp"] = int(to_timestamp.timestamp())
-
-        # Fetch logs from Braintrust using GET endpoint
-        headers = {"Authorization": f"Bearer {self.api_key}"}
-
-        url = f"{self.api_url}/v1/project_logs/{project_id}/fetch"
-
-        response = requests.get(url, headers=headers, params=params)
-        response.raise_for_status()
-
-        logs = response.json()
-
-        # Convert each log to EvaluationRow
-        for log in logs.get("events", []):
-            if log.get("metadata", {}).get("agent_name") == "agent_instance":
-                try:
-                    eval_row = self._convert_log_to_evaluation_row(log)
-                    if eval_row:
-                        yield eval_row
-                except Exception as e:
-                    print(f"Warning: Failed to convert log {log.get('id', 'unknown')}: {e}")
-                    continue
-
-    def _convert_log_to_evaluation_row(self, log: Dict[str, Any]) -> Optional[EvaluationRow]:
-        """Convert a Braintrust log to EvaluationRow format."""
-        # Extract messages from the log
-        messages = self._extract_messages(log)
-        if not messages:
-            return None
+        btql_query: str,
+        include_tool_calls: bool = True,
+        converter: Optional[TraceConverter] = None,
+    ) -> List[EvaluationRow]:
+        """Get evaluation rows using a custom BTQL query.
 
-        # Extract metadata (pulling nothing currently)
-        input_metadata = InputMetadata(
-            row_id=log.get("id"),
-            completion_params=log.get("metadata", {}),
-            dataset_info={
-                "braintrust_log_id": log.get("id"),
-                "braintrust_project_id": self.project_id,
-                "span_id": log.get("span_id"),
-                "trace_id": log.get("root_span_id"),
-            },
-        )
-
-        # Extract ground truth from metadata
-        metadata = log.get("metadata", {})
-        ground_truth = metadata.get("ground_truth")
+        Args:
+            btql_query: The BTQL query string to execute
+            include_tool_calls: Whether to include tool calling information
+            converter: Optional custom converter implementing TraceConverter protocol
 
-        return EvaluationRow(
-            messages=messages,
-            input_metadata=input_metadata,
-            ground_truth=str(ground_truth) if ground_truth else None,
-        )
+        Returns:
+            List[EvaluationRow]: Converted evaluation rows
+        """
+        eval_rows = []
 
-    def _extract_messages(self, log: Dict[str, Any]) -> List[Message]:
-        """Extract conversation messages from a Braintrust log."""
-        messages = []
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
 
-        # Look for complete conversations (input + output arrays)
-        input_data = log.get("input")
-        output_data = log.get("output")
+        response = requests.post(f"{self.api_url}/btql", headers=headers, json={"query": btql_query, "fmt": "json"})
+        response.raise_for_status()
+        query_response = response.json()
 
-        # Skip spans without meaningful conversation data
-        if not input_data or not output_data:
-            return []
-
-        # Extract input messages (usually just user message)
-        if isinstance(input_data, list):
-            for msg in input_data:
-                if isinstance(msg, dict) and "role" in msg and "content" in msg:
-                    messages.append(Message(role=msg["role"], content=str(msg["content"])))
-
-        # Extract output messages (assistant + tool responses)
-        if isinstance(output_data, list):
-            for msg in output_data:
-                if isinstance(msg, dict) and "role" in msg:
-                    # Handle tool calls in assistant messages
-                    tool_calls = msg.get("tool_calls") if msg["role"] == "assistant" else None
-                    tool_call_id = msg.get("tool_call_id") if msg["role"] == "tool" else None
-                    name = msg.get("name") if msg["role"] == "tool" else None
-
-                    messages.append(
-                        Message(
-                            role=msg["role"],
-                            content=str(msg.get("content", "")),
-                            tool_calls=tool_calls,
-                            tool_call_id=tool_call_id,
-                            name=name,
-                        )
-                    )
-
-        return messages
-
-    def create_score(
-        self,
-        log_id: str,
-        name: str,
-        value: float,
-        comment: Optional[str] = None,
-        project_id: Optional[str] = None,
-    ) -> bool:
-        """Create a score/feedback for a Braintrust log entry.
+        if not query_response or not query_response.get("data"):
+            logger.debug("No data returned from BTQL query")
+            return eval_rows
 
-        Args:
-            log_id: The ID of the log entry to score
-            name: The score name/type
-            value: The score value
-            comment: Optional comment explaining the score
-            project_id: Project ID (overrides instance default)
+        all_traces = query_response["data"]
+        logger.debug("BTQL query returned %d traces", len(all_traces))
 
-        Returns:
-            True if successful, False otherwise
-        """
-        project_id = project_id or self.project_id
-        if not project_id:
-            raise ValueError("project_id required")
-
-        # Prepare feedback data - API expects "feedback" array
-        feedback_item = {
-            "id": log_id,
-            "name": name,
-            "value": value,
-        }
-        if comment:
-            feedback_item["comment"] = comment
-
-        feedback_data = {"feedback": [feedback_item]}
-
-        # Post feedback to Braintrust
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+        # Process each selected trace
+        for trace in all_traces:
+            try:
+                if converter:
+                    eval_row = converter(trace, include_tool_calls)
+                else:
+                    eval_row = convert_trace_to_evaluation_row(trace, include_tool_calls)
+                if eval_row:
+                    eval_rows.append(eval_row)
+            except (AttributeError, ValueError, KeyError) as e:
+                logger.warning("Failed to convert trace %s: %s", trace.get("id", "unknown"), e)
+                continue
 
-        try:
-            url = f"{self.api_url}/v1/project_logs/{project_id}/feedback"
-            response = requests.post(url, headers=headers, json=feedback_data)
-            response.raise_for_status()
-            return True
-        except Exception as e:
-            print(f"Error creating Braintrust score: {e}")
-            return False
+        logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
+        return eval_rows
 
 
 def create_braintrust_adapter(
     api_key: Optional[str] = None,
     api_url: Optional[str] = None,
     project_id: Optional[str] = None,
 ) -> BraintrustAdapter:
-    """Create a BraintrustAdapter instance."""
+    """Factory function to create a Braintrust adapter."""
     return BraintrustAdapter(
         api_key=api_key,
         api_url=api_url,