From 67ac6190e41dbb25cb559805b7ad3403e06df4b6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 8 Sep 2025 01:57:10 -0700
Subject: [PATCH 1/3] braintrust example

---
 eval_protocol/adapters/braintrust.py          | 209 +++++++++++++++-
 pyproject.toml                                |   3 +
 tests/chinook/braintrust/generate_traces.py   | 225 ++++++++++++++++++
 .../braintrust/test_braintrust_chinook.py     | 133 +++++++++++
 tests/chinook/langfuse/generate_traces.py     |   8 -
 uv.lock                                       | 105 +++++++-
 6 files changed, 671 insertions(+), 12 deletions(-)
 create mode 100644 tests/chinook/braintrust/generate_traces.py
 create mode 100644 tests/chinook/braintrust/test_braintrust_chinook.py

diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py
index bc444cfa..b4d34921 100644
--- a/eval_protocol/adapters/braintrust.py
+++ b/eval_protocol/adapters/braintrust.py
@@ -1,8 +1,211 @@
-"""Deprecated adapter wrappers for Braintrust.
+"""Braintrust adapter for Eval Protocol.
 
-This module forwards imports to :mod:`eval_protocol.integrations.braintrust`.
+This adapter pulls traces from Braintrust projects and converts them
+to EvaluationRow format for evaluation pipelines.
 """
 
+import os
+from datetime import datetime
+from typing import Any, Dict, Iterator, List, Optional
+
+import requests
+
+from eval_protocol.models import EvaluationRow, InputMetadata, Message
+
+# Keep backward compatibility
 from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
 
-__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer"]
+
+class BraintrustAdapter:
+    """Minimal adapter to pull traces from Braintrust."""
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        api_url: Optional[str] = None,
+        project_id: Optional[str] = None,
+    ):
+        """Initialize the Braintrust adapter.
+
+        Args:
+            api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
+            api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
+            project_id: Project ID to fetch logs from
+        """
+        self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY")
+        self.api_url = api_url or os.getenv("BRAINTRUST_API_URL", "https://api.braintrust.dev")
+        self.project_id = project_id
+
+        if not self.api_key:
+            raise ValueError("BRAINTRUST_API_KEY environment variable or api_key parameter required")
+
+    def get_evaluation_rows(
+        self,
+        project_id: Optional[str] = None,
+        limit: Optional[int] = None,
+        from_timestamp: Optional[datetime] = None,
+        to_timestamp: Optional[datetime] = None,
+    ) -> Iterator[EvaluationRow]:
+        """Fetch traces from Braintrust and convert to EvaluationRow format."""
+        project_id = project_id or self.project_id
+        if not project_id:
+            raise ValueError("project_id required")
+
+        # Prepare query parameters for GET request
+        params = {"limit": 1000}
+        if from_timestamp:
+            params["from_timestamp"] = int(from_timestamp.timestamp())
+        if to_timestamp:
+            params["to_timestamp"] = int(to_timestamp.timestamp())
+
+        # Fetch logs from Braintrust using GET endpoint
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+
+        url = f"{self.api_url}/v1/project_logs/{project_id}/fetch"
+
+        response = requests.get(url, headers=headers, params=params)
+        response.raise_for_status()
+
+        logs = response.json()
+
+        # Convert each log to EvaluationRow
+        for log in logs.get("events", []):
+            if log.get("metadata", {}).get("agent_name") == "agent_instance":
+                try:
+                    eval_row = self._convert_log_to_evaluation_row(log)
+                    if eval_row:
+                        yield eval_row
+                except Exception as e:
+                    print(f"Warning: Failed to convert log {log.get('id', 'unknown')}: {e}")
+                    continue
+
+    def _convert_log_to_evaluation_row(self, log: Dict[str, Any]) -> Optional[EvaluationRow]:
+        """Convert a Braintrust log to EvaluationRow format."""
+        # Extract messages from the log
+        messages = self._extract_messages(log)
+        if not messages:
+            return None
+
+        # Extract metadata (pulling nothing currently)
+        input_metadata = InputMetadata(
+            row_id=log.get("id"),
+            completion_params=log.get("metadata", {}),
+            dataset_info={
+                "braintrust_log_id": log.get("id"),
+                "braintrust_project_id": self.project_id,
+                "span_id": log.get("span_id"),
+                "trace_id": log.get("root_span_id"),
+            },
+        )
+
+        # Extract ground truth from metadata
+        metadata = log.get("metadata", {})
+        ground_truth = metadata.get("ground_truth")
+
+        return EvaluationRow(
+            messages=messages,
+            input_metadata=input_metadata,
+            ground_truth=str(ground_truth) if ground_truth else None,
+        )
+
+    def _extract_messages(self, log: Dict[str, Any]) -> List[Message]:
+        """Extract conversation messages from a Braintrust log."""
+        messages = []
+
+        # Look for complete conversations (input + output arrays)
+        input_data = log.get("input")
+        output_data = log.get("output")
+
+        # Skip spans without meaningful conversation data
+        if not input_data or not output_data:
+            return []
+
+        # Extract input messages (usually just user message)
+        if isinstance(input_data, list):
+            for msg in input_data:
+                if isinstance(msg, dict) and "role" in msg and "content" in msg:
+                    messages.append(Message(role=msg["role"], content=str(msg["content"])))
+
+        # Extract output messages (assistant + tool responses)
+        if isinstance(output_data, list):
+            for msg in output_data:
+                if isinstance(msg, dict) and "role" in msg:
+                    # Handle tool calls in assistant messages
+                    tool_calls = msg.get("tool_calls") if msg["role"] == "assistant" else None
+                    tool_call_id = msg.get("tool_call_id") if msg["role"] == "tool" else None
+                    name = msg.get("name") if msg["role"] == "tool" else None
+
+                    messages.append(
+                        Message(
+                            role=msg["role"],
+                            content=str(msg.get("content", "")),
+                            tool_calls=tool_calls,
+                            tool_call_id=tool_call_id,
+                            name=name,
+                        )
+                    )
+
+        return messages
+
+    def create_score(
+        self,
+        log_id: str,
+        name: str,
+        value: float,
+        comment: Optional[str] = None,
+        project_id: Optional[str] = None,
+    ) -> bool:
+        """Create a score/feedback for a Braintrust log entry.
+
+        Args:
+            log_id: The ID of the log entry to score
+            name: The score name/type
+            value: The score value
+            comment: Optional comment explaining the score
+            project_id: Project ID (overrides instance default)
+
+        Returns:
+            True if successful, False otherwise
+        """
+        project_id = project_id or self.project_id
+        if not project_id:
+            raise ValueError("project_id required")
+
+        # Prepare feedback data - API expects "feedback" array
+        feedback_item = {
+            "id": log_id,
+            "name": name,
+            "value": value,
+        }
+        if comment:
+            feedback_item["comment"] = comment
+
+        feedback_data = {"feedback": [feedback_item]}
+
+        # Post feedback to Braintrust
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+
+        try:
+            url = f"{self.api_url}/v1/project_logs/{project_id}/feedback"
+            response = requests.post(url, headers=headers, json=feedback_data)
+            response.raise_for_status()
+            return True
+        except Exception as e:
+            print(f"Error creating Braintrust score: {e}")
+            return False
+
+
+def create_braintrust_adapter(
+    api_key: Optional[str] = None,
+    api_url: Optional[str] = None,
+    project_id: Optional[str] = None,
+) -> BraintrustAdapter:
+    """Create a BraintrustAdapter instance."""
+    return BraintrustAdapter(
+        api_key=api_key,
+        api_url=api_url,
+        project_id=project_id,
+    )
+
+
+__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"]
diff --git a/pyproject.toml b/pyproject.toml
index 256b8e40..0cd9678c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -133,6 +133,9 @@ chinook = [
 langchain = [
     "langchain-core>=0.3.0",
 ]
+braintrust = [
+    "braintrust[otel]",
+]
 
 [tool.pytest.ini_options]
 addopts = "-q"
diff --git a/tests/chinook/braintrust/generate_traces.py b/tests/chinook/braintrust/generate_traces.py
new file mode 100644
index 00000000..1e95b598
--- /dev/null
+++ b/tests/chinook/braintrust/generate_traces.py
@@ -0,0 +1,225 @@
+import os
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
+
+from tests.chinook.dataset import collect_dataset
+from tests.chinook.pydantic.agent import setup_agent
+
+dataset = collect_dataset()
+current_idx = 0
+
+
+class SpanIDCapturingProcessor:
+    """Custom processor to capture span IDs when they open/close."""
+
+    def on_start(self, span, parent_context=None):
+        """Called when span starts - capture the ID."""
+        global current_idx
+        if span.name == "agent run":
+            span.set_attribute("ground_truth", dataset[current_idx].ground_truth)
+            current_idx += 1
+
+    def on_end(self, span):
+        pass
+
+    def shutdown(self):
+        pass
+
+    def force_flush(self, timeout_millis=30000):
+        pass
+
+
+try:
+    from braintrust.otel import BraintrustSpanProcessor
+    from opentelemetry import trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from pydantic_ai.agent import Agent
+    from braintrust import init_logger
+
+    BRAINTRUST_AVAILABLE = True
+
+    provider = TracerProvider()
+    trace.set_tracer_provider(provider)
+    provider.add_span_processor(BraintrustSpanProcessor())  # pyright: ignore[reportArgumentType]
+    provider.add_span_processor(SpanIDCapturingProcessor())  # pyright: ignore[reportArgumentType]
+
+    logger = init_logger(project="default-otel-project")
+
+    Agent.instrument_all()
+
+except ImportError:
+    BRAINTRUST_AVAILABLE = False
+
+    def setup_braintrust():
+        pass
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[0:1]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_0(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - Ground truth set by span processor during span creation.
+    """
+    return row
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[1:2]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_1(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - PydanticAI automatically creates rich Braintrust traces.
+    """
+    return row
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[2:3]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_2(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - PydanticAI automatically creates rich Braintrust traces.
+    """
+    return row
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[3:4]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_3(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - PydanticAI automatically creates rich Braintrust traces.
+    """
+    return row
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[4:5]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_4(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - PydanticAI automatically creates rich Braintrust traces.
+    """
+    return row
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()[5:6]],
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    mode="pointwise",
+)
+async def test_complex_query_5(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries - PydanticAI automatically creates rich Braintrust traces.
+    """
+    return row
diff --git a/tests/chinook/braintrust/test_braintrust_chinook.py b/tests/chinook/braintrust/test_braintrust_chinook.py
new file mode 100644
index 00000000..d9c2a77d
--- /dev/null
+++ b/tests/chinook/braintrust/test_braintrust_chinook.py
@@ -0,0 +1,133 @@
+import os
+from datetime import datetime, timedelta
+from typing import List, Any, Dict
+
+import pytest
+from pydantic import BaseModel
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIModel
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata
+from eval_protocol.pytest import evaluation_test, NoOpRolloutProcessor
+
+try:
+    from eval_protocol.adapters.braintrust import create_braintrust_adapter
+
+    BRAINTRUST_AVAILABLE = True
+except ImportError:
+    BRAINTRUST_AVAILABLE = False
+    create_braintrust_adapter = None
+
+
+class Response(BaseModel):
+    score: float
+    reason: str
+
+
+LLM_JUDGE_PROMPT = (
+    "Your job is to compare the response to the expected answer.\n"
+    "The response will be a narrative report of the query results.\n"
+    "If the response contains the same or well summarized information as the expected answer, return 1.0.\n"
+    "If the response does not contain the same information or is missing information, return 0.0."
+)
+
+
+def fetch_braintrust_traces_as_evaluation_rows(hours_back: int = 24) -> List[EvaluationRow]:
+    """
+    Dataset adapter: Use BraintrustAdapter to fetch traces from project logs.
+    """
+    if not BRAINTRUST_AVAILABLE or not create_braintrust_adapter:
+        print("⚠️ Braintrust unavailable - no traces to evaluate")
+        return []
+
+    try:
+        print("🧠 Using BraintrustAdapter to fetch Chinook traces")
+
+        adapter = create_braintrust_adapter(
+            project_id="df6863de-6ce2-4fcc-9995-1fa6605f8623"  # Your Braintrust project
+        )
+
+        # Use the adapter to fetch logs
+        now = datetime.now()
+        from_timestamp = now - timedelta(hours=hours_back)
+
+        evaluation_rows = list(
+            adapter.get_evaluation_rows(
+                from_timestamp=from_timestamp,
+                to_timestamp=now,
+            )
+        )
+
+        print(f"✅ BraintrustAdapter extracted {len(evaluation_rows)} evaluation rows")
+        return evaluation_rows
+
+    except Exception as e:
+        print(f"❌ BraintrustAdapter failed: {e}")
+        return []
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Only run this test locally (skipped in CI)",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[fetch_braintrust_traces_as_evaluation_rows(hours_back=168)],  # 1 week back
+    rollout_processor=NoOpRolloutProcessor(),  # No-op since traces already exist
+    mode="pointwise",
+)
+async def test_braintrust_trace_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    This test acts as an external evaluation pipeline for Braintrust traces.
+    It:
+    1. Gets traces from Braintrust (via dataset adapter)
+    2. Uses NoOpRolloutProcessor (traces already exist)
+    3. Evaluates each trace using same LLM judge as PydanticAI test
+    4. Pushes scores back to Braintrust (if API supports it)
+    """
+    # Same eval logic as PydanticAI example
+    last_assistant_message = row.last_assistant_message()
+    if last_assistant_message is None:
+        row.evaluation_result = EvaluateResult(
+            score=0.0,
+            reason="No assistant message found",
+        )
+    elif not last_assistant_message.content:
+        row.evaluation_result = EvaluateResult(
+            score=0.0,
+            reason="No assistant message found",
+        )
+    else:
+        model = OpenAIModel(
+            "accounts/fireworks/models/kimi-k2-instruct",
+            provider="fireworks",
+        )
+
+        class Response(BaseModel):
+            """
+            A score between 0.0 and 1.0 indicating whether the response is correct.
+            """
+
+            score: float
+
+            """
+            A short explanation of why the response is correct or incorrect.
+            """
+            reason: str
+
+        comparison_agent = Agent(
+            model=model,
+            system_prompt=LLM_JUDGE_PROMPT,
+            output_type=Response,
+            output_retries=5,
+        )
+
+        result = await comparison_agent.run(
+            f"Expected answer: {row.ground_truth}\nResponse: {last_assistant_message.content}"
+        )
+        row.evaluation_result = EvaluateResult(
+            score=result.output.score,
+            reason=result.output.reason,
+        )
+
+    return row
diff --git a/tests/chinook/langfuse/generate_traces.py b/tests/chinook/langfuse/generate_traces.py
index e2d7d011..bb3cc299 100644
--- a/tests/chinook/langfuse/generate_traces.py
+++ b/tests/chinook/langfuse/generate_traces.py
@@ -29,14 +29,6 @@ def decorator(func):
         return decorator if args and callable(args[0]) else decorator
 
 
-LLM_JUDGE_PROMPT = (
-    "Your job is to compare the response to the expected answer.\n"
-    "The response will be a narrative report of the query results.\n"
-    "If the response contains the same or well summarized information as the expected answer, return 1.0.\n"
-    "If the response does not contain the same information or is missing information, return 0.0."
-)
-
-
 @pytest.mark.skipif(
     os.environ.get("CI") == "true",
     reason="Only run this test locally (skipped in CI)",
diff --git a/uv.lock b/uv.lock
index 5a01c075..36a84820 100644
--- a/uv.lock
+++ b/uv.lock
@@ -479,6 +479,33 @@ version = "2.3.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/dd/5a/ad8d3ef9c13d5afcc1e44a77f11792ee717f6727b3320bddbc607e935e2a/box2d-py-2.3.5.tar.gz", hash = "sha256:b37dc38844bcd7def48a97111d2b082e4f81cca3cece7460feb3eacda0da2207", size = 374446, upload-time = "2018-10-02T01:03:23.527Z" }
 
+[[package]]
+name = "braintrust"
+version = "0.2.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "chevron" },
+    { name = "exceptiongroup" },
+    { name = "gitpython" },
+    { name = "python-dotenv" },
+    { name = "python-slugify" },
+    { name = "requests" },
+    { name = "sseclient-py" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e8/66/d8bf0a99a29ec7c6cbf47a1e9416ea2f9dcab7bf6d6539cd6a8e755ec22f/braintrust-0.2.6.tar.gz", hash = "sha256:e9c2ae3cce09a8562fa436bc6ec20039c495a6f82928ac36050b5fbc7b89743a", size = 180921, upload-time = "2025-08-27T18:13:01.804Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/86/8d4bb3b0ab6dd40d2d428bb0b48d29601ed2a344aa6c618378311a73844a/braintrust-0.2.6-py3-none-any.whl", hash = "sha256:d07f1666881c25a8d65a04cfb84f45a0016531928eb3133172942fae49f9a8f8", size = 210465, upload-time = "2025-08-27T18:13:00.503Z" },
+]
+
+[package.optional-dependencies]
+otel = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+    { name = "opentelemetry-sdk" },
+]
+
 [[package]]
 name = "brotli"
 version = "1.1.0"
@@ -732,6 +759,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" },
 ]
 
+[[package]]
+name = "chevron"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/1f/ca74b65b19798895d63a6e92874162f44233467c9e7c1ed8afd19016ebe9/chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf", size = 11440, upload-time = "2021-01-02T22:47:59.233Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/93/342cc62a70ab727e093ed98e02a725d85b746345f05d2b5e5034649f4ec8/chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443", size = 11595, upload-time = "2021-01-02T22:47:57.847Z" },
+]
+
 [[package]]
 name = "click"
 version = "8.2.1"
@@ -1215,6 +1251,9 @@ box2d = [
     { name = "pillow" },
     { name = "swig" },
 ]
+braintrust = [
+    { name = "braintrust", extra = ["otel"] },
+]
 chinook = [
     { name = "psycopg2-binary" },
 ]
@@ -1291,6 +1330,7 @@ requires-dist = [
     { name = "aiosqlite" },
     { name = "anthropic", specifier = ">=0.59.0" },
     { name = "backoff", specifier = ">=2.2.0" },
+    { name = "braintrust", extras = ["otel"], marker = "extra == 'braintrust'" },
     { name = "build", marker = "extra == 'dev'" },
     { name = "dataclasses-json", specifier = ">=0.5.7" },
     { name = "datasets", specifier = ">=3.0.0" },
@@ -1364,7 +1404,7 @@ requires-dist = [
     { name = "websockets", specifier = ">=15.0.1" },
     { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" },
 ]
-provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain"]
+provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "braintrust"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -1705,6 +1745,30 @@ http = [
     { name = "aiohttp" },
 ]
 
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.25.1"
@@ -5174,6 +5238,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
 ]
 
+[[package]]
+name = "python-slugify"
+version = "8.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "text-unidecode" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/87/c7/5e1547c44e31da50a460df93af11a535ace568ef89d7a811069ead340c4a/python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856", size = 10921, upload-time = "2024-02-08T18:32:45.488Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051, upload-time = "2024-02-08T18:32:43.911Z" },
+]
+
 [[package]]
 name = "pytz"
 version = "2025.2"
@@ -5951,6 +6027,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -6035,6 +6120,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e4/f1/6c7eaa8187ba789a6dd6d74430307478d2a91c23a5452ab339b6fbe15a08/sse_starlette-2.4.1-py3-none-any.whl", hash = "sha256:08b77ea898ab1a13a428b2b6f73cfe6d0e607a7b4e15b9bb23e4a37b087fd39a", size = 10824, upload-time = "2025-07-06T09:41:32.321Z" },
 ]
 
+[[package]]
+name = "sseclient-py"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/ed/3df5ab8bb0c12f86c28d0cadb11ed1de44a92ed35ce7ff4fd5518a809325/sseclient-py-1.8.0.tar.gz", hash = "sha256:c547c5c1a7633230a38dc599a21a2dc638f9b5c297286b48b46b935c71fac3e8", size = 7791, upload-time = "2023-09-01T19:39:20.45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/58/97655efdfeb5b4eeab85b1fc5d3fa1023661246c2ab2a26ea8e47402d4f2/sseclient_py-1.8.0-py2.py3-none-any.whl", hash = "sha256:4ecca6dc0b9f963f8384e9d7fd529bf93dd7d708144c4fb5da0e0a1a926fee83", size = 8828, upload-time = "2023-09-01T19:39:17.627Z" },
+]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@@ -6186,6 +6280,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" },
 ]
 
+[[package]]
+name = "text-unidecode"
+version = "1.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ab/e2/e9a00f0ccb71718418230718b3d900e71a5d16e701a3dae079a21e9cd8f8/text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93", size = 76885, upload-time = "2019-08-30T21:36:45.405Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload-time = "2019-08-30T21:37:03.543Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.9.0"

From ad0e059e00ffc81918054640d22b896dccf12e5a Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 15 Sep 2025 17:45:16 -0700
Subject: [PATCH 2/3] fix uv lock

---
 uv.lock | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 2 deletions(-)

diff --git a/uv.lock b/uv.lock
index 227436c7..cea73ed6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.13'",
@@ -491,6 +491,33 @@ version = "2.3.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/dd/5a/ad8d3ef9c13d5afcc1e44a77f11792ee717f6727b3320bddbc607e935e2a/box2d-py-2.3.5.tar.gz", hash = "sha256:b37dc38844bcd7def48a97111d2b082e4f81cca3cece7460feb3eacda0da2207", size = 374446, upload-time = "2018-10-02T01:03:23.527Z" }
 
+[[package]]
+name = "braintrust"
+version = "0.2.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "chevron" },
+    { name = "exceptiongroup" },
+    { name = "gitpython" },
+    { name = "python-dotenv" },
+    { name = "python-slugify" },
+    { name = "requests" },
+    { name = "sseclient-py" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9e/ab/199161c7810f9c22fd04dff374075536fc66aabcca5ea522296aedeb6378/braintrust-0.2.7.tar.gz", hash = "sha256:faa9d54c2d6dac30b11d9b4b68817aa1258aeab5945758159107fb6402ac5b80", size = 184823, upload-time = "2025-09-11T23:44:58.661Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/82/02/f704c8ea68622286dd7aaa16a3a223a9ee2f8b337c86c652d111aa05b442/braintrust-0.2.7-py3-none-any.whl", hash = "sha256:735e1b32a785e144756c4821e0515dd40dca921c86c417000f4b5617024f1349", size = 214417, upload-time = "2025-09-11T23:44:57.028Z" },
+]
+
+[package.optional-dependencies]
+otel = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+    { name = "opentelemetry-sdk" },
+]
+
 [[package]]
 name = "brotli"
 version = "1.1.0"
@@ -744,6 +771,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" },
 ]
 
+[[package]]
+name = "chevron"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/1f/ca74b65b19798895d63a6e92874162f44233467c9e7c1ed8afd19016ebe9/chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf", size = 11440, upload-time = "2021-01-02T22:47:59.233Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/93/342cc62a70ab727e093ed98e02a725d85b746345f05d2b5e5034649f4ec8/chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443", size = 11595, upload-time = "2021-01-02T22:47:57.847Z" },
+]
+
 [[package]]
 name = "click"
 version = "8.2.1"
@@ -1227,6 +1263,9 @@ box2d = [
     { name = "pillow" },
     { name = "swig" },
 ]
+braintrust = [
+    { name = "braintrust", extra = ["otel"] },
+]
 chinook = [
     { name = "psycopg2-binary" },
 ]
@@ -1312,6 +1351,7 @@ requires-dist = [
     { name = "aiosqlite" },
     { name = "anthropic", specifier = ">=0.59.0" },
     { name = "backoff", specifier = ">=2.2.0" },
+    { name = "braintrust", extras = ["otel"], marker = "extra == 'braintrust'" },
     { name = "build", marker = "extra == 'dev'" },
     { name = "dataclasses-json", specifier = ">=0.5.7" },
     { name = "datasets", specifier = ">=3.0.0" },
@@ -1390,7 +1430,7 @@ requires-dist = [
     { name = "websockets", specifier = ">=15.0.1" },
     { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" },
 ]
-provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "langgraph", "langgraph-tools"]
+provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench", "pydantic", "supabase", "chinook", "langchain", "braintrust", "langgraph", "langgraph-tools"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -1746,6 +1786,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/12/41fcfba4ae0f6b4805f09d11f0e6d6417df2572cea13208c0f439170ee0c/genai_prices-0.0.25-py3-none-any.whl", hash = "sha256:47b412e6927787caa00717a5d99b2e4c0858bed507bb16473b1bcaff48d5aae9", size = 47002, upload-time = "2025-09-01T17:30:41.012Z" },
 ]
 
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.25.1"
@@ -5416,6 +5480,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
 ]
 
+[[package]]
+name = "python-slugify"
+version = "8.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "text-unidecode" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/87/c7/5e1547c44e31da50a460df93af11a535ace568ef89d7a811069ead340c4a/python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856", size = 10921, upload-time = "2024-02-08T18:32:45.488Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051, upload-time = "2024-02-08T18:32:43.911Z" },
+]
+
 [[package]]
 name = "pytz"
 version = "2025.2"
@@ -6164,6 +6240,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -6248,6 +6333,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e4/f1/6c7eaa8187ba789a6dd6d74430307478d2a91c23a5452ab339b6fbe15a08/sse_starlette-2.4.1-py3-none-any.whl", hash = "sha256:08b77ea898ab1a13a428b2b6f73cfe6d0e607a7b4e15b9bb23e4a37b087fd39a", size = 10824, upload-time = "2025-07-06T09:41:32.321Z" },
 ]
 
+[[package]]
+name = "sseclient-py"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/ed/3df5ab8bb0c12f86c28d0cadb11ed1de44a92ed35ce7ff4fd5518a809325/sseclient-py-1.8.0.tar.gz", hash = "sha256:c547c5c1a7633230a38dc599a21a2dc638f9b5c297286b48b46b935c71fac3e8", size = 7791, upload-time = "2023-09-01T19:39:20.45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/58/97655efdfeb5b4eeab85b1fc5d3fa1023661246c2ab2a26ea8e47402d4f2/sseclient_py-1.8.0-py2.py3-none-any.whl", hash = "sha256:4ecca6dc0b9f963f8384e9d7fd529bf93dd7d708144c4fb5da0e0a1a926fee83", size = 8828, upload-time = "2023-09-01T19:39:17.627Z" },
+]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@@ -6419,6 +6513,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" },
 ]
 
+[[package]]
+name = "text-unidecode"
+version = "1.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ab/e2/e9a00f0ccb71718418230718b3d900e71a5d16e701a3dae079a21e9cd8f8/text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93", size = 76885, upload-time = "2019-08-30T21:36:45.405Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload-time = "2019-08-30T21:37:03.543Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.9.0"

From 520bd1c23672ce22d16fb4565ad79fcedd65e425 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 16 Sep 2025 17:36:01 -0700
Subject: [PATCH 3/3] braintrust example

---
 eval_protocol/adapters/braintrust.py          | 333 ++++++++++--------
 eval_protocol/adapters/langfuse.py            | 116 ++----
 eval_protocol/adapters/utils.py               |  98 ++++++
 eval_protocol/quickstart/llm_judge.py         |   3 +-
 .../quickstart/llm_judge_braintrust.py        | 133 +++++++
 eval_protocol/quickstart/utils.py             |  36 --
 6 files changed, 447 insertions(+), 272 deletions(-)
 create mode 100644 eval_protocol/adapters/utils.py
 create mode 100644 eval_protocol/quickstart/llm_judge_braintrust.py

diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py
index b4d34921..979d4d52 100644
--- a/eval_protocol/adapters/braintrust.py
+++ b/eval_protocol/adapters/braintrust.py
@@ -1,23 +1,158 @@
 """Braintrust adapter for Eval Protocol.
 
-This adapter pulls traces from Braintrust projects and converts them
-to EvaluationRow format for evaluation pipelines.
+This adapter allows pulling data from Braintrust deployments and converting it
+to EvaluationRow format for use in evaluation pipelines.
 """
 
+import logging
 import os
-from datetime import datetime
-from typing import Any, Dict, Iterator, List, Optional
+import random
+import time
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Protocol
 
 import requests
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .utils import extract_messages_from_data
 
 # Keep backward compatibility
 from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
 
 
+logger = logging.getLogger(__name__)
+
+
+class TraceConverter(Protocol):
+    """Protocol for custom trace-to-EvaluationRow converter functions.
+
+    A converter function should take a Braintrust trace along with processing
+    options and return an EvaluationRow or None to skip the trace.
+    """
+
+    def __call__(
+        self,
+        trace: Dict[str, Any],
+        include_tool_calls: bool,
+    ) -> Optional[EvaluationRow]:
+        """Convert a Braintrust trace to an EvaluationRow.
+
+        Args:
+            trace: The Braintrust trace object to convert
+            include_tool_calls: Whether to include tool calling information
+
+        Returns:
+            EvaluationRow or None if the trace should be skipped
+        """
+        ...
+
+
+def convert_trace_to_evaluation_row(trace: Dict[str, Any], include_tool_calls: bool = True) -> Optional[EvaluationRow]:
+    """Convert a Braintrust trace to EvaluationRow format.
+
+    Args:
+        trace: Braintrust trace object
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        EvaluationRow or None if conversion fails
+    """
+    try:
+        # Extract messages from the trace
+        messages = extract_messages_from_trace(trace, include_tool_calls)
+
+        # Extract tools if available
+        tools = None
+        if include_tool_calls:
+            metadata = trace.get("metadata", {})
+            tools = metadata.get("tools")
+            if not tools:
+                hidden_params = metadata.get("hidden_params", {})
+                optional_params = hidden_params.get("optional_params", {})
+                tools = optional_params.get("tools")
+
+        if not messages:
+            return None
+
+        return EvaluationRow(
+            messages=messages,
+            tools=tools,
+            input_metadata=InputMetadata(
+                session_data={
+                    "braintrust_trace_id": trace.get("id"),
+                }
+            ),
+        )
+
+    except (AttributeError, ValueError, KeyError) as e:
+        logger.error("Error converting trace %s: %s", trace.get("id", "unknown"), e)
+        return None
+
+
+def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
+    """Extract messages from Braintrust trace input and output.
+
+    Args:
+        trace: Braintrust trace object
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        List of Message objects
+    """
+    messages = []
+
+    try:
+        # Look for complete conversations (input + output arrays)
+        input_data = trace.get("input")
+
+        output_data = None
+        output_list = trace.get("output", [])
+        if output_list and len(output_list) > 0:
+            first_output = output_list[0]
+            if isinstance(first_output, dict):
+                output_data = first_output.get("message")
+
+        # Skip spans without meaningful conversation data
+        if not input_data or not output_data:
+            return messages
+
+        # Extract messages from input and output
+        if input_data:
+            messages.extend(extract_messages_from_data(input_data, include_tool_calls))
+        if output_data:
+            messages.extend(extract_messages_from_data(output_data, include_tool_calls))
+
+    except (AttributeError, ValueError, KeyError) as e:
+        logger.warning("Error processing trace %s: %s", trace.get("id", "unknown"), e)
+
+    return messages
+
+
 class BraintrustAdapter:
-    """Minimal adapter to pull traces from Braintrust."""
+    """Adapter to pull data from Braintrust and convert to EvaluationRow format.
+
+    This adapter can pull both chat conversations and tool calling traces from
+    Braintrust deployments and convert them into the EvaluationRow format expected
+    by the evaluation protocol.
+
+    Examples:
+        Basic usage:
+        >>> adapter = BraintrustAdapter(
+        ...     api_key="your_api_key",
+        ...     project_id="your_project_id"
+        ... )
+        >>> btql_query = "select: * from: project_logs('your_project_id') traces limit: 10"
+        >>> rows = adapter.get_evaluation_rows(btql_query)
+
+        Using BTQL for custom queries:
+        >>> btql_query = '''
+        ... select: *
+        ... from: project_logs('your_project_id') traces
+        ... filter: metadata.agent_name = 'agent_instance'
+        ... limit: 50
+        ... '''
+        >>> rows = adapter.get_evaluation_rows(btql_query)
+    """
 
     def __init__(
         self,
@@ -30,169 +165,63 @@ def __init__(
         Args:
             api_key: Braintrust API key (defaults to BRAINTRUST_API_KEY env var)
             api_url: Braintrust API URL (defaults to BRAINTRUST_API_URL env var)
-            project_id: Project ID to fetch logs from
+            project_id: Project ID to fetch logs from (defaults to BRAINTRUST_PROJECT_ID env var)
         """
         self.api_key = api_key or os.getenv("BRAINTRUST_API_KEY")
         self.api_url = api_url or os.getenv("BRAINTRUST_API_URL", "https://api.braintrust.dev")
-        self.project_id = project_id
+        self.project_id = project_id or os.getenv("BRAINTRUST_PROJECT_ID")
 
         if not self.api_key:
             raise ValueError("BRAINTRUST_API_KEY environment variable or api_key parameter required")
+        if not self.project_id:
+            raise ValueError("BRAINTRUST_PROJECT_ID environment variable or project_id parameter required")
 
     def get_evaluation_rows(
         self,
-        project_id: Optional[str] = None,
-        limit: Optional[int] = None,
-        from_timestamp: Optional[datetime] = None,
-        to_timestamp: Optional[datetime] = None,
-    ) -> Iterator[EvaluationRow]:
-        """Fetch traces from Braintrust and convert to EvaluationRow format."""
-        project_id = project_id or self.project_id
-        if not project_id:
-            raise ValueError("project_id required")
-
-        # Prepare query parameters for GET request
-        params = {"limit": 1000}
-        if from_timestamp:
-            params["from_timestamp"] = int(from_timestamp.timestamp())
-        if to_timestamp:
-            params["to_timestamp"] = int(to_timestamp.timestamp())
-
-        # Fetch logs from Braintrust using GET endpoint
-        headers = {"Authorization": f"Bearer {self.api_key}"}
-
-        url = f"{self.api_url}/v1/project_logs/{project_id}/fetch"
-
-        response = requests.get(url, headers=headers, params=params)
-        response.raise_for_status()
-
-        logs = response.json()
-
-        # Convert each log to EvaluationRow
-        for log in logs.get("events", []):
-            if log.get("metadata", {}).get("agent_name") == "agent_instance":
-                try:
-                    eval_row = self._convert_log_to_evaluation_row(log)
-                    if eval_row:
-                        yield eval_row
-                except Exception as e:
-                    print(f"Warning: Failed to convert log {log.get('id', 'unknown')}: {e}")
-                    continue
-
-    def _convert_log_to_evaluation_row(self, log: Dict[str, Any]) -> Optional[EvaluationRow]:
-        """Convert a Braintrust log to EvaluationRow format."""
-        # Extract messages from the log
-        messages = self._extract_messages(log)
-        if not messages:
-            return None
+        btql_query: str,
+        include_tool_calls: bool = True,
+        converter: Optional[TraceConverter] = None,
+    ) -> List[EvaluationRow]:
+        """Get evaluation rows using a custom BTQL query.
 
-        # Extract metadata (pulling nothing currently)
-        input_metadata = InputMetadata(
-            row_id=log.get("id"),
-            completion_params=log.get("metadata", {}),
-            dataset_info={
-                "braintrust_log_id": log.get("id"),
-                "braintrust_project_id": self.project_id,
-                "span_id": log.get("span_id"),
-                "trace_id": log.get("root_span_id"),
-            },
-        )
-
-        # Extract ground truth from metadata
-        metadata = log.get("metadata", {})
-        ground_truth = metadata.get("ground_truth")
+        Args:
+            btql_query: The BTQL query string to execute
+            include_tool_calls: Whether to include tool calling information
+            converter: Optional custom converter implementing TraceConverter protocol
 
-        return EvaluationRow(
-            messages=messages,
-            input_metadata=input_metadata,
-            ground_truth=str(ground_truth) if ground_truth else None,
-        )
+        Returns:
+            List[EvaluationRow]: Converted evaluation rows
+        """
+        eval_rows = []
 
-    def _extract_messages(self, log: Dict[str, Any]) -> List[Message]:
-        """Extract conversation messages from a Braintrust log."""
-        messages = []
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
 
-        # Look for complete conversations (input + output arrays)
-        input_data = log.get("input")
-        output_data = log.get("output")
+        response = requests.post(f"{self.api_url}/btql", headers=headers, json={"query": btql_query, "fmt": "json"})
+        response.raise_for_status()
+        query_response = response.json()
 
-        # Skip spans without meaningful conversation data
-        if not input_data or not output_data:
-            return []
-
-        # Extract input messages (usually just user message)
-        if isinstance(input_data, list):
-            for msg in input_data:
-                if isinstance(msg, dict) and "role" in msg and "content" in msg:
-                    messages.append(Message(role=msg["role"], content=str(msg["content"])))
-
-        # Extract output messages (assistant + tool responses)
-        if isinstance(output_data, list):
-            for msg in output_data:
-                if isinstance(msg, dict) and "role" in msg:
-                    # Handle tool calls in assistant messages
-                    tool_calls = msg.get("tool_calls") if msg["role"] == "assistant" else None
-                    tool_call_id = msg.get("tool_call_id") if msg["role"] == "tool" else None
-                    name = msg.get("name") if msg["role"] == "tool" else None
-
-                    messages.append(
-                        Message(
-                            role=msg["role"],
-                            content=str(msg.get("content", "")),
-                            tool_calls=tool_calls,
-                            tool_call_id=tool_call_id,
-                            name=name,
-                        )
-                    )
-
-        return messages
-
-    def create_score(
-        self,
-        log_id: str,
-        name: str,
-        value: float,
-        comment: Optional[str] = None,
-        project_id: Optional[str] = None,
-    ) -> bool:
-        """Create a score/feedback for a Braintrust log entry.
+        if not query_response or not query_response.get("data"):
+            logger.debug("No data returned from BTQL query")
+            return eval_rows
 
-        Args:
-            log_id: The ID of the log entry to score
-            name: The score name/type
-            value: The score value
-            comment: Optional comment explaining the score
-            project_id: Project ID (overrides instance default)
+        all_traces = query_response["data"]
+        logger.debug("BTQL query returned %d traces", len(all_traces))
 
-        Returns:
-            True if successful, False otherwise
-        """
-        project_id = project_id or self.project_id
-        if not project_id:
-            raise ValueError("project_id required")
-
-        # Prepare feedback data - API expects "feedback" array
-        feedback_item = {
-            "id": log_id,
-            "name": name,
-            "value": value,
-        }
-        if comment:
-            feedback_item["comment"] = comment
-
-        feedback_data = {"feedback": [feedback_item]}
-
-        # Post feedback to Braintrust
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+        # Process each selected trace
+        for trace in all_traces:
+            try:
+                if converter:
+                    eval_row = converter(trace, include_tool_calls)
+                else:
+                    eval_row = convert_trace_to_evaluation_row(trace, include_tool_calls)
+                if eval_row:
+                    eval_rows.append(eval_row)
+            except (AttributeError, ValueError, KeyError) as e:
+                logger.warning("Failed to convert trace %s: %s", trace.get("id", "unknown"), e)
+                continue
 
-        try:
-            url = f"{self.api_url}/v1/project_logs/{project_id}/feedback"
-            response = requests.post(url, headers=headers, json=feedback_data)
-            response.raise_for_status()
-            return True
-        except Exception as e:
-            print(f"Error creating Braintrust score: {e}")
-            return False
+        logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
+        return eval_rows
 
 
 def create_braintrust_adapter(
@@ -200,7 +229,7 @@ def create_braintrust_adapter(
     api_url: Optional[str] = None,
     project_id: Optional[str] = None,
 ) -> BraintrustAdapter:
-    """Create a BraintrustAdapter instance."""
+    """Factory function to create a Braintrust adapter."""
     return BraintrustAdapter(
         api_key=api_key,
         api_url=api_url,
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
index e3f3144a..115448dd 100644
--- a/eval_protocol/adapters/langfuse.py
+++ b/eval_protocol/adapters/langfuse.py
@@ -12,6 +12,7 @@
 from typing import Any, Dict, List, Optional, Protocol
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .utils import extract_messages_from_data
 
 logger = logging.getLogger(__name__)
 
@@ -112,7 +113,7 @@ def extract_messages_from_trace(
     if span_name:  # Look for a generation tied to a span name
         try:
             # Find the final generation in the named span
-            gen: ObservationsView | None = find_final_generation_in_span(trace, span_name)
+            gen: ObservationsView | None = get_final_generation_in_span(trace, span_name)
             if not gen:
                 return messages
 
@@ -141,87 +142,8 @@ def extract_messages_from_trace(
     return messages
 
 
-def extract_messages_from_data(data, include_tool_calls: bool) -> List[Message]:
-    """Extract messages from data (works for both input and output).
-
-    Args:
-        data: Data from trace or generation (input or output)
-        include_tool_calls: Whether to include tool calling information
-
-    Returns:
-        List of Message objects
-    """
-    messages = []
-
-    if isinstance(data, dict):
-        if "messages" in data:
-            # OpenAI-style messages format
-            for msg in data["messages"]:
-                messages.append(dict_to_message(msg, include_tool_calls))
-        elif "role" in data:
-            # Single message format
-            messages.append(dict_to_message(data, include_tool_calls))
-        elif "prompt" in data:
-            # Simple prompt format
-            messages.append(Message(role="user", content=str(data["prompt"])))
-        elif "content" in data:
-            # Simple content format
-            messages.append(Message(role="assistant", content=str(data["content"])))
-        else:
-            # Fallback: treat as single message
-            messages.append(dict_to_message(data, include_tool_calls))
-    elif isinstance(data, list):
-        # Direct list of message dicts
-        for msg in data:
-            if isinstance(msg, dict):
-                messages.append(dict_to_message(msg, include_tool_calls))
-    elif isinstance(data, str):
-        # Simple string - role depends on context, default to user
-        messages.append(Message(role="user", content=data))
-
-    return messages
-
-
-def dict_to_message(msg_dict: Dict[str, Any], include_tool_calls: bool = True) -> Message:
-    """Convert a dictionary to a Message object.
-
-    Args:
-        msg_dict: Dictionary containing message data
-        include_tool_calls: Whether to include tool calling information
-
-    Returns:
-        Message object
-    """
-    # Extract basic message components
-    role = msg_dict.get("role", "assistant")
-    content = msg_dict.get("content")
-    name = msg_dict.get("name")
-
-    # Handle tool calls if enabled
-    tool_calls = None
-    tool_call_id = None
-    function_call = None
-
-    if include_tool_calls:
-        if "tool_calls" in msg_dict:
-            tool_calls = msg_dict["tool_calls"]
-        if "tool_call_id" in msg_dict:
-            tool_call_id = msg_dict["tool_call_id"]
-        if "function_call" in msg_dict:
-            function_call = msg_dict["function_call"]
-
-    return Message(
-        role=role,
-        content=content,
-        name=name,
-        tool_call_id=tool_call_id,
-        tool_calls=tool_calls,
-        function_call=function_call,
-    )
-
-
-def find_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) -> ObservationsView | None:
-    """Find the final generation within a named span that contains full message history.
+def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) -> ObservationsView | None:
+    """Get the final generation within a named span that contains full message history.
 
     Args:
         trace: Langfuse trace object
@@ -511,6 +433,36 @@ def get_evaluation_rows_by_ids(
                 continue
         return eval_rows
 
+    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Push evaluation scores back to Langfuse traces for tracking and analysis.
+
+        Creates a score entry in Langfuse for each unique trace_id found in the evaluation
+        rows' session data. This allows you to see evaluation results directly in the
+        Langfuse UI alongside the original traces.
+
+        Args:
+            rows: List of EvaluationRow objects with session_data containing trace IDs
+            model_name: Name of the model (used as the score name in Langfuse)
+            mean_score: The calculated mean score to push to Langfuse
+
+        Note:
+            Silently handles errors if rows lack session data
+        """
+        try:
+            for trace_id in set(
+                row.input_metadata.session_data["langfuse_trace_id"]
+                for row in rows
+                if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
+            ):
+                if trace_id:
+                    self.client.create_score(
+                        trace_id=trace_id,
+                        name=model_name,
+                        value=mean_score,
+                    )
+        except Exception as e:
+            logger.warning("Failed to push scores to Langfuse: %s", e)
+
 
 def create_langfuse_adapter() -> LangfuseAdapter:
     """Factory function to create a Langfuse adapter."""
diff --git a/eval_protocol/adapters/utils.py b/eval_protocol/adapters/utils.py
new file mode 100644
index 00000000..0ccf6caf
--- /dev/null
+++ b/eval_protocol/adapters/utils.py
@@ -0,0 +1,98 @@
+"""Common utilities for adapter implementations.
+
+This module contains shared functions and utilities used across different
+adapter implementations to avoid code duplication.
+"""
+
+import logging
+import time
+from typing import Any, Dict, List
+
+from eval_protocol.models import Message
+
+logger = logging.getLogger(__name__)
+
+
+def extract_messages_from_data(data, include_tool_calls: bool) -> List[Message]:
+    """Extract messages from data (works for both input and output).
+
+    This is a common function used by multiple adapters to parse message data
+    from various formats (dict, list, string) into standardized Message objects.
+
+    Args:
+        data: Data from trace/log (input or output) - can be dict, list, or string
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        List of Message objects
+    """
+    messages = []
+
+    if isinstance(data, dict):
+        if "messages" in data:
+            # OpenAI-style messages format
+            for msg in data["messages"]:
+                messages.append(dict_to_message(msg, include_tool_calls))
+        elif "role" in data:
+            # Single message format
+            messages.append(dict_to_message(data, include_tool_calls))
+        elif "prompt" in data:
+            # Simple prompt format
+            messages.append(Message(role="user", content=str(data["prompt"])))
+        elif "content" in data:
+            # Simple content format
+            messages.append(Message(role="assistant", content=str(data["content"])))
+        else:
+            # Fallback: treat as single message
+            messages.append(dict_to_message(data, include_tool_calls))
+    elif isinstance(data, list):
+        # Direct list of message dicts
+        for msg in data:
+            if isinstance(msg, dict):
+                messages.append(dict_to_message(msg, include_tool_calls))
+    elif isinstance(data, str):
+        # Simple string - role depends on context, default to user
+        messages.append(Message(role="user", content=data))
+
+    return messages
+
+
+def dict_to_message(msg_dict: Dict[str, Any], include_tool_calls: bool = True) -> Message:
+    """Convert a dictionary to a Message object.
+
+    This is a common function used by multiple adapters to convert dictionary
+    representations of messages into standardized Message objects.
+
+    Args:
+        msg_dict: Dictionary containing message data
+        include_tool_calls: Whether to include tool calling information
+
+    Returns:
+        Message object
+    """
+    # Extract basic message components
+    role = msg_dict.get("role", "assistant")
+    content = msg_dict.get("content")
+    name = msg_dict.get("name")
+
+    # Handle tool calls if enabled
+    tool_calls = None
+    tool_call_id = None
+    function_call = None
+
+    if include_tool_calls:
+        if "tool_calls" in msg_dict:
+            tool_calls = msg_dict["tool_calls"]
+        if "tool_call_id" in msg_dict:
+            tool_call_id = msg_dict["tool_call_id"]
+        if "function_call" in msg_dict:
+            function_call = msg_dict["function_call"]
+
+    return Message(
+        role=role,
+        content=content,
+        name=name,
+        tool_call_id=tool_call_id,
+        tool_calls=tool_calls,
+        function_call=function_call,
+    )
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
index 7c1be954..3978d46e 100644
--- a/eval_protocol/quickstart/llm_judge.py
+++ b/eval_protocol/quickstart/llm_judge.py
@@ -16,7 +16,6 @@
     split_multi_turn_rows,
     JUDGE_CONFIGS,
     calculate_bootstrap_scores,
-    push_scores_to_langfuse,
     run_judgment_async,
 )
 import asyncio
@@ -131,6 +130,6 @@ async def run_judgment(row):
             row.evaluation_result.score = mean_score
 
     # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
-    push_scores_to_langfuse(rows, model_name, mean_score)
+    adapter.push_scores(rows, model_name, mean_score)
 
     return rows
diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
new file mode 100644
index 00000000..a1902cf7
--- /dev/null
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -0,0 +1,133 @@
+"""
+Default LLM judge for Eval Protocol using Braintrust. Inspired by Arena-Hard-Auto.
+"""
+
+import os
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from tqdm import tqdm
+
+import pytest
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.quickstart.utils import (
+    split_multi_turn_rows,
+    JUDGE_CONFIGS,
+    calculate_bootstrap_scores,
+    run_judgment_async,
+)
+import asyncio
+from openai import AsyncOpenAI
+from eval_protocol.adapters.braintrust import create_braintrust_adapter
+
+adapter = create_braintrust_adapter()
+
+
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[
+        adapter.get_evaluation_rows(
+            btql_query=f"""
+select: *
+from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
+filter: is_root = true
+limit: 5
+"""
+        )
+    ],
+    completion_params=[
+        {"model": "gpt-4.1"},
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "medium"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
+        },
+    ],
+    rollout_processor=SingleTurnRolloutProcessor(),
+    preprocess_fn=split_multi_turn_rows,
+    max_concurrent_rollouts=64,
+    mode="all",
+)
+async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
+    """
+    LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
+
+    Compares model responses against ground truth using an LLM judge. For each row:
+    1. Extracts the question from messages[:-1]
+    2. Compares messages[-1] (new model response) vs ground_truth (baseline response)
+    3. Runs two judgment rounds (A vs B, B vs A) to reduce position bias
+    4. Calculates bootstrap scores across all comparisons
+    5. Updates evaluation_result with final scores and confidence intervals
+
+    Args:
+        rows: List of EvaluationRow objects with messages, ground_truth, and tools
+
+    Returns:
+        Same rows with updated evaluation_result containing scores and judgments
+    """
+
+    judge_name = "gemini-2.5-pro"  # Edit to which judge you'd like to use. Configs are in utils.py.
+
+    if not rows:
+        print("❌ No evaluation rows provided")
+        return rows
+
+    print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...")
+
+    model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
+
+    judgments = []
+    max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
+
+    judge_config = JUDGE_CONFIGS[judge_name]
+
+    async with AsyncOpenAI(
+        api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
+    ) as shared_client:
+        semaphore = asyncio.Semaphore(max_concurrency)
+
+        async def run_judgment(row):
+            async with semaphore:
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
+
+        tasks = [run_judgment(row) for row in rows]
+
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
+            result = await coro
+            if result and result["games"][0] and result["games"][1]:
+                judgments.append(result)
+
+    if not judgments:
+        print("❌ No valid judgments generated")
+        return rows
+
+    print(f"✅ Generated {len(judgments)} valid judgments")
+
+    # Calculate bootstrap scores
+    result = calculate_bootstrap_scores(judgments)
+    if not result:
+        print("❌ No valid scores extracted")
+        return rows
+
+    mean_score, lower_score, upper_score = result
+
+    # Print leaderboard
+    print("\n##### LLM Judge Results (90th percentile CI) #####")
+
+    clean_model_name = model_name.split("/")[-1]  # Clean model name
+
+    print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
+    print("original: 50.0% (CI: 50.0% - 50.0%)")
+
+    for row in rows:
+        if row.evaluation_result:
+            row.evaluation_result.score = mean_score
+
+    return rows
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
index d862a472..9fda11b5 100644
--- a/eval_protocol/quickstart/utils.py
+++ b/eval_protocol/quickstart/utils.py
@@ -280,39 +280,3 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> Optional[tupl
     upper_score = bootstraps.quantile(0.95)
 
     return mean_score, lower_score, upper_score
-
-
-def push_scores_to_langfuse(rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
-    """
-    Push evaluation scores back to Langfuse traces for tracking and analysis.
-
-    Creates a score entry in Langfuse for each unique trace_id found in the evaluation
-    rows' session data. This allows you to see evaluation results directly in the
-    Langfuse UI alongside the original traces.
-
-    Args:
-        rows: List of EvaluationRow objects with session_data containing trace IDs
-        model_name: Name of the model (used as the score name in Langfuse)
-        mean_score: The calculated mean score to push to Langfuse
-
-    Note:
-        Silently handles errors if Langfuse is unavailable or if rows lack session data
-    """
-    try:
-        from eval_protocol.adapters.langfuse import create_langfuse_adapter
-
-        langfuse = create_langfuse_adapter().client
-
-        for trace_id in set(
-            row.input_metadata.session_data["langfuse_trace_id"]
-            for row in rows
-            if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
-        ):
-            if trace_id:
-                langfuse.create_score(
-                    trace_id=trace_id,
-                    name=model_name,
-                    value=mean_score,
-                )
-    except Exception as e:
-        print(f"⚠️ Failed to push scores to Langfuse: {e}")