eval-protocol · dphuang2 · Sep 17, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -88,6 +88,7 @@ jobs:
 
       - name: Run Core Tests with pytest-xdist
         env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
           FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
           FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,9 +7,9 @@ repos:
     rev: v6.0.0
     hooks:
     -   id: trailing-whitespace
-        exclude: "(^vite-app/|\\.snap$)"
+        exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
     -   id: end-of-file-fixer
-        exclude: "(^vite-app/|\\.snap$)"
+        exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
     -   id: check-yaml
     -   id: check-added-large-files
     -   id: check-merge-conflict

diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py
@@ -0,0 +1,189 @@
+"""Langfuse adapter for Eval Protocol.
+
+This adapter allows pulling data from Langfuse deployments and converting it
+to EvaluationRow format for use in evaluation pipelines.
+"""
+
+from collections.abc import Iterable, Sequence
+import logging
+from typing import List
+from typing_extensions import Any
+
+from openai.pagination import SyncCursorPage
+from openai.types.chat.chat_completion_function_tool_param import ChatCompletionFunctionToolParam
+from openai.types.chat.chat_completion_message import FunctionCall
+from openai.types.responses import Response
+from openai.types.responses.response_item import ResponseItem
+from openai.types.chat.chat_completion_message_function_tool_call import (
+    ChatCompletionMessageFunctionToolCall,
+    Function,
+)
+from openai.types.responses.tool import Tool
+
+from eval_protocol.models import EvaluationRow, InputMetadata, Message
+
+logger = logging.getLogger(__name__)
+
+
+from openai import OpenAI
+
+
+class OpenAIResponsesAdapter:
+    """Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.
+
+    This adapter can pull both chat conversations and tool calling traces from
+    Langfuse deployments and convert them into the EvaluationRow format expected
+    by the evaluation protocol.
+
+    Examples:
+        Basic usage:
+        >>> adapter = OpenAIResponsesAdapter(
+        ...     api_key="your_api_key",
+        ... )
+        >>> rows = list(adapter.get_evaluation_rows(respnse_ids=["response_id_1", "response_id_2"]))
+    """
+
+    def __init__(self, api_key: str | None = None, base_url: str | None = None):
+        """Initialize the OpenAI Responses adapter."""
+        self.openai = OpenAI(api_key=api_key, base_url=base_url)
+
+    def get_evaluation_rows(
+        self,
+        response_ids: List[str],
+    ) -> List[EvaluationRow]:
+        """Pull responses from OpenAI Responses API and convert to EvaluationRow format.
+
+        Args:
+            response_ids: List of response IDs to fetch
+        Returns:
+            List[EvaluationRow]: Converted evaluation rows
+        """
+        eval_rows: list[EvaluationRow] = []
+
+        for response_id in response_ids:
+            input_items = self.openai.responses.input_items.list(response_id=response_id)
+            response = self.openai.responses.retrieve(response_id=response_id)
+            eval_rows.append(self._create_evaluation_row(input_items, response))
+
+        logger.info(
+            "Successfully processed %d selected traces into %d evaluation rows", len(response_ids), len(eval_rows)
+        )
+        return eval_rows
+
+    def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], response: Response) -> EvaluationRow:
+        """Convert a response to an evaluation row."""
+        messages: list[Message] = []
+        if response.instructions:
+            if isinstance(response.instructions, list):
+                raise NotImplementedError("List of instructions is not supported")
+            else:
+                messages.append(Message(role="system", content=response.instructions))
+        messages.extend(self._create_messages(input_items))
+        if response.output_text:
+            messages.append(Message(role="assistant", content=response.output_text))
+        tools = self._responses_tools_to_chat_completion_tools(response.tools)
+        tool_dicts = [dict(tool) for tool in tools]
+        return EvaluationRow(
+            messages=messages,
+            tools=tool_dicts,
+            input_metadata=InputMetadata(
+                completion_params={
+                    "model": response.model,
+                    "temperature": response.temperature,
+                    "max_output_tokens": response.max_output_tokens,
+                    "max_tool_calls": response.max_tool_calls,
+                    "parallel_tool_calls": response.parallel_tool_calls,
+                    """
+                    We have to manually extract the reasoning effort and summary
+                    from the response.reasoning object because the openai-python
+                    causes an issue with model_dump() which is used for testing.
+
+                    https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356
+                    """
+                    "reasoning": {
+                        "effort": response.reasoning.effort,
+                        "summary": response.reasoning.summary,
+                    }
+                    if response.reasoning
+                    else None,
+                    "top_logprobs": response.top_logprobs,
+                    "truncation": response.truncation,
+                    "top_p": response.top_p,
+                }
+            ),
+        )
+
+    def _responses_tools_to_chat_completion_tools(
+        self, tools: List[Tool]
+    ) -> Sequence[ChatCompletionFunctionToolParam]:
+        """Convert OpenAI Responses API tools to chat completion message function tool calls."""
+        chat_completion_tools: List[ChatCompletionFunctionToolParam] = []
+        for tool in tools:
+            if tool.type == "function":
+                chat_completion_tools.append(
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": tool.name,
+                            "parameters": tool.parameters or {},
+                            "strict": tool.strict,
+                            "description": tool.description or "",
+                        },
+                    }
+                )
+            else:
+                raise NotImplementedError("Only function tools are supported")
+        return chat_completion_tools
+
+    def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterable[Message]:
+        """Create messages from input items.
+
+        Converts OpenAI Responses API input items to chat completion message format.
+        Handles different types of response items including messages and tool calls.
+        Groups parallel tool calls under a single assistant message.
+        Since we iterate backwards and reverse at the end, tool call outputs should
+        be added before the assistant message with tool calls.
+        """
+        messages: list[Message] = []
+        current_tool_calls: list[ChatCompletionMessageFunctionToolCall] = []
+        tool_call_outputs: list[Message] = []
+
+        for item in input_items:
+            if item.type == "message":
+                # If we have accumulated tool calls, create an assistant message with them
+                if current_tool_calls:
+                    # Add tool call outputs first (since we reverse at the end)
+                    messages.extend(tool_call_outputs)
+                    tool_call_outputs = []
+                    # Then add the assistant message with tool calls
+                    messages.append(Message(role="assistant", tool_calls=current_tool_calls))
+                    current_tool_calls = []
+
+                # This is a message item (input or output)
+                content = item.content
+                for content_item in content:
+                    if content_item.type == "input_text":
+                        text_content = content_item.text
+                        # Create new message
+                        messages.append(Message(role=item.role, content=text_content))
+                    else:
+                        raise NotImplementedError(f"Unsupported content type: {content_item.type}")
+            elif item.type == "function_call_output":
+                # Collect tool call outputs to add before assistant message
+                tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
+            elif item.type == "function_call":
+                tool_call = ChatCompletionMessageFunctionToolCall(
+                    id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
+                )
+                current_tool_calls.append(tool_call)
+            else:
+                raise NotImplementedError(f"Unsupported item type: {item.type}")
+
+        # If we have remaining tool calls, create an assistant message with them
+        if current_tool_calls:
+            # Add tool call outputs first (since we reverse at the end)
+            messages.extend(tool_call_outputs)
+            # Then add the assistant message with tool calls
+            messages.append(Message(role="assistant", tool_calls=current_tool_calls))
+
+        return reversed(messages)
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -658,6 +658,13 @@ def last_assistant_message(self) -> Optional[Message]:
             return None
         return assistant_messages[-1]
 
+    def get_first_user_message(self) -> Optional[Message]:
+        """Returns the first user message from the conversation. Returns None if none found."""
+        user_messages = self.get_user_messages()
+        if not user_messages:
+            return None
+        return user_messages[0]
+
     def get_user_messages(self) -> List[Message]:
         """Returns only the user messages from the conversation."""
         return [msg for msg in self.messages if msg.role == "user"]

diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
@@ -13,7 +13,7 @@
 from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageParam
 from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
 from pydantic import TypeAdapter
-from pydantic_ai import Agent
+from pydantic_ai import Agent, ModelSettings
 from pydantic_ai._utils import generate_tool_call_id
 from pydantic_ai.messages import ModelMessage
 from pydantic_ai.messages import (
@@ -22,7 +22,7 @@
     ToolReturnPart,
     UserPromptPart,
 )
-from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel
 from pydantic_ai.providers.openai import OpenAIProvider
 
 logger = logging.getLogger(__name__)
@@ -46,7 +46,6 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
         """Create agent rollout tasks and return them for external handling."""
 
         semaphore = config.semaphore
-
         agent = self._setup_agent(config)
 
         async def process_row(row: EvaluationRow) -> EvaluationRow:
@@ -70,7 +69,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             row.tools = tools
 
             model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages]
-            response = await agent.run(message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"))
+            settings = self.construct_model_settings(agent, row)
+            response = await agent.run(
+                message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"), model_settings=settings
+            )
             row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages())
 
             # TODO: pydantic ai accumulates usage info across all models in multi-agent setup, so this simple tracking doesn't work for cost. to discuss with @dphuang2 when he's back.
@@ -98,6 +100,28 @@ async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage])
         oai_messages: list[ChatCompletionMessageParam] = await self._util._map_messages(messages)
         return [Message(**m) for m in oai_messages]  # pyright: ignore[reportArgumentType]
 
+    def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSettings:
+        model = agent.model
+        settings = None
+        if model and not isinstance(model, str) and model.settings:
+            # We must copy model settings to avoid concurrency issues by modifying the same object in-place
+            settings = model.settings.copy()
+        if settings is None:
+            settings = ModelSettings()
+        settings["extra_body"] = settings.get("extra_body", {})
+        extra_body = settings["extra_body"]
+
+        # Only store metadata for ResponsesModel, not for ChatModel
+        if isinstance(extra_body, dict) and isinstance(model, OpenAIResponsesModel):
+            extra_body["metadata"] = settings.get("metadata", {})
+            extra_body["metadata"]["row_id"] = row.input_metadata.row_id
+            extra_body["metadata"]["invocation_id"] = row.execution_metadata.invocation_id
+            extra_body["metadata"]["rollout_id"] = row.execution_metadata.rollout_id
+            extra_body["metadata"]["run_id"] = row.execution_metadata.run_id
+            extra_body["metadata"]["experiment_id"] = row.execution_metadata.experiment_id
+
+        return settings
+
     def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage:
         if message.role == "assistant":
             type_adapter = TypeAdapter(ChatCompletionMessage)

diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py
@@ -33,7 +33,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
 class DefaultParameterIdGenerator:
     """Default ID generator that creates meaningful IDs from parameter combinations."""
 
-    def __init__(self, max_length: int = 50):
+    def __init__(self, max_length: int = 200):
         """Initialize the ID generator with configuration options.
 
         Args:
@@ -45,13 +45,16 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
         """Generate an ID for a parameter combination."""
         dataset, completion_params, messages, rows, evaluation_test_kwargs = combo
 
-        # Add model name if available
         if completion_params:
-            model = completion_params.get("model")
-            if model:
-                # Extract just the model name, not the full path
-                model_name = model.split("/")[-1] if "/" in model else model
-                id_str = f"model-{model_name}"
+            # Get all string, numeric, and boolean values from completion_params, sorted by key
+            str_values = []
+            for key in sorted(completion_params.keys()):
+                value = completion_params[key]
+                if isinstance(value, (str, int, float, bool)):
+                    str_values.append(str(value))
+
+            if str_values:
+                id_str = ":".join(str_values)
 
                 # Truncate if too long
                 if len(id_str) > self.max_length:

diff --git a/eval_protocol/quickstart/__init__.py b/eval_protocol/quickstart/__init__.py
@@ -0,0 +1,4 @@
+from .llm_judge import aha_judge
+from .utils import split_multi_turn_rows
+
+__all__ = ["aha_judge"]
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -2,9 +2,11 @@
 Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
 """
 
+from collections.abc import Awaitable, Callable
 import os
 from datetime import datetime
 from typing import List, Dict, Any, Optional
+from typing_extensions import cast
 from tqdm import tqdm
 
 import pytest
@@ -55,6 +57,10 @@
     mode="all",
 )
 async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
+    return await aha_judge(rows)
+
+
+async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
     """
     LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
 
@@ -72,8 +78,6 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
         Same rows with updated evaluation_result containing scores and judgments
     """
 
-    judge_name = "gemini-2.5-pro"  # Edit to which judge you'd like to use. Configs are in utils.py.
-
     if not rows:
         print("❌ No evaluation rows provided")
         return rows