Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:

- name: Run Core Tests with pytest-xdist
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ repos:
rev: v6.0.0
hooks:
- id: trailing-whitespace
exclude: "(^vite-app/|\\.snap$)"
exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
- id: end-of-file-fixer
exclude: "(^vite-app/|\\.snap$)"
exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
- id: check-yaml
- id: check-added-large-files
- id: check-merge-conflict
Expand Down
189 changes: 189 additions & 0 deletions eval_protocol/adapters/openai_responses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""Langfuse adapter for Eval Protocol.

This adapter allows pulling data from Langfuse deployments and converting it
to EvaluationRow format for use in evaluation pipelines.
"""

from collections.abc import Iterable, Sequence
import logging
from typing import List
from typing_extensions import Any

from openai.pagination import SyncCursorPage
from openai.types.chat.chat_completion_function_tool_param import ChatCompletionFunctionToolParam
from openai.types.chat.chat_completion_message import FunctionCall
from openai.types.responses import Response
from openai.types.responses.response_item import ResponseItem
from openai.types.chat.chat_completion_message_function_tool_call import (
ChatCompletionMessageFunctionToolCall,
Function,
)
from openai.types.responses.tool import Tool

from eval_protocol.models import EvaluationRow, InputMetadata, Message

logger = logging.getLogger(__name__)


from openai import OpenAI


class OpenAIResponsesAdapter:
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.

This adapter can pull both chat conversations and tool calling traces from
Langfuse deployments and convert them into the EvaluationRow format expected
by the evaluation protocol.

Examples:
Basic usage:
>>> adapter = OpenAIResponsesAdapter(
... api_key="your_api_key",
... )
>>> rows = list(adapter.get_evaluation_rows(respnse_ids=["response_id_1", "response_id_2"]))
"""

def __init__(self, api_key: str | None = None, base_url: str | None = None):
"""Initialize the OpenAI Responses adapter."""
self.openai = OpenAI(api_key=api_key, base_url=base_url)

def get_evaluation_rows(
self,
response_ids: List[str],
) -> List[EvaluationRow]:
"""Pull responses from OpenAI Responses API and convert to EvaluationRow format.

Args:
response_ids: List of response IDs to fetch
Returns:
List[EvaluationRow]: Converted evaluation rows
"""
eval_rows: list[EvaluationRow] = []

for response_id in response_ids:
input_items = self.openai.responses.input_items.list(response_id=response_id)
response = self.openai.responses.retrieve(response_id=response_id)
eval_rows.append(self._create_evaluation_row(input_items, response))

logger.info(
"Successfully processed %d selected traces into %d evaluation rows", len(response_ids), len(eval_rows)
)
return eval_rows

def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], response: Response) -> EvaluationRow:
"""Convert a response to an evaluation row."""
messages: list[Message] = []
if response.instructions:
if isinstance(response.instructions, list):
raise NotImplementedError("List of instructions is not supported")
else:
messages.append(Message(role="system", content=response.instructions))
messages.extend(self._create_messages(input_items))
if response.output_text:
messages.append(Message(role="assistant", content=response.output_text))
tools = self._responses_tools_to_chat_completion_tools(response.tools)
tool_dicts = [dict(tool) for tool in tools]
return EvaluationRow(
messages=messages,
tools=tool_dicts,
input_metadata=InputMetadata(
completion_params={
"model": response.model,
"temperature": response.temperature,
"max_output_tokens": response.max_output_tokens,
"max_tool_calls": response.max_tool_calls,
"parallel_tool_calls": response.parallel_tool_calls,
"""
We have to manually extract the reasoning effort and summary
from the response.reasoning object because the openai-python
causes an issue with model_dump() which is used for testing.

https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356
"""
"reasoning": {
"effort": response.reasoning.effort,
"summary": response.reasoning.summary,
}
if response.reasoning
else None,
"top_logprobs": response.top_logprobs,
"truncation": response.truncation,
"top_p": response.top_p,
}
),
)

def _responses_tools_to_chat_completion_tools(
self, tools: List[Tool]
) -> Sequence[ChatCompletionFunctionToolParam]:
"""Convert OpenAI Responses API tools to chat completion message function tool calls."""
chat_completion_tools: List[ChatCompletionFunctionToolParam] = []
for tool in tools:
if tool.type == "function":
chat_completion_tools.append(
{
"type": "function",
"function": {
"name": tool.name,
"parameters": tool.parameters or {},
"strict": tool.strict,
"description": tool.description or "",
},
}
)
else:
raise NotImplementedError("Only function tools are supported")
return chat_completion_tools

def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterable[Message]:
"""Create messages from input items.

Converts OpenAI Responses API input items to chat completion message format.
Handles different types of response items including messages and tool calls.
Groups parallel tool calls under a single assistant message.
Since we iterate backwards and reverse at the end, tool call outputs should
be added before the assistant message with tool calls.
"""
messages: list[Message] = []
current_tool_calls: list[ChatCompletionMessageFunctionToolCall] = []
tool_call_outputs: list[Message] = []

for item in input_items:
if item.type == "message":
# If we have accumulated tool calls, create an assistant message with them
if current_tool_calls:
# Add tool call outputs first (since we reverse at the end)
messages.extend(tool_call_outputs)
tool_call_outputs = []
# Then add the assistant message with tool calls
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
current_tool_calls = []

# This is a message item (input or output)
content = item.content
for content_item in content:
if content_item.type == "input_text":
text_content = content_item.text
# Create new message
messages.append(Message(role=item.role, content=text_content))
else:
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
elif item.type == "function_call_output":
# Collect tool call outputs to add before assistant message
tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
elif item.type == "function_call":
tool_call = ChatCompletionMessageFunctionToolCall(
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
)
current_tool_calls.append(tool_call)
else:
raise NotImplementedError(f"Unsupported item type: {item.type}")

# If we have remaining tool calls, create an assistant message with them
if current_tool_calls:
# Add tool call outputs first (since we reverse at the end)
messages.extend(tool_call_outputs)
# Then add the assistant message with tool calls
messages.append(Message(role="assistant", tool_calls=current_tool_calls))

return reversed(messages)
7 changes: 7 additions & 0 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,13 @@ def last_assistant_message(self) -> Optional[Message]:
return None
return assistant_messages[-1]

def get_first_user_message(self) -> Optional[Message]:
"""Returns the first user message from the conversation. Returns None if none found."""
user_messages = self.get_user_messages()
if not user_messages:
return None
return user_messages[0]

def get_user_messages(self) -> List[Message]:
"""Returns only the user messages from the conversation."""
return [msg for msg in self.messages if msg.role == "user"]
Expand Down
32 changes: 28 additions & 4 deletions eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageParam
from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
from pydantic import TypeAdapter
from pydantic_ai import Agent
from pydantic_ai import Agent, ModelSettings
from pydantic_ai._utils import generate_tool_call_id
from pydantic_ai.messages import ModelMessage
from pydantic_ai.messages import (
Expand All @@ -22,7 +22,7 @@
ToolReturnPart,
UserPromptPart,
)
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel
from pydantic_ai.providers.openai import OpenAIProvider

logger = logging.getLogger(__name__)
Expand All @@ -46,7 +46,6 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
"""Create agent rollout tasks and return them for external handling."""

semaphore = config.semaphore

agent = self._setup_agent(config)

async def process_row(row: EvaluationRow) -> EvaluationRow:
Expand All @@ -70,7 +69,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
row.tools = tools

model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages]
response = await agent.run(message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"))
settings = self.construct_model_settings(agent, row)
response = await agent.run(
message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"), model_settings=settings
)
row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages())

# TODO: pydantic ai accumulates usage info across all models in multi-agent setup, so this simple tracking doesn't work for cost. to discuss with @dphuang2 when he's back.
Expand Down Expand Up @@ -98,6 +100,28 @@ async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage])
oai_messages: list[ChatCompletionMessageParam] = await self._util._map_messages(messages)
return [Message(**m) for m in oai_messages] # pyright: ignore[reportArgumentType]

def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSettings:
model = agent.model
settings = None
if model and not isinstance(model, str) and model.settings:
# We must copy model settings to avoid concurrency issues by modifying the same object in-place
settings = model.settings.copy()
if settings is None:
settings = ModelSettings()
settings["extra_body"] = settings.get("extra_body", {})
extra_body = settings["extra_body"]

# Only store metadata for ResponsesModel, not for ChatModel
if isinstance(extra_body, dict) and isinstance(model, OpenAIResponsesModel):
extra_body["metadata"] = settings.get("metadata", {})
extra_body["metadata"]["row_id"] = row.input_metadata.row_id
extra_body["metadata"]["invocation_id"] = row.execution_metadata.invocation_id
extra_body["metadata"]["rollout_id"] = row.execution_metadata.rollout_id
extra_body["metadata"]["run_id"] = row.execution_metadata.run_id
extra_body["metadata"]["experiment_id"] = row.execution_metadata.experiment_id

return settings

def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage:
if message.role == "assistant":
type_adapter = TypeAdapter(ChatCompletionMessage)
Expand Down
17 changes: 10 additions & 7 deletions eval_protocol/pytest/parameterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
class DefaultParameterIdGenerator:
"""Default ID generator that creates meaningful IDs from parameter combinations."""

def __init__(self, max_length: int = 50):
def __init__(self, max_length: int = 200):
"""Initialize the ID generator with configuration options.

Args:
Expand All @@ -45,13 +45,16 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
"""Generate an ID for a parameter combination."""
dataset, completion_params, messages, rows, evaluation_test_kwargs = combo

# Add model name if available
if completion_params:
model = completion_params.get("model")
if model:
# Extract just the model name, not the full path
model_name = model.split("/")[-1] if "/" in model else model
id_str = f"model-{model_name}"
# Get all string, numeric, and boolean values from completion_params, sorted by key
str_values = []
for key in sorted(completion_params.keys()):
value = completion_params[key]
if isinstance(value, (str, int, float, bool)):
str_values.append(str(value))

if str_values:
id_str = ":".join(str_values)

# Truncate if too long
if len(id_str) > self.max_length:
Expand Down
4 changes: 4 additions & 0 deletions eval_protocol/quickstart/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .llm_judge import aha_judge
from .utils import split_multi_turn_rows

__all__ = ["aha_judge"]
8 changes: 6 additions & 2 deletions eval_protocol/quickstart/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
"""

from collections.abc import Awaitable, Callable
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
from typing_extensions import cast
from tqdm import tqdm

import pytest
Expand Down Expand Up @@ -55,6 +57,10 @@
mode="all",
)
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
return await aha_judge(rows)


async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
"""
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.

Expand All @@ -72,8 +78,6 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
Same rows with updated evaluation_result containing scores and judgments
"""

judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.

if not rows:
print("❌ No evaluation rows provided")
return rows
Expand Down
Loading
Loading