Skip to content

Commit 71faaf6

Browse files
author
Dylan Huang
authored
Responses api part 2 (#174)
* try reasoning effort high * task 7 * tasks 8-12 * generically come up with IDs * fix folder name * save * fix metadata storing logic based on chatcompletions vs. responses for pydantic because of "metadata perameter is only allowed when 'store' is enabled" and pydantic has no option to set "store" = True * Add method to retrieve the first user message in EvaluationRow class * better test name * better LLM judge * fix llm_judge_langsmith.py * Update run_judgment_async to use AsyncOpenAI client for improved async handling * snapshot test the simple example * update snapshot * fix handling of parallel tool calls * update snapshot * add pyright ignore statements * add __init__ and export judge as aha_judge * properly convert responses tools to chat completion tools * update snapshots * remove duplicate test * add responses example * add OPENAI_API_KEY to testing workflow
1 parent df0fc3a commit 71faaf6

31 files changed

+2668
-55
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ jobs:
8888

8989
- name: Run Core Tests with pytest-xdist
9090
env:
91+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
9192
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
9293
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
9394
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ repos:
77
rev: v6.0.0
88
hooks:
99
- id: trailing-whitespace
10-
exclude: "(^vite-app/|\\.snap$)"
10+
exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
1111
- id: end-of-file-fixer
12-
exclude: "(^vite-app/|\\.snap$)"
12+
exclude: "(^vite-app/|\\.snap$|\\.ambr$)"
1313
- id: check-yaml
1414
- id: check-added-large-files
1515
- id: check-merge-conflict
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
"""Langfuse adapter for Eval Protocol.
2+
3+
This adapter allows pulling data from Langfuse deployments and converting it
4+
to EvaluationRow format for use in evaluation pipelines.
5+
"""
6+
7+
from collections.abc import Iterable, Sequence
8+
import logging
9+
from typing import List
10+
from typing_extensions import Any
11+
12+
from openai.pagination import SyncCursorPage
13+
from openai.types.chat.chat_completion_function_tool_param import ChatCompletionFunctionToolParam
14+
from openai.types.chat.chat_completion_message import FunctionCall
15+
from openai.types.responses import Response
16+
from openai.types.responses.response_item import ResponseItem
17+
from openai.types.chat.chat_completion_message_function_tool_call import (
18+
ChatCompletionMessageFunctionToolCall,
19+
Function,
20+
)
21+
from openai.types.responses.tool import Tool
22+
23+
from eval_protocol.models import EvaluationRow, InputMetadata, Message
24+
25+
logger = logging.getLogger(__name__)
26+
27+
28+
from openai import OpenAI
29+
30+
31+
class OpenAIResponsesAdapter:
32+
"""Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.
33+
34+
This adapter can pull both chat conversations and tool calling traces from
35+
Langfuse deployments and convert them into the EvaluationRow format expected
36+
by the evaluation protocol.
37+
38+
Examples:
39+
Basic usage:
40+
>>> adapter = OpenAIResponsesAdapter(
41+
... api_key="your_api_key",
42+
... )
43+
>>> rows = list(adapter.get_evaluation_rows(respnse_ids=["response_id_1", "response_id_2"]))
44+
"""
45+
46+
def __init__(self, api_key: str | None = None, base_url: str | None = None):
47+
"""Initialize the OpenAI Responses adapter."""
48+
self.openai = OpenAI(api_key=api_key, base_url=base_url)
49+
50+
def get_evaluation_rows(
51+
self,
52+
response_ids: List[str],
53+
) -> List[EvaluationRow]:
54+
"""Pull responses from OpenAI Responses API and convert to EvaluationRow format.
55+
56+
Args:
57+
response_ids: List of response IDs to fetch
58+
Returns:
59+
List[EvaluationRow]: Converted evaluation rows
60+
"""
61+
eval_rows: list[EvaluationRow] = []
62+
63+
for response_id in response_ids:
64+
input_items = self.openai.responses.input_items.list(response_id=response_id)
65+
response = self.openai.responses.retrieve(response_id=response_id)
66+
eval_rows.append(self._create_evaluation_row(input_items, response))
67+
68+
logger.info(
69+
"Successfully processed %d selected traces into %d evaluation rows", len(response_ids), len(eval_rows)
70+
)
71+
return eval_rows
72+
73+
def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], response: Response) -> EvaluationRow:
74+
"""Convert a response to an evaluation row."""
75+
messages: list[Message] = []
76+
if response.instructions:
77+
if isinstance(response.instructions, list):
78+
raise NotImplementedError("List of instructions is not supported")
79+
else:
80+
messages.append(Message(role="system", content=response.instructions))
81+
messages.extend(self._create_messages(input_items))
82+
if response.output_text:
83+
messages.append(Message(role="assistant", content=response.output_text))
84+
tools = self._responses_tools_to_chat_completion_tools(response.tools)
85+
tool_dicts = [dict(tool) for tool in tools]
86+
return EvaluationRow(
87+
messages=messages,
88+
tools=tool_dicts,
89+
input_metadata=InputMetadata(
90+
completion_params={
91+
"model": response.model,
92+
"temperature": response.temperature,
93+
"max_output_tokens": response.max_output_tokens,
94+
"max_tool_calls": response.max_tool_calls,
95+
"parallel_tool_calls": response.parallel_tool_calls,
96+
"""
97+
We have to manually extract the reasoning effort and summary
98+
from the response.reasoning object because the openai-python
99+
causes an issue with model_dump() which is used for testing.
100+
101+
https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356
102+
"""
103+
"reasoning": {
104+
"effort": response.reasoning.effort,
105+
"summary": response.reasoning.summary,
106+
}
107+
if response.reasoning
108+
else None,
109+
"top_logprobs": response.top_logprobs,
110+
"truncation": response.truncation,
111+
"top_p": response.top_p,
112+
}
113+
),
114+
)
115+
116+
def _responses_tools_to_chat_completion_tools(
117+
self, tools: List[Tool]
118+
) -> Sequence[ChatCompletionFunctionToolParam]:
119+
"""Convert OpenAI Responses API tools to chat completion message function tool calls."""
120+
chat_completion_tools: List[ChatCompletionFunctionToolParam] = []
121+
for tool in tools:
122+
if tool.type == "function":
123+
chat_completion_tools.append(
124+
{
125+
"type": "function",
126+
"function": {
127+
"name": tool.name,
128+
"parameters": tool.parameters or {},
129+
"strict": tool.strict,
130+
"description": tool.description or "",
131+
},
132+
}
133+
)
134+
else:
135+
raise NotImplementedError("Only function tools are supported")
136+
return chat_completion_tools
137+
138+
def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterable[Message]:
139+
"""Create messages from input items.
140+
141+
Converts OpenAI Responses API input items to chat completion message format.
142+
Handles different types of response items including messages and tool calls.
143+
Groups parallel tool calls under a single assistant message.
144+
Since we iterate backwards and reverse at the end, tool call outputs should
145+
be added before the assistant message with tool calls.
146+
"""
147+
messages: list[Message] = []
148+
current_tool_calls: list[ChatCompletionMessageFunctionToolCall] = []
149+
tool_call_outputs: list[Message] = []
150+
151+
for item in input_items:
152+
if item.type == "message":
153+
# If we have accumulated tool calls, create an assistant message with them
154+
if current_tool_calls:
155+
# Add tool call outputs first (since we reverse at the end)
156+
messages.extend(tool_call_outputs)
157+
tool_call_outputs = []
158+
# Then add the assistant message with tool calls
159+
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
160+
current_tool_calls = []
161+
162+
# This is a message item (input or output)
163+
content = item.content
164+
for content_item in content:
165+
if content_item.type == "input_text":
166+
text_content = content_item.text
167+
# Create new message
168+
messages.append(Message(role=item.role, content=text_content))
169+
else:
170+
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
171+
elif item.type == "function_call_output":
172+
# Collect tool call outputs to add before assistant message
173+
tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
174+
elif item.type == "function_call":
175+
tool_call = ChatCompletionMessageFunctionToolCall(
176+
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
177+
)
178+
current_tool_calls.append(tool_call)
179+
else:
180+
raise NotImplementedError(f"Unsupported item type: {item.type}")
181+
182+
# If we have remaining tool calls, create an assistant message with them
183+
if current_tool_calls:
184+
# Add tool call outputs first (since we reverse at the end)
185+
messages.extend(tool_call_outputs)
186+
# Then add the assistant message with tool calls
187+
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
188+
189+
return reversed(messages)

eval_protocol/models.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,13 @@ def last_assistant_message(self) -> Optional[Message]:
658658
return None
659659
return assistant_messages[-1]
660660

661+
def get_first_user_message(self) -> Optional[Message]:
662+
"""Returns the first user message from the conversation. Returns None if none found."""
663+
user_messages = self.get_user_messages()
664+
if not user_messages:
665+
return None
666+
return user_messages[0]
667+
661668
def get_user_messages(self) -> List[Message]:
662669
"""Returns only the user messages from the conversation."""
663670
return [msg for msg in self.messages if msg.role == "user"]

eval_protocol/pytest/default_pydantic_ai_rollout_processor.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageParam
1414
from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
1515
from pydantic import TypeAdapter
16-
from pydantic_ai import Agent
16+
from pydantic_ai import Agent, ModelSettings
1717
from pydantic_ai._utils import generate_tool_call_id
1818
from pydantic_ai.messages import ModelMessage
1919
from pydantic_ai.messages import (
@@ -22,7 +22,7 @@
2222
ToolReturnPart,
2323
UserPromptPart,
2424
)
25-
from pydantic_ai.models.openai import OpenAIChatModel
25+
from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel
2626
from pydantic_ai.providers.openai import OpenAIProvider
2727

2828
logger = logging.getLogger(__name__)
@@ -46,7 +46,6 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
4646
"""Create agent rollout tasks and return them for external handling."""
4747

4848
semaphore = config.semaphore
49-
5049
agent = self._setup_agent(config)
5150

5251
async def process_row(row: EvaluationRow) -> EvaluationRow:
@@ -70,7 +69,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
7069
row.tools = tools
7170

7271
model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages]
73-
response = await agent.run(message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"))
72+
settings = self.construct_model_settings(agent, row)
73+
response = await agent.run(
74+
message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"), model_settings=settings
75+
)
7476
row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages())
7577

7678
# TODO: pydantic ai accumulates usage info across all models in multi-agent setup, so this simple tracking doesn't work for cost. to discuss with @dphuang2 when he's back.
@@ -98,6 +100,28 @@ async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage])
98100
oai_messages: list[ChatCompletionMessageParam] = await self._util._map_messages(messages)
99101
return [Message(**m) for m in oai_messages] # pyright: ignore[reportArgumentType]
100102

103+
def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSettings:
104+
model = agent.model
105+
settings = None
106+
if model and not isinstance(model, str) and model.settings:
107+
# We must copy model settings to avoid concurrency issues by modifying the same object in-place
108+
settings = model.settings.copy()
109+
if settings is None:
110+
settings = ModelSettings()
111+
settings["extra_body"] = settings.get("extra_body", {})
112+
extra_body = settings["extra_body"]
113+
114+
# Only store metadata for ResponsesModel, not for ChatModel
115+
if isinstance(extra_body, dict) and isinstance(model, OpenAIResponsesModel):
116+
extra_body["metadata"] = settings.get("metadata", {})
117+
extra_body["metadata"]["row_id"] = row.input_metadata.row_id
118+
extra_body["metadata"]["invocation_id"] = row.execution_metadata.invocation_id
119+
extra_body["metadata"]["rollout_id"] = row.execution_metadata.rollout_id
120+
extra_body["metadata"]["run_id"] = row.execution_metadata.run_id
121+
extra_body["metadata"]["experiment_id"] = row.execution_metadata.experiment_id
122+
123+
return settings
124+
101125
def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage:
102126
if message.role == "assistant":
103127
type_adapter = TypeAdapter(ChatCompletionMessage)

eval_protocol/pytest/parameterize.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
3333
class DefaultParameterIdGenerator:
3434
"""Default ID generator that creates meaningful IDs from parameter combinations."""
3535

36-
def __init__(self, max_length: int = 50):
36+
def __init__(self, max_length: int = 200):
3737
"""Initialize the ID generator with configuration options.
3838
3939
Args:
@@ -45,13 +45,16 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
4545
"""Generate an ID for a parameter combination."""
4646
dataset, completion_params, messages, rows, evaluation_test_kwargs = combo
4747

48-
# Add model name if available
4948
if completion_params:
50-
model = completion_params.get("model")
51-
if model:
52-
# Extract just the model name, not the full path
53-
model_name = model.split("/")[-1] if "/" in model else model
54-
id_str = f"model-{model_name}"
49+
# Get all string, numeric, and boolean values from completion_params, sorted by key
50+
str_values = []
51+
for key in sorted(completion_params.keys()):
52+
value = completion_params[key]
53+
if isinstance(value, (str, int, float, bool)):
54+
str_values.append(str(value))
55+
56+
if str_values:
57+
id_str = ":".join(str_values)
5558

5659
# Truncate if too long
5760
if len(id_str) > self.max_length:
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .llm_judge import aha_judge
2+
from .utils import split_multi_turn_rows
3+
4+
__all__ = ["aha_judge"]

eval_protocol/quickstart/llm_judge.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
33
"""
44

5+
from collections.abc import Awaitable, Callable
56
import os
67
from datetime import datetime
78
from typing import List, Dict, Any, Optional
9+
from typing_extensions import cast
810
from tqdm import tqdm
911

1012
import pytest
@@ -55,6 +57,10 @@
5557
mode="all",
5658
)
5759
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
60+
return await aha_judge(rows)
61+
62+
63+
async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
5864
"""
5965
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
6066
@@ -72,8 +78,6 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
7278
Same rows with updated evaluation_result containing scores and judgments
7379
"""
7480

75-
judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.
76-
7781
if not rows:
7882
print("❌ No evaluation rows provided")
7983
return rows

0 commit comments

Comments
 (0)