Skip to content

Commit f585ce0

Browse files
author
Dylan Huang
committed
add responses example
1 parent 7175428 commit f585ce0

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
LLM Judge quickstart that PULLS DATA FROM OpenAI Responses API and persists results locally via Eval Protocol.
3+
4+
This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses
5+
OpenAI Responses API as the source of evaluation rows.
6+
7+
Env vars:
8+
export OPENAI_API_KEY=... # required to fetch examples
9+
10+
Judge model keys:
11+
- Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY
12+
- Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY
13+
14+
Run:
15+
pytest python-sdk/eval_protocol/quickstart/llm_judge_openai_responses.py -q -s
16+
"""
17+
18+
import os
19+
from typing import List
20+
21+
import pytest
22+
23+
from eval_protocol.models import EvaluationRow
24+
from eval_protocol.pytest import evaluation_test
25+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
26+
from eval_protocol.quickstart import aha_judge, split_multi_turn_rows
27+
from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter
28+
29+
adapter = OpenAIResponsesAdapter()
30+
input_rows = adapter.get_evaluation_rows(
31+
response_ids=[
32+
"resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
33+
"resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
34+
]
35+
)
36+
37+
38+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
39+
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
40+
@evaluation_test(
41+
input_rows=[input_rows],
42+
completion_params=[
43+
{
44+
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
45+
},
46+
{
47+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
48+
},
49+
],
50+
rollout_processor=SingleTurnRolloutProcessor(),
51+
preprocess_fn=split_multi_turn_rows,
52+
mode="all",
53+
)
54+
async def test_llm_judge_openai_responses(rows: List[EvaluationRow]) -> List[EvaluationRow]:
55+
return await aha_judge(rows)

0 commit comments

Comments
 (0)