|
| 1 | +""" |
| 2 | +LLM Judge quickstart that PULLS DATA FROM OpenAI Responses API and persists results locally via Eval Protocol. |
| 3 | +
|
| 4 | +This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses |
| 5 | +OpenAI Responses API as the source of evaluation rows. |
| 6 | +
|
| 7 | +Env vars: |
| 8 | + export OPENAI_API_KEY=... # required to fetch examples |
| 9 | +
|
| 10 | +Judge model keys: |
| 11 | + - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY |
| 12 | + - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY |
| 13 | +
|
| 14 | +Run: |
| 15 | + pytest python-sdk/eval_protocol/quickstart/llm_judge_openai_responses.py -q -s |
| 16 | +""" |
| 17 | + |
| 18 | +import os |
| 19 | +from typing import List |
| 20 | + |
| 21 | +import pytest |
| 22 | + |
| 23 | +from eval_protocol.models import EvaluationRow |
| 24 | +from eval_protocol.pytest import evaluation_test |
| 25 | +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor |
| 26 | +from eval_protocol.quickstart import aha_judge, split_multi_turn_rows |
| 27 | +from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter |
| 28 | + |
| 29 | +adapter = OpenAIResponsesAdapter() |
| 30 | +input_rows = adapter.get_evaluation_rows( |
| 31 | + response_ids=[ |
| 32 | + "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f", |
| 33 | + "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", |
| 34 | + ] |
| 35 | +) |
| 36 | + |
| 37 | + |
| 38 | +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] |
| 39 | +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] |
| 40 | +@evaluation_test( |
| 41 | + input_rows=[input_rows], |
| 42 | + completion_params=[ |
| 43 | + { |
| 44 | + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", |
| 45 | + }, |
| 46 | + { |
| 47 | + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", |
| 48 | + }, |
| 49 | + ], |
| 50 | + rollout_processor=SingleTurnRolloutProcessor(), |
| 51 | + preprocess_fn=split_multi_turn_rows, |
| 52 | + mode="all", |
| 53 | +) |
| 54 | +async def test_llm_judge_openai_responses(rows: List[EvaluationRow]) -> List[EvaluationRow]: |
| 55 | + return await aha_judge(rows) |
0 commit comments