Skip to content

Commit 549b396

Browse files
committed
ok wtf
1 parent 96b0d34 commit 549b396

File tree

1 file changed

+26
-18
lines changed

1 file changed

+26
-18
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import pytest
1010

11-
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
11+
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message
1212
from eval_protocol.pytest import evaluation_test
1313
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
1414
from eval_protocol.quickstart.utils import (
@@ -23,36 +23,44 @@
2323
from openai import AsyncOpenAI
2424

2525

26+
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
27+
converted: List[EvaluationRow] = []
28+
for r in rows:
29+
question = r.get("question", "")
30+
answer = r.get("answer", None)
31+
messages = [
32+
Message(role="system", content="hi"),
33+
Message(role="user", content=str(question)),
34+
]
35+
converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
36+
return converted
37+
38+
2639
@pytest.mark.asyncio
2740
@evaluation_test(
28-
input_rows=[
29-
fetch_langfuse_traces_as_evaluation_rows(
30-
hours_back=24,
31-
limit=1,
32-
page_size=10,
33-
sleep_between_gets=3.0,
34-
max_retries=5,
35-
)
41+
input_dataset=[
42+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
43+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
3644
],
45+
dataset_adapter=aime2025_dataset_adapter,
3746
completion_params=[
38-
# {
39-
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
40-
# },
41-
{"model": "gpt-4.1"},
4247
{
4348
"max_tokens": 131000,
44-
"extra_body": {"reasoning_effort": "medium"},
49+
"extra_body": {"reasoning_effort": "low"},
4550
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
4651
},
4752
{
4853
"max_tokens": 131000,
49-
"extra_body": {"reasoning_effort": "low"},
50-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
54+
"extra_body": {"reasoning_effort": "medium"},
55+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
5156
},
5257
],
5358
rollout_processor=SingleTurnRolloutProcessor(),
54-
# preprocess_fn=split_multi_turn_rows,
55-
max_concurrent_rollouts=64,
59+
aggregation_method="mean",
60+
passed_threshold=0.8,
61+
num_runs=1,
62+
max_dataset_rows=1,
63+
max_concurrent_rollouts=4,
5664
mode="pointwise",
5765
)
5866
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)