Skip to content

Commit 009b0bb

Browse files
committed
same as aime now
1 parent 088dea6 commit 009b0bb

File tree

1 file changed

+6
-10
lines changed

1 file changed

+6
-10
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,34 +39,30 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
3939
return converted
4040

4141

42-
@pytest.mark.asyncio
4342
@evaluation_test(
4443
input_dataset=[
4544
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
4645
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
4746
],
4847
dataset_adapter=aime2025_dataset_adapter,
4948
completion_params=[
50-
# {
51-
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
52-
# },
53-
{"model": "gpt-4.1"},
5449
{
5550
"max_tokens": 131000,
56-
"extra_body": {"reasoning_effort": "medium"},
51+
"extra_body": {"reasoning_effort": "low"},
5752
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
5853
},
5954
{
6055
"max_tokens": 131000,
61-
"extra_body": {"reasoning_effort": "low"},
62-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
56+
"extra_body": {"reasoning_effort": "medium"},
57+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
6358
},
6459
],
6560
rollout_processor=SingleTurnRolloutProcessor(),
66-
# preprocess_fn=split_multi_turn_rows,
61+
aggregation_method="mean",
62+
passed_threshold=0.8,
6763
num_runs=1,
6864
max_dataset_rows=1,
69-
max_concurrent_rollouts=64,
65+
max_concurrent_rollouts=4,
7066
mode="pointwise",
7167
)
7268
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)