Skip to content

Commit 8c62b6b

Browse files
committed
try osmething else
1 parent 009b0bb commit 8c62b6b

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,30 +39,35 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
3939
return converted
4040

4141

42+
@pytest.mark.asyncio
4243
@evaluation_test(
4344
input_dataset=[
4445
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
4546
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
4647
],
4748
dataset_adapter=aime2025_dataset_adapter,
4849
completion_params=[
50+
# {
51+
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
52+
# },
53+
{"model": "gpt-4.1"},
4954
{
5055
"max_tokens": 131000,
51-
"extra_body": {"reasoning_effort": "low"},
56+
"extra_body": {"reasoning_effort": "medium"},
5257
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
5358
},
5459
{
5560
"max_tokens": 131000,
56-
"extra_body": {"reasoning_effort": "medium"},
57-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
61+
"extra_body": {"reasoning_effort": "low"},
62+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
5863
},
5964
],
6065
rollout_processor=SingleTurnRolloutProcessor(),
6166
aggregation_method="mean",
6267
passed_threshold=0.8,
6368
num_runs=1,
6469
max_dataset_rows=1,
65-
max_concurrent_rollouts=4,
70+
max_concurrent_rollouts=64,
6671
mode="pointwise",
6772
)
6873
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)