Skip to content

Commit d9ea133

Browse files
committed
try something else
1 parent 549b396 commit d9ea133

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
2929
question = r.get("question", "")
3030
answer = r.get("answer", None)
3131
messages = [
32-
Message(role="system", content="hi"),
32+
Message(
33+
role="system",
34+
content="You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}.",
35+
),
3336
Message(role="user", content=str(question)),
3437
]
3538
converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
@@ -44,23 +47,24 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
4447
],
4548
dataset_adapter=aime2025_dataset_adapter,
4649
completion_params=[
50+
# {
51+
# "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
52+
# },
53+
{"model": "gpt-4.1"},
4754
{
4855
"max_tokens": 131000,
49-
"extra_body": {"reasoning_effort": "low"},
56+
"extra_body": {"reasoning_effort": "medium"},
5057
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
5158
},
5259
{
5360
"max_tokens": 131000,
54-
"extra_body": {"reasoning_effort": "medium"},
55-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
61+
"extra_body": {"reasoning_effort": "low"},
62+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
5663
},
5764
],
5865
rollout_processor=SingleTurnRolloutProcessor(),
59-
aggregation_method="mean",
60-
passed_threshold=0.8,
61-
num_runs=1,
62-
max_dataset_rows=1,
63-
max_concurrent_rollouts=4,
66+
preprocess_fn=split_multi_turn_rows,
67+
max_concurrent_rollouts=64,
6468
mode="pointwise",
6569
)
6670
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)