@@ -29,7 +29,10 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
2929 question = r .get ("question" , "" )
3030 answer = r .get ("answer" , None )
3131 messages = [
32- Message (role = "system" , content = "hi" ),
32+ Message (
33+ role = "system" ,
34+ content = "You are a helpful math assistant. Please reason step by step, and put your final answer within \\ boxed{...}." ,
35+ ),
3336 Message (role = "user" , content = str (question )),
3437 ]
3538 converted .append (EvaluationRow (messages = messages , ground_truth = str (answer ) if answer is not None else None ))
@@ -44,23 +47,24 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
4447 ],
4548 dataset_adapter = aime2025_dataset_adapter ,
4649 completion_params = [
50+ # {
51+ # "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
52+ # },
53+ {"model" : "gpt-4.1" },
4754 {
4855 "max_tokens" : 131000 ,
49- "extra_body" : {"reasoning_effort" : "low " },
56+ "extra_body" : {"reasoning_effort" : "medium " },
5057 "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
5158 },
5259 {
5360 "max_tokens" : 131000 ,
54- "extra_body" : {"reasoning_effort" : "medium " },
55- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b " ,
61+ "extra_body" : {"reasoning_effort" : "low " },
62+ "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-20b " ,
5663 },
5764 ],
5865 rollout_processor = SingleTurnRolloutProcessor (),
59- aggregation_method = "mean" ,
60- passed_threshold = 0.8 ,
61- num_runs = 1 ,
62- max_dataset_rows = 1 ,
63- max_concurrent_rollouts = 4 ,
66+ preprocess_fn = split_multi_turn_rows ,
67+ max_concurrent_rollouts = 64 ,
6468 mode = "pointwise" ,
6569)
6670async def test_llm_judge (row : EvaluationRow ) -> EvaluationRow :
0 commit comments