@@ -39,34 +39,30 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
3939 return converted
4040
4141
42- @pytest .mark .asyncio
4342@evaluation_test (
4443 input_dataset = [
4544 "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl" ,
4645 "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl" ,
4746 ],
4847 dataset_adapter = aime2025_dataset_adapter ,
4948 completion_params = [
50- # {
51- # "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
52- # },
53- {"model" : "gpt-4.1" },
5449 {
5550 "max_tokens" : 131000 ,
56- "extra_body" : {"reasoning_effort" : "medium " },
51+ "extra_body" : {"reasoning_effort" : "low " },
5752 "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
5853 },
5954 {
6055 "max_tokens" : 131000 ,
61- "extra_body" : {"reasoning_effort" : "low " },
62- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-20b " ,
56+ "extra_body" : {"reasoning_effort" : "medium " },
57+ "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b " ,
6358 },
6459 ],
6560 rollout_processor = SingleTurnRolloutProcessor (),
66- # preprocess_fn=split_multi_turn_rows,
61+ aggregation_method = "mean" ,
62+ passed_threshold = 0.8 ,
6763 num_runs = 1 ,
6864 max_dataset_rows = 1 ,
69- max_concurrent_rollouts = 64 ,
65+ max_concurrent_rollouts = 4 ,
7066 mode = "pointwise" ,
7167)
7268async def test_llm_judge (row : EvaluationRow ) -> EvaluationRow :
0 commit comments