@@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
5959 return None
6060
6161
62- def _get_aime_dataset_path () -> str :
63- """Get the AIME dataset file path."""
64- return str (Path (__file__ ).parent / "data" / "aime.jsonl" )
65-
66-
6762def aime2025_dataset_adapter (rows : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
6863 converted : List [EvaluationRow ] = []
6964 for r in rows :
@@ -79,9 +74,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7974
8075@evaluation_test (
8176 input_dataset = [
82- _get_aime_dataset_path (),
83- # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
84- # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
77+ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl" ,
78+ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl" ,
8579 ],
8680 dataset_adapter = aime2025_dataset_adapter ,
8781 completion_params = [
@@ -95,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
9589 rollout_processor = SingleTurnRolloutProcessor (),
9690 aggregation_method = "mean" ,
9791 passed_threshold = 0.8 ,
98- num_runs = 1 ,
99- max_dataset_rows = 1 ,
100- max_concurrent_rollouts = 1 ,
92+ num_runs = 8 ,
93+ max_dataset_rows = 2 ,
94+ max_concurrent_rollouts = 4 ,
10195 mode = "pointwise" ,
10296)
10397def test_aime25_pointwise (row : EvaluationRow ) -> EvaluationRow :
0 commit comments