Skip to content

Commit a3e7941

Browse files
committed
test on full dataset
1 parent af137b3 commit a3e7941

File tree

1 file changed

+5
-11
lines changed

1 file changed

+5
-11
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
5959
return None
6060

6161

62-
def _get_aime_dataset_path() -> str:
63-
"""Get the AIME dataset file path."""
64-
return str(Path(__file__).parent / "data" / "aime.jsonl")
65-
66-
6762
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6863
converted: List[EvaluationRow] = []
6964
for r in rows:
@@ -79,9 +74,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7974

8075
@evaluation_test(
8176
input_dataset=[
82-
_get_aime_dataset_path(),
83-
# "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
84-
# "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
77+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
78+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
8579
],
8680
dataset_adapter=aime2025_dataset_adapter,
8781
completion_params=[
@@ -95,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
9589
rollout_processor=SingleTurnRolloutProcessor(),
9690
aggregation_method="mean",
9791
passed_threshold=0.8,
98-
num_runs=1,
99-
max_dataset_rows=1,
100-
max_concurrent_rollouts=1,
92+
num_runs=8,
93+
max_dataset_rows=2,
94+
max_concurrent_rollouts=4,
10195
mode="pointwise",
10296
)
10397
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)