Skip to content

Commit a4f599f

Browse files
committed
revert test to original
1 parent 0c8642d commit a4f599f

File tree

2 files changed

+5
-10
lines changed

2 files changed

+5
-10
lines changed

eval_protocol/benchmarks/data/aime.jsonl

Lines changed: 0 additions & 1 deletion
This file was deleted.

eval_protocol/benchmarks/test_aime25.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,27 +79,23 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7979

8080
@evaluation_test(
8181
input_dataset=[
82-
# _get_aime_dataset_path(),
8382
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
8483
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
8584
],
8685
dataset_adapter=aime2025_dataset_adapter,
8786
completion_params=[
8887
{
8988
"max_tokens": 131000,
90-
# "extra_body": {"reasoning_effort": "low"},
91-
"model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
92-
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
93-
"stream": True,
94-
# "timeout": 2400,
89+
"extra_body": {"reasoning_effort": "low"},
90+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
9591
}
9692
],
9793
rollout_processor=SingleTurnRolloutProcessor(),
9894
aggregation_method="mean",
9995
passed_threshold=0.8,
100-
num_runs=1,
101-
max_dataset_rows=30,
102-
max_concurrent_rollouts=1,
96+
num_runs=8,
97+
max_dataset_rows=2,
98+
max_concurrent_rollouts=4,
10399
mode="pointwise",
104100
)
105101
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

0 commit comments

Comments
 (0)