We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 684131d commit 4aca272Copy full SHA for 4aca272
eval_protocol/benchmarks/test_aime25.py
@@ -90,7 +90,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
90
rollout_processor=SingleTurnRolloutProcessor(),
91
aggregation_method="mean",
92
passed_threshold=0.8,
93
- num_runs=2,
+ num_runs=1,
94
max_concurrent_rollouts=16,
95
mode="pointwise",
96
)
0 commit comments