Skip to content

Commit f7a648c

Browse files
committed
Route benchmark datasets through data loaders
1 parent 76cc1e7 commit f7a648c

File tree

3 files changed

+51
-9
lines changed

3 files changed

+51
-9
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Any, Dict, List, Optional
22

3+
from eval_protocol.common_utils import load_jsonl
4+
from eval_protocol.data_loader import DynamicDataLoader
35
from eval_protocol.models import (
46
EvaluateResult,
57
EvaluationRow,
@@ -11,6 +13,7 @@
1113
SingleTurnRolloutProcessor,
1214
)
1315
from eval_protocol.pytest.evaluation_test import evaluation_test
16+
from eval_protocol.pytest.utils import parse_ep_max_rows
1417

1518
SYSTEM_PROMPT = (
1619
"You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}."
@@ -71,12 +74,29 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7174
return converted
7275

7376

77+
_AIME2025_DATASET_URLS: List[str] = [
78+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
79+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
80+
]
81+
82+
83+
def aime2025_data_generator() -> List[EvaluationRow]:
84+
"""Load the AIME 2025 datasets and convert them into evaluation rows."""
85+
dataset_rows: List[Dict[str, Any]] = []
86+
for dataset_url in _AIME2025_DATASET_URLS:
87+
dataset_rows.extend(load_jsonl(dataset_url))
88+
89+
max_rows = parse_ep_max_rows(2)
90+
if max_rows is not None:
91+
dataset_rows = dataset_rows[:max_rows]
92+
93+
return aime2025_dataset_adapter(dataset_rows)
94+
95+
7496
@evaluation_test(
75-
input_dataset=[
76-
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
77-
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
78-
],
79-
dataset_adapter=aime2025_dataset_adapter,
97+
data_loaders=DynamicDataLoader(
98+
generators=[aime2025_data_generator],
99+
),
80100
completion_params=[
81101
{
82102
"max_tokens": 131000,

eval_protocol/benchmarks/test_tau_bench_airline.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from pathlib import Path
1111
from typing import Any, Dict, List
1212

13+
from eval_protocol.common_utils import load_jsonl
14+
from eval_protocol.data_loader import DynamicDataLoader
1315
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
1416
from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig
1517
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
@@ -69,6 +71,14 @@ def _get_airline_dataset_path() -> str:
6971
return str(Path(__file__).parent / "data" / "airline_dataset.jsonl")
7072

7173

74+
def tau_bench_airline_data_generator() -> List[EvaluationRow]:
75+
"""Load and adapt the airline dataset into evaluation rows."""
76+
dataset_rows: List[Dict[str, Any]] = []
77+
for dataset_path in [_get_airline_dataset_path()]:
78+
dataset_rows.extend(load_jsonl(dataset_path))
79+
return tau_bench_airline_to_evaluation_row(dataset_rows)
80+
81+
7282
def _get_server_script_path() -> str:
7383
"""Get the tau2 mcp server script path."""
7484
from eval_protocol.mcp_servers.tau2 import get_server_script_path
@@ -107,8 +117,9 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
107117

108118

109119
@evaluation_test(
110-
input_dataset=[_get_airline_dataset_path()],
111-
dataset_adapter=tau_bench_airline_to_evaluation_row,
120+
data_loaders=DynamicDataLoader(
121+
generators=[tau_bench_airline_data_generator],
122+
),
112123
completion_params=[
113124
{
114125
"temperature": 0.8,

eval_protocol/benchmarks/test_tau_bench_retail.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from pathlib import Path
1111
from typing import Any, Dict, List
1212

13+
from eval_protocol.common_utils import load_jsonl
14+
from eval_protocol.data_loader import DynamicDataLoader
1315
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
1416
from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig
1517
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
@@ -69,6 +71,14 @@ def _get_retail_dataset_path() -> str:
6971
return str(Path(__file__).parent / "data" / "retail_dataset.jsonl")
7072

7173

74+
def tau_bench_retail_data_generator() -> List[EvaluationRow]:
75+
"""Load and adapt the retail dataset into evaluation rows."""
76+
dataset_rows: List[Dict[str, Any]] = []
77+
for dataset_path in [_get_retail_dataset_path()]:
78+
dataset_rows.extend(load_jsonl(dataset_path))
79+
return tau_bench_retail_to_evaluation_row(dataset_rows)
80+
81+
7282
def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
7383
"""
7484
Convert entries from retail dataset to EvaluationRow objects.
@@ -98,8 +108,9 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
98108

99109

100110
@evaluation_test(
101-
input_dataset=[_get_retail_dataset_path()],
102-
dataset_adapter=tau_bench_retail_to_evaluation_row,
111+
data_loaders=DynamicDataLoader(
112+
generators=[tau_bench_retail_data_generator],
113+
),
103114
completion_params=[
104115
{
105116
"temperature": 0.8,

0 commit comments

Comments
 (0)