Skip to content

Commit 210efa4

Browse files
committed
Enhance IFEval benchmark processing by introducing IFEvalGroundTruthRolloutProcessor for improved ground truth extraction and updating pyproject.toml to include new benchmark data files.
1 parent 0b19932 commit 210efa4

File tree

3 files changed

+40
-19
lines changed

3 files changed

+40
-19
lines changed

eval_protocol/benchmarks/ifeval/ifbench_instructions.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@
3636
import csv
3737
import io
3838

39-
import ifbench_util as instructions_util
39+
try:
40+
from . import ifbench_util as instructions_util
41+
except ImportError:
42+
import ifbench_util as instructions_util
4043

4144
logger = logging.getLogger(__name__)
4245

eval_protocol/benchmarks/ifeval/test_ifeval.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
pytest eval_protocol/benchmarks/ifeval/test_ifeval.py -v
99
"""
1010

11+
import asyncio
1112
import json
1213
from pathlib import Path
13-
from typing import Any
1414

15-
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
15+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
1616
from eval_protocol.pytest import evaluation_test
1717
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
18+
from eval_protocol.pytest.rollout_processor import RolloutProcessor
19+
from eval_protocol.pytest.types import RolloutProcessorConfig
1820

1921
from .reward import ifeval_partial_credit_reward
2022

@@ -46,29 +48,44 @@ def _coerce_content_to_str(content: str | list | None) -> str:
4648
_IFBENCH_MESSAGES = _load_ifbench_messages()
4749

4850

49-
class IFEvalRolloutProcessor(SingleTurnRolloutProcessor):
50-
"""Preprocess rows to extract ground_truth from __GT__ messages."""
51-
52-
def preprocess_row(self, row: EvaluationRow) -> EvaluationRow:
53-
"""Extract ground truth and remove __GT__ messages."""
54-
filtered_messages: list[Message] = []
55-
for m in row.messages:
56-
content_str = _coerce_content_to_str(m.content)
57-
if m.role == "system" and content_str.startswith("__GT__:"):
58-
# Extract ground truth
59-
row.ground_truth = content_str.split(":", 1)[1].strip()
60-
else:
61-
filtered_messages.append(m)
62-
row.messages = filtered_messages
63-
return row
51+
class IFEvalGroundTruthRolloutProcessor(RolloutProcessor):
52+
"""Extract ground truth from __GT__ system messages, then run single-turn rollouts."""
53+
54+
def __init__(self) -> None:
55+
super().__init__()
56+
self.single_turn_processor = SingleTurnRolloutProcessor()
57+
58+
def __call__(
59+
self, rows: list[EvaluationRow], config: RolloutProcessorConfig
60+
) -> list[asyncio.Task[EvaluationRow]]:
61+
processed: list[EvaluationRow] = []
62+
for r in rows:
63+
gt_tokens: list[str] = []
64+
for m in r.messages:
65+
if m.role == "system":
66+
content_str = _coerce_content_to_str(m.content)
67+
if content_str.startswith("__GT__:"):
68+
gt_tokens.append(content_str)
69+
if gt_tokens:
70+
r.ground_truth = gt_tokens[-1].split(":", 1)[1].strip()
71+
filtered: list[Message] = []
72+
for m in r.messages:
73+
if m.role == "system":
74+
content_str = _coerce_content_to_str(m.content)
75+
if content_str.startswith("__GT__:"):
76+
continue
77+
filtered.append(m)
78+
r.messages = filtered
79+
processed.append(r)
80+
return self.single_turn_processor(processed, config)
6481

6582

6683
@evaluation_test(
6784
input_messages=_IFBENCH_MESSAGES,
6885
completion_params=[
6986
{"model": "fireworks_ai/accounts/fireworks/models/qwen3-8b"}
7087
],
71-
rollout_processor=IFEvalRolloutProcessor(),
88+
rollout_processor=IFEvalGroundTruthRolloutProcessor(),
7289
aggregation_method="mean",
7390
passed_threshold=0.5,
7491
num_runs=1,

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ include = ["eval_protocol*", "development*", "vendor*"]
170170
"eval_protocol" = ["../vite-app/dist/**/*"]
171171
"eval_protocol.mcp_servers.tau2" = ["*.md", "tests/system_prompts/*.md"]
172172
"eval_protocol.benchmarks" = ["data/*.jsonl"]
173+
"eval_protocol.benchmarks.ifeval" = ["data/*.jsonl"]
173174
"vendor.tau2" = ["data/**/*.md"]
174175

175176

0 commit comments

Comments
 (0)