Skip to content

Commit f0180c4

Browse files
committed
Merge branch 'derekx/gepa-part-1' of github.com:eval-protocol/python-sdk into derekx/gepa-part-1
2 parents c04acf8 + 7ddfceb commit f0180c4

File tree

1 file changed

+4
-82
lines changed

1 file changed

+4
-82
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 4 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
SingleTurnRolloutProcessor,
1313
)
1414
from eval_protocol.pytest.evaluation_test import evaluation_test
15-
from eval_protocol.training import GEPATrainer
16-
from eval_protocol.training.gepa_utils import build_reflection_lm
1715

1816
SYSTEM_PROMPT = (
1917
"You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}."
@@ -63,44 +61,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
6361
return None
6462

6563

66-
def _build_feedback_text(
67-
*,
68-
extracted_int: Optional[int],
69-
gt_int: Optional[int],
70-
is_valid: bool,
71-
raw_model_answer: str,
72-
ground_truth: Optional[str],
73-
) -> str:
74-
"""
75-
Build a feedback string similar in spirit to the GEPA `metric_with_feedback`.
76-
77-
Cases:
78-
- Parse failure (model or gold): explain integer formatting and show correct answer.
79-
- Correct: "Your answer is correct. The correct answer is '...'."
80-
- Incorrect: "Your answer is incorrect. The correct answer is '...'."
81-
"""
82-
correct_answer_display = str(gt_int if gt_int is not None else (ground_truth or ""))
83-
84-
if not is_valid:
85-
# Could not parse either the model answer or the gold answer as an integer.
86-
feedback_text = (
87-
"The final answer must be a valid integer and nothing else. "
88-
f"You responded with '{raw_model_answer}', which couldn't be parsed as a python integer. "
89-
"Please ensure your answer is a valid integer without any additional text or formatting."
90-
)
91-
if correct_answer_display:
92-
feedback_text += f" The correct answer is '{correct_answer_display}'."
93-
return feedback_text
94-
95-
if extracted_int == gt_int:
96-
return f"Your answer is correct. The correct answer is '{correct_answer_display}'."
97-
else:
98-
return f"Your answer is incorrect. The correct answer is '{correct_answer_display}'."
99-
100-
# TODO: our dataset does not contain written solutions, so we cannot provide feedback on the solution. maybe need to add it later.
101-
# they're using https://huggingface.co/datasets/AI-MO/aimo-validation-aime
102-
103-
10464
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
10565
converted: List[EvaluationRow] = []
10666
for r in rows:
@@ -123,14 +83,15 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
12383
completion_params=[
12484
{
12585
"max_tokens": 131000,
126-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus",
86+
"extra_body": {"reasoning_effort": "low"},
87+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
12788
}
12889
],
12990
rollout_processor=SingleTurnRolloutProcessor(),
13091
aggregation_method="mean",
13192
passed_threshold=0.8,
13293
num_runs=8,
133-
max_dataset_rows=None, # Use full dataset
94+
max_dataset_rows=2,
13495
max_concurrent_rollouts=4,
13596
mode="pointwise",
13697
)
@@ -163,49 +124,10 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
163124
)
164125
}
165126

166-
feedback_text = _build_feedback_text(
167-
extracted_int=extracted_int,
168-
gt_int=gt_int,
169-
is_valid=is_valid,
170-
raw_model_answer=content_str,
171-
ground_truth=str(row.ground_truth),
172-
)
173-
174127
row.evaluation_result = EvaluateResult(
175128
score=score,
176-
reason=feedback_text,
129+
reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
177130
is_score_valid=is_valid,
178131
metrics=metrics,
179132
)
180133
return row
181-
182-
183-
if __name__ == "__main__":
184-
import asyncio
185-
186-
trainer = GEPATrainer(
187-
test_aime25_pointwise,
188-
train_ratio=0.5, # 50% for training (15 problems)
189-
val_ratio=0.3, # 30% for validation (9 problems)
190-
# test_ratio = 20% (6 problems) - calculated automatically
191-
)
192-
193-
# Use same Fireworks model for both main and reflection
194-
reflection_lm = build_reflection_lm("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus")
195-
196-
optimized_program = trainer.train(
197-
num_threads=4, # Reduced from 32 to avoid API timeouts
198-
track_stats=True,
199-
reflection_minibatch_size=5, # Reduced to limit concurrent requests
200-
reflection_lm=reflection_lm,
201-
)
202-
203-
# Option 1: Quick DSPy evaluation (doesn't use EP infrastructure)
204-
print("\n=== DSPy Evaluation ===")
205-
print(trainer.evaluate(optimized_program))
206-
207-
# Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.)
208-
# This goes through the normal @evaluation_test pipeline
209-
print("\n=== EP Evaluation (with tracing) ===")
210-
results = trainer.run_ep_evaluation(optimized_program)
211-
print(f"Final EP Score: {results['score']:.3f}")

0 commit comments

Comments
 (0)