1212 SingleTurnRolloutProcessor ,
1313)
1414from eval_protocol .pytest .evaluation_test import evaluation_test
15- from eval_protocol .training import GEPATrainer
16- from eval_protocol .training .gepa_utils import build_reflection_lm
1715
1816SYSTEM_PROMPT = (
1917 "You are a helpful math assistant. Please reason step by step, and put your final answer within \\ boxed{...}."
@@ -63,44 +61,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
6361 return None
6462
6563
66- def _build_feedback_text (
67- * ,
68- extracted_int : Optional [int ],
69- gt_int : Optional [int ],
70- is_valid : bool ,
71- raw_model_answer : str ,
72- ground_truth : Optional [str ],
73- ) -> str :
74- """
75- Build a feedback string similar in spirit to the GEPA `metric_with_feedback`.
76-
77- Cases:
78- - Parse failure (model or gold): explain integer formatting and show correct answer.
79- - Correct: "Your answer is correct. The correct answer is '...'."
80- - Incorrect: "Your answer is incorrect. The correct answer is '...'."
81- """
82- correct_answer_display = str (gt_int if gt_int is not None else (ground_truth or "" ))
83-
84- if not is_valid :
85- # Could not parse either the model answer or the gold answer as an integer.
86- feedback_text = (
87- "The final answer must be a valid integer and nothing else. "
88- f"You responded with '{ raw_model_answer } ', which couldn't be parsed as a python integer. "
89- "Please ensure your answer is a valid integer without any additional text or formatting."
90- )
91- if correct_answer_display :
92- feedback_text += f" The correct answer is '{ correct_answer_display } '."
93- return feedback_text
94-
95- if extracted_int == gt_int :
96- return f"Your answer is correct. The correct answer is '{ correct_answer_display } '."
97- else :
98- return f"Your answer is incorrect. The correct answer is '{ correct_answer_display } '."
99-
100- # TODO: our dataset does not contain written solutions, so we cannot provide feedback on the solution. maybe need to add it later.
101- # they're using https://huggingface.co/datasets/AI-MO/aimo-validation-aime
102-
103-
10464def aime2025_dataset_adapter (rows : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
10565 converted : List [EvaluationRow ] = []
10666 for r in rows :
@@ -123,14 +83,15 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
12383 completion_params = [
12484 {
12585 "max_tokens" : 131000 ,
126- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus" ,
86+ "extra_body" : {"reasoning_effort" : "low" },
87+ "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
12788 }
12889 ],
12990 rollout_processor = SingleTurnRolloutProcessor (),
13091 aggregation_method = "mean" ,
13192 passed_threshold = 0.8 ,
13293 num_runs = 8 ,
133- max_dataset_rows = None , # Use full dataset
94+ max_dataset_rows = 2 ,
13495 max_concurrent_rollouts = 4 ,
13596 mode = "pointwise" ,
13697)
@@ -163,49 +124,10 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
163124 )
164125 }
165126
166- feedback_text = _build_feedback_text (
167- extracted_int = extracted_int ,
168- gt_int = gt_int ,
169- is_valid = is_valid ,
170- raw_model_answer = content_str ,
171- ground_truth = str (row .ground_truth ),
172- )
173-
174127 row .evaluation_result = EvaluateResult (
175128 score = score ,
176- reason = feedback_text ,
129+ reason = ( "Answer correct" if score == 1.0 else "Answer incorrect" ) ,
177130 is_score_valid = is_valid ,
178131 metrics = metrics ,
179132 )
180133 return row
181-
182-
183- if __name__ == "__main__" :
184- import asyncio
185-
186- trainer = GEPATrainer (
187- test_aime25_pointwise ,
188- train_ratio = 0.5 , # 50% for training (15 problems)
189- val_ratio = 0.3 , # 30% for validation (9 problems)
190- # test_ratio = 20% (6 problems) - calculated automatically
191- )
192-
193- # Use same Fireworks model for both main and reflection
194- reflection_lm = build_reflection_lm ("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus" )
195-
196- optimized_program = trainer .train (
197- num_threads = 4 , # Reduced from 32 to avoid API timeouts
198- track_stats = True ,
199- reflection_minibatch_size = 5 , # Reduced to limit concurrent requests
200- reflection_lm = reflection_lm ,
201- )
202-
203- # Option 1: Quick DSPy evaluation (doesn't use EP infrastructure)
204- print ("\n === DSPy Evaluation ===" )
205- print (trainer .evaluate (optimized_program ))
206-
207- # Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.)
208- # This goes through the normal @evaluation_test pipeline
209- print ("\n === EP Evaluation (with tracing) ===" )
210- results = trainer .run_ep_evaluation (optimized_program )
211- print (f"Final EP Score: { results ['score' ]:.3f} " )
0 commit comments