@@ -123,15 +123,14 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
123123 completion_params = [
124124 {
125125 "max_tokens" : 131000 ,
126- "extra_body" : {"reasoning_effort" : "low" },
127- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
126+ "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus" ,
128127 }
129128 ],
130129 rollout_processor = SingleTurnRolloutProcessor (),
131130 aggregation_method = "mean" ,
132131 passed_threshold = 0.8 ,
133132 num_runs = 8 ,
134- max_dataset_rows = 2 ,
133+ max_dataset_rows = None , # Use full dataset
135134 max_concurrent_rollouts = 4 ,
136135 mode = "pointwise" ,
137136)
@@ -182,14 +181,31 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
182181
183182
184183if __name__ == "__main__" :
185- trainer = GEPATrainer (test_aime25_pointwise )
186- reflection_lm = build_reflection_lm ("gpt-5" )
184+ import asyncio
185+
186+ trainer = GEPATrainer (
187+ test_aime25_pointwise ,
188+ train_ratio = 0.5 , # 50% for training (15 problems)
189+ val_ratio = 0.3 , # 30% for validation (9 problems)
190+ # test_ratio = 20% (6 problems) - calculated automatically
191+ )
192+
193+ # Use same Fireworks model for both main and reflection
194+ reflection_lm = build_reflection_lm ("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus" )
187195
188196 optimized_program = trainer .train (
189- num_threads = 32 ,
197+ num_threads = 4 , # Reduced from 32 to avoid API timeouts
190198 track_stats = True ,
191- reflection_minibatch_size = 3 ,
199+ reflection_minibatch_size = 5 , # Reduced to limit concurrent requests
192200 reflection_lm = reflection_lm ,
193201 )
194202
203+ # Option 1: Quick DSPy evaluation (doesn't use EP infrastructure)
204+ print ("\n === DSPy Evaluation ===" )
195205 print (trainer .evaluate (optimized_program ))
206+
207+ # Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.)
208+ # This goes through the normal @evaluation_test pipeline
209+ print ("\n === EP Evaluation (with tracing) ===" )
210+ results = trainer .run_ep_evaluation (optimized_program )
211+ print (f"Final EP Score: { results ['score' ]:.3f} " )
0 commit comments