Skip to content

Commit 35a3267

Browse files
committed
gepa wokring
1 parent 693274e commit 35a3267

File tree

4 files changed

+925
-64
lines changed

4 files changed

+925
-64
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,14 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
123123
completion_params=[
124124
{
125125
"max_tokens": 131000,
126-
"extra_body": {"reasoning_effort": "low"},
127-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
126+
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus",
128127
}
129128
],
130129
rollout_processor=SingleTurnRolloutProcessor(),
131130
aggregation_method="mean",
132131
passed_threshold=0.8,
133132
num_runs=8,
134-
max_dataset_rows=2,
133+
max_dataset_rows=None, # Use full dataset
135134
max_concurrent_rollouts=4,
136135
mode="pointwise",
137136
)
@@ -182,14 +181,31 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
182181

183182

184183
if __name__ == "__main__":
185-
trainer = GEPATrainer(test_aime25_pointwise)
186-
reflection_lm = build_reflection_lm("gpt-5")
184+
import asyncio
185+
186+
trainer = GEPATrainer(
187+
test_aime25_pointwise,
188+
train_ratio=0.5, # 50% for training (15 problems)
189+
val_ratio=0.3, # 30% for validation (9 problems)
190+
# test_ratio = 20% (6 problems) - calculated automatically
191+
)
192+
193+
# Use same Fireworks model for both main and reflection
194+
reflection_lm = build_reflection_lm("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus")
187195

188196
optimized_program = trainer.train(
189-
num_threads=32,
197+
num_threads=4, # Reduced from 32 to avoid API timeouts
190198
track_stats=True,
191-
reflection_minibatch_size=3,
199+
reflection_minibatch_size=5, # Reduced to limit concurrent requests
192200
reflection_lm=reflection_lm,
193201
)
194202

203+
# Option 1: Quick DSPy evaluation (doesn't use EP infrastructure)
204+
print("\n=== DSPy Evaluation ===")
195205
print(trainer.evaluate(optimized_program))
206+
207+
# Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.)
208+
# This goes through the normal @evaluation_test pipeline
209+
print("\n=== EP Evaluation (with tracing) ===")
210+
results = trainer.run_ep_evaluation(optimized_program)
211+
print(f"Final EP Score: {results['score']:.3f}")

eval_protocol/training/__init__.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1-
from gepa_trainer import GEPATrainer
1+
from .gepa_trainer import GEPATrainer
2+
from .gepa_utils import (
3+
DSPyModuleType,
4+
DSPyModuleFactory,
5+
create_single_turn_program,
6+
create_signature,
7+
build_reflection_lm,
8+
)
29

3-
__all__ = ["GEPATrainer"]
10+
__all__ = [
11+
"GEPATrainer",
12+
# DSPy module creation utilities
13+
"DSPyModuleType",
14+
"DSPyModuleFactory",
15+
"create_single_turn_program",
16+
"create_signature",
17+
# Reflection LM helpers
18+
"build_reflection_lm",
19+
]

0 commit comments

Comments
 (0)