From 0a261158c6e67eb041ffb7ee98c047cb7f219157 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 5 Dec 2025 18:26:43 -0800 Subject: [PATCH 01/15] gepa integration part 1 --- eval_protocol/models.py | 31 +++- eval_protocol/pytest/evaluation_test.py | 35 ++++- eval_protocol/trainable_gepa_design.md | 201 ++++++++++++++++++++++++ eval_protocol/training/utils.py | 19 +++ tests/test_models.py | 32 ++++ tests/test_training_utils.py | 32 ++++ 6 files changed, 342 insertions(+), 8 deletions(-) create mode 100644 eval_protocol/trainable_gepa_design.md create mode 100644 eval_protocol/training/utils.py create mode 100644 tests/test_training_utils.py diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 3ff0613e..911c13b9 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -3,7 +3,7 @@ import importlib from datetime import datetime, timezone from enum import Enum -from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union +from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union, Callable, Sequence JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None] @@ -1190,3 +1190,32 @@ class MCPMultiClientConfiguration(BaseModel): """Represents a MCP configuration.""" mcpServers: Dict[str, Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl]] + + +class EPParameters(BaseModel): + """The parameters of an `@evaluation_test`. Used for trainable integrations.""" + + completion_params: Any = None + input_messages: Any = None + input_dataset: Any = None + input_rows: Any = None + data_loaders: Any = None + dataset_adapter: Optional[Callable[..., Any]] = None + rollout_processor: Any = None + rollout_processor_kwargs: Dict[str, Any] | None = None + aggregation_method: Any = Field(default="mean") + passed_threshold: Any = None + disable_browser_open: bool = False + num_runs: int = 1 + filtered_row_ids: Optional[Sequence[str]] = None + max_dataset_rows: Optional[int] = None + mcp_config_path: Optional[str] = None + max_concurrent_rollouts: int = 8 + max_concurrent_evaluations: int = 64 + server_script_path: Optional[str] = None + steps: int = 30 + mode: Any = Field(default="pointwise") + combine_datasets: bool = True + preprocess_fn: Optional[Callable[[list[EvaluationRow]], list[EvaluationRow]]] = None + logger: Any = None + exception_handler_config: Any = None diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index f7fb16b3..82065517 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -21,6 +21,7 @@ EvaluationThresholdDict, EvaluateResult, Status, + EPParameters, ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper from eval_protocol.pytest.evaluation_test_postprocess import postprocess @@ -695,13 +696,33 @@ async def _collect_result(config, lst): ) pytest_wrapper = pytest.mark.asyncio(pytest_wrapper) - ep_params: dict[str, Any] = { - "rollout_processor": rollout_processor, - "server_script_path": server_script_path, - "mcp_config_path": mcp_config_path, - "rollout_processor_kwargs": rollout_processor_kwargs, - "mode": mode, - } + # Attach full evaluation parameter metadata for training integrations + ep_params: EPParameters = EPParameters( + completion_params=completion_params, + input_messages=input_messages, + input_dataset=input_dataset, + input_rows=input_rows, + data_loaders=data_loaders, + dataset_adapter=dataset_adapter, + rollout_processor=rollout_processor, + rollout_processor_kwargs=rollout_processor_kwargs, + aggregation_method=aggregation_method, + passed_threshold=passed_threshold, + disable_browser_open=disable_browser_open, + num_runs=num_runs, + filtered_row_ids=filtered_row_ids, + max_dataset_rows=max_dataset_rows, + mcp_config_path=mcp_config_path, + max_concurrent_rollouts=max_concurrent_rollouts, + max_concurrent_evaluations=max_concurrent_evaluations, + server_script_path=server_script_path, + steps=steps, + mode=mode, + combine_datasets=combine_datasets, + preprocess_fn=preprocess_fn, + logger=logger, + exception_handler_config=exception_handler_config, + ) # Create the dual mode wrapper dual_mode_wrapper = create_dual_mode_wrapper( diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md new file mode 100644 index 00000000..398840f8 --- /dev/null +++ b/eval_protocol/trainable_gepa_design.md @@ -0,0 +1,201 @@ +## GEPA-Trainable Interface Design for Eval Protocol + +### Goals + +- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as trainable parameters, without changing their core evaluation logic. +- **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API. +- **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly. + +### High-Level Architecture + +- **Benchmark file (e.g., `test_aime25.py`)** + - Continues to define: + - Dataset adapter (`aime2025_dataset_adapter`). + - `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that: + - Uses `SingleTurnRolloutProcessor` (or another processor). + - Computes per-row metrics and sets `row.evaluation_result`. + - Adds *optional* trainable wiring at the bottom, under `if __name__ == "__main__":`, that: + - Imports a trainable/core API from `eval_protocol.trainable`. + - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate. + - Invokes a train routine (GEPA-based or otherwise). + +- **Trainable core** + - Provides a single central abstraction: + - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form: + - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides. + - **Candidate representation**: Start with `dict[str, str]` (e.g., `{"system_prompt": "..."}`), anticipating future extensions (few-shot examples, tool docs, etc.). + - Includes helper utilities to: + - Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function. + - Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores). + +- **GEPA adapter (e.g., `eval_protocol/trainable/gepa_adapter.py`)** + - Wraps the trainable core and GEPA’s API: + - Accepts: + - An `EPConfig`. + - A candidate space definition (for now, implicit via `dict[str, str]` keys). + - GEPA configuration (budget, reflection model, seed, component selection strategy, etc.). + - Provides: + - A GEPA-compatible metric interface that: + - Given a candidate, uses `EPConfig` (and benchmark-specific logic such as a custom `dataset_adapter`) to: + - Construct or adapt rows for that candidate. + - Run rollouts (reusing the same processors and params as the test). + - Compute scalar scores (e.g., mean exact-match over a batch). + - A training routine that returns: + - A `best_candidate: dict[str, str]`. + - Optional rich result object (e.g., mapping to `GEPAResult`, additional stats). + +### Relationship to `evaluation_test` and `__ep_params__` + +- Existing `evaluation_test` code will attach: + +```python +ep_params: dict[str, Any] = { + "rollout_processor": rollout_processor, + "server_script_path": server_script_path, + "mcp_config_path": mcp_config_path, + "rollout_processor_kwargs": rollout_processor_kwargs, + "mode": mode, +} +setattr(dual_mode_wrapper, "__ep_params__", ep_params) +``` + +- Design direction: + - **Use `__ep_params__` as the single source of truth**. + - **`__ep_params__` should contain all effective `evaluation_test` parameters**, including: + - Parsed `completion_params` (after env overrides). + - Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms. + - `aggregation_method`, `num_runs`, `max_dataset_rows`, etc. + - Rollout and mode information (processor, kwargs, concurrency limits, mode). + - The trainable core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate trainable-only config. + +- Trainable core will expose: + - A factory like: + + ```python + def build_ep_parameters_from_test( + test_fn: TestFunction, + ) -> EPParameters: + ... + ``` + + - This function: + - Reads `test_fn.__ep_params__`. + - Reconstructs how to: + - Load and preprocess the dataset. + - Configure the rollout processor (`RolloutProcessorConfig`). + - Run rollouts and then apply the row-level metric (by calling the decorated test function in a library mode). + +- Training code (e.g., `python test_aime25.py`) then becomes: + - Import the test function (e.g., `test_aime25_pointwise`). + - Build an `EPParameters` from it. + - Call into a GEPA-based trainer that uses the `EPParameters`. + +### Open Questions + +- **Where tuned prompts live (storage format and location)**: + - GEPA already supports a `run_dir` for logging and checkpoints. + - We need to decide: + - Whether EP should: + - Treat `run_dir` as the canonical store and optionally add a small `best_candidate.json` there; or + - Provide an additional EP-level artifact format. + - For now, storage is left as an **explicit design TODO** and can be finalized once we have the core/adapter in place. + +### Work Split: Person A vs Person B + +#### Person A – Trainable Core & `evaluation_test` Integration + +- **1. Extend `evaluation_test` metadata (no behavior change)** + - Populate a single `__ep_config__` dict on the decorated test function that includes: + - Dataset specification (paths / input_rows / dataloaders, `dataset_adapter`, `max_dataset_rows`, etc.) after `parse_ep_*`. + - Parsed `completion_params` (after env overrides like `parse_ep_completion_params_overwrite`). + - Rollout settings (`rollout_processor`, `rollout_processor_kwargs`, `mode`, `max_concurrent_rollouts`, `max_concurrent_evaluations`). + - Aggregation and threshold metadata. + - Ensure: + - Backwards compatibility for existing tests. + - Clear typing and docstrings to guide future use. + +- **2. Define core trainable abstractions in `eval_protocol/trainable/core.py`** + - Define: + - `EPConfig`: + - A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.). + - Can be serialized/inspected for external tooling. + - Candidate type alias (initially `Candidate = dict[str, str]`). + - Implement: + - `build_ep_config_from_test(test_fn: TestFunction) -> EPConfig`. + - Reads `__ep_config__`. + - Reuses the same dataset and rollout logic as pytest, but in a library-friendly way (no pytest invocation). + - Helper(s) to: + - Run a single candidate over the dataset, possibly with: + - A subset of rows (train vs val split initially determined by the benchmark or EPConfig). + - A configurable aggregation method (mean score to start). + +- **3. Minimal tests and documentation for the core** + - Add unit/integration tests that: + - Use a tiny fake `@evaluation_test` function. + - Confirm `build_ep_config_from_test` produces a config that can: + - Load mock rows. + - Run a dummy rollout processor. + - Apply a simple metric to produce scores. + - Document (in this design file or a short README) how benchmarks should think about exposing tunable pieces (e.g., via custom dataset adapters or other wiring). + +#### Person B – GEPA Adapter & Benchmark Wiring + +- **4. Implement GEPA integration in `eval_protocol/trainable/gepa_adapter.py`** + - Define a small adapter API, e.g.: + +```python +class GEPATrainer: + def __init__(self, spec: TrainableBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...): + ... + + def train(self) -> tuple[Candidate, Any]: + """Run GEPA and return best candidate plus optional rich result.""" +``` + + - Inside, implement: + - Conversion from `(spec, inject_fn)` into a GEPA metric: + - For each candidate: + - Clone or map the base dataset rows, applying `inject_fn(candidate, row)`. + - Use the spec’s rollout runner + metric runner to compute per-example and aggregate scores. + - Return the aggregate score (and optional textual feedback) to GEPA. + - The call to `gepa.optimize(...)` with: + - `seed_candidate` constructed from the baseline configuration (e.g., default system prompt). + - Budget configuration (max metric calls / auto presets). + - Reflection config (reflection LM or other knobs) passed in via constructor. + - Mapping from `GEPAResult` (or equivalent) back into: + - `best_candidate: Candidate`. + - Optional rich result object (e.g., exposing Pareto-front stats). + +- **5. Wire a first benchmark: AIME 2025** + - In `eval_protocol/benchmarks/test_aime25.py`: + - Factor the row-scoring logic inside `test_aime25_pointwise` into a **reusable metric function** (pure function that sets `row.evaluation_result` given a rolled-out row). + - Decide how candidates should influence the evaluation: + - For example, by making the dataset adapter or message-construction logic candidate-aware (e.g., changing the system prompt). + - Add a `if __name__ == "__main__":` block that: + - Imports `test_aime25_pointwise` and builds an `EPConfig` via `build_ep_config_from_test`. + - Instantiates `GEPATrainer` with: + - The `EPConfig`. + - Initial GEPA config (budget, reflection model placeholder, seed). + - Calls `trainer.train()` and prints/logs the resulting `best_candidate` for now. + - Keep storage of tuned prompts as a TODO/extension point to be resolved later. + +- **6. Optional second benchmark: GPQA** + - Repeat step 5 for `test_gpqa.py`: + - Identify what’s tunable (system prompt, possibly chain-of-thought instructions). + - Extract metric logic into a reusable function. + - Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer. + - This will validate that: + - The abstractions generalize across tasks. + - No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined trainable API). + +### Coordination Notes + +- **Order of work** + - Person A should go first (or in parallel up to the point where `EPConfig` and `build_ep_config_from_test` are usable). + - Person B can stub against interfaces and adjust once Person A’s core is available. +- **Integration checkpoints** + - After Person A lands the core + tests: + - Person B wires AIME with a very simple “optimizer” (even random search) to smoke-test the path before hooking up real GEPA. + - After GEPA integration works for AIME: + - Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts. + - Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows. diff --git a/eval_protocol/training/utils.py b/eval_protocol/training/utils.py new file mode 100644 index 00000000..10457aa0 --- /dev/null +++ b/eval_protocol/training/utils.py @@ -0,0 +1,19 @@ +from typing import Any + +from eval_protocol.models import EPParameters + + +def build_ep_parameters_from_test(test_fn: Any) -> EPParameters: + """ + Build an `EPParameters` instance from an `@evaluation_test`-decorated function. + + The decorator is responsible for attaching a `__ep_params__` attribute that + contains all effective evaluation parameters after parsing/env overrides. + """ + if not hasattr(test_fn, "__ep_params__"): + raise ValueError( + "The provided test function does not have `__ep_params__` attached. " + "Ensure it is decorated with `@evaluation_test` from eval_protocol.pytest." + ) + + return getattr(test_fn, "__ep_params__") diff --git a/tests/test_models.py b/tests/test_models.py index 723685b8..27529829 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -11,6 +11,7 @@ Message, MetricResult, StepOutput, + EPParameters, ) @@ -721,3 +722,34 @@ def test_message_dump_for_chat_completion_request(): assert "weight" not in dictionary assert "reasoning_content" not in dictionary assert dictionary["content"] == "Hello, how are you?" + + +def test_ep_parameters_defaults(): + """EPParameters should have sensible defaults for core fields.""" + params = EPParameters() + + assert params.completion_params is None + assert params.num_runs == 1 + assert params.disable_browser_open is False + assert params.max_concurrent_rollouts == 8 + assert params.max_concurrent_evaluations == 64 + assert params.mode == "pointwise" + assert params.combine_datasets is True + + +def test_ep_parameters_accepts_arbitrary_types(): + """EPParameters should allow rich Python types for callable/logger fields.""" + + def dummy_preprocess(rows): + return rows + + def dummy_adapter(*args, **kwargs): + return None + + logger = logging.getLogger("ep-params-test") + + params = EPParameters(dataset_adapter=dummy_adapter, preprocess_fn=dummy_preprocess, logger=logger) + + assert params.dataset_adapter is dummy_adapter + assert params.preprocess_fn is dummy_preprocess + assert params.logger is logger diff --git a/tests/test_training_utils.py b/tests/test_training_utils.py new file mode 100644 index 00000000..084ff9a9 --- /dev/null +++ b/tests/test_training_utils.py @@ -0,0 +1,32 @@ +import pytest + +from eval_protocol.models import EPParameters +from eval_protocol.training.utils import build_ep_parameters_from_test + + +def test_build_ep_parameters_from_test_returns_attached_model(): + """build_ep_parameters_from_test should return the EPParameters attached to the test function.""" + + def dummy_test() -> None: + pass + + params = EPParameters(num_runs=3, completion_params={"model": "gpt-4"}) + setattr(dummy_test, "__ep_params__", params) + + result = build_ep_parameters_from_test(dummy_test) + + assert result is params + assert result.num_runs == 3 + assert result.completion_params == {"model": "gpt-4"} + + +def test_build_ep_parameters_from_test_missing_attr_raises(): + """build_ep_parameters_from_test should raise when __ep_params__ is missing.""" + + def dummy_test_no_attr() -> None: + pass + + with pytest.raises(ValueError) as exc_info: + build_ep_parameters_from_test(dummy_test_no_attr) + + assert "__ep_params__" in str(exc_info.value) From 42e0b0889c9888b1c6e736593516f39aa388777d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 5 Dec 2025 18:29:49 -0800 Subject: [PATCH 02/15] update --- eval_protocol/trainable_gepa_design.md | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md index 398840f8..5859fba4 100644 --- a/eval_protocol/trainable_gepa_design.md +++ b/eval_protocol/trainable_gepa_design.md @@ -1,8 +1,8 @@ -## GEPA-Trainable Interface Design for Eval Protocol +## GEPA-training Interface Design for Eval Protocol ### Goals -- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as trainable parameters, without changing their core evaluation logic. +- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as training parameters, without changing their core evaluation logic. - **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API. - **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly. @@ -14,12 +14,12 @@ - `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that: - Uses `SingleTurnRolloutProcessor` (or another processor). - Computes per-row metrics and sets `row.evaluation_result`. - - Adds *optional* trainable wiring at the bottom, under `if __name__ == "__main__":`, that: - - Imports a trainable/core API from `eval_protocol.trainable`. + - Adds *optional* training wiring at the bottom, under `if __name__ == "__main__":`, that: + - Imports a training/core API from `eval_protocol.training`. - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate. - Invokes a train routine (GEPA-based or otherwise). -- **Trainable core** +- **training core** - Provides a single central abstraction: - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form: - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides. @@ -28,8 +28,8 @@ - Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function. - Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores). -- **GEPA adapter (e.g., `eval_protocol/trainable/gepa_adapter.py`)** - - Wraps the trainable core and GEPA’s API: +- **GEPA adapter (e.g., `eval_protocol/training/gepa_adapter.py`)** + - Wraps the training core and GEPA’s API: - Accepts: - An `EPConfig`. - A candidate space definition (for now, implicit via `dict[str, str]` keys). @@ -66,9 +66,9 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) - Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms. - `aggregation_method`, `num_runs`, `max_dataset_rows`, etc. - Rollout and mode information (processor, kwargs, concurrency limits, mode). - - The trainable core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate trainable-only config. + - The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config. -- Trainable core will expose: +- training core will expose: - A factory like: ```python @@ -90,7 +90,7 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) - Build an `EPParameters` from it. - Call into a GEPA-based trainer that uses the `EPParameters`. -### Open Questions +### TODO for derek to figure out: how to store the changing system prompts. - **Where tuned prompts live (storage format and location)**: - GEPA already supports a `run_dir` for logging and checkpoints. @@ -102,7 +102,7 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) ### Work Split: Person A vs Person B -#### Person A – Trainable Core & `evaluation_test` Integration +#### Person A – training Core & `evaluation_test` Integration - **1. Extend `evaluation_test` metadata (no behavior change)** - Populate a single `__ep_config__` dict on the decorated test function that includes: @@ -114,7 +114,7 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) - Backwards compatibility for existing tests. - Clear typing and docstrings to guide future use. -- **2. Define core trainable abstractions in `eval_protocol/trainable/core.py`** +- **2. Define core training abstractions in `eval_protocol/training/core.py`** - Define: - `EPConfig`: - A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.). @@ -140,12 +140,12 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) #### Person B – GEPA Adapter & Benchmark Wiring -- **4. Implement GEPA integration in `eval_protocol/trainable/gepa_adapter.py`** +- **4. Implement GEPA integration in `eval_protocol/training/gepa_adapter.py`** - Define a small adapter API, e.g.: ```python class GEPATrainer: - def __init__(self, spec: TrainableBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...): + def __init__(self, spec: trainingBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...): ... def train(self) -> tuple[Candidate, Any]: @@ -186,7 +186,7 @@ class GEPATrainer: - Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer. - This will validate that: - The abstractions generalize across tasks. - - No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined trainable API). + - No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined training API). ### Coordination Notes From 4fa4162b6588978d2222b85cb0fc6e9f5db32c51 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 6 Dec 2025 00:13:07 -0800 Subject: [PATCH 03/15] skeleton of gepa trainer --- eval_protocol/benchmarks/test_aime25.py | 16 +++ eval_protocol/trainable_gepa_design.md | 39 ++++++- eval_protocol/training/__init__.py | 3 + eval_protocol/training/gepa_adapter.py | 138 ++++++++++++++++++++++++ eval_protocol/training/gepa_utils.py | 32 ++++++ pyproject.toml | 1 + 6 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 eval_protocol/training/__init__.py create mode 100644 eval_protocol/training/gepa_adapter.py create mode 100644 eval_protocol/training/gepa_utils.py diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 6eb785a7..debd9fad 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -12,6 +12,8 @@ SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.training import GEPATrainer +from eval_protocol.training.gepa_utils import build_reflection_lm SYSTEM_PROMPT = ( "You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}." @@ -131,3 +133,17 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: metrics=metrics, ) return row + + +if __name__ == "__main__": + trainer = GEPATrainer(test_aime25_pointwise) + reflection_lm = build_reflection_lm("gpt-5") + + optimized_program = trainer.train( + num_threads=32, + track_stats=True, + reflection_minibatch_size=3, + reflection_lm=reflection_lm, + ) + + print(trainer.evaluate(optimized_program)) diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md index 5859fba4..b66fb7e0 100644 --- a/eval_protocol/trainable_gepa_design.md +++ b/eval_protocol/trainable_gepa_design.md @@ -19,7 +19,7 @@ - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate. - Invokes a train routine (GEPA-based or otherwise). -- **training core** +- **Training core** - Provides a single central abstraction: - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form: - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides. @@ -68,7 +68,7 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params) - Rollout and mode information (processor, kwargs, concurrency limits, mode). - The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config. -- training core will expose: +- Training core will expose: - A factory like: ```python @@ -199,3 +199,38 @@ class GEPATrainer: - After GEPA integration works for AIME: - Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts. - Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows. + + +future: + +this is how gepa defines eval: + +def metric( + gold: Example, + pred: Prediction, + trace: Optional[DSPyTrace] = None, + pred_name: Optional[str] = None, + pred_trace: Optional[DSPyTrace] = None, +) -> float | ScoreWithFeedback: + """ + This function is called with the following arguments: + - gold: The gold example. + - pred: The predicted output. + - trace: Optional. The trace of the program's execution. + - pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which + the feedback is being requested. + - pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for. + + Note the `pred_name` and `pred_trace` arguments. During optimization, GEPA will call the metric to obtain + feedback for individual predictors being optimized. GEPA provides the name of the predictor in `pred_name` + and the sub-trace (of the trace) corresponding to the predictor in `pred_trace`. + If available at the predictor level, the metric should return {'score': float, 'feedback': str} corresponding + to the predictor. + If not available at the predictor level, the metric can also return a text feedback at the program level + (using just the gold, pred and trace). + If no feedback is returned, GEPA will use a simple text feedback consisting of just the score: + f"This trajectory got a score of {score}." + """ + ... + +ideally generic way to turn evaluation_test into this. diff --git a/eval_protocol/training/__init__.py b/eval_protocol/training/__init__.py new file mode 100644 index 00000000..998c5e9c --- /dev/null +++ b/eval_protocol/training/__init__.py @@ -0,0 +1,3 @@ +from gepa_adapter import GEPATrainer + +__all__ = ["GEPATrainer"] diff --git a/eval_protocol/training/gepa_adapter.py b/eval_protocol/training/gepa_adapter.py new file mode 100644 index 00000000..47b35267 --- /dev/null +++ b/eval_protocol/training/gepa_adapter.py @@ -0,0 +1,138 @@ +from typing import Any, Dict, Literal + +import dspy +from dspy.clients.lm import LM +from dspy.primitives import Module +from dspy.teleprompt.gepa.gepa import GEPA +from gepa.core.adapter import ProposalFn +from gepa.proposer.reflective_mutation.base import ReflectionComponentSelector + +from eval_protocol.models import EPParameters, EvaluationRow +from eval_protocol.pytest.types import TestFunction +from eval_protocol.training.gepa_utils import REFLECTION_LM_CONFIGS +from eval_protocol.training.utils import build_ep_parameters_from_test + + +class GEPATrainer: + """ + High-level entrypoint for running GEPA-style training against an existing + `@evaluation_test`-decorated function. + + This class is intentionally minimal for now: + - It captures `EPParameters` from the provided test function via + `build_ep_parameters_from_test`. + - It stores any GEPA-related configuration kwargs for future use. + - The actual GEPA optimization loop is left as a TODO. + """ + + def __init__(self, test_fn: TestFunction) -> None: + """ + Args: + test_fn: The `@evaluation_test`-decorated function defining the eval. + """ + self.test_fn = test_fn + self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn) + + self.metric = ( + test_fn # TODO: need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text. + ) + + self.program = ... # TODO: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky. + + self.train_set, self.val_set, self.test_set = ( + ..., + ..., + ..., + ) # TODO: need to convert our input_dataset to a train set + + def train( + self, + auto: Literal["light", "medium", "heavy"] | None = None, + max_full_evals: int | None = None, + max_metric_calls: int | None = None, + reflection_minibatch_size: int = 3, + candidate_selection_strategy: Literal["pareto", "current_best"] = "pareto", + reflection_lm: LM | None = None, + skip_perfect_score: bool = True, + add_format_failure_as_feedback: bool = False, + instruction_proposer: ProposalFn | None = None, + component_selector: ReflectionComponentSelector | str = "round_robin", + use_merge: bool = True, + max_merge_invocations: int | None = 5, + num_threads: int | None = None, + failure_score: float = 0.0, + perfect_score: float = 1.0, + log_dir: str | None = None, + track_stats: bool = False, + use_wandb: bool = False, + wandb_api_key: str | None = None, + wandb_init_kwargs: dict[str, Any] | None = None, + track_best_outputs: bool = False, + warn_on_score_mismatch: bool = True, + enable_tool_optimization: bool = False, + use_mlflow: bool = False, + seed: int | None = 0, + gepa_kwargs: dict | None = None, + ) -> Module: + """ + Run GEPA to optimize over candidates. + """ + gepa_args: dict[str, Any] = { + "auto": auto, + "max_full_evals": max_full_evals, + "max_metric_calls": max_metric_calls, + "reflection_minibatch_size": reflection_minibatch_size, + "candidate_selection_strategy": candidate_selection_strategy, + "reflection_lm": reflection_lm, + "skip_perfect_score": skip_perfect_score, + "add_format_failure_as_feedback": add_format_failure_as_feedback, + "instruction_proposer": instruction_proposer, + "component_selector": component_selector, + "use_merge": use_merge, + "max_merge_invocations": max_merge_invocations, + "num_threads": num_threads, + "failure_score": failure_score, + "perfect_score": perfect_score, + "log_dir": log_dir, + "track_stats": track_stats, + "use_wandb": use_wandb, + "wandb_api_key": wandb_api_key, + "wandb_init_kwargs": wandb_init_kwargs, + "track_best_outputs": track_best_outputs, + "warn_on_score_mismatch": warn_on_score_mismatch, + "enable_tool_optimization": enable_tool_optimization, + "use_mlflow": use_mlflow, + "seed": seed, + } + gepa_args.update(gepa_kwargs or {}) + + optimizer = GEPA( + metric=self.metric, + **gepa_args, + ) + + optimized_program = optimizer.compile( + self.program, + trainset=self.train_set, + valset=self.val_set, + ) + + return optimized_program + + def evaluate(self, optimized_program: Module) -> list[EvaluationRow]: + # convert back to EP + + # and then just run our evaluation_test function on the optimized program. + + # OR we can evaluate using dspy.Evaluate + + # evaluate = dspy.Evaluate( + # devset=self.test_set, + # metric=self.metric, + # num_threads=32, + # display_table=True, + # display_progress=True + # ) + + # return evaluate(self.optimized_program) + ... diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py new file mode 100644 index 00000000..520c7de4 --- /dev/null +++ b/eval_protocol/training/gepa_utils.py @@ -0,0 +1,32 @@ +import os + +import dspy +from dspy.clients.lm import LM + +REFLECTION_LM_CONFIGS = { + "gpt-5": { + "model": "gpt-5", + "temperature": 1.0, + "max_tokens": 32000, + "api_key": os.getenv("OPENAI_API_KEY"), + "base_url": "https://api.openai.com/v1", + }, + "kimi-k2-instruct-0905": { + "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "temperature": 0.6, # Kimi recommended temperature + "max_tokens": 131000, + "api_key": os.getenv("FIREWORKS_API_KEY"), + "base_url": "https://api.fireworks.ai/inference/v1", + }, +} + + +def build_reflection_lm(reflection_lm_name: str) -> LM: + reflection_lm_config = REFLECTION_LM_CONFIGS[reflection_lm_name] + return dspy.LM( + model=reflection_lm_config["model"], + temperature=reflection_lm_config["temperature"], + max_tokens=reflection_lm_config["max_tokens"], + api_key=reflection_lm_config["api_key"], + base_url=reflection_lm_config["base_url"], + ) diff --git a/pyproject.toml b/pyproject.toml index a43f773a..ceea22cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "deepdiff>=6.0.0", "websockets>=15.0.1", "fastapi>=0.116.1", + "dspy>=3.0.0", ] [project.urls] From d6eb57844be5c06376faacad7b3f02331737c689 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 6 Dec 2025 00:19:23 -0800 Subject: [PATCH 04/15] abc trainer --- eval_protocol/training/__init__.py | 2 +- .../{gepa_adapter.py => gepa_trainer.py} | 6 +++--- eval_protocol/training/trainer.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) rename eval_protocol/training/{gepa_adapter.py => gepa_trainer.py} (97%) create mode 100644 eval_protocol/training/trainer.py diff --git a/eval_protocol/training/__init__.py b/eval_protocol/training/__init__.py index 998c5e9c..fcb904c1 100644 --- a/eval_protocol/training/__init__.py +++ b/eval_protocol/training/__init__.py @@ -1,3 +1,3 @@ -from gepa_adapter import GEPATrainer +from gepa_trainer import GEPATrainer __all__ = ["GEPATrainer"] diff --git a/eval_protocol/training/gepa_adapter.py b/eval_protocol/training/gepa_trainer.py similarity index 97% rename from eval_protocol/training/gepa_adapter.py rename to eval_protocol/training/gepa_trainer.py index 47b35267..b2956ce1 100644 --- a/eval_protocol/training/gepa_adapter.py +++ b/eval_protocol/training/gepa_trainer.py @@ -9,11 +9,11 @@ from eval_protocol.models import EPParameters, EvaluationRow from eval_protocol.pytest.types import TestFunction -from eval_protocol.training.gepa_utils import REFLECTION_LM_CONFIGS +from eval_protocol.training.trainer import Trainer from eval_protocol.training.utils import build_ep_parameters_from_test -class GEPATrainer: +class GEPATrainer(Trainer): """ High-level entrypoint for running GEPA-style training against an existing `@evaluation_test`-decorated function. @@ -30,7 +30,7 @@ def __init__(self, test_fn: TestFunction) -> None: Args: test_fn: The `@evaluation_test`-decorated function defining the eval. """ - self.test_fn = test_fn + super().__init__(test_fn) self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn) self.metric = ( diff --git a/eval_protocol/training/trainer.py b/eval_protocol/training/trainer.py new file mode 100644 index 00000000..1008bb41 --- /dev/null +++ b/eval_protocol/training/trainer.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +from eval_protocol.pytest.types import TestFunction + + +class Trainer(ABC): + def __init__(self, test_fn: TestFunction): + self.test_fn = test_fn + + @abstractmethod + def train(self, *args, **kwargs): ... + + @abstractmethod + def evaluate(self, *args, **kwargs): + # evaluation logic possibly can be shared since it's EP. TBD + ... From 9ef49a0d93bfe6214befaec1b1802d3a11808700 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 6 Dec 2025 00:20:38 -0800 Subject: [PATCH 05/15] assign --- eval_protocol/training/gepa_trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index b2956ce1..8c05f824 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -33,17 +33,17 @@ def __init__(self, test_fn: TestFunction) -> None: super().__init__(test_fn) self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn) - self.metric = ( - test_fn # TODO: need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text. - ) + self.metric = test_fn # TODO @derek. need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text. - self.program = ... # TODO: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky. + self.program = ( + ... + ) # TODO @shreymodi1: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky. self.train_set, self.val_set, self.test_set = ( ..., ..., ..., - ) # TODO: need to convert our input_dataset to a train set + ) # TODO @shreymodi1. need to convert our input_dataset to a train set def train( self, From c61de5bd77083aa4fdb81eeb430aeb69890fe2cb Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 6 Dec 2025 00:21:59 -0800 Subject: [PATCH 06/15] fix lock --- uv.lock | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/uv.lock b/uv.lock index 38b07c4a..2a7ae8f0 100644 --- a/uv.lock +++ b/uv.lock @@ -187,6 +187,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792, upload-time = "2025-02-03T07:30:13.6Z" }, ] +[[package]] +name = "alembic" +version = "1.17.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/a6/74c8cadc2882977d80ad756a13857857dbcf9bd405bc80b662eb10651282/alembic-1.17.2.tar.gz", hash = "sha256:bbe9751705c5e0f14877f02d46c53d10885e377e3d90eda810a016f9baa19e8e", size = 1988064, upload-time = "2025-11-14T20:35:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ba/88/6237e97e3385b57b5f1528647addea5cc03d4d65d5979ab24327d41fb00d/alembic-1.17.2-py3-none-any.whl", hash = "sha256:f483dd1fe93f6c5d49217055e4d15b905b425b6af906746abb35b69c1996c4e6", size = 248554, upload-time = "2025-11-14T20:35:05.699Z" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -329,6 +344,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028", size = 5721, upload-time = "2023-08-10T16:35:55.203Z" }, ] +[[package]] +name = "asyncer" +version = "0.0.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/67/7ea59c3e69eaeee42e7fc91a5be67ca5849c8979acac2b920249760c6af2/asyncer-0.0.8.tar.gz", hash = "sha256:a589d980f57e20efb07ed91d0dbe67f1d2fd343e7142c66d3a099f05c620739c", size = 18217, upload-time = "2024-08-24T23:15:36.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/04/15b6ca6b7842eda2748bda0a0af73f2d054e9344320f8bba01f994294bcb/asyncer-0.0.8-py3-none-any.whl", hash = "sha256:5920d48fc99c8f8f0f1576e1882f5022885589c5fcbc46ce4224ec3e53776eeb", size = 9209, upload-time = "2024-08-24T23:15:35.317Z" }, +] + [[package]] name = "asyncstdlib-fw" version = "3.13.2" @@ -830,6 +857,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "colorlog" +version = "6.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" }, +] + [[package]] name = "comm" version = "0.2.3" @@ -1118,6 +1157,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252, upload-time = "2024-01-27T23:42:14.239Z" }, ] +[[package]] +name = "diskcache" +version = "5.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -1177,6 +1225,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, ] +[[package]] +name = "dspy" +version = "3.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "asyncer" }, + { name = "backoff" }, + { name = "cachetools" }, + { name = "cloudpickle" }, + { name = "diskcache" }, + { name = "gepa" }, + { name = "joblib" }, + { name = "json-repair" }, + { name = "litellm" }, + { name = "magicattr" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "openai" }, + { name = "optuna" }, + { name = "orjson" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "regex" }, + { name = "requests" }, + { name = "rich" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/18/0042d299cd5e85fdb381568f0cfcc7769122e8f70ea0a2d33e12fd63e705/dspy-3.0.4.tar.gz", hash = "sha256:cb4529df9a91353a16144d9d94ba6ff25f36fc5adfd921f127f4c49d0e309fb8", size = 236376, upload-time = "2025-11-10T17:43:37.619Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/52/56eed4828175f48f712a50a994293065afa7cc98cb112992a0b071179b6c/dspy-3.0.4-py3-none-any.whl", hash = "sha256:c0a88c7936f41f6f613ee6ca8cd92e63746ff2bd780e3896615ade7628eb6a6a", size = 285224, upload-time = "2025-11-10T17:43:36.263Z" }, +] + [[package]] name = "e2b" version = "1.3.3" @@ -1220,6 +1303,7 @@ dependencies = [ { name = "dataclasses-json" }, { name = "deepdiff" }, { name = "docstring-parser" }, + { name = "dspy" }, { name = "fastapi" }, { name = "httpx" }, { name = "hydra-core" }, @@ -1362,6 +1446,7 @@ requires-dist = [ { name = "deepdiff", specifier = ">=6.0.0" }, { name = "docker", marker = "extra == 'dev'", specifier = "==7.1.0" }, { name = "docstring-parser", specifier = ">=0.15" }, + { name = "dspy", specifier = ">=3.0.0" }, { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.19" }, @@ -1790,6 +1875,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/12/41fcfba4ae0f6b4805f09d11f0e6d6417df2572cea13208c0f439170ee0c/genai_prices-0.0.25-py3-none-any.whl", hash = "sha256:47b412e6927787caa00717a5d99b2e4c0858bed507bb16473b1bcaff48d5aae9", size = 47002, upload-time = "2025-09-01T17:30:41.012Z" }, ] +[[package]] +name = "gepa" +version = "0.0.17" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/61/f0/fe312ed4405ddc2ca97dc1ce8915c4dd707e413503e6832910ab088fceb6/gepa-0.0.17.tar.gz", hash = "sha256:641ed46f8127618341b66ee82a87fb46a21c5d2d427a5e0b91c850a7f7f64e7f", size = 99816, upload-time = "2025-09-25T22:13:45.476Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/dc/2bc81a01caa887ed58db3c725bebf1e98f37807a4d06c51ecaa85a7cabe0/gepa-0.0.17-py3-none-any.whl", hash = "sha256:0ea98f4179dbc8dd83bdf53494f302e663ee1da8300d086c4cc8ce4aefa4042c", size = 110464, upload-time = "2025-09-25T22:13:44.14Z" }, +] + [[package]] name = "gitdb" version = "4.0.12" @@ -1959,6 +2053,61 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, ] +[[package]] +name = "greenlet" +version = "3.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/e5/40dbda2736893e3e53d25838e0f19a2b417dfc122b9989c91918db30b5d3/greenlet-3.3.0.tar.gz", hash = "sha256:a82bb225a4e9e4d653dd2fb7b8b2d36e4fb25bc0165422a11e48b88e9e6f78fb", size = 190651, upload-time = "2025-12-04T14:49:44.05Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/6a/33d1702184d94106d3cdd7bfb788e19723206fce152e303473ca3b946c7b/greenlet-3.3.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:6f8496d434d5cb2dce025773ba5597f71f5410ae499d5dd9533e0653258cdb3d", size = 273658, upload-time = "2025-12-04T14:23:37.494Z" }, + { url = "https://files.pythonhosted.org/packages/d6/b7/2b5805bbf1907c26e434f4e448cd8b696a0b71725204fa21a211ff0c04a7/greenlet-3.3.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b96dc7eef78fd404e022e165ec55327f935b9b52ff355b067eb4a0267fc1cffb", size = 574810, upload-time = "2025-12-04T14:50:04.154Z" }, + { url = "https://files.pythonhosted.org/packages/94/38/343242ec12eddf3d8458c73f555c084359883d4ddc674240d9e61ec51fd6/greenlet-3.3.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:73631cd5cccbcfe63e3f9492aaa664d278fda0ce5c3d43aeda8e77317e38efbd", size = 586248, upload-time = "2025-12-04T14:57:39.35Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d0/0ae86792fb212e4384041e0ef8e7bc66f59a54912ce407d26a966ed2914d/greenlet-3.3.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b299a0cb979f5d7197442dccc3aee67fce53500cd88951b7e6c35575701c980b", size = 597403, upload-time = "2025-12-04T15:07:10.831Z" }, + { url = "https://files.pythonhosted.org/packages/b6/a8/15d0aa26c0036a15d2659175af00954aaaa5d0d66ba538345bd88013b4d7/greenlet-3.3.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7dee147740789a4632cace364816046e43310b59ff8fb79833ab043aefa72fd5", size = 586910, upload-time = "2025-12-04T14:25:59.705Z" }, + { url = "https://files.pythonhosted.org/packages/e1/9b/68d5e3b7ccaba3907e5532cf8b9bf16f9ef5056a008f195a367db0ff32db/greenlet-3.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:39b28e339fc3c348427560494e28d8a6f3561c8d2bcf7d706e1c624ed8d822b9", size = 1547206, upload-time = "2025-12-04T15:04:21.027Z" }, + { url = "https://files.pythonhosted.org/packages/66/bd/e3086ccedc61e49f91e2cfb5ffad9d8d62e5dc85e512a6200f096875b60c/greenlet-3.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b3c374782c2935cc63b2a27ba8708471de4ad1abaa862ffdb1ef45a643ddbb7d", size = 1613359, upload-time = "2025-12-04T14:27:26.548Z" }, + { url = "https://files.pythonhosted.org/packages/f4/6b/d4e73f5dfa888364bbf02efa85616c6714ae7c631c201349782e5b428925/greenlet-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:b49e7ed51876b459bd645d83db257f0180e345d3f768a35a85437a24d5a49082", size = 300740, upload-time = "2025-12-04T14:47:52.773Z" }, + { url = "https://files.pythonhosted.org/packages/1f/cb/48e964c452ca2b92175a9b2dca037a553036cb053ba69e284650ce755f13/greenlet-3.3.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e29f3018580e8412d6aaf5641bb7745d38c85228dacf51a73bd4e26ddf2a6a8e", size = 274908, upload-time = "2025-12-04T14:23:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/28/da/38d7bff4d0277b594ec557f479d65272a893f1f2a716cad91efeb8680953/greenlet-3.3.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a687205fb22794e838f947e2194c0566d3812966b41c78709554aa883183fb62", size = 577113, upload-time = "2025-12-04T14:50:05.493Z" }, + { url = "https://files.pythonhosted.org/packages/3c/f2/89c5eb0faddc3ff014f1c04467d67dee0d1d334ab81fadbf3744847f8a8a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4243050a88ba61842186cb9e63c7dfa677ec146160b0efd73b855a3d9c7fcf32", size = 590338, upload-time = "2025-12-04T14:57:41.136Z" }, + { url = "https://files.pythonhosted.org/packages/80/d7/db0a5085035d05134f8c089643da2b44cc9b80647c39e93129c5ef170d8f/greenlet-3.3.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:670d0f94cd302d81796e37299bcd04b95d62403883b24225c6b5271466612f45", size = 601098, upload-time = "2025-12-04T15:07:11.898Z" }, + { url = "https://files.pythonhosted.org/packages/dc/a6/e959a127b630a58e23529972dbc868c107f9d583b5a9f878fb858c46bc1a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cb3a8ec3db4a3b0eb8a3c25436c2d49e3505821802074969db017b87bc6a948", size = 590206, upload-time = "2025-12-04T14:26:01.254Z" }, + { url = "https://files.pythonhosted.org/packages/48/60/29035719feb91798693023608447283b266b12efc576ed013dd9442364bb/greenlet-3.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2de5a0b09eab81fc6a382791b995b1ccf2b172a9fec934747a7a23d2ff291794", size = 1550668, upload-time = "2025-12-04T15:04:22.439Z" }, + { url = "https://files.pythonhosted.org/packages/0a/5f/783a23754b691bfa86bd72c3033aa107490deac9b2ef190837b860996c9f/greenlet-3.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4449a736606bd30f27f8e1ff4678ee193bc47f6ca810d705981cfffd6ce0d8c5", size = 1615483, upload-time = "2025-12-04T14:27:28.083Z" }, + { url = "https://files.pythonhosted.org/packages/1d/d5/c339b3b4bc8198b7caa4f2bd9fd685ac9f29795816d8db112da3d04175bb/greenlet-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:7652ee180d16d447a683c04e4c5f6441bae7ba7b17ffd9f6b3aff4605e9e6f71", size = 301164, upload-time = "2025-12-04T14:42:51.577Z" }, + { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, + { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, + { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, + { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, + { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, + { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, + { url = "https://files.pythonhosted.org/packages/6c/79/3912a94cf27ec503e51ba493692d6db1e3cd8ac7ac52b0b47c8e33d7f4f9/greenlet-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7a34b13d43a6b78abf828a6d0e87d3385680eaf830cd60d20d52f249faabf39", size = 301964, upload-time = "2025-12-04T14:36:58.316Z" }, + { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, + { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, + { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, + { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, + { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, + { url = "https://files.pythonhosted.org/packages/7e/71/ba21c3fb8c5dce83b8c01f458a42e99ffdb1963aeec08fff5a18588d8fd7/greenlet-3.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:9ee1942ea19550094033c35d25d20726e4f1c40d59545815e1128ac58d416d38", size = 301833, upload-time = "2025-12-04T14:32:23.929Z" }, + { url = "https://files.pythonhosted.org/packages/d7/7c/f0a6d0ede2c7bf092d00bc83ad5bafb7e6ec9b4aab2fbdfa6f134dc73327/greenlet-3.3.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:60c2ef0f578afb3c8d92ea07ad327f9a062547137afe91f38408f08aacab667f", size = 275671, upload-time = "2025-12-04T14:23:05.267Z" }, + { url = "https://files.pythonhosted.org/packages/44/06/dac639ae1a50f5969d82d2e3dd9767d30d6dbdbab0e1a54010c8fe90263c/greenlet-3.3.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a5d554d0712ba1de0a6c94c640f7aeba3f85b3a6e1f2899c11c2c0428da9365", size = 646360, upload-time = "2025-12-04T14:50:10.026Z" }, + { url = "https://files.pythonhosted.org/packages/e0/94/0fb76fe6c5369fba9bf98529ada6f4c3a1adf19e406a47332245ef0eb357/greenlet-3.3.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3a898b1e9c5f7307ebbde4102908e6cbfcb9ea16284a3abe15cab996bee8b9b3", size = 658160, upload-time = "2025-12-04T14:57:45.41Z" }, + { url = "https://files.pythonhosted.org/packages/93/79/d2c70cae6e823fac36c3bbc9077962105052b7ef81db2f01ec3b9bf17e2b/greenlet-3.3.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dcd2bdbd444ff340e8d6bdf54d2f206ccddbb3ccfdcd3c25bf4afaa7b8f0cf45", size = 671388, upload-time = "2025-12-04T15:07:15.789Z" }, + { url = "https://files.pythonhosted.org/packages/b8/14/bab308fc2c1b5228c3224ec2bf928ce2e4d21d8046c161e44a2012b5203e/greenlet-3.3.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5773edda4dc00e173820722711d043799d3adb4f01731f40619e07ea2750b955", size = 660166, upload-time = "2025-12-04T14:26:05.099Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d2/91465d39164eaa0085177f61983d80ffe746c5a1860f009811d498e7259c/greenlet-3.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ac0549373982b36d5fd5d30beb8a7a33ee541ff98d2b502714a09f1169f31b55", size = 1615193, upload-time = "2025-12-04T15:04:27.041Z" }, + { url = "https://files.pythonhosted.org/packages/42/1b/83d110a37044b92423084d52d5d5a3b3a73cafb51b547e6d7366ff62eff1/greenlet-3.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d198d2d977460358c3b3a4dc844f875d1adb33817f0613f663a656f463764ccc", size = 1683653, upload-time = "2025-12-04T14:27:32.366Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9a/9030e6f9aa8fd7808e9c31ba4c38f87c4f8ec324ee67431d181fe396d705/greenlet-3.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:73f51dd0e0bdb596fb0417e475fa3c5e32d4c83638296e560086b8d7da7c4170", size = 305387, upload-time = "2025-12-04T14:26:51.063Z" }, + { url = "https://files.pythonhosted.org/packages/a0/66/bd6317bc5932accf351fc19f177ffba53712a202f9df10587da8df257c7e/greenlet-3.3.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d6ed6f85fae6cdfdb9ce04c9bf7a08d666cfcfb914e7d006f44f840b46741931", size = 282638, upload-time = "2025-12-04T14:25:20.941Z" }, + { url = "https://files.pythonhosted.org/packages/30/cf/cc81cb030b40e738d6e69502ccbd0dd1bced0588e958f9e757945de24404/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9125050fcf24554e69c4cacb086b87b3b55dc395a8b3ebe6487b045b2614388", size = 651145, upload-time = "2025-12-04T14:50:11.039Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ea/1020037b5ecfe95ca7df8d8549959baceb8186031da83d5ecceff8b08cd2/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:87e63ccfa13c0a0f6234ed0add552af24cc67dd886731f2261e46e241608bee3", size = 654236, upload-time = "2025-12-04T14:57:47.007Z" }, + { url = "https://files.pythonhosted.org/packages/69/cc/1e4bae2e45ca2fa55299f4e85854606a78ecc37fead20d69322f96000504/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2662433acbca297c9153a4023fe2161c8dcfdcc91f10433171cf7e7d94ba2221", size = 662506, upload-time = "2025-12-04T15:07:16.906Z" }, + { url = "https://files.pythonhosted.org/packages/57/b9/f8025d71a6085c441a7eaff0fd928bbb275a6633773667023d19179fe815/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c6e9b9c1527a78520357de498b0e709fb9e2f49c3a513afd5a249007261911b", size = 653783, upload-time = "2025-12-04T14:26:06.225Z" }, + { url = "https://files.pythonhosted.org/packages/f6/c7/876a8c7a7485d5d6b5c6821201d542ef28be645aa024cfe1145b35c120c1/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:286d093f95ec98fdd92fcb955003b8a3d054b4e2cab3e2707a5039e7b50520fd", size = 1614857, upload-time = "2025-12-04T15:04:28.484Z" }, + { url = "https://files.pythonhosted.org/packages/4f/dc/041be1dff9f23dac5f48a43323cd0789cb798342011c19a248d9c9335536/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c10513330af5b8ae16f023e8ddbfb486ab355d04467c4679c5cfe4659975dd9", size = 1676034, upload-time = "2025-12-04T14:27:33.531Z" }, +] + [[package]] name = "griffe" version = "1.12.1" @@ -2618,6 +2767,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] +[[package]] +name = "joblib" +version = "1.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" }, +] + +[[package]] +name = "json-repair" +version = "0.54.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/05/9fbcd5ffab9c41455e7d80af65a90876718b8ea2fb4525e187ab11836dd4/json_repair-0.54.2.tar.gz", hash = "sha256:4b6b62ce17f1a505b220fa4aadba1fc37dc9c221544f158471efe3775620bad6", size = 38575, upload-time = "2025-11-25T19:31:22.768Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/3a/1b4df9adcd69fee9c9e4b439c13e8c866f2fae520054aede7030b2278be9/json_repair-0.54.2-py3-none-any.whl", hash = "sha256:be51cce5dca97e0c24ebdf61a1ede2449a8a7666012de99467bb7b0afb35179b", size = 29322, upload-time = "2025-11-25T19:31:21.492Z" }, +] + [[package]] name = "json5" version = "0.12.0" @@ -3141,6 +3308,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, ] +[[package]] +name = "magicattr" +version = "0.1.6" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/7e/76b7e0c391bee7e9273725c29c8fe41c4df62a215ce58aa8e3518baee0bb/magicattr-0.1.6-py2.py3-none-any.whl", hash = "sha256:d96b18ee45b5ee83b09c17e15d3459a64de62d538808c2f71182777dd9dbbbdf", size = 4664, upload-time = "2022-01-25T16:56:47.074Z" }, +] + +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -4211,6 +4398,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/a6/b98d508d189b9c208f5978d0906141747d7e6df7c7cafec03657ed1ed559/opentelemetry_util_http-0.57b0-py3-none-any.whl", hash = "sha256:e54c0df5543951e471c3d694f85474977cd5765a3b7654398c83bab3d2ffb8e9", size = 7643, upload-time = "2025-07-29T15:42:41.744Z" }, ] +[[package]] +name = "optuna" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "alembic" }, + { name = "colorlog" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "sqlalchemy" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6b/81/08f90f194eed78178064a9383432eca95611e2c5331e7b01e2418ce4b15a/optuna-4.6.0.tar.gz", hash = "sha256:89e38c2447c7f793a726617b8043f01e31f0bad54855040db17eb3b49404a369", size = 477444, upload-time = "2025-11-10T05:14:30.151Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/de/3d8455b08cb6312f8cc46aacdf16c71d4d881a1db4a4140fc5ef31108422/optuna-4.6.0-py3-none-any.whl", hash = "sha256:4c3a9facdef2b2dd7e3e2a8ae3697effa70fae4056fcf3425cfc6f5a40feb069", size = 404708, upload-time = "2025-11-10T05:14:28.6Z" }, +] + [[package]] name = "orderly-set" version = "5.5.0" @@ -6251,6 +6457,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" }, ] +[[package]] +name = "sqlalchemy" +version = "2.0.44" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f0/f2/840d7b9496825333f532d2e3976b8eadbf52034178aac53630d09fe6e1ef/sqlalchemy-2.0.44.tar.gz", hash = "sha256:0ae7454e1ab1d780aee69fd2aae7d6b8670a581d8847f2d1e0f7ddfbf47e5a22", size = 9819830, upload-time = "2025-10-10T14:39:12.935Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/a7/e9ccfa7eecaf34c6f57d8cb0bb7cbdeeff27017cc0f5d0ca90fdde7a7c0d/sqlalchemy-2.0.44-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c77f3080674fc529b1bd99489378c7f63fcb4ba7f8322b79732e0258f0ea3ce", size = 2137282, upload-time = "2025-10-10T15:36:10.965Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e1/50bc121885bdf10833a4f65ecbe9fe229a3215f4d65a58da8a181734cae3/sqlalchemy-2.0.44-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c26ef74ba842d61635b0152763d057c8d48215d5be9bb8b7604116a059e9985", size = 2127322, upload-time = "2025-10-10T15:36:12.428Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/a8573b7230a3ce5ee4b961a2d510d71b43872513647398e595b744344664/sqlalchemy-2.0.44-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4a172b31785e2f00780eccab00bc240ccdbfdb8345f1e6063175b3ff12ad1b0", size = 3214772, upload-time = "2025-10-10T15:34:15.09Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/c63d8adb6a7edaf8dcb6f75a2b1e9f8577960a1e489606859c4d73e7d32b/sqlalchemy-2.0.44-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9480c0740aabd8cb29c329b422fb65358049840b34aba0adf63162371d2a96e", size = 3214434, upload-time = "2025-10-10T15:47:00.473Z" }, + { url = "https://files.pythonhosted.org/packages/ee/a6/243d277a4b54fae74d4797957a7320a5c210c293487f931cbe036debb697/sqlalchemy-2.0.44-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:17835885016b9e4d0135720160db3095dc78c583e7b902b6be799fb21035e749", size = 3155365, upload-time = "2025-10-10T15:34:17.932Z" }, + { url = "https://files.pythonhosted.org/packages/5f/f8/6a39516ddd75429fd4ee5a0d72e4c80639fab329b2467c75f363c2ed9751/sqlalchemy-2.0.44-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cbe4f85f50c656d753890f39468fcd8190c5f08282caf19219f684225bfd5fd2", size = 3178910, upload-time = "2025-10-10T15:47:02.346Z" }, + { url = "https://files.pythonhosted.org/packages/43/f0/118355d4ad3c39d9a2f5ee4c7304a9665b3571482777357fa9920cd7a6b4/sqlalchemy-2.0.44-cp310-cp310-win32.whl", hash = "sha256:2fcc4901a86ed81dc76703f3b93ff881e08761c63263c46991081fd7f034b165", size = 2105624, upload-time = "2025-10-10T15:38:15.552Z" }, + { url = "https://files.pythonhosted.org/packages/61/83/6ae5f9466f8aa5d0dcebfff8c9c33b98b27ce23292df3b990454b3d434fd/sqlalchemy-2.0.44-cp310-cp310-win_amd64.whl", hash = "sha256:9919e77403a483ab81e3423151e8ffc9dd992c20d2603bf17e4a8161111e55f5", size = 2129240, upload-time = "2025-10-10T15:38:17.175Z" }, + { url = "https://files.pythonhosted.org/packages/e3/81/15d7c161c9ddf0900b076b55345872ed04ff1ed6a0666e5e94ab44b0163c/sqlalchemy-2.0.44-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fe3917059c7ab2ee3f35e77757062b1bea10a0b6ca633c58391e3f3c6c488dd", size = 2140517, upload-time = "2025-10-10T15:36:15.64Z" }, + { url = "https://files.pythonhosted.org/packages/d4/d5/4abd13b245c7d91bdf131d4916fd9e96a584dac74215f8b5bc945206a974/sqlalchemy-2.0.44-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:de4387a354ff230bc979b46b2207af841dc8bf29847b6c7dbe60af186d97aefa", size = 2130738, upload-time = "2025-10-10T15:36:16.91Z" }, + { url = "https://files.pythonhosted.org/packages/cb/3c/8418969879c26522019c1025171cefbb2a8586b6789ea13254ac602986c0/sqlalchemy-2.0.44-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3678a0fb72c8a6a29422b2732fe423db3ce119c34421b5f9955873eb9b62c1e", size = 3304145, upload-time = "2025-10-10T15:34:19.569Z" }, + { url = "https://files.pythonhosted.org/packages/94/2d/fdb9246d9d32518bda5d90f4b65030b9bf403a935cfe4c36a474846517cb/sqlalchemy-2.0.44-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cf6872a23601672d61a68f390e44703442639a12ee9dd5a88bbce52a695e46e", size = 3304511, upload-time = "2025-10-10T15:47:05.088Z" }, + { url = "https://files.pythonhosted.org/packages/7d/fb/40f2ad1da97d5c83f6c1269664678293d3fe28e90ad17a1093b735420549/sqlalchemy-2.0.44-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:329aa42d1be9929603f406186630135be1e7a42569540577ba2c69952b7cf399", size = 3235161, upload-time = "2025-10-10T15:34:21.193Z" }, + { url = "https://files.pythonhosted.org/packages/95/cb/7cf4078b46752dca917d18cf31910d4eff6076e5b513c2d66100c4293d83/sqlalchemy-2.0.44-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:70e03833faca7166e6a9927fbee7c27e6ecde436774cd0b24bbcc96353bce06b", size = 3261426, upload-time = "2025-10-10T15:47:07.196Z" }, + { url = "https://files.pythonhosted.org/packages/f8/3b/55c09b285cb2d55bdfa711e778bdffdd0dc3ffa052b0af41f1c5d6e582fa/sqlalchemy-2.0.44-cp311-cp311-win32.whl", hash = "sha256:253e2f29843fb303eca6b2fc645aca91fa7aa0aa70b38b6950da92d44ff267f3", size = 2105392, upload-time = "2025-10-10T15:38:20.051Z" }, + { url = "https://files.pythonhosted.org/packages/c7/23/907193c2f4d680aedbfbdf7bf24c13925e3c7c292e813326c1b84a0b878e/sqlalchemy-2.0.44-cp311-cp311-win_amd64.whl", hash = "sha256:7a8694107eb4308a13b425ca8c0e67112f8134c846b6e1f722698708741215d5", size = 2130293, upload-time = "2025-10-10T15:38:21.601Z" }, + { url = "https://files.pythonhosted.org/packages/62/c4/59c7c9b068e6813c898b771204aad36683c96318ed12d4233e1b18762164/sqlalchemy-2.0.44-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:72fea91746b5890f9e5e0997f16cbf3d53550580d76355ba2d998311b17b2250", size = 2139675, upload-time = "2025-10-10T16:03:31.064Z" }, + { url = "https://files.pythonhosted.org/packages/d6/ae/eeb0920537a6f9c5a3708e4a5fc55af25900216bdb4847ec29cfddf3bf3a/sqlalchemy-2.0.44-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:585c0c852a891450edbb1eaca8648408a3cc125f18cf433941fa6babcc359e29", size = 2127726, upload-time = "2025-10-10T16:03:35.934Z" }, + { url = "https://files.pythonhosted.org/packages/d8/d5/2ebbabe0379418eda8041c06b0b551f213576bfe4c2f09d77c06c07c8cc5/sqlalchemy-2.0.44-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b94843a102efa9ac68a7a30cd46df3ff1ed9c658100d30a725d10d9c60a2f44", size = 3327603, upload-time = "2025-10-10T15:35:28.322Z" }, + { url = "https://files.pythonhosted.org/packages/45/e5/5aa65852dadc24b7d8ae75b7efb8d19303ed6ac93482e60c44a585930ea5/sqlalchemy-2.0.44-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:119dc41e7a7defcefc57189cfa0e61b1bf9c228211aba432b53fb71ef367fda1", size = 3337842, upload-time = "2025-10-10T15:43:45.431Z" }, + { url = "https://files.pythonhosted.org/packages/41/92/648f1afd3f20b71e880ca797a960f638d39d243e233a7082c93093c22378/sqlalchemy-2.0.44-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0765e318ee9179b3718c4fd7ba35c434f4dd20332fbc6857a5e8df17719c24d7", size = 3264558, upload-time = "2025-10-10T15:35:29.93Z" }, + { url = "https://files.pythonhosted.org/packages/40/cf/e27d7ee61a10f74b17740918e23cbc5bc62011b48282170dc4c66da8ec0f/sqlalchemy-2.0.44-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2e7b5b079055e02d06a4308d0481658e4f06bc7ef211567edc8f7d5dce52018d", size = 3301570, upload-time = "2025-10-10T15:43:48.407Z" }, + { url = "https://files.pythonhosted.org/packages/3b/3d/3116a9a7b63e780fb402799b6da227435be878b6846b192f076d2f838654/sqlalchemy-2.0.44-cp312-cp312-win32.whl", hash = "sha256:846541e58b9a81cce7dee8329f352c318de25aa2f2bbe1e31587eb1f057448b4", size = 2103447, upload-time = "2025-10-10T15:03:21.678Z" }, + { url = "https://files.pythonhosted.org/packages/25/83/24690e9dfc241e6ab062df82cc0df7f4231c79ba98b273fa496fb3dd78ed/sqlalchemy-2.0.44-cp312-cp312-win_amd64.whl", hash = "sha256:7cbcb47fd66ab294703e1644f78971f6f2f1126424d2b300678f419aa73c7b6e", size = 2130912, upload-time = "2025-10-10T15:03:24.656Z" }, + { url = "https://files.pythonhosted.org/packages/45/d3/c67077a2249fdb455246e6853166360054c331db4613cda3e31ab1cadbef/sqlalchemy-2.0.44-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ff486e183d151e51b1d694c7aa1695747599bb00b9f5f604092b54b74c64a8e1", size = 2135479, upload-time = "2025-10-10T16:03:37.671Z" }, + { url = "https://files.pythonhosted.org/packages/2b/91/eabd0688330d6fd114f5f12c4f89b0d02929f525e6bf7ff80aa17ca802af/sqlalchemy-2.0.44-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b1af8392eb27b372ddb783b317dea0f650241cea5bd29199b22235299ca2e45", size = 2123212, upload-time = "2025-10-10T16:03:41.755Z" }, + { url = "https://files.pythonhosted.org/packages/b0/bb/43e246cfe0e81c018076a16036d9b548c4cc649de241fa27d8d9ca6f85ab/sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b61188657e3a2b9ac4e8f04d6cf8e51046e28175f79464c67f2fd35bceb0976", size = 3255353, upload-time = "2025-10-10T15:35:31.221Z" }, + { url = "https://files.pythonhosted.org/packages/b9/96/c6105ed9a880abe346b64d3b6ddef269ddfcab04f7f3d90a0bf3c5a88e82/sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b87e7b91a5d5973dda5f00cd61ef72ad75a1db73a386b62877d4875a8840959c", size = 3260222, upload-time = "2025-10-10T15:43:50.124Z" }, + { url = "https://files.pythonhosted.org/packages/44/16/1857e35a47155b5ad927272fee81ae49d398959cb749edca6eaa399b582f/sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:15f3326f7f0b2bfe406ee562e17f43f36e16167af99c4c0df61db668de20002d", size = 3189614, upload-time = "2025-10-10T15:35:32.578Z" }, + { url = "https://files.pythonhosted.org/packages/88/ee/4afb39a8ee4fc786e2d716c20ab87b5b1fb33d4ac4129a1aaa574ae8a585/sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e77faf6ff919aa8cd63f1c4e561cac1d9a454a191bb864d5dd5e545935e5a40", size = 3226248, upload-time = "2025-10-10T15:43:51.862Z" }, + { url = "https://files.pythonhosted.org/packages/32/d5/0e66097fc64fa266f29a7963296b40a80d6a997b7ac13806183700676f86/sqlalchemy-2.0.44-cp313-cp313-win32.whl", hash = "sha256:ee51625c2d51f8baadf2829fae817ad0b66b140573939dd69284d2ba3553ae73", size = 2101275, upload-time = "2025-10-10T15:03:26.096Z" }, + { url = "https://files.pythonhosted.org/packages/03/51/665617fe4f8c6450f42a6d8d69243f9420f5677395572c2fe9d21b493b7b/sqlalchemy-2.0.44-cp313-cp313-win_amd64.whl", hash = "sha256:c1c80faaee1a6c3428cecf40d16a2365bcf56c424c92c2b6f0f9ad204b899e9e", size = 2127901, upload-time = "2025-10-10T15:03:27.548Z" }, + { url = "https://files.pythonhosted.org/packages/9c/5e/6a29fa884d9fb7ddadf6b69490a9d45fded3b38541713010dad16b77d015/sqlalchemy-2.0.44-py3-none-any.whl", hash = "sha256:19de7ca1246fbef9f9d1bff8f1ab25641569df226364a0e40457dc5457c54b05", size = 1928718, upload-time = "2025-10-10T15:29:45.32Z" }, +] + [[package]] name = "sse-starlette" version = "2.4.1" From 693274e67139578cb3ff9e70fe5d601bf425fd2d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 6 Dec 2025 00:53:50 -0800 Subject: [PATCH 07/15] attempt at primitive conversion --- eval_protocol/benchmarks/test_aime25.py | 48 +++++++++++++++- eval_protocol/training/gepa_trainer.py | 7 +-- eval_protocol/training/gepa_utils.py | 75 +++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 5 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index debd9fad..c921cef7 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -63,6 +63,44 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None +def _build_feedback_text( + *, + extracted_int: Optional[int], + gt_int: Optional[int], + is_valid: bool, + raw_model_answer: str, + ground_truth: Optional[str], +) -> str: + """ + Build a feedback string similar in spirit to the GEPA `metric_with_feedback`. + + Cases: + - Parse failure (model or gold): explain integer formatting and show correct answer. + - Correct: "Your answer is correct. The correct answer is '...'." + - Incorrect: "Your answer is incorrect. The correct answer is '...'." + """ + correct_answer_display = str(gt_int if gt_int is not None else (ground_truth or "")) + + if not is_valid: + # Could not parse either the model answer or the gold answer as an integer. + feedback_text = ( + "The final answer must be a valid integer and nothing else. " + f"You responded with '{raw_model_answer}', which couldn't be parsed as a python integer. " + "Please ensure your answer is a valid integer without any additional text or formatting." + ) + if correct_answer_display: + feedback_text += f" The correct answer is '{correct_answer_display}'." + return feedback_text + + if extracted_int == gt_int: + return f"Your answer is correct. The correct answer is '{correct_answer_display}'." + else: + return f"Your answer is incorrect. The correct answer is '{correct_answer_display}'." + + # TODO: our dataset does not contain written solutions, so we cannot provide feedback on the solution. maybe need to add it later. + # they're using https://huggingface.co/datasets/AI-MO/aimo-validation-aime + + def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: @@ -126,9 +164,17 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: ) } + feedback_text = _build_feedback_text( + extracted_int=extracted_int, + gt_int=gt_int, + is_valid=is_valid, + raw_model_answer=content_str, + ground_truth=str(row.ground_truth), + ) + row.evaluation_result = EvaluateResult( score=score, - reason=("Answer correct" if score == 1.0 else "Answer incorrect"), + reason=feedback_text, is_score_valid=is_valid, metrics=metrics, ) diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index 8c05f824..6fc2a08a 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -11,6 +11,7 @@ from eval_protocol.pytest.types import TestFunction from eval_protocol.training.trainer import Trainer from eval_protocol.training.utils import build_ep_parameters_from_test +from eval_protocol.training.gepa_utils import ep_test_to_gepa_metric class GEPATrainer(Trainer): @@ -33,11 +34,9 @@ def __init__(self, test_fn: TestFunction) -> None: super().__init__(test_fn) self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn) - self.metric = test_fn # TODO @derek. need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text. + self.metric = ep_test_to_gepa_metric(test_fn) - self.program = ( - ... - ) # TODO @shreymodi1: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky. + self.program = ... # TODO @shreymodi1: converting between a program (dspy.Module) and rollout processors is a bit tricky. maybe start with single turn self.train_set, self.val_set, self.test_set = ( ..., diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 520c7de4..ec50245c 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -1,7 +1,15 @@ import os +from typing import Optional import dspy from dspy.clients.lm import LM +from dspy.primitives import Example, Prediction +from dspy.teleprompt.gepa.gepa_utils import DSPyTrace, ScoreWithFeedback +from dspy.teleprompt.gepa.gepa import GEPAFeedbackMetric + +from eval_protocol.pytest.types import TestFunction +from eval_protocol.models import EvaluationRow, Message + REFLECTION_LM_CONFIGS = { "gpt-5": { @@ -30,3 +38,70 @@ def build_reflection_lm(reflection_lm_name: str) -> LM: api_key=reflection_lm_config["api_key"], base_url=reflection_lm_config["base_url"], ) + + +def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: + """ + Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`. + + Assumptions (aligned with common DSPy usage): + - `gold.answer` holds the ground-truth answer. + - `pred.answer` holds the model's final answer text. + """ + gt = gold.get("answer", None) + ground_truth_str: Optional[str] = str(gt) if gt is not None else None + + content = pred.get("answer", "") + + return EvaluationRow( + messages=[ + Message(role="assistant", content=str(content)) + ], # TODO: for some evals, you might need system / user message too. + ground_truth=ground_truth_str, + ) + + +def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback: + """ + Convert an EvaluationRow into a GEPA-compatible ScoreWithFeedback + (implemented as a dspy.Prediction subclass in dspy.teleprompt.gepa). + """ + if row.evaluation_result is None: + return dspy.Prediction( + score=0.0, + feedback="No evaluation_result was produced by the evaluation_test.", + ) + + score = float(row.evaluation_result.score or 0.0) + feedback = row.evaluation_result.reason or f"This trajectory got a score of {score}." + return dspy.Prediction(score=score, feedback=feedback) + + +def ep_test_to_gepa_metric( + test_fn: TestFunction, +) -> GEPAFeedbackMetric: + """ + Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into + a GEPAFeedbackMetric-compatible callable. + + The resulting metric: + - Constructs an EvaluationRow from (gold, pred) using a simple heuristic. + - Applies the EP test_fn to populate `row.evaluation_result`. + - Returns a dspy.Prediction(score, feedback) derived from that result. + """ + + def metric( + gold: Example, + pred: Prediction, + trace: Optional[DSPyTrace] = None, + pred_name: Optional[str] = None, + pred_trace: Optional[DSPyTrace] = None, + ) -> ScoreWithFeedback: + row = gold_and_pred_to_row(gold, pred) + + evaluated_row: EvaluationRow = test_fn(row) # pyright: ignore + # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow] + + return row_to_prediction(evaluated_row) + + return metric From 35a3267d7e23832921db61acb25710acaaae8a5a Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Tue, 9 Dec 2025 11:19:54 -0800 Subject: [PATCH 08/15] gepa wokring --- eval_protocol/benchmarks/test_aime25.py | 30 +- eval_protocol/training/__init__.py | 20 +- eval_protocol/training/gepa_trainer.py | 505 ++++++++++++++++++++++-- eval_protocol/training/gepa_utils.py | 434 ++++++++++++++++++-- 4 files changed, 925 insertions(+), 64 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index c921cef7..6994a0ca 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -123,15 +123,14 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - "extra_body": {"reasoning_effort": "low"}, - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus", } ], rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, num_runs=8, - max_dataset_rows=2, + max_dataset_rows=None, # Use full dataset max_concurrent_rollouts=4, mode="pointwise", ) @@ -182,14 +181,31 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: if __name__ == "__main__": - trainer = GEPATrainer(test_aime25_pointwise) - reflection_lm = build_reflection_lm("gpt-5") + import asyncio + + trainer = GEPATrainer( + test_aime25_pointwise, + train_ratio=0.5, # 50% for training (15 problems) + val_ratio=0.3, # 30% for validation (9 problems) + # test_ratio = 20% (6 problems) - calculated automatically + ) + + # Use same Fireworks model for both main and reflection + reflection_lm = build_reflection_lm("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus") optimized_program = trainer.train( - num_threads=32, + num_threads=4, # Reduced from 32 to avoid API timeouts track_stats=True, - reflection_minibatch_size=3, + reflection_minibatch_size=5, # Reduced to limit concurrent requests reflection_lm=reflection_lm, ) + # Option 1: Quick DSPy evaluation (doesn't use EP infrastructure) + print("\n=== DSPy Evaluation ===") print(trainer.evaluate(optimized_program)) + + # Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.) + # This goes through the normal @evaluation_test pipeline + print("\n=== EP Evaluation (with tracing) ===") + results = trainer.run_ep_evaluation(optimized_program) + print(f"Final EP Score: {results['score']:.3f}") diff --git a/eval_protocol/training/__init__.py b/eval_protocol/training/__init__.py index fcb904c1..122b6a7a 100644 --- a/eval_protocol/training/__init__.py +++ b/eval_protocol/training/__init__.py @@ -1,3 +1,19 @@ -from gepa_trainer import GEPATrainer +from .gepa_trainer import GEPATrainer +from .gepa_utils import ( + DSPyModuleType, + DSPyModuleFactory, + create_single_turn_program, + create_signature, + build_reflection_lm, +) -__all__ = ["GEPATrainer"] +__all__ = [ + "GEPATrainer", + # DSPy module creation utilities + "DSPyModuleType", + "DSPyModuleFactory", + "create_single_turn_program", + "create_signature", + # Reflection LM helpers + "build_reflection_lm", +] diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index 6fc2a08a..08541062 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -1,17 +1,27 @@ -from typing import Any, Dict, Literal +import asyncio +from typing import Any, Dict, List, Literal import dspy from dspy.clients.lm import LM -from dspy.primitives import Module +from dspy.primitives import Module, Example from dspy.teleprompt.gepa.gepa import GEPA from gepa.core.adapter import ProposalFn from gepa.proposer.reflective_mutation.base import ReflectionComponentSelector -from eval_protocol.models import EPParameters, EvaluationRow -from eval_protocol.pytest.types import TestFunction +from eval_protocol.models import EPParameters, EvaluationRow, Message +from eval_protocol.pytest.types import TestFunction, RolloutProcessorConfig from eval_protocol.training.trainer import Trainer from eval_protocol.training.utils import build_ep_parameters_from_test -from eval_protocol.training.gepa_utils import ep_test_to_gepa_metric +from eval_protocol.training.gepa_utils import ( + ep_test_to_gepa_metric, + create_single_turn_program, + configure_dspy_lm, + extract_system_prompt_from_rows, + evaluation_rows_to_dspy_examples, + train_val_test_split, + DSPyModuleType, + DSPyModuleFactory, +) class GEPATrainer(Trainer): @@ -19,34 +29,207 @@ class GEPATrainer(Trainer): High-level entrypoint for running GEPA-style training against an existing `@evaluation_test`-decorated function. - This class is intentionally minimal for now: - - It captures `EPParameters` from the provided test function via - `build_ep_parameters_from_test`. - - It stores any GEPA-related configuration kwargs for future use. - - The actual GEPA optimization loop is left as a TODO. + This trainer: + 1. Extracts configuration from the @evaluation_test decorator + 2. Creates a DSPy ChainOfThought program (mirrors SingleTurnRolloutProcessor) + 3. Converts the EP dataset to DSPy format + 4. Uses EP's test function as the GEPA metric + 5. Runs GEPA optimization to find the best system prompt + + The optimized system prompt can then be used with EP's rollout processor + for final evaluation. """ - def __init__(self, test_fn: TestFunction) -> None: + def __init__( + self, + test_fn: TestFunction, + *, + # Dataset splitting + train_ratio: float = 0.8, + val_ratio: float = 0.1, + seed: int = 42, + # DSPy signature configuration + input_field: str = "problem", + output_field: str = "answer", + input_desc: str | None = None, + output_desc: str | None = None, + # DSPy module configuration + module_type: DSPyModuleType | str = DSPyModuleType.CHAIN_OF_THOUGHT, + module_factory: DSPyModuleFactory | None = None, + # Custom program (overrides automatic creation) + program: Module | None = None, + ) -> None: """ Args: test_fn: The `@evaluation_test`-decorated function defining the eval. + train_ratio: Proportion of data for training (default 0.8) + val_ratio: Proportion of data for validation (default 0.1) + seed: Random seed for dataset splitting + input_field: Name of the input field in DSPy signature (default: "problem") + output_field: Name of the output field in DSPy signature (default: "answer") + input_desc: Optional description for the input field + output_desc: Optional description for the output field + module_type: Which DSPy module to use: + - PREDICT: Simple input → output + - CHAIN_OF_THOUGHT: Adds reasoning (default, good for complex tasks) + - PROGRAM_OF_THOUGHT: Generates code to solve problems + module_factory: Custom factory to create DSPy module. Overrides module_type. + program: Pre-built DSPy module. If provided, skips automatic creation. + + Examples: + # Default: ChainOfThought for math + trainer = GEPATrainer(test_fn) + + # Simple classification + trainer = GEPATrainer( + test_fn, + input_field="text", + output_field="label", + module_type=DSPyModuleType.PREDICT, + ) + + # Custom DSPy module + my_program = dspy.ChainOfThought(MySignature) + trainer = GEPATrainer(test_fn, program=my_program) """ super().__init__(test_fn) self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn) + # Store configuration + self._input_field = input_field + self._output_field = output_field + + # Configure DSPy to use the same LLM as EP + configure_dspy_lm(self.ep_params) + + # Wrap the EP test function as a GEPA metric self.metric = ep_test_to_gepa_metric(test_fn) - self.program = ... # TODO @shreymodi1: converting between a program (dspy.Module) and rollout processors is a bit tricky. maybe start with single turn + # Load and split the dataset + self._rows: List[EvaluationRow] = self._load_dataset() + train_rows, val_rows, test_rows = train_val_test_split( + self._rows, + train_ratio=train_ratio, + val_ratio=val_ratio, + seed=seed, + ) + + # Extract the system prompt from the dataset (this is what GEPA will optimize!) + self._initial_system_prompt = extract_system_prompt_from_rows(self._rows) + + # Debug: Print initial setup info + print("\n" + "=" * 80) + print("GEPA TRAINER INITIALIZATION") + print("=" * 80) + print(f"\n📊 Dataset loaded: {len(self._rows)} total rows") + print(f" - Train: {len(train_rows)} rows") + print(f" - Val: {len(val_rows)} rows") + print(f" - Test: {len(test_rows)} rows") + print("\n📝 Initial System Prompt (what GEPA will optimize):") + print("-" * 40) + print( + self._initial_system_prompt[:500] + "..." + if self._initial_system_prompt and len(self._initial_system_prompt) > 500 + else self._initial_system_prompt + ) + print("-" * 40) + + # Create or use provided DSPy program + if program is not None: + # Use the provided program directly + self.program: Module = program + else: + # Create DSPy program (mirrors SingleTurnRolloutProcessor) + # - system_prompt → signature.instructions (GEPA optimizes this!) + # - user message → input field + # - assistant response → output field + self.program = create_single_turn_program( + system_prompt=self._initial_system_prompt, + input_field=input_field, + output_field=output_field, + module_type=module_type, + input_desc=input_desc, + output_desc=output_desc, + module_factory=module_factory, + ) + + # Convert EP rows to DSPy Examples + self.train_set: List[Example] = evaluation_rows_to_dspy_examples(train_rows, input_field, output_field) + self.val_set: List[Example] = evaluation_rows_to_dspy_examples(val_rows, input_field, output_field) + self.test_set: List[Example] = evaluation_rows_to_dspy_examples(test_rows, input_field, output_field) + + # Debug: Print example info + print("\n📦 DSPy Examples created:") + print(f" Input field: '{input_field}', Output field: '{output_field}'") + if self.train_set: + ex = self.train_set[0] + print("\n Sample train example:") + print(f" - {input_field}: {str(getattr(ex, input_field, ''))[:200]}...") + print(f" - {output_field}: {str(getattr(ex, output_field, ''))}") + print("=" * 80 + "\n") + + def _load_dataset(self) -> List[EvaluationRow]: + """ + Load the dataset from ep_params. + + Supports: + - input_rows: Pre-constructed EvaluationRow objects + - input_dataset: Paths to JSONL files (requires dataset_adapter) + - input_messages: Raw message lists + """ + ep = self.ep_params + + # Case 1: Pre-constructed rows + if ep.input_rows: + return list(ep.input_rows) + + # Case 2: Dataset paths with adapter + if ep.input_dataset and ep.dataset_adapter: + from eval_protocol.common_utils import load_jsonl + + all_data: List[Dict[str, Any]] = [] + dataset_paths = ep.input_dataset if isinstance(ep.input_dataset, list) else [ep.input_dataset] + + for path in dataset_paths: + all_data.extend(load_jsonl(path)) + + # Apply max_dataset_rows limit + if ep.max_dataset_rows: + all_data = all_data[: ep.max_dataset_rows] - self.train_set, self.val_set, self.test_set = ( - ..., - ..., - ..., - ) # TODO @shreymodi1. need to convert our input_dataset to a train set + return ep.dataset_adapter(all_data) + + # Case 3: Input messages (convert to rows) + if ep.input_messages: + from eval_protocol.models import Message + + rows = [] + for messages in ep.input_messages: + rows.append(EvaluationRow(messages=messages)) + return rows + + raise ValueError( + "No dataset found in ep_params. " + "Provide input_rows, input_dataset (with dataset_adapter), or input_messages." + ) + + @property + def initial_system_prompt(self) -> str | None: + """The original system prompt extracted from the dataset.""" + return self._initial_system_prompt + + def get_optimized_system_prompt(self, optimized_program: Module) -> str: + """ + Extract the optimized system prompt from a GEPA-optimized program. + + This can be used with EP's rollout processor via system_prompt_override. + """ + # GEPA stores optimized instructions in the signature + return optimized_program.predict.signature.instructions def train( self, - auto: Literal["light", "medium", "heavy"] | None = None, + auto: Literal["light", "medium", "heavy"] | None = "light", max_full_evals: int | None = None, max_metric_calls: int | None = None, reflection_minibatch_size: int = 3, @@ -68,7 +251,6 @@ def train( wandb_init_kwargs: dict[str, Any] | None = None, track_best_outputs: bool = False, warn_on_score_mismatch: bool = True, - enable_tool_optimization: bool = False, use_mlflow: bool = False, seed: int | None = 0, gepa_kwargs: dict | None = None, @@ -99,12 +281,44 @@ def train( "wandb_init_kwargs": wandb_init_kwargs, "track_best_outputs": track_best_outputs, "warn_on_score_mismatch": warn_on_score_mismatch, - "enable_tool_optimization": enable_tool_optimization, "use_mlflow": use_mlflow, "seed": seed, } gepa_args.update(gepa_kwargs or {}) + print("\n" + "=" * 80) + print("GEPA TRAINING STARTED") + print("=" * 80) + print(f"📋 Program type: {type(self.program).__name__}") + + # Get signature - ChainOfThought stores it in .predict.signature + sig = None + if hasattr(self.program, "signature"): + sig = self.program.signature + elif hasattr(self.program, "predict") and hasattr(self.program.predict, "signature"): + sig = self.program.predict.signature + + if sig: + print(f"📋 Signature: {sig}") + print("📋 Initial Instructions:") + print("-" * 40) + print(sig.instructions if sig.instructions else "None") + print("-" * 40) + else: + print("📋 Signature: N/A") + + print(f"📋 Train set size: {len(self.train_set)}") + print(f"📋 Val set size: {len(self.val_set)}") + print(f"📋 Test set size: {len(self.test_set)}") + print(f"📋 GEPA auto mode: {gepa_args.get('auto', 'N/A')}") + print(f"📋 Reflection minibatch size: {gepa_args.get('reflection_minibatch_size', 3)}") + print("=" * 80 + "\n") + + # Enable verbose logging from DSPy/GEPA + import logging + + logging.getLogger("dspy.teleprompt.gepa.gepa").setLevel(logging.INFO) + optimizer = GEPA( metric=self.metric, **gepa_args, @@ -116,22 +330,247 @@ def train( valset=self.val_set, ) + print("\n" + "=" * 80) + print("GEPA TRAINING COMPLETE") + print("=" * 80) + + # Print detailed results if track_stats was enabled + if hasattr(optimized_program, "detailed_results"): + results = optimized_program.detailed_results + print("\n📊 OPTIMIZATION STATS:") + print(f" Total metric calls: {results.total_metric_calls}") + print(f" Full val evals: {results.num_full_val_evals}") + print(f" Best candidate index: {results.best_idx}") + print(f" Best val score: {results.val_aggregate_scores[results.best_idx]:.3f}") + + print("\n📈 ALL CANDIDATE SCORES:") + for i, score in enumerate(results.val_aggregate_scores): + marker = " 🏆" if i == results.best_idx else "" + print(f" Candidate {i}: {score:.3f}{marker}") + + optimized_instructions = self.get_optimized_system_prompt(optimized_program) + print("\n🎯 OPTIMIZED SYSTEM PROMPT:") + print("-" * 60) + print(optimized_instructions) + print("-" * 60) + + # Compare with initial + print("\n📝 COMPARISON:") + print(f" Initial prompt length: {len(self._initial_system_prompt or '')} chars") + print(f" Optimized prompt length: {len(optimized_instructions)} chars") + if self._initial_system_prompt != optimized_instructions: + print(" ✅ Prompt was CHANGED by GEPA") + else: + print(" ⚠️ Prompt was NOT changed (model may already be optimal or no failures to learn from)") + + print("=" * 80 + "\n") + return optimized_program - def evaluate(self, optimized_program: Module) -> list[EvaluationRow]: - # convert back to EP + def evaluate( + self, + optimized_program: Module, + num_threads: int = 32, + display_table: bool = True, + display_progress: bool = True, + ) -> dspy.evaluate.EvaluationResult: + """ + Evaluate the optimized program on the test set using DSPy's Evaluate. - # and then just run our evaluation_test function on the optimized program. + Args: + optimized_program: The GEPA-optimized program + num_threads: Number of parallel threads for evaluation + display_table: Whether to display results table + display_progress: Whether to show progress bar + + Returns: + DSPy EvaluationResult with score and per-example results + """ + evaluator = dspy.Evaluate( + devset=self.test_set, + metric=self.metric, + num_threads=num_threads, + display_table=display_table, + display_progress=display_progress, + ) + + return evaluator(optimized_program) - # OR we can evaluate using dspy.Evaluate + def evaluate_baseline( + self, + num_threads: int = 32, + display_table: bool = True, + display_progress: bool = True, + ) -> dspy.evaluate.EvaluationResult: + """ + Evaluate the unoptimized baseline program on the test set. + + Useful for comparing before/after GEPA optimization. + """ + return self.evaluate( + self.program, + num_threads=num_threads, + display_table=display_table, + display_progress=display_progress, + ) + + def _inject_system_prompt(self, rows: List[EvaluationRow], new_system_prompt: str) -> List[EvaluationRow]: + """ + Create copies of rows with the system prompt replaced. + """ + modified_rows = [] + for row in rows: + new_row = row.model_copy(deep=True) + new_messages = [] + system_found = False + for msg in new_row.messages: + if msg.role == "system" and not system_found: + # Replace the first system message + new_messages.append(Message(role="system", content=new_system_prompt)) + system_found = True + else: + new_messages.append(msg) + # If no system message found, prepend one + if not system_found: + new_messages.insert(0, Message(role="system", content=new_system_prompt)) + new_row.messages = new_messages + modified_rows.append(new_row) + return modified_rows + + async def evaluate_with_ep( + self, + optimized_program: Module, + *, + use_test_set: bool = True, + max_concurrent_rollouts: int = 8, + ) -> Dict[str, Any]: + """ + Run final evaluation through the normal EP infrastructure. + + This uses the same LLM proxy (EP_LLM_API_BASE) and tracing as a normal + @evaluation_test job. + + Args: + optimized_program: The GEPA-optimized program + use_test_set: If True, evaluate on test set. If False, use full dataset. + max_concurrent_rollouts: Maximum concurrent LLM calls + + Returns: + Dict with evaluation results: + - 'rows': List of evaluated EvaluationRow objects + - 'score': Aggregate score + - 'optimized_prompt': The prompt used for evaluation + """ + from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor + from eval_protocol.pytest.execution import execute_pytest + from eval_protocol.logging import default_logger - # evaluate = dspy.Evaluate( - # devset=self.test_set, - # metric=self.metric, - # num_threads=32, - # display_table=True, - # display_progress=True - # ) + # Get optimized system prompt + optimized_prompt = self.get_optimized_system_prompt(optimized_program) - # return evaluate(self.optimized_program) - ... + print("\n" + "=" * 80) + print("RUNNING EP EVALUATION (with LLM proxy & tracing)") + print("=" * 80) + print(f"📋 Using optimized prompt ({len(optimized_prompt)} chars)") + + # Get rows to evaluate + if use_test_set: + # Reconstruct test rows from test_set examples + _, _, test_rows = train_val_test_split( + self._rows, + train_ratio=0.5, # Match the ratio used in training + val_ratio=0.3, + seed=42, + ) + rows_to_eval = test_rows + print(f"📊 Evaluating on TEST SET: {len(rows_to_eval)} rows") + else: + rows_to_eval = self._rows + print(f"📊 Evaluating on FULL DATASET: {len(rows_to_eval)} rows") + + # Inject optimized system prompt into rows + modified_rows = self._inject_system_prompt(rows_to_eval, optimized_prompt) + + # Set up rollout processor config + completion_params = self.ep_params.completion_params + if isinstance(completion_params, list): + completion_params = completion_params[0] if completion_params else {} + completion_params = completion_params or {} + + # Create semaphore for concurrency control + semaphore = asyncio.Semaphore(max_concurrent_rollouts) + + config = RolloutProcessorConfig( + completion_params=completion_params, + mcp_config_path="", + server_script_path=None, + steps=30, + logger=default_logger, + semaphore=semaphore, + kwargs={}, + exception_handler_config=None, + ) + + # Run rollouts through EP infrastructure (uses EP_LLM_API_BASE) + rollout_processor = SingleTurnRolloutProcessor() + rollout_processor.setup() + + print("🚀 Running rollouts through EP infrastructure...") + print(f" Model: {completion_params.get('model', 'N/A')}") + + try: + # Execute rollouts + tasks = rollout_processor(modified_rows, config) + rolled_out_rows = await asyncio.gather(*tasks) + + print(f"✅ Rollouts complete: {len(rolled_out_rows)} rows") + + # Run evaluation function on each row + evaluated_rows = [] + scores = [] + + for row in rolled_out_rows: + # Call the original test function for evaluation + evaluated_row = await execute_pytest( + self.test_fn, + processed_row=row, # pyright: ignore[reportArgumentType] + ) + evaluated_rows.append(evaluated_row) + + # Extract score - evaluated_row is EvaluationRow from execute_pytest + if hasattr(evaluated_row, "evaluation_result") and evaluated_row.evaluation_result: # pyright: ignore[reportAttributeAccessIssue] + scores.append(evaluated_row.evaluation_result.score) # pyright: ignore[reportAttributeAccessIssue] + + # Calculate aggregate score + avg_score = sum(scores) / len(scores) if scores else 0.0 + + print("\n📊 EVALUATION RESULTS:") + print(f" Total rows: {len(evaluated_rows)}") + print(f" Aggregate score: {avg_score:.3f}") + print(f" Passing: {sum(1 for s in scores if s >= 0.5)}/{len(scores)}") + print("=" * 80 + "\n") + + return { + "rows": evaluated_rows, + "score": avg_score, + "scores": scores, + "optimized_prompt": optimized_prompt, + } + + finally: + rollout_processor.cleanup() + + def run_ep_evaluation( + self, + optimized_program: Module, + **kwargs, + ) -> Dict[str, Any]: + """ + Synchronous wrapper for evaluate_with_ep. + + Example: + trainer = GEPATrainer(test_fn) + optimized = trainer.train() + results = trainer.run_ep_evaluation(optimized) + """ + return asyncio.run(self.evaluate_with_ep(optimized_program, **kwargs)) diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index ec50245c..7a3b60c3 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Any, Optional, Tuple import dspy from dspy.clients.lm import LM @@ -8,36 +8,88 @@ from dspy.teleprompt.gepa.gepa import GEPAFeedbackMetric from eval_protocol.pytest.types import TestFunction -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, EPParameters, Message +# ============================================================================= +# Reflection LM configurations for GEPA +# ============================================================================= + +# Reflection LM configs use LiteLLM format: "provider/model_name" +# API keys should be set via environment variables: +# - OPENAI_API_KEY for OpenAI models +# - FIREWORKS_API_KEY for Fireworks models +# - ANTHROPIC_API_KEY for Anthropic models + REFLECTION_LM_CONFIGS = { + # OpenAI models "gpt-5": { - "model": "gpt-5", + "model": "openai/gpt-5", "temperature": 1.0, "max_tokens": 32000, - "api_key": os.getenv("OPENAI_API_KEY"), - "base_url": "https://api.openai.com/v1", }, - "kimi-k2-instruct-0905": { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", - "temperature": 0.6, # Kimi recommended temperature + "gpt-4o": { + "model": "openai/gpt-4o", + "temperature": 1.0, + "max_tokens": 16000, + }, + # Anthropic models + "claude-sonnet": { + "model": "anthropic/claude-sonnet-4-20250514", + "temperature": 1.0, + "max_tokens": 16000, + }, + # Fireworks models + "kimi-k2": { + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + "temperature": 0.6, "max_tokens": 131000, - "api_key": os.getenv("FIREWORKS_API_KEY"), - "base_url": "https://api.fireworks.ai/inference/v1", + }, + "llama-4-maverick": { + "model": "fireworks_ai/accounts/fireworks/models/llama4-maverick-instruct-basic", + "temperature": 1.0, + "max_tokens": 65536, + }, + "deepseek-r1": { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-r1", + "temperature": 1.0, + "max_tokens": 65536, + }, + "qwen3-235b": { + "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b", + "temperature": 1.0, + "max_tokens": 65536, }, } def build_reflection_lm(reflection_lm_name: str) -> LM: - reflection_lm_config = REFLECTION_LM_CONFIGS[reflection_lm_name] - return dspy.LM( - model=reflection_lm_config["model"], - temperature=reflection_lm_config["temperature"], - max_tokens=reflection_lm_config["max_tokens"], - api_key=reflection_lm_config["api_key"], - base_url=reflection_lm_config["base_url"], - ) + """ + Build a DSPy LM for GEPA's reflection step. + + Args: + reflection_lm_name: One of the predefined configs ("gpt-5", "gpt-4o", + "claude-sonnet", "kimi-k2-instruct-0905") + OR a raw LiteLLM model string (e.g., "openai/gpt-4o") + + Returns: + A dspy.LM configured for reflection. + + Note: API keys must be set via environment variables: + - OPENAI_API_KEY for OpenAI models + - FIREWORKS_API_KEY for Fireworks models + - ANTHROPIC_API_KEY for Anthropic models + """ + if reflection_lm_name in REFLECTION_LM_CONFIGS: + config = REFLECTION_LM_CONFIGS[reflection_lm_name] + return dspy.LM( + model=config["model"], + temperature=config.get("temperature"), + max_tokens=config.get("max_tokens"), + ) + else: + # Assume it's a raw LiteLLM model string + return dspy.LM(model=reflection_lm_name) def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: @@ -47,9 +99,14 @@ def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: Assumptions (aligned with common DSPy usage): - `gold.answer` holds the ground-truth answer. - `pred.answer` holds the model's final answer text. + + Note: ground_truth is preserved in its original type (list, dict, str, etc.) + to support structured comparisons like SQL result matching. """ gt = gold.get("answer", None) - ground_truth_str: Optional[str] = str(gt) if gt is not None else None + # Preserve original type - don't convert to string! + # This is important for SQL evaluators that expect list[dict] results + ground_truth = gt content = pred.get("answer", "") @@ -57,7 +114,7 @@ def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: messages=[ Message(role="assistant", content=str(content)) ], # TODO: for some evals, you might need system / user message too. - ground_truth=ground_truth_str, + ground_truth=ground_truth, ) @@ -88,7 +145,17 @@ def ep_test_to_gepa_metric( - Constructs an EvaluationRow from (gold, pred) using a simple heuristic. - Applies the EP test_fn to populate `row.evaluation_result`. - Returns a dspy.Prediction(score, feedback) derived from that result. + + Note: The @evaluation_test decorator wraps functions as async, so we need to + handle both sync and async test functions. """ + import asyncio + import inspect + + # Counter for debugging + call_count = [0] + DEBUG_METRIC = True # Set to False to disable metric debug output + DEBUG_VERBOSE = True # Set to True to print ALL calls (can be very verbose!) def metric( gold: Example, @@ -97,11 +164,334 @@ def metric( pred_name: Optional[str] = None, pred_trace: Optional[DSPyTrace] = None, ) -> ScoreWithFeedback: + call_count[0] += 1 + + should_print = DEBUG_METRIC and (DEBUG_VERBOSE or call_count[0] <= 3) + + if should_print: + print(f"\n🔍 METRIC CALL #{call_count[0]}") + print("-" * 40) + print(f" Gold (expected): {gold.get('answer', 'N/A')}") + print(f" Pred (model): {str(pred.get('answer', 'N/A'))[:200]}") + if hasattr(pred, "reasoning") and pred.reasoning: + print(f" Reasoning: {str(pred.reasoning)[:300]}...") + row = gold_and_pred_to_row(gold, pred) - evaluated_row: EvaluationRow = test_fn(row) # pyright: ignore + # Call the test function - handle both sync and async + result = test_fn(row) # pyright: ignore + + # If it's a coroutine, run it synchronously + if inspect.iscoroutine(result): + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = None + + if loop is not None: + # Already in an async context - create a new loop in a thread + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(asyncio.run, result) + evaluated_row: EvaluationRow = future.result() + else: + # No running loop - safe to use asyncio.run + evaluated_row = asyncio.run(result) + else: + evaluated_row = result # type: ignore[reportAssignmentType] + # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow] - return row_to_prediction(evaluated_row) + score_result = row_to_prediction(evaluated_row) + + if should_print: + print(f" Score: {score_result.score}") + print(f" Feedback: {str(score_result.feedback)[:200]}") + print("-" * 40) + + return score_result return metric + + +# ============================================================================= +# DSPy Program Creation (maps SingleTurnRolloutProcessor → DSPy Module) +# ============================================================================= + +from typing import Callable, Type +from enum import Enum + + +class DSPyModuleType(Enum): + """Available DSPy module types for single-turn rollouts.""" + + PREDICT = "predict" # Simple input → output + CHAIN_OF_THOUGHT = "chain_of_thought" # Adds reasoning before output (good for math) + PROGRAM_OF_THOUGHT = "program_of_thought" # Generates code to solve problems + + +# Type alias for custom module factory +DSPyModuleFactory = Callable[[dspy.Signature], dspy.Module] + + +def create_signature( + input_field: str = "problem", + output_field: str = "answer", + instructions: str | None = None, + input_desc: str | None = None, + output_desc: str | None = None, +) -> dspy.Signature: + """ + Create a DSPy Signature for single-turn tasks. + + Args: + input_field: Name of the input field (default: "problem") + output_field: Name of the output field (default: "answer") + instructions: System prompt / instructions (what GEPA optimizes!) + input_desc: Description for the input field + output_desc: Description for the output field + + Returns: + A dspy.Signature configured for the task. + """ + # Build signature string + signature_str = f"{input_field} -> {output_field}" + + # Create base signature + if instructions: + sig = dspy.Signature(signature_str, instructions=instructions) + else: + sig = dspy.Signature(signature_str) + + # Add field descriptions if provided + if input_desc: + sig = sig.with_updated_fields(input_field, desc=input_desc) + if output_desc: + sig = sig.with_updated_fields(output_field, desc=output_desc) + + return sig + + +def create_single_turn_program( + system_prompt: str | None = None, + input_field: str = "problem", + output_field: str = "answer", + module_type: DSPyModuleType | str = DSPyModuleType.CHAIN_OF_THOUGHT, + input_desc: str | None = None, + output_desc: str | None = None, + module_factory: DSPyModuleFactory | None = None, +) -> dspy.Module: + """ + Create a DSPy program that mirrors SingleTurnRolloutProcessor. + + This is the general mapping: + - SingleTurnRolloutProcessor: system message + user message → LLM → assistant response + - DSPy Module: instructions + input field → LLM → output field + + GEPA optimizes the `instructions` (system prompt equivalent)! + + Args: + system_prompt: The system prompt (becomes signature.instructions). + input_field: Name of the input field (default: "problem") + output_field: Name of the output field (default: "answer") + module_type: Which DSPy module to use: + - PREDICT: Simple input → output + - CHAIN_OF_THOUGHT: Adds reasoning before output (default, good for complex tasks) + - PROGRAM_OF_THOUGHT: Generates code to solve problems + input_desc: Optional description for the input field + output_desc: Optional description for the output field + module_factory: Custom factory function to create the module. + If provided, overrides module_type. + Signature: (dspy.Signature) -> dspy.Module + + Returns: + A DSPy module ready for GEPA optimization. + + Examples: + # Default: ChainOfThought for math + program = create_single_turn_program(system_prompt="Solve step by step") + + # Simple classification + program = create_single_turn_program( + input_field="text", + output_field="label", + module_type=DSPyModuleType.PREDICT + ) + + # Custom module + program = create_single_turn_program( + system_prompt="...", + module_factory=lambda sig: MyCustomModule(sig) + ) + """ + # Create the signature + sig = create_signature( + input_field=input_field, + output_field=output_field, + instructions=system_prompt, + input_desc=input_desc, + output_desc=output_desc, + ) + + # Use custom factory if provided + if module_factory is not None: + return module_factory(sig) + + # Convert string to enum if needed + if isinstance(module_type, str): + module_type = DSPyModuleType(module_type) + + # Create the appropriate module type + if module_type == DSPyModuleType.PREDICT: + return dspy.Predict(sig) + elif module_type == DSPyModuleType.CHAIN_OF_THOUGHT: + return dspy.ChainOfThought(sig) + elif module_type == DSPyModuleType.PROGRAM_OF_THOUGHT: + return dspy.ProgramOfThought(sig) + else: + raise ValueError(f"Unknown module type: {module_type}") + + +def configure_dspy_lm(ep_params: EPParameters) -> None: + """ + Configure DSPy to use the same LLM as the EP evaluation. + + Extracts model info from ep_params.completion_params and configures dspy. + + DSPy uses LiteLLM under the hood, so: + - Model format: "provider/model_name" (e.g., "openai/gpt-4o", "fireworks_ai/...") + - API keys: Set via environment variables (OPENAI_API_KEY, FIREWORKS_API_KEY, etc.) + """ + raw_params = ep_params.completion_params + + # Handle completion_params being a list (for sweeps) - use the first one + if isinstance(raw_params, list): + completion_params = raw_params[0] if raw_params else {} + else: + completion_params = raw_params or {} + + # Extract model name (expected to already be in LiteLLM format) + model = completion_params.get("model", "openai/gpt-4") + + # Extract optional parameters + temperature = completion_params.get("temperature") # None = use provider default + max_tokens = completion_params.get("max_tokens") # None = use provider default + + # Build kwargs - only include non-None values + lm_kwargs: dict[str, Any] = {"model": model} + if temperature is not None: + lm_kwargs["temperature"] = temperature + if max_tokens is not None: + lm_kwargs["max_tokens"] = max_tokens + + # Pass through any extra kwargs from completion_params that DSPy/LiteLLM supports + passthrough_keys = ["num_retries", "cache"] + for key in passthrough_keys: + if key in completion_params: + lm_kwargs[key] = completion_params[key] + + lm = dspy.LM(**lm_kwargs) + dspy.configure(lm=lm) + + +# ============================================================================= +# Dataset Conversion (EvaluationRow → DSPy Example) +# ============================================================================= + + +def extract_system_prompt_from_rows(rows: list[EvaluationRow]) -> str | None: + """ + Extract the system prompt from a list of EvaluationRows. + + Assumes all rows have the same system prompt (common in benchmarks). + Returns the first system message content found, or None. + """ + for row in rows: + system_msg = row.get_system_message() + if system_msg and system_msg.content: + content = system_msg.content + return str(content) if content else None + return None + + +def extract_user_content(row: EvaluationRow) -> str: + """Extract the user message content from an EvaluationRow.""" + user_msg = row.get_first_user_message() + if user_msg and user_msg.content: + return str(user_msg.content) + return "" + + +def evaluation_row_to_dspy_example( + row: EvaluationRow, + input_field: str = "problem", + output_field: str = "answer", +) -> Example: + """ + Convert an EvaluationRow to a DSPy Example. + + Maps: + - User message content → input_field (e.g., "problem") + - ground_truth → output_field (e.g., "answer") + + Note: ground_truth is preserved in its original type to support + structured comparisons (e.g., SQL result matching with list[dict]). + """ + # Extract user message as input + input_content = extract_user_content(row) + + # Ground truth is the expected output - preserve original type! + # Don't convert to string - this breaks SQL evaluators that expect list[dict] + output_content = row.ground_truth if row.ground_truth is not None else "" + + return dspy.Example( + **{ + input_field: input_content, + output_field: output_content, + } + ).with_inputs(input_field) + + +def evaluation_rows_to_dspy_examples( + rows: list[EvaluationRow], + input_field: str = "problem", + output_field: str = "answer", +) -> list[Example]: + """Convert a list of EvaluationRows to DSPy Examples.""" + return [evaluation_row_to_dspy_example(row, input_field, output_field) for row in rows] + + +def train_val_test_split( + rows: list[EvaluationRow], + train_ratio: float = 0.8, + val_ratio: float = 0.1, + seed: int = 42, +) -> Tuple[list[EvaluationRow], list[EvaluationRow], list[EvaluationRow]]: + """ + Split EvaluationRows into train/val/test sets. + + Args: + rows: List of EvaluationRow objects + train_ratio: Proportion for training (default 0.8) + val_ratio: Proportion for validation (default 0.1) + seed: Random seed for reproducibility + + Returns: + Tuple of (train_rows, val_rows, test_rows) + """ + import random + + # Copy and shuffle + shuffled = list(rows) + random.Random(seed).shuffle(shuffled) + + n = len(shuffled) + train_end = int(n * train_ratio) + val_end = int(n * (train_ratio + val_ratio)) + + train_rows = shuffled[:train_end] + val_rows = shuffled[train_end:val_end] + test_rows = shuffled[val_end:] + + return train_rows, val_rows, test_rows From 2d787bfc69931bfa82a3d40d8089a909aa69748a Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Tue, 9 Dec 2025 11:20:25 -0800 Subject: [PATCH 09/15] gepa work --- eval_protocol/training/gepa_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 7a3b60c3..2c75cbb9 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -199,7 +199,7 @@ def metric( # No running loop - safe to use asyncio.run evaluated_row = asyncio.run(result) else: - evaluated_row = result # type: ignore[reportAssignmentType] + evaluated_row = result # pyright: ignore[reportAssignmentType] # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow] From 8a2093b119377bf3de328078d5ffb6380f469c9b Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Tue, 9 Dec 2025 13:19:50 -0800 Subject: [PATCH 10/15] updates --- eval_protocol/training/gepa_trainer.py | 30 +++++++++++++++++++++++ eval_protocol/training/gepa_utils.py | 34 +++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index 08541062..869ed050 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -153,6 +153,22 @@ def __init__( module_factory=module_factory, ) + # Debug: Verify program structure + print("\n🔍 DEBUG [GEPATrainer] PROGRAM STRUCTURE:") + print(f" Program type: {type(self.program).__name__}") + print(f" Has .predict: {hasattr(self.program, 'predict')}") + if hasattr(self.program, "predict"): + print(f" predict type: {type(self.program.predict).__name__}") + print(f" predict.signature: {self.program.predict.signature}") + print( + f" predict.signature.instructions (first 300 chars): {(self.program.predict.signature.instructions or '')[:300]}..." + ) + print(f" Named predictors: {[name for name, _ in self.program.named_predictors()]}") + for name, pred in self.program.named_predictors(): + print( + f" - '{name}': {pred.signature.instructions[:100] if pred.signature.instructions else 'None'}..." + ) + # Convert EP rows to DSPy Examples self.train_set: List[Example] = evaluation_rows_to_dspy_examples(train_rows, input_field, output_field) self.val_set: List[Example] = evaluation_rows_to_dspy_examples(val_rows, input_field, output_field) @@ -348,6 +364,20 @@ def train( marker = " 🏆" if i == results.best_idx else "" print(f" Candidate {i}: {score:.3f}{marker}") + # Show all candidate instructions + print("\n📝 ALL CANDIDATE INSTRUCTIONS:") + if hasattr(results, "candidates") and results.candidates: + for i, cand_prog in enumerate(results.candidates): + marker = " 🏆 BEST" if i == results.best_idx else "" + print(f"\n --- Candidate {i}{marker} (score: {results.val_aggregate_scores[i]:.3f}) ---") + # Get instructions from the candidate program + for name, pred in cand_prog.named_predictors(): + instr = pred.signature.instructions or "" + print(f" Predictor '{name}' instructions (first 500 chars):") + print(f" {instr[:500]}...") + if len(instr) > 500: + print(f" ... ({len(instr)} total chars)") + optimized_instructions = self.get_optimized_system_prompt(optimized_program) print("\n🎯 OPTIMIZED SYSTEM PROMPT:") print("-" * 60) diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 2c75cbb9..a4d66d76 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -110,6 +110,14 @@ def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: content = pred.get("answer", "") + # Debug: print conversion details (only first few) + import os + + if os.environ.get("EP_DEBUG_GEPA"): + print("\n [gold_and_pred_to_row] Converting:") + print(f" gold.answer type: {type(gt)}, value preview: {str(gt)[:100]}...") + print(f" pred.answer type: {type(content)}, value preview: {str(content)[:100]}...") + return EvaluationRow( messages=[ Message(role="assistant", content=str(content)) @@ -325,6 +333,14 @@ def create_single_turn_program( module_factory=lambda sig: MyCustomModule(sig) ) """ + print("\n" + "⚙️" * 20) + print("DEBUG [create_single_turn_program] CREATING DSPY MODULE") + print("⚙️" * 20) + print(f" input_field: '{input_field}'") + print(f" output_field: '{output_field}'") + print(f" module_type: {module_type}") + print(f" system_prompt (first 200 chars): {(system_prompt or '')[:200]}...") + # Create the signature sig = create_signature( input_field=input_field, @@ -334,8 +350,12 @@ def create_single_turn_program( output_desc=output_desc, ) + print(f"\n Created signature: {sig}") + print(f" Signature instructions (first 200 chars): {(sig.instructions or '')[:200]}...") + # Use custom factory if provided if module_factory is not None: + print(" Using custom module factory") return module_factory(sig) # Convert string to enum if needed @@ -344,14 +364,22 @@ def create_single_turn_program( # Create the appropriate module type if module_type == DSPyModuleType.PREDICT: - return dspy.Predict(sig) + program = dspy.Predict(sig) elif module_type == DSPyModuleType.CHAIN_OF_THOUGHT: - return dspy.ChainOfThought(sig) + program = dspy.ChainOfThought(sig) elif module_type == DSPyModuleType.PROGRAM_OF_THOUGHT: - return dspy.ProgramOfThought(sig) + program = dspy.ProgramOfThought(sig) else: raise ValueError(f"Unknown module type: {module_type}") + print(f"\n Created module: {type(program).__name__}") + print(f" Named predictors: {[name for name, _ in program.named_predictors()]}") + for name, pred in program.named_predictors(): + print(f" '{name}' signature.instructions (first 200 chars): {(pred.signature.instructions or '')[:200]}...") + print("⚙️" * 20 + "\n") + + return program + def configure_dspy_lm(ep_params: EPParameters) -> None: """ From ce61cadde855e7761df3d0856073275158d39c3d Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Thu, 11 Dec 2025 14:03:53 -0800 Subject: [PATCH 11/15] cleaning up 1 --- eval_protocol/models.py | 2 + eval_protocol/trainable_gepa_design.md | 236 ------------------------- eval_protocol/training/gepa_trainer.py | 155 +--------------- eval_protocol/training/gepa_utils.py | 50 ------ eval_protocol/training/trainer.py | 9 +- 5 files changed, 17 insertions(+), 435 deletions(-) delete mode 100644 eval_protocol/trainable_gepa_design.md diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 911c13b9..b869e140 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1195,6 +1195,8 @@ class MCPMultiClientConfiguration(BaseModel): class EPParameters(BaseModel): """The parameters of an `@evaluation_test`. Used for trainable integrations.""" + model_config = ConfigDict(arbitrary_types_allowed=True) + completion_params: Any = None input_messages: Any = None input_dataset: Any = None diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md deleted file mode 100644 index b66fb7e0..00000000 --- a/eval_protocol/trainable_gepa_design.md +++ /dev/null @@ -1,236 +0,0 @@ -## GEPA-training Interface Design for Eval Protocol - -### Goals - -- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as training parameters, without changing their core evaluation logic. -- **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API. -- **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly. - -### High-Level Architecture - -- **Benchmark file (e.g., `test_aime25.py`)** - - Continues to define: - - Dataset adapter (`aime2025_dataset_adapter`). - - `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that: - - Uses `SingleTurnRolloutProcessor` (or another processor). - - Computes per-row metrics and sets `row.evaluation_result`. - - Adds *optional* training wiring at the bottom, under `if __name__ == "__main__":`, that: - - Imports a training/core API from `eval_protocol.training`. - - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate. - - Invokes a train routine (GEPA-based or otherwise). - -- **Training core** - - Provides a single central abstraction: - - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form: - - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides. - - **Candidate representation**: Start with `dict[str, str]` (e.g., `{"system_prompt": "..."}`), anticipating future extensions (few-shot examples, tool docs, etc.). - - Includes helper utilities to: - - Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function. - - Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores). - -- **GEPA adapter (e.g., `eval_protocol/training/gepa_adapter.py`)** - - Wraps the training core and GEPA’s API: - - Accepts: - - An `EPConfig`. - - A candidate space definition (for now, implicit via `dict[str, str]` keys). - - GEPA configuration (budget, reflection model, seed, component selection strategy, etc.). - - Provides: - - A GEPA-compatible metric interface that: - - Given a candidate, uses `EPConfig` (and benchmark-specific logic such as a custom `dataset_adapter`) to: - - Construct or adapt rows for that candidate. - - Run rollouts (reusing the same processors and params as the test). - - Compute scalar scores (e.g., mean exact-match over a batch). - - A training routine that returns: - - A `best_candidate: dict[str, str]`. - - Optional rich result object (e.g., mapping to `GEPAResult`, additional stats). - -### Relationship to `evaluation_test` and `__ep_params__` - -- Existing `evaluation_test` code will attach: - -```python -ep_params: dict[str, Any] = { - "rollout_processor": rollout_processor, - "server_script_path": server_script_path, - "mcp_config_path": mcp_config_path, - "rollout_processor_kwargs": rollout_processor_kwargs, - "mode": mode, -} -setattr(dual_mode_wrapper, "__ep_params__", ep_params) -``` - -- Design direction: - - **Use `__ep_params__` as the single source of truth**. - - **`__ep_params__` should contain all effective `evaluation_test` parameters**, including: - - Parsed `completion_params` (after env overrides). - - Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms. - - `aggregation_method`, `num_runs`, `max_dataset_rows`, etc. - - Rollout and mode information (processor, kwargs, concurrency limits, mode). - - The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config. - -- Training core will expose: - - A factory like: - - ```python - def build_ep_parameters_from_test( - test_fn: TestFunction, - ) -> EPParameters: - ... - ``` - - - This function: - - Reads `test_fn.__ep_params__`. - - Reconstructs how to: - - Load and preprocess the dataset. - - Configure the rollout processor (`RolloutProcessorConfig`). - - Run rollouts and then apply the row-level metric (by calling the decorated test function in a library mode). - -- Training code (e.g., `python test_aime25.py`) then becomes: - - Import the test function (e.g., `test_aime25_pointwise`). - - Build an `EPParameters` from it. - - Call into a GEPA-based trainer that uses the `EPParameters`. - -### TODO for derek to figure out: how to store the changing system prompts. - -- **Where tuned prompts live (storage format and location)**: - - GEPA already supports a `run_dir` for logging and checkpoints. - - We need to decide: - - Whether EP should: - - Treat `run_dir` as the canonical store and optionally add a small `best_candidate.json` there; or - - Provide an additional EP-level artifact format. - - For now, storage is left as an **explicit design TODO** and can be finalized once we have the core/adapter in place. - -### Work Split: Person A vs Person B - -#### Person A – training Core & `evaluation_test` Integration - -- **1. Extend `evaluation_test` metadata (no behavior change)** - - Populate a single `__ep_config__` dict on the decorated test function that includes: - - Dataset specification (paths / input_rows / dataloaders, `dataset_adapter`, `max_dataset_rows`, etc.) after `parse_ep_*`. - - Parsed `completion_params` (after env overrides like `parse_ep_completion_params_overwrite`). - - Rollout settings (`rollout_processor`, `rollout_processor_kwargs`, `mode`, `max_concurrent_rollouts`, `max_concurrent_evaluations`). - - Aggregation and threshold metadata. - - Ensure: - - Backwards compatibility for existing tests. - - Clear typing and docstrings to guide future use. - -- **2. Define core training abstractions in `eval_protocol/training/core.py`** - - Define: - - `EPConfig`: - - A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.). - - Can be serialized/inspected for external tooling. - - Candidate type alias (initially `Candidate = dict[str, str]`). - - Implement: - - `build_ep_config_from_test(test_fn: TestFunction) -> EPConfig`. - - Reads `__ep_config__`. - - Reuses the same dataset and rollout logic as pytest, but in a library-friendly way (no pytest invocation). - - Helper(s) to: - - Run a single candidate over the dataset, possibly with: - - A subset of rows (train vs val split initially determined by the benchmark or EPConfig). - - A configurable aggregation method (mean score to start). - -- **3. Minimal tests and documentation for the core** - - Add unit/integration tests that: - - Use a tiny fake `@evaluation_test` function. - - Confirm `build_ep_config_from_test` produces a config that can: - - Load mock rows. - - Run a dummy rollout processor. - - Apply a simple metric to produce scores. - - Document (in this design file or a short README) how benchmarks should think about exposing tunable pieces (e.g., via custom dataset adapters or other wiring). - -#### Person B – GEPA Adapter & Benchmark Wiring - -- **4. Implement GEPA integration in `eval_protocol/training/gepa_adapter.py`** - - Define a small adapter API, e.g.: - -```python -class GEPATrainer: - def __init__(self, spec: trainingBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...): - ... - - def train(self) -> tuple[Candidate, Any]: - """Run GEPA and return best candidate plus optional rich result.""" -``` - - - Inside, implement: - - Conversion from `(spec, inject_fn)` into a GEPA metric: - - For each candidate: - - Clone or map the base dataset rows, applying `inject_fn(candidate, row)`. - - Use the spec’s rollout runner + metric runner to compute per-example and aggregate scores. - - Return the aggregate score (and optional textual feedback) to GEPA. - - The call to `gepa.optimize(...)` with: - - `seed_candidate` constructed from the baseline configuration (e.g., default system prompt). - - Budget configuration (max metric calls / auto presets). - - Reflection config (reflection LM or other knobs) passed in via constructor. - - Mapping from `GEPAResult` (or equivalent) back into: - - `best_candidate: Candidate`. - - Optional rich result object (e.g., exposing Pareto-front stats). - -- **5. Wire a first benchmark: AIME 2025** - - In `eval_protocol/benchmarks/test_aime25.py`: - - Factor the row-scoring logic inside `test_aime25_pointwise` into a **reusable metric function** (pure function that sets `row.evaluation_result` given a rolled-out row). - - Decide how candidates should influence the evaluation: - - For example, by making the dataset adapter or message-construction logic candidate-aware (e.g., changing the system prompt). - - Add a `if __name__ == "__main__":` block that: - - Imports `test_aime25_pointwise` and builds an `EPConfig` via `build_ep_config_from_test`. - - Instantiates `GEPATrainer` with: - - The `EPConfig`. - - Initial GEPA config (budget, reflection model placeholder, seed). - - Calls `trainer.train()` and prints/logs the resulting `best_candidate` for now. - - Keep storage of tuned prompts as a TODO/extension point to be resolved later. - -- **6. Optional second benchmark: GPQA** - - Repeat step 5 for `test_gpqa.py`: - - Identify what’s tunable (system prompt, possibly chain-of-thought instructions). - - Extract metric logic into a reusable function. - - Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer. - - This will validate that: - - The abstractions generalize across tasks. - - No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined training API). - -### Coordination Notes - -- **Order of work** - - Person A should go first (or in parallel up to the point where `EPConfig` and `build_ep_config_from_test` are usable). - - Person B can stub against interfaces and adjust once Person A’s core is available. -- **Integration checkpoints** - - After Person A lands the core + tests: - - Person B wires AIME with a very simple “optimizer” (even random search) to smoke-test the path before hooking up real GEPA. - - After GEPA integration works for AIME: - - Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts. - - Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows. - - -future: - -this is how gepa defines eval: - -def metric( - gold: Example, - pred: Prediction, - trace: Optional[DSPyTrace] = None, - pred_name: Optional[str] = None, - pred_trace: Optional[DSPyTrace] = None, -) -> float | ScoreWithFeedback: - """ - This function is called with the following arguments: - - gold: The gold example. - - pred: The predicted output. - - trace: Optional. The trace of the program's execution. - - pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which - the feedback is being requested. - - pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for. - - Note the `pred_name` and `pred_trace` arguments. During optimization, GEPA will call the metric to obtain - feedback for individual predictors being optimized. GEPA provides the name of the predictor in `pred_name` - and the sub-trace (of the trace) corresponding to the predictor in `pred_trace`. - If available at the predictor level, the metric should return {'score': float, 'feedback': str} corresponding - to the predictor. - If not available at the predictor level, the metric can also return a text feedback at the program level - (using just the gold, pred and trace). - If no feedback is returned, GEPA will use a simple text feedback consisting of just the score: - f"This trajectory got a score of {score}." - """ - ... - -ideally generic way to turn evaluation_test into this. diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index 869ed050..1b670780 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -117,23 +117,6 @@ def __init__( # Extract the system prompt from the dataset (this is what GEPA will optimize!) self._initial_system_prompt = extract_system_prompt_from_rows(self._rows) - # Debug: Print initial setup info - print("\n" + "=" * 80) - print("GEPA TRAINER INITIALIZATION") - print("=" * 80) - print(f"\n📊 Dataset loaded: {len(self._rows)} total rows") - print(f" - Train: {len(train_rows)} rows") - print(f" - Val: {len(val_rows)} rows") - print(f" - Test: {len(test_rows)} rows") - print("\n📝 Initial System Prompt (what GEPA will optimize):") - print("-" * 40) - print( - self._initial_system_prompt[:500] + "..." - if self._initial_system_prompt and len(self._initial_system_prompt) > 500 - else self._initial_system_prompt - ) - print("-" * 40) - # Create or use provided DSPy program if program is not None: # Use the provided program directly @@ -153,37 +136,11 @@ def __init__( module_factory=module_factory, ) - # Debug: Verify program structure - print("\n🔍 DEBUG [GEPATrainer] PROGRAM STRUCTURE:") - print(f" Program type: {type(self.program).__name__}") - print(f" Has .predict: {hasattr(self.program, 'predict')}") - if hasattr(self.program, "predict"): - print(f" predict type: {type(self.program.predict).__name__}") - print(f" predict.signature: {self.program.predict.signature}") - print( - f" predict.signature.instructions (first 300 chars): {(self.program.predict.signature.instructions or '')[:300]}..." - ) - print(f" Named predictors: {[name for name, _ in self.program.named_predictors()]}") - for name, pred in self.program.named_predictors(): - print( - f" - '{name}': {pred.signature.instructions[:100] if pred.signature.instructions else 'None'}..." - ) - # Convert EP rows to DSPy Examples self.train_set: List[Example] = evaluation_rows_to_dspy_examples(train_rows, input_field, output_field) self.val_set: List[Example] = evaluation_rows_to_dspy_examples(val_rows, input_field, output_field) self.test_set: List[Example] = evaluation_rows_to_dspy_examples(test_rows, input_field, output_field) - # Debug: Print example info - print("\n📦 DSPy Examples created:") - print(f" Input field: '{input_field}', Output field: '{output_field}'") - if self.train_set: - ex = self.train_set[0] - print("\n Sample train example:") - print(f" - {input_field}: {str(getattr(ex, input_field, ''))[:200]}...") - print(f" - {output_field}: {str(getattr(ex, output_field, ''))}") - print("=" * 80 + "\n") - def _load_dataset(self) -> List[EvaluationRow]: """ Load the dataset from ep_params. @@ -241,7 +198,13 @@ def get_optimized_system_prompt(self, optimized_program: Module) -> str: This can be used with EP's rollout processor via system_prompt_override. """ # GEPA stores optimized instructions in the signature - return optimized_program.predict.signature.instructions + # Handle both PREDICT (has .signature directly) and ChainOfThought (has .predict.signature) + if hasattr(optimized_program, "signature"): + return optimized_program.signature.instructions # pyright: ignore[reportAttributeAccessIssue] + elif hasattr(optimized_program, "predict") and hasattr(optimized_program.predict, "signature"): # pyright: ignore[reportAttributeAccessIssue] + return optimized_program.predict.signature.instructions # pyright: ignore[reportAttributeAccessIssue] + else: + raise ValueError("Could not find signature.instructions on the optimized program") def train( self, @@ -302,39 +265,6 @@ def train( } gepa_args.update(gepa_kwargs or {}) - print("\n" + "=" * 80) - print("GEPA TRAINING STARTED") - print("=" * 80) - print(f"📋 Program type: {type(self.program).__name__}") - - # Get signature - ChainOfThought stores it in .predict.signature - sig = None - if hasattr(self.program, "signature"): - sig = self.program.signature - elif hasattr(self.program, "predict") and hasattr(self.program.predict, "signature"): - sig = self.program.predict.signature - - if sig: - print(f"📋 Signature: {sig}") - print("📋 Initial Instructions:") - print("-" * 40) - print(sig.instructions if sig.instructions else "None") - print("-" * 40) - else: - print("📋 Signature: N/A") - - print(f"📋 Train set size: {len(self.train_set)}") - print(f"📋 Val set size: {len(self.val_set)}") - print(f"📋 Test set size: {len(self.test_set)}") - print(f"📋 GEPA auto mode: {gepa_args.get('auto', 'N/A')}") - print(f"📋 Reflection minibatch size: {gepa_args.get('reflection_minibatch_size', 3)}") - print("=" * 80 + "\n") - - # Enable verbose logging from DSPy/GEPA - import logging - - logging.getLogger("dspy.teleprompt.gepa.gepa").setLevel(logging.INFO) - optimizer = GEPA( metric=self.metric, **gepa_args, @@ -346,55 +276,6 @@ def train( valset=self.val_set, ) - print("\n" + "=" * 80) - print("GEPA TRAINING COMPLETE") - print("=" * 80) - - # Print detailed results if track_stats was enabled - if hasattr(optimized_program, "detailed_results"): - results = optimized_program.detailed_results - print("\n📊 OPTIMIZATION STATS:") - print(f" Total metric calls: {results.total_metric_calls}") - print(f" Full val evals: {results.num_full_val_evals}") - print(f" Best candidate index: {results.best_idx}") - print(f" Best val score: {results.val_aggregate_scores[results.best_idx]:.3f}") - - print("\n📈 ALL CANDIDATE SCORES:") - for i, score in enumerate(results.val_aggregate_scores): - marker = " 🏆" if i == results.best_idx else "" - print(f" Candidate {i}: {score:.3f}{marker}") - - # Show all candidate instructions - print("\n📝 ALL CANDIDATE INSTRUCTIONS:") - if hasattr(results, "candidates") and results.candidates: - for i, cand_prog in enumerate(results.candidates): - marker = " 🏆 BEST" if i == results.best_idx else "" - print(f"\n --- Candidate {i}{marker} (score: {results.val_aggregate_scores[i]:.3f}) ---") - # Get instructions from the candidate program - for name, pred in cand_prog.named_predictors(): - instr = pred.signature.instructions or "" - print(f" Predictor '{name}' instructions (first 500 chars):") - print(f" {instr[:500]}...") - if len(instr) > 500: - print(f" ... ({len(instr)} total chars)") - - optimized_instructions = self.get_optimized_system_prompt(optimized_program) - print("\n🎯 OPTIMIZED SYSTEM PROMPT:") - print("-" * 60) - print(optimized_instructions) - print("-" * 60) - - # Compare with initial - print("\n📝 COMPARISON:") - print(f" Initial prompt length: {len(self._initial_system_prompt or '')} chars") - print(f" Optimized prompt length: {len(optimized_instructions)} chars") - if self._initial_system_prompt != optimized_instructions: - print(" ✅ Prompt was CHANGED by GEPA") - else: - print(" ⚠️ Prompt was NOT changed (model may already be optimal or no failures to learn from)") - - print("=" * 80 + "\n") - return optimized_program def evaluate( @@ -403,7 +284,7 @@ def evaluate( num_threads: int = 32, display_table: bool = True, display_progress: bool = True, - ) -> dspy.evaluate.EvaluationResult: + ) -> Any: # Returns dspy.evaluate.EvaluationResult """ Evaluate the optimized program on the test set using DSPy's Evaluate. @@ -431,7 +312,7 @@ def evaluate_baseline( num_threads: int = 32, display_table: bool = True, display_progress: bool = True, - ) -> dspy.evaluate.EvaluationResult: + ) -> Any: # Returns dspy.evaluate.EvaluationResult """ Evaluate the unoptimized baseline program on the test set. @@ -498,11 +379,6 @@ async def evaluate_with_ep( # Get optimized system prompt optimized_prompt = self.get_optimized_system_prompt(optimized_program) - print("\n" + "=" * 80) - print("RUNNING EP EVALUATION (with LLM proxy & tracing)") - print("=" * 80) - print(f"📋 Using optimized prompt ({len(optimized_prompt)} chars)") - # Get rows to evaluate if use_test_set: # Reconstruct test rows from test_set examples @@ -513,10 +389,8 @@ async def evaluate_with_ep( seed=42, ) rows_to_eval = test_rows - print(f"📊 Evaluating on TEST SET: {len(rows_to_eval)} rows") else: rows_to_eval = self._rows - print(f"📊 Evaluating on FULL DATASET: {len(rows_to_eval)} rows") # Inject optimized system prompt into rows modified_rows = self._inject_system_prompt(rows_to_eval, optimized_prompt) @@ -545,16 +419,11 @@ async def evaluate_with_ep( rollout_processor = SingleTurnRolloutProcessor() rollout_processor.setup() - print("🚀 Running rollouts through EP infrastructure...") - print(f" Model: {completion_params.get('model', 'N/A')}") - try: # Execute rollouts tasks = rollout_processor(modified_rows, config) rolled_out_rows = await asyncio.gather(*tasks) - print(f"✅ Rollouts complete: {len(rolled_out_rows)} rows") - # Run evaluation function on each row evaluated_rows = [] scores = [] @@ -574,12 +443,6 @@ async def evaluate_with_ep( # Calculate aggregate score avg_score = sum(scores) / len(scores) if scores else 0.0 - print("\n📊 EVALUATION RESULTS:") - print(f" Total rows: {len(evaluated_rows)}") - print(f" Aggregate score: {avg_score:.3f}") - print(f" Passing: {sum(1 for s in scores if s >= 0.5)}/{len(scores)}") - print("=" * 80 + "\n") - return { "rows": evaluated_rows, "score": avg_score, diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index a4d66d76..0e3df571 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -1,4 +1,3 @@ -import os from typing import Any, Optional, Tuple import dspy @@ -110,14 +109,6 @@ def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: content = pred.get("answer", "") - # Debug: print conversion details (only first few) - import os - - if os.environ.get("EP_DEBUG_GEPA"): - print("\n [gold_and_pred_to_row] Converting:") - print(f" gold.answer type: {type(gt)}, value preview: {str(gt)[:100]}...") - print(f" pred.answer type: {type(content)}, value preview: {str(content)[:100]}...") - return EvaluationRow( messages=[ Message(role="assistant", content=str(content)) @@ -160,11 +151,6 @@ def ep_test_to_gepa_metric( import asyncio import inspect - # Counter for debugging - call_count = [0] - DEBUG_METRIC = True # Set to False to disable metric debug output - DEBUG_VERBOSE = True # Set to True to print ALL calls (can be very verbose!) - def metric( gold: Example, pred: Prediction, @@ -172,18 +158,6 @@ def metric( pred_name: Optional[str] = None, pred_trace: Optional[DSPyTrace] = None, ) -> ScoreWithFeedback: - call_count[0] += 1 - - should_print = DEBUG_METRIC and (DEBUG_VERBOSE or call_count[0] <= 3) - - if should_print: - print(f"\n🔍 METRIC CALL #{call_count[0]}") - print("-" * 40) - print(f" Gold (expected): {gold.get('answer', 'N/A')}") - print(f" Pred (model): {str(pred.get('answer', 'N/A'))[:200]}") - if hasattr(pred, "reasoning") and pred.reasoning: - print(f" Reasoning: {str(pred.reasoning)[:300]}...") - row = gold_and_pred_to_row(gold, pred) # Call the test function - handle both sync and async @@ -212,12 +186,6 @@ def metric( # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow] score_result = row_to_prediction(evaluated_row) - - if should_print: - print(f" Score: {score_result.score}") - print(f" Feedback: {str(score_result.feedback)[:200]}") - print("-" * 40) - return score_result return metric @@ -333,14 +301,6 @@ def create_single_turn_program( module_factory=lambda sig: MyCustomModule(sig) ) """ - print("\n" + "⚙️" * 20) - print("DEBUG [create_single_turn_program] CREATING DSPY MODULE") - print("⚙️" * 20) - print(f" input_field: '{input_field}'") - print(f" output_field: '{output_field}'") - print(f" module_type: {module_type}") - print(f" system_prompt (first 200 chars): {(system_prompt or '')[:200]}...") - # Create the signature sig = create_signature( input_field=input_field, @@ -350,12 +310,8 @@ def create_single_turn_program( output_desc=output_desc, ) - print(f"\n Created signature: {sig}") - print(f" Signature instructions (first 200 chars): {(sig.instructions or '')[:200]}...") - # Use custom factory if provided if module_factory is not None: - print(" Using custom module factory") return module_factory(sig) # Convert string to enum if needed @@ -372,12 +328,6 @@ def create_single_turn_program( else: raise ValueError(f"Unknown module type: {module_type}") - print(f"\n Created module: {type(program).__name__}") - print(f" Named predictors: {[name for name, _ in program.named_predictors()]}") - for name, pred in program.named_predictors(): - print(f" '{name}' signature.instructions (first 200 chars): {(pred.signature.instructions or '')[:200]}...") - print("⚙️" * 20 + "\n") - return program diff --git a/eval_protocol/training/trainer.py b/eval_protocol/training/trainer.py index 1008bb41..4bcb9bfc 100644 --- a/eval_protocol/training/trainer.py +++ b/eval_protocol/training/trainer.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Any from eval_protocol.pytest.types import TestFunction @@ -8,9 +9,11 @@ def __init__(self, test_fn: TestFunction): self.test_fn = test_fn @abstractmethod - def train(self, *args, **kwargs): ... + def train(self, *args: Any, **kwargs: Any) -> Any: + """Run training and return the optimized model/program.""" + ... @abstractmethod - def evaluate(self, *args, **kwargs): - # evaluation logic possibly can be shared since it's EP. TBD + def evaluate(self, *args: Any, **kwargs: Any) -> Any: + """Evaluate the optimized model/program.""" ... From 7ddfceb5ed826d9e50d0c2d7ffe2d031783f8411 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 12 Dec 2025 12:35:16 -0800 Subject: [PATCH 12/15] undo --- eval_protocol/benchmarks/test_aime25.py | 86 ++----------------------- 1 file changed, 4 insertions(+), 82 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 6994a0ca..6eb785a7 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -12,8 +12,6 @@ SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test -from eval_protocol.training import GEPATrainer -from eval_protocol.training.gepa_utils import build_reflection_lm SYSTEM_PROMPT = ( "You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}." @@ -63,44 +61,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None -def _build_feedback_text( - *, - extracted_int: Optional[int], - gt_int: Optional[int], - is_valid: bool, - raw_model_answer: str, - ground_truth: Optional[str], -) -> str: - """ - Build a feedback string similar in spirit to the GEPA `metric_with_feedback`. - - Cases: - - Parse failure (model or gold): explain integer formatting and show correct answer. - - Correct: "Your answer is correct. The correct answer is '...'." - - Incorrect: "Your answer is incorrect. The correct answer is '...'." - """ - correct_answer_display = str(gt_int if gt_int is not None else (ground_truth or "")) - - if not is_valid: - # Could not parse either the model answer or the gold answer as an integer. - feedback_text = ( - "The final answer must be a valid integer and nothing else. " - f"You responded with '{raw_model_answer}', which couldn't be parsed as a python integer. " - "Please ensure your answer is a valid integer without any additional text or formatting." - ) - if correct_answer_display: - feedback_text += f" The correct answer is '{correct_answer_display}'." - return feedback_text - - if extracted_int == gt_int: - return f"Your answer is correct. The correct answer is '{correct_answer_display}'." - else: - return f"Your answer is incorrect. The correct answer is '{correct_answer_display}'." - - # TODO: our dataset does not contain written solutions, so we cannot provide feedback on the solution. maybe need to add it later. - # they're using https://huggingface.co/datasets/AI-MO/aimo-validation-aime - - def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: @@ -123,14 +83,15 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus", + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, num_runs=8, - max_dataset_rows=None, # Use full dataset + max_dataset_rows=2, max_concurrent_rollouts=4, mode="pointwise", ) @@ -163,49 +124,10 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: ) } - feedback_text = _build_feedback_text( - extracted_int=extracted_int, - gt_int=gt_int, - is_valid=is_valid, - raw_model_answer=content_str, - ground_truth=str(row.ground_truth), - ) - row.evaluation_result = EvaluateResult( score=score, - reason=feedback_text, + reason=("Answer correct" if score == 1.0 else "Answer incorrect"), is_score_valid=is_valid, metrics=metrics, ) return row - - -if __name__ == "__main__": - import asyncio - - trainer = GEPATrainer( - test_aime25_pointwise, - train_ratio=0.5, # 50% for training (15 problems) - val_ratio=0.3, # 30% for validation (9 problems) - # test_ratio = 20% (6 problems) - calculated automatically - ) - - # Use same Fireworks model for both main and reflection - reflection_lm = build_reflection_lm("fireworks_ai/accounts/fireworks/models/deepseek-v3p1-terminus") - - optimized_program = trainer.train( - num_threads=4, # Reduced from 32 to avoid API timeouts - track_stats=True, - reflection_minibatch_size=5, # Reduced to limit concurrent requests - reflection_lm=reflection_lm, - ) - - # Option 1: Quick DSPy evaluation (doesn't use EP infrastructure) - print("\n=== DSPy Evaluation ===") - print(trainer.evaluate(optimized_program)) - - # Option 2: Full EP evaluation (uses LLM proxy, Fireworks tracing, etc.) - # This goes through the normal @evaluation_test pipeline - print("\n=== EP Evaluation (with tracing) ===") - results = trainer.run_ep_evaluation(optimized_program) - print(f"Final EP Score: {results['score']:.3f}") From c04acf8c4a0d00a04a119b542883157610a4cd46 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 15 Dec 2025 12:02:51 -0600 Subject: [PATCH 13/15] fixes --- .../pytest/integrations/openenv_trl_vllm.py | 11 ++++--- eval_protocol/training/gepa_trainer.py | 28 ++++++++--------- eval_protocol/training/gepa_utils.py | 30 ++++++++++++++----- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/eval_protocol/pytest/integrations/openenv_trl_vllm.py b/eval_protocol/pytest/integrations/openenv_trl_vllm.py index 3f204680..9db3ce3d 100644 --- a/eval_protocol/pytest/integrations/openenv_trl_vllm.py +++ b/eval_protocol/pytest/integrations/openenv_trl_vllm.py @@ -121,10 +121,13 @@ def rollout_func(prompts: List[str], trainer) -> Dict[str, List]: eval_func = candidate_tests[0] ep_eval_func = eval_func # used later after rollouts complete - ep_params: Dict[str, Any] = getattr(eval_func, "__ep_params__", {}) - ep_rollout_processor = ep_params.get("rollout_processor") - ep_rollout_processor_kwargs = ep_params.get("rollout_processor_kwargs") or {} - ep_mcp_config_path = ep_params.get("mcp_config_path") or "" + ep_params = getattr(eval_func, "__ep_params__", None) + # ep_params is an EPParameters model (Pydantic), use attribute access + ep_rollout_processor = getattr(ep_params, "rollout_processor", None) if ep_params else None + ep_rollout_processor_kwargs = ( + (getattr(ep_params, "rollout_processor_kwargs", None) or {}) if ep_params else {} + ) + ep_mcp_config_path = (getattr(ep_params, "mcp_config_path", None) or "") if ep_params else "" logger.info( "[OpenEnvVLLM] Loaded eval test '%s' with rollout_processor=%s", getattr(eval_func, "__name__", str(eval_func)), diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index 1b670780..d4fa64b3 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -10,6 +10,9 @@ from eval_protocol.models import EPParameters, EvaluationRow, Message from eval_protocol.pytest.types import TestFunction, RolloutProcessorConfig +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.pytest.execution import execute_pytest +from eval_protocol.dataset_logger import default_logger from eval_protocol.training.trainer import Trainer from eval_protocol.training.utils import build_ep_parameters_from_test from eval_protocol.training.gepa_utils import ( @@ -98,12 +101,15 @@ def __init__( # Store configuration self._input_field = input_field self._output_field = output_field + self._train_ratio = train_ratio + self._val_ratio = val_ratio + self._seed = seed # Configure DSPy to use the same LLM as EP configure_dspy_lm(self.ep_params) - # Wrap the EP test function as a GEPA metric - self.metric = ep_test_to_gepa_metric(test_fn) + # Wrap the EP test function as a GEPA metric (with configured field names) + self.metric = ep_test_to_gepa_metric(test_fn, input_field, output_field) # Load and split the dataset self._rows: List[EvaluationRow] = self._load_dataset() @@ -113,6 +119,10 @@ def __init__( val_ratio=val_ratio, seed=seed, ) + # Store original EvaluationRow objects for later use in evaluate_with_ep + self._train_rows: List[EvaluationRow] = train_rows + self._val_rows: List[EvaluationRow] = val_rows + self._test_rows: List[EvaluationRow] = test_rows # Extract the system prompt from the dataset (this is what GEPA will optimize!) self._initial_system_prompt = extract_system_prompt_from_rows(self._rows) @@ -372,23 +382,13 @@ async def evaluate_with_ep( - 'score': Aggregate score - 'optimized_prompt': The prompt used for evaluation """ - from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor - from eval_protocol.pytest.execution import execute_pytest - from eval_protocol.logging import default_logger - # Get optimized system prompt optimized_prompt = self.get_optimized_system_prompt(optimized_program) # Get rows to evaluate if use_test_set: - # Reconstruct test rows from test_set examples - _, _, test_rows = train_val_test_split( - self._rows, - train_ratio=0.5, # Match the ratio used in training - val_ratio=0.3, - seed=42, - ) - rows_to_eval = test_rows + # Use stored test rows (same split from __init__) + rows_to_eval = self._test_rows else: rows_to_eval = self._rows diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 0e3df571..42ebb13d 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -91,23 +91,30 @@ def build_reflection_lm(reflection_lm_name: str) -> LM: return dspy.LM(model=reflection_lm_name) -def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: +def gold_and_pred_to_row( + gold: Example, + pred: Prediction, + input_field: str = "problem", + output_field: str = "answer", +) -> EvaluationRow: """ Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`. - Assumptions (aligned with common DSPy usage): - - `gold.answer` holds the ground-truth answer. - - `pred.answer` holds the model's final answer text. + Args: + gold: The ground-truth example + pred: The model's prediction + input_field: Name of the input field in the DSPy signature + output_field: Name of the output field in the DSPy signature Note: ground_truth is preserved in its original type (list, dict, str, etc.) to support structured comparisons like SQL result matching. """ - gt = gold.get("answer", None) + gt = gold.get(output_field, None) # Preserve original type - don't convert to string! # This is important for SQL evaluators that expect list[dict] results ground_truth = gt - content = pred.get("answer", "") + content = pred.get(output_field, "") return EvaluationRow( messages=[ @@ -135,13 +142,20 @@ def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback: def ep_test_to_gepa_metric( test_fn: TestFunction, + input_field: str = "problem", + output_field: str = "answer", ) -> GEPAFeedbackMetric: """ Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into a GEPAFeedbackMetric-compatible callable. + Args: + test_fn: The EP evaluation test function + input_field: Name of the input field in the DSPy signature (default: "problem") + output_field: Name of the output field in the DSPy signature (default: "answer") + The resulting metric: - - Constructs an EvaluationRow from (gold, pred) using a simple heuristic. + - Constructs an EvaluationRow from (gold, pred) using the configured field names. - Applies the EP test_fn to populate `row.evaluation_result`. - Returns a dspy.Prediction(score, feedback) derived from that result. @@ -158,7 +172,7 @@ def metric( pred_name: Optional[str] = None, pred_trace: Optional[DSPyTrace] = None, ) -> ScoreWithFeedback: - row = gold_and_pred_to_row(gold, pred) + row = gold_and_pred_to_row(gold, pred, input_field, output_field) # Call the test function - handle both sync and async result = test_fn(row) # pyright: ignore From 3336b908b04e909eacd9f67ca76f400e9c5da967 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 15 Dec 2025 12:12:47 -0600 Subject: [PATCH 14/15] fix --- eval_protocol/training/gepa_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py index 42ebb13d..15f30681 100644 --- a/eval_protocol/training/gepa_utils.py +++ b/eval_protocol/training/gepa_utils.py @@ -359,7 +359,7 @@ def configure_dspy_lm(ep_params: EPParameters) -> None: # Handle completion_params being a list (for sweeps) - use the first one if isinstance(raw_params, list): - completion_params = raw_params[0] if raw_params else {} + completion_params = (raw_params[0] if raw_params else None) or {} else: completion_params = raw_params or {} From 7b3c4206495a963a93a5d0c69e03ba1c2e540388 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 15 Dec 2025 12:39:06 -0600 Subject: [PATCH 15/15] updated --- eval_protocol/models.py | 1 + eval_protocol/pytest/evaluation_test.py | 1 + eval_protocol/training/gepa_trainer.py | 62 +++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/eval_protocol/models.py b/eval_protocol/models.py index b869e140..a90fe8ac 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1205,6 +1205,7 @@ class EPParameters(BaseModel): dataset_adapter: Optional[Callable[..., Any]] = None rollout_processor: Any = None rollout_processor_kwargs: Dict[str, Any] | None = None + evaluation_test_kwargs: Any = None aggregation_method: Any = Field(default="mean") passed_threshold: Any = None disable_browser_open: bool = False diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 82065517..4a7ea88a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -706,6 +706,7 @@ async def _collect_result(config, lst): dataset_adapter=dataset_adapter, rollout_processor=rollout_processor, rollout_processor_kwargs=rollout_processor_kwargs, + evaluation_test_kwargs=evaluation_test_kwargs, aggregation_method=aggregation_method, passed_threshold=passed_threshold, disable_browser_open=disable_browser_open, diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py index d4fa64b3..d91efe67 100644 --- a/eval_protocol/training/gepa_trainer.py +++ b/eval_protocol/training/gepa_trainer.py @@ -157,14 +157,29 @@ def _load_dataset(self) -> List[EvaluationRow]: Supports: - input_rows: Pre-constructed EvaluationRow objects + - Can be List[EvaluationRow] (direct usage) + - Or Sequence[list[EvaluationRow]] (parameterized usage) - input_dataset: Paths to JSONL files (requires dataset_adapter) - input_messages: Raw message lists + - data_loaders: EvaluationDataLoader instances """ ep = self.ep_params # Case 1: Pre-constructed rows + # Handle both direct List[EvaluationRow] and parameterized Sequence[list[EvaluationRow]] if ep.input_rows: - return list(ep.input_rows) + rows_input = ep.input_rows + # Check if it's a list of EvaluationRows (direct) or list of lists (parameterized) + if rows_input and isinstance(rows_input[0], EvaluationRow): + # Direct usage: List[EvaluationRow] + return list(rows_input) + else: + # Parameterized usage: Sequence[list[EvaluationRow]] + all_rows: List[EvaluationRow] = [] + for rows_list in rows_input: + if rows_list is not None: + all_rows.extend(rows_list) + return all_rows # Case 2: Dataset paths with adapter if ep.input_dataset and ep.dataset_adapter: @@ -183,17 +198,54 @@ def _load_dataset(self) -> List[EvaluationRow]: return ep.dataset_adapter(all_data) # Case 3: Input messages (convert to rows) + # Handle both direct List[List[Message]] and parameterized Sequence[list[list[Message]] | None] if ep.input_messages: - from eval_protocol.models import Message + rows: List[EvaluationRow] = [] + messages_input = ep.input_messages + + # Check if first element is a Message (direct list of conversations) or a list (parameterized) + if messages_input and messages_input[0]: + first_elem = messages_input[0] + # Check if it's List[Message] (a single conversation) or List[List[Message]] + if hasattr(first_elem, "role"): + # It's a Message - so input is a single conversation List[Message] + rows.append(EvaluationRow(messages=list(messages_input))) + elif first_elem and hasattr(first_elem[0], "role"): + # It's List[List[Message]] - direct usage with multiple conversations + for messages in messages_input: + if messages: + rows.append(EvaluationRow(messages=messages)) + else: + # Parameterized usage: Sequence[list[list[Message]] | None] + for messages_list in messages_input: + if messages_list is not None: + for messages in messages_list: + rows.append(EvaluationRow(messages=messages)) + return rows + + # Case 4: Data loaders + if ep.data_loaders: + from eval_protocol.data_loader.models import EvaluationDataLoader rows = [] - for messages in ep.input_messages: - rows.append(EvaluationRow(messages=messages)) + data_loaders = ep.data_loaders + data_loaders_list = ( + [data_loaders] if isinstance(data_loaders, EvaluationDataLoader) else list(data_loaders) + ) + for data_loader in data_loaders_list: + results = data_loader.load() + for result in results: + rows.extend(result.rows) + + # Apply max_dataset_rows limit + if ep.max_dataset_rows: + rows = rows[: ep.max_dataset_rows] + return rows raise ValueError( "No dataset found in ep_params. " - "Provide input_rows, input_dataset (with dataset_adapter), or input_messages." + "Provide input_rows, input_dataset (with dataset_adapter), input_messages, or data_loaders." ) @property