fixes

shreymodi1 · shreymodi1 · commit c04acf8c4a0d · 2025-12-15T12:02:51.000-06:00
diff --git a/eval_protocol/pytest/integrations/openenv_trl_vllm.py b/eval_protocol/pytest/integrations/openenv_trl_vllm.py
@@ -121,10 +121,13 @@ def rollout_func(prompts: List[str], trainer) -> Dict[str, List]:
 
             eval_func = candidate_tests[0]
             ep_eval_func = eval_func  # used later after rollouts complete
-            ep_params: Dict[str, Any] = getattr(eval_func, "__ep_params__", {})
-            ep_rollout_processor = ep_params.get("rollout_processor")
-            ep_rollout_processor_kwargs = ep_params.get("rollout_processor_kwargs") or {}
-            ep_mcp_config_path = ep_params.get("mcp_config_path") or ""
+            ep_params = getattr(eval_func, "__ep_params__", None)
+            # ep_params is an EPParameters model (Pydantic), use attribute access
+            ep_rollout_processor = getattr(ep_params, "rollout_processor", None) if ep_params else None
+            ep_rollout_processor_kwargs = (
+                (getattr(ep_params, "rollout_processor_kwargs", None) or {}) if ep_params else {}
+            )
+            ep_mcp_config_path = (getattr(ep_params, "mcp_config_path", None) or "") if ep_params else ""
             logger.info(
                 "[OpenEnvVLLM] Loaded eval test '%s' with rollout_processor=%s",
                 getattr(eval_func, "__name__", str(eval_func)),
diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py
@@ -10,6 +10,9 @@
 
 from eval_protocol.models import EPParameters, EvaluationRow, Message
 from eval_protocol.pytest.types import TestFunction, RolloutProcessorConfig
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.pytest.execution import execute_pytest
+from eval_protocol.dataset_logger import default_logger
 from eval_protocol.training.trainer import Trainer
 from eval_protocol.training.utils import build_ep_parameters_from_test
 from eval_protocol.training.gepa_utils import (
@@ -98,12 +101,15 @@ def __init__(
         # Store configuration
         self._input_field = input_field
         self._output_field = output_field
+        self._train_ratio = train_ratio
+        self._val_ratio = val_ratio
+        self._seed = seed
 
         # Configure DSPy to use the same LLM as EP
         configure_dspy_lm(self.ep_params)
 
-        # Wrap the EP test function as a GEPA metric
-        self.metric = ep_test_to_gepa_metric(test_fn)
+        # Wrap the EP test function as a GEPA metric (with configured field names)
+        self.metric = ep_test_to_gepa_metric(test_fn, input_field, output_field)
 
         # Load and split the dataset
         self._rows: List[EvaluationRow] = self._load_dataset()
@@ -113,6 +119,10 @@ def __init__(
             val_ratio=val_ratio,
             seed=seed,
         )
+        # Store original EvaluationRow objects for later use in evaluate_with_ep
+        self._train_rows: List[EvaluationRow] = train_rows
+        self._val_rows: List[EvaluationRow] = val_rows
+        self._test_rows: List[EvaluationRow] = test_rows
 
         # Extract the system prompt from the dataset (this is what GEPA will optimize!)
         self._initial_system_prompt = extract_system_prompt_from_rows(self._rows)
@@ -372,23 +382,13 @@ async def evaluate_with_ep(
             - 'score': Aggregate score
             - 'optimized_prompt': The prompt used for evaluation
         """
-        from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
-        from eval_protocol.pytest.execution import execute_pytest
-        from eval_protocol.logging import default_logger
-
         # Get optimized system prompt
         optimized_prompt = self.get_optimized_system_prompt(optimized_program)
 
         # Get rows to evaluate
         if use_test_set:
-            # Reconstruct test rows from test_set examples
-            _, _, test_rows = train_val_test_split(
-                self._rows,
-                train_ratio=0.5,  # Match the ratio used in training
-                val_ratio=0.3,
-                seed=42,
-            )
-            rows_to_eval = test_rows
+            # Use stored test rows (same split from __init__)
+            rows_to_eval = self._test_rows
         else:
             rows_to_eval = self._rows
 
diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py
@@ -91,23 +91,30 @@ def build_reflection_lm(reflection_lm_name: str) -> LM:
         return dspy.LM(model=reflection_lm_name)
 
 
-def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow:
+def gold_and_pred_to_row(
+    gold: Example,
+    pred: Prediction,
+    input_field: str = "problem",
+    output_field: str = "answer",
+) -> EvaluationRow:
     """
     Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`.
 
-    Assumptions (aligned with common DSPy usage):
-    - `gold.answer` holds the ground-truth answer.
-    - `pred.answer` holds the model's final answer text.
+    Args:
+        gold: The ground-truth example
+        pred: The model's prediction
+        input_field: Name of the input field in the DSPy signature
+        output_field: Name of the output field in the DSPy signature
 
     Note: ground_truth is preserved in its original type (list, dict, str, etc.)
     to support structured comparisons like SQL result matching.
     """
-    gt = gold.get("answer", None)
+    gt = gold.get(output_field, None)
     # Preserve original type - don't convert to string!
     # This is important for SQL evaluators that expect list[dict] results
     ground_truth = gt
 
-    content = pred.get("answer", "")
+    content = pred.get(output_field, "")
 
     return EvaluationRow(
         messages=[
@@ -135,13 +142,20 @@ def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback:
 
 def ep_test_to_gepa_metric(
     test_fn: TestFunction,
+    input_field: str = "problem",
+    output_field: str = "answer",
 ) -> GEPAFeedbackMetric:
     """
     Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into
     a GEPAFeedbackMetric-compatible callable.
 
+    Args:
+        test_fn: The EP evaluation test function
+        input_field: Name of the input field in the DSPy signature (default: "problem")
+        output_field: Name of the output field in the DSPy signature (default: "answer")
+
     The resulting metric:
-    - Constructs an EvaluationRow from (gold, pred) using a simple heuristic.
+    - Constructs an EvaluationRow from (gold, pred) using the configured field names.
     - Applies the EP test_fn to populate `row.evaluation_result`.
     - Returns a dspy.Prediction(score, feedback) derived from that result.
 
@@ -158,7 +172,7 @@ def metric(
         pred_name: Optional[str] = None,
         pred_trace: Optional[DSPyTrace] = None,
     ) -> ScoreWithFeedback:
-        row = gold_and_pred_to_row(gold, pred)
+        row = gold_and_pred_to_row(gold, pred, input_field, output_field)
 
         # Call the test function - handle both sync and async
         result = test_fn(row)  # pyright: ignore