attempt at primitive conversion

xzrderek · xzrderek · commit 693274e67139 · 2025-12-06T00:53:50.000-08:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -63,6 +63,44 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
+def _build_feedback_text(
+    *,
+    extracted_int: Optional[int],
+    gt_int: Optional[int],
+    is_valid: bool,
+    raw_model_answer: str,
+    ground_truth: Optional[str],
+) -> str:
+    """
+    Build a feedback string similar in spirit to the GEPA `metric_with_feedback`.
+
+    Cases:
+    - Parse failure (model or gold): explain integer formatting and show correct answer.
+    - Correct: "Your answer is correct. The correct answer is '...'."
+    - Incorrect: "Your answer is incorrect. The correct answer is '...'."
+    """
+    correct_answer_display = str(gt_int if gt_int is not None else (ground_truth or ""))
+
+    if not is_valid:
+        # Could not parse either the model answer or the gold answer as an integer.
+        feedback_text = (
+            "The final answer must be a valid integer and nothing else. "
+            f"You responded with '{raw_model_answer}', which couldn't be parsed as a python integer. "
+            "Please ensure your answer is a valid integer without any additional text or formatting."
+        )
+        if correct_answer_display:
+            feedback_text += f" The correct answer is '{correct_answer_display}'."
+        return feedback_text
+
+    if extracted_int == gt_int:
+        return f"Your answer is correct. The correct answer is '{correct_answer_display}'."
+    else:
+        return f"Your answer is incorrect. The correct answer is '{correct_answer_display}'."
+
+    # TODO: our dataset does not contain written solutions, so we cannot provide feedback on the solution. maybe need to add it later.
+    # they're using https://huggingface.co/datasets/AI-MO/aimo-validation-aime
+
+
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:
@@ -126,9 +164,17 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
         )
     }
 
+    feedback_text = _build_feedback_text(
+        extracted_int=extracted_int,
+        gt_int=gt_int,
+        is_valid=is_valid,
+        raw_model_answer=content_str,
+        ground_truth=str(row.ground_truth),
+    )
+
     row.evaluation_result = EvaluateResult(
         score=score,
-        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        reason=feedback_text,
         is_score_valid=is_valid,
         metrics=metrics,
     )
diff --git a/eval_protocol/training/gepa_trainer.py b/eval_protocol/training/gepa_trainer.py
@@ -11,6 +11,7 @@
 from eval_protocol.pytest.types import TestFunction
 from eval_protocol.training.trainer import Trainer
 from eval_protocol.training.utils import build_ep_parameters_from_test
+from eval_protocol.training.gepa_utils import ep_test_to_gepa_metric
 
 
 class GEPATrainer(Trainer):
@@ -33,11 +34,9 @@ def __init__(self, test_fn: TestFunction) -> None:
         super().__init__(test_fn)
         self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn)
 
-        self.metric = test_fn  # TODO @derek. need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text.
+        self.metric = ep_test_to_gepa_metric(test_fn)
 
-        self.program = (
-            ...
-        )  # TODO @shreymodi1: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky.
+        self.program = ...  # TODO @shreymodi1: converting between a program (dspy.Module) and rollout processors is a bit tricky. maybe start with single turn
 
         self.train_set, self.val_set, self.test_set = (
             ...,
diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py
@@ -1,7 +1,15 @@
 import os
+from typing import Optional
 
 import dspy
 from dspy.clients.lm import LM
+from dspy.primitives import Example, Prediction
+from dspy.teleprompt.gepa.gepa_utils import DSPyTrace, ScoreWithFeedback
+from dspy.teleprompt.gepa.gepa import GEPAFeedbackMetric
+
+from eval_protocol.pytest.types import TestFunction
+from eval_protocol.models import EvaluationRow, Message
+
 
 REFLECTION_LM_CONFIGS = {
     "gpt-5": {
@@ -30,3 +38,70 @@ def build_reflection_lm(reflection_lm_name: str) -> LM:
         api_key=reflection_lm_config["api_key"],
         base_url=reflection_lm_config["base_url"],
     )
+
+
+def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow:
+    """
+    Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`.
+
+    Assumptions (aligned with common DSPy usage):
+    - `gold.answer` holds the ground-truth answer.
+    - `pred.answer` holds the model's final answer text.
+    """
+    gt = gold.get("answer", None)
+    ground_truth_str: Optional[str] = str(gt) if gt is not None else None
+
+    content = pred.get("answer", "")
+
+    return EvaluationRow(
+        messages=[
+            Message(role="assistant", content=str(content))
+        ],  # TODO: for some evals, you might need system / user message too.
+        ground_truth=ground_truth_str,
+    )
+
+
+def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback:
+    """
+    Convert an EvaluationRow into a GEPA-compatible ScoreWithFeedback
+    (implemented as a dspy.Prediction subclass in dspy.teleprompt.gepa).
+    """
+    if row.evaluation_result is None:
+        return dspy.Prediction(
+            score=0.0,
+            feedback="No evaluation_result was produced by the evaluation_test.",
+        )
+
+    score = float(row.evaluation_result.score or 0.0)
+    feedback = row.evaluation_result.reason or f"This trajectory got a score of {score}."
+    return dspy.Prediction(score=score, feedback=feedback)
+
+
+def ep_test_to_gepa_metric(
+    test_fn: TestFunction,
+) -> GEPAFeedbackMetric:
+    """
+    Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into
+    a GEPAFeedbackMetric-compatible callable.
+
+    The resulting metric:
+    - Constructs an EvaluationRow from (gold, pred) using a simple heuristic.
+    - Applies the EP test_fn to populate `row.evaluation_result`.
+    - Returns a dspy.Prediction(score, feedback) derived from that result.
+    """
+
+    def metric(
+        gold: Example,
+        pred: Prediction,
+        trace: Optional[DSPyTrace] = None,
+        pred_name: Optional[str] = None,
+        pred_trace: Optional[DSPyTrace] = None,
+    ) -> ScoreWithFeedback:
+        row = gold_and_pred_to_row(gold, pred)
+
+        evaluated_row: EvaluationRow = test_fn(row)  # pyright: ignore
+        # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow]
+
+        return row_to_prediction(evaluated_row)
+
+    return metric