|
1 | 1 | import os |
| 2 | +from typing import Optional |
2 | 3 |
|
3 | 4 | import dspy |
4 | 5 | from dspy.clients.lm import LM |
| 6 | +from dspy.primitives import Example, Prediction |
| 7 | +from dspy.teleprompt.gepa.gepa_utils import DSPyTrace, ScoreWithFeedback |
| 8 | +from dspy.teleprompt.gepa.gepa import GEPAFeedbackMetric |
| 9 | + |
| 10 | +from eval_protocol.pytest.types import TestFunction |
| 11 | +from eval_protocol.models import EvaluationRow, Message |
| 12 | + |
5 | 13 |
|
6 | 14 | REFLECTION_LM_CONFIGS = { |
7 | 15 | "gpt-5": { |
@@ -30,3 +38,70 @@ def build_reflection_lm(reflection_lm_name: str) -> LM: |
30 | 38 | api_key=reflection_lm_config["api_key"], |
31 | 39 | base_url=reflection_lm_config["base_url"], |
32 | 40 | ) |
| 41 | + |
| 42 | + |
| 43 | +def gold_and_pred_to_row(gold: Example, pred: Prediction) -> EvaluationRow: |
| 44 | + """ |
| 45 | + Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`. |
| 46 | +
|
| 47 | + Assumptions (aligned with common DSPy usage): |
| 48 | + - `gold.answer` holds the ground-truth answer. |
| 49 | + - `pred.answer` holds the model's final answer text. |
| 50 | + """ |
| 51 | + gt = gold.get("answer", None) |
| 52 | + ground_truth_str: Optional[str] = str(gt) if gt is not None else None |
| 53 | + |
| 54 | + content = pred.get("answer", "") |
| 55 | + |
| 56 | + return EvaluationRow( |
| 57 | + messages=[ |
| 58 | + Message(role="assistant", content=str(content)) |
| 59 | + ], # TODO: for some evals, you might need system / user message too. |
| 60 | + ground_truth=ground_truth_str, |
| 61 | + ) |
| 62 | + |
| 63 | + |
| 64 | +def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback: |
| 65 | + """ |
| 66 | + Convert an EvaluationRow into a GEPA-compatible ScoreWithFeedback |
| 67 | + (implemented as a dspy.Prediction subclass in dspy.teleprompt.gepa). |
| 68 | + """ |
| 69 | + if row.evaluation_result is None: |
| 70 | + return dspy.Prediction( |
| 71 | + score=0.0, |
| 72 | + feedback="No evaluation_result was produced by the evaluation_test.", |
| 73 | + ) |
| 74 | + |
| 75 | + score = float(row.evaluation_result.score or 0.0) |
| 76 | + feedback = row.evaluation_result.reason or f"This trajectory got a score of {score}." |
| 77 | + return dspy.Prediction(score=score, feedback=feedback) |
| 78 | + |
| 79 | + |
| 80 | +def ep_test_to_gepa_metric( |
| 81 | + test_fn: TestFunction, |
| 82 | +) -> GEPAFeedbackMetric: |
| 83 | + """ |
| 84 | + Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into |
| 85 | + a GEPAFeedbackMetric-compatible callable. |
| 86 | +
|
| 87 | + The resulting metric: |
| 88 | + - Constructs an EvaluationRow from (gold, pred) using a simple heuristic. |
| 89 | + - Applies the EP test_fn to populate `row.evaluation_result`. |
| 90 | + - Returns a dspy.Prediction(score, feedback) derived from that result. |
| 91 | + """ |
| 92 | + |
| 93 | + def metric( |
| 94 | + gold: Example, |
| 95 | + pred: Prediction, |
| 96 | + trace: Optional[DSPyTrace] = None, |
| 97 | + pred_name: Optional[str] = None, |
| 98 | + pred_trace: Optional[DSPyTrace] = None, |
| 99 | + ) -> ScoreWithFeedback: |
| 100 | + row = gold_and_pred_to_row(gold, pred) |
| 101 | + |
| 102 | + evaluated_row: EvaluationRow = test_fn(row) # pyright: ignore |
| 103 | + # TODO: this is problematic. for groupwise, we will have to extend this to handle list[EvaluationRow] |
| 104 | + |
| 105 | + return row_to_prediction(evaluated_row) |
| 106 | + |
| 107 | + return metric |
0 commit comments