@@ -91,23 +91,30 @@ def build_reflection_lm(reflection_lm_name: str) -> LM:
9191 return dspy .LM (model = reflection_lm_name )
9292
9393
94- def gold_and_pred_to_row (gold : Example , pred : Prediction ) -> EvaluationRow :
94+ def gold_and_pred_to_row (
95+ gold : Example ,
96+ pred : Prediction ,
97+ input_field : str = "problem" ,
98+ output_field : str = "answer" ,
99+ ) -> EvaluationRow :
95100 """
96101 Convert a GEPA (gold, pred) pair into an EvaluationRow for an EP `@evaluation_test`.
97102
98- Assumptions (aligned with common DSPy usage):
99- - `gold.answer` holds the ground-truth answer.
100- - `pred.answer` holds the model's final answer text.
103+ Args:
104+ gold: The ground-truth example
105+ pred: The model's prediction
106+ input_field: Name of the input field in the DSPy signature
107+ output_field: Name of the output field in the DSPy signature
101108
102109 Note: ground_truth is preserved in its original type (list, dict, str, etc.)
103110 to support structured comparisons like SQL result matching.
104111 """
105- gt = gold .get ("answer" , None )
112+ gt = gold .get (output_field , None )
106113 # Preserve original type - don't convert to string!
107114 # This is important for SQL evaluators that expect list[dict] results
108115 ground_truth = gt
109116
110- content = pred .get ("answer" , "" )
117+ content = pred .get (output_field , "" )
111118
112119 return EvaluationRow (
113120 messages = [
@@ -135,13 +142,20 @@ def row_to_prediction(row: EvaluationRow) -> ScoreWithFeedback:
135142
136143def ep_test_to_gepa_metric (
137144 test_fn : TestFunction ,
145+ input_field : str = "problem" ,
146+ output_field : str = "answer" ,
138147) -> GEPAFeedbackMetric :
139148 """
140149 Adapter: convert an EP-style `test_fn(row: EvaluationRow) -> EvaluationRow` into
141150 a GEPAFeedbackMetric-compatible callable.
142151
152+ Args:
153+ test_fn: The EP evaluation test function
154+ input_field: Name of the input field in the DSPy signature (default: "problem")
155+ output_field: Name of the output field in the DSPy signature (default: "answer")
156+
143157 The resulting metric:
144- - Constructs an EvaluationRow from (gold, pred) using a simple heuristic .
158+ - Constructs an EvaluationRow from (gold, pred) using the configured field names .
145159 - Applies the EP test_fn to populate `row.evaluation_result`.
146160 - Returns a dspy.Prediction(score, feedback) derived from that result.
147161
@@ -158,7 +172,7 @@ def metric(
158172 pred_name : Optional [str ] = None ,
159173 pred_trace : Optional [DSPyTrace ] = None ,
160174 ) -> ScoreWithFeedback :
161- row = gold_and_pred_to_row (gold , pred )
175+ row = gold_and_pred_to_row (gold , pred , input_field , output_field )
162176
163177 # Call the test function - handle both sync and async
164178 result = test_fn (row ) # pyright: ignore
0 commit comments