11import atexit
22import shutil
33import tempfile
4- from eval_protocol .models import EvaluationRow , Message
4+ from eval_protocol .models import EvaluationRow , Message , EvaluateResult
55from eval_protocol .pytest import evaluation_test
66from eval_protocol .pytest .default_no_op_rollout_processor import NoOpRolloutProcessor
77from eval_protocol .pytest .default_single_turn_rollout_process import SingleTurnRolloutProcessor
@@ -23,6 +23,7 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow:
2323 """Run math evaluation on sample dataset using pytest interface."""
2424 assert row .messages [0 ].content == "What is the capital of France?"
2525 assert row .execution_metadata .invocation_id == "test-invocation-123"
26+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
2627 return row
2728
2829
@@ -38,6 +39,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
3839 """Run math evaluation on sample dataset using pytest interface."""
3940 assert row .messages [0 ].content == "What is 5 * 6?"
4041 assert row .input_metadata .completion_params ["model" ] == "gpt-40"
42+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
4143 return row
4244
4345
@@ -60,6 +62,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
6062 )
6163 def test_input_override (row : EvaluationRow ) -> EvaluationRow :
6264 assert row .messages [0 ].content == "What is 10 / 2?"
65+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
6366 return row
6467
6568
@@ -79,6 +82,7 @@ def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> Evalu
7982 # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
8083 assert len (row .messages ) == 1
8184 assert row .messages [0 ].role == "user"
85+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
8286 return row
8387
8488 @evaluation_test (
@@ -96,6 +100,7 @@ def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> Eval
96100 assert row .messages [0 ].role == "user"
97101 # Verify the original message content is preserved (no assistant response added)
98102 assert row .messages [0 ].content == "Test override"
103+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
99104 return row
100105
101106 @evaluation_test (
@@ -115,6 +120,7 @@ def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> E
115120 # Verify rows pass through unchanged
116121 assert len (row .messages ) == 1
117122 assert row .messages [0 ].role == "user"
123+ row .evaluation_result = EvaluateResult (score = 0.0 , reason = "Dummy evaluation result" )
118124 return row
119125
120126
0 commit comments