@@ -85,11 +85,14 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
8585 assert row .evaluation_result .is_score_valid is False
8686 assert "Error during evaluation: ValueError: Test error in evaluation function" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
8787
88- # Check eval_metadata.status was set to score_invalid (due to is_score_valid=False in postprocess)
88+ # Check eval_metadata.status was set to error and is preserved (not overridden by postprocess)
8989 assert row .eval_metadata is not None
9090 assert row .eval_metadata .status is not None
91- assert row .eval_metadata .status .is_score_invalid ()
92- assert row .eval_metadata .status .message == "Score is invalid"
91+ assert row .eval_metadata .status .is_error ()
92+ assert (
93+ "Error during evaluation: ValueError: Test error in evaluation function"
94+ in row .eval_metadata .status .message
95+ )
9396
9497 async def test_pointwise_evaluation_runtime_error (self ):
9598 """Test that RuntimeError in evaluation function is properly caught and handled."""
@@ -121,10 +124,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
121124 # Check error type is included in reason
122125 assert row .evaluation_result is not None
123126 assert "RuntimeError" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
124- # Status will be score_invalid (not error) due to postprocess override
127+ # Status will be error and preserved (not overridden by postprocess)
125128 assert row .eval_metadata is not None
126129 assert row .eval_metadata .status is not None
127- assert row .eval_metadata .status .is_score_invalid ()
130+ assert row .eval_metadata .status .is_error ()
128131
129132 async def test_pointwise_evaluation_multiple_runs_with_errors (self ):
130133 """Test that errors are handled consistently across multiple runs."""
@@ -156,10 +159,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
156159 assert row .evaluation_result .score == 0.0
157160 assert row .evaluation_result .is_score_valid is False
158161 assert "ValueError" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
159- # Status will be score_invalid due to postprocess
162+ # Status will be error and preserved
160163 assert row .eval_metadata is not None
161164 assert row .eval_metadata .status is not None
162- assert row .eval_metadata .status .is_score_invalid ()
165+ assert row .eval_metadata .status .is_error ()
163166
164167 async def test_pointwise_evaluation_custom_exception (self ):
165168 """Test handling of custom exception types."""
@@ -196,10 +199,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
196199 assert row .evaluation_result is not None
197200 assert "CustomEvaluationError" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
198201 assert "Custom error with details" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
199- # Status will be score_invalid due to postprocess
202+ # Status will be error and preserved
200203 assert row .eval_metadata is not None
201204 assert row .eval_metadata .status is not None
202- assert row .eval_metadata .status .is_score_invalid ()
205+ assert row .eval_metadata .status .is_error ()
203206
204207 async def test_pointwise_evaluation_error_with_multiline_message (self ):
205208 """Test handling of errors with multiline error messages."""
@@ -280,10 +283,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]:
280283 in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
281284 )
282285
283- # Status will be score_invalid due to postprocess
286+ # Status will be error and preserved
284287 assert row .eval_metadata is not None
285288 assert row .eval_metadata .status is not None
286- assert row .eval_metadata .status .is_score_invalid ()
289+ assert row .eval_metadata .status .is_error ()
287290
288291 async def test_groupwise_evaluation_runtime_error (self ):
289292 """Test that RuntimeError in groupwise evaluation function is properly caught and handled."""
@@ -321,10 +324,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]:
321324 for row in rollouts .values ():
322325 if row .evaluation_result is not None :
323326 assert "RuntimeError" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
324- # Status will be score_invalid due to postprocess
327+ # Status will be error and preserved
325328 assert row .eval_metadata is not None
326329 assert row .eval_metadata .status is not None
327- assert row .eval_metadata .status .is_score_invalid ()
330+ assert row .eval_metadata .status .is_error ()
328331
329332
330333class TestEvaluatorErrorHandlingEdgeCases :
@@ -467,17 +470,17 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
467470 assert row .evaluation_result .score == 0.0
468471 assert row .evaluation_result .is_score_valid is False
469472 assert "ValueError" in row .evaluation_result .reason # pyright: ignore[reportOperatorIssue]
470- # Status will be score_invalid due to postprocess
473+ # Status will be error and preserved
471474 assert row .eval_metadata is not None
472475 assert row .eval_metadata .status is not None
473- assert row .eval_metadata .status .is_score_invalid ()
476+ assert row .eval_metadata .status .is_error ()
474477
475478
476479class TestEvaluatorErrorHandlingStatusCodes :
477480 """Test that Status codes are correctly set for different error scenarios."""
478481
479- async def test_error_status_uses_score_invalid_code (self ):
480- """Test that error status uses Status.Code.SCORE_INVALID due to postprocess ."""
482+ async def test_error_status_uses_internal_code (self ):
483+ """Test that error status uses Status.Code.INTERNAL and is preserved ."""
481484 from eval_protocol .pytest .evaluation_test import evaluation_test
482485
483486 input_messages = [
@@ -502,10 +505,11 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
502505 assert len (rollouts ) == 1
503506 row = list (rollouts .values ())[0 ]
504507
505- # Verify status code is SCORE_INVALID (102) after postprocess
508+ # Verify status code is INTERNAL (13) and preserved (not overridden by postprocess)
506509 assert row .eval_metadata is not None
507510 assert row .eval_metadata .status is not None
508- assert row .eval_metadata .status .code == Status .Code .SCORE_INVALID
511+ assert row .eval_metadata .status .code == Status .Code .INTERNAL
512+ assert row .eval_metadata .status .is_error ()
509513
510514 async def test_evaluation_result_reason_format (self ):
511515 """Test that evaluation_result.reason contains the error details."""
@@ -541,7 +545,8 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
541545 assert "KeyError" in reason # pyright: ignore[reportOperatorIssue]
542546 assert "missing_key" in reason # pyright: ignore[reportOperatorIssue]
543547
544- # Status will be score_invalid, not containing the error details
548+ # Status will be error and preserved
545549 assert row .eval_metadata is not None
546550 assert row .eval_metadata .status is not None
547- assert row .eval_metadata .status .is_score_invalid ()
551+ assert row .eval_metadata .status .is_error ()
552+ assert "KeyError" in row .eval_metadata .status .message
0 commit comments