Skip to content

Commit 59a1133

Browse files
committed
fix ut
1 parent aaf858b commit 59a1133

File tree

2 files changed

+29
-22
lines changed

2 files changed

+29
-22
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,9 @@ async def _collect_result(config, lst):
602602
r.eval_metadata.status = Status.error(
603603
r.rollout_status.message, r.rollout_status.details
604604
)
605-
elif not (r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING):
605+
elif not (
606+
r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING
607+
):
606608
# if the eval_metadata status code has not been set to something else, consider it as finished
607609
r.eval_metadata.status = Status.eval_finished()
608610
# Optional debug print for assistant/tool sequence

tests/pytest/test_pytest_evaluator_error_handling.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,14 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
8585
assert row.evaluation_result.is_score_valid is False
8686
assert "Error during evaluation: ValueError: Test error in evaluation function" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
8787

88-
# Check eval_metadata.status was set to score_invalid (due to is_score_valid=False in postprocess)
88+
# Check eval_metadata.status was set to error and is preserved (not overridden by postprocess)
8989
assert row.eval_metadata is not None
9090
assert row.eval_metadata.status is not None
91-
assert row.eval_metadata.status.is_score_invalid()
92-
assert row.eval_metadata.status.message == "Score is invalid"
91+
assert row.eval_metadata.status.is_error()
92+
assert (
93+
"Error during evaluation: ValueError: Test error in evaluation function"
94+
in row.eval_metadata.status.message
95+
)
9396

9497
async def test_pointwise_evaluation_runtime_error(self):
9598
"""Test that RuntimeError in evaluation function is properly caught and handled."""
@@ -121,10 +124,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
121124
# Check error type is included in reason
122125
assert row.evaluation_result is not None
123126
assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
124-
# Status will be score_invalid (not error) due to postprocess override
127+
# Status will be error and preserved (not overridden by postprocess)
125128
assert row.eval_metadata is not None
126129
assert row.eval_metadata.status is not None
127-
assert row.eval_metadata.status.is_score_invalid()
130+
assert row.eval_metadata.status.is_error()
128131

129132
async def test_pointwise_evaluation_multiple_runs_with_errors(self):
130133
"""Test that errors are handled consistently across multiple runs."""
@@ -156,10 +159,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
156159
assert row.evaluation_result.score == 0.0
157160
assert row.evaluation_result.is_score_valid is False
158161
assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
159-
# Status will be score_invalid due to postprocess
162+
# Status will be error and preserved
160163
assert row.eval_metadata is not None
161164
assert row.eval_metadata.status is not None
162-
assert row.eval_metadata.status.is_score_invalid()
165+
assert row.eval_metadata.status.is_error()
163166

164167
async def test_pointwise_evaluation_custom_exception(self):
165168
"""Test handling of custom exception types."""
@@ -196,10 +199,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
196199
assert row.evaluation_result is not None
197200
assert "CustomEvaluationError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
198201
assert "Custom error with details" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
199-
# Status will be score_invalid due to postprocess
202+
# Status will be error and preserved
200203
assert row.eval_metadata is not None
201204
assert row.eval_metadata.status is not None
202-
assert row.eval_metadata.status.is_score_invalid()
205+
assert row.eval_metadata.status.is_error()
203206

204207
async def test_pointwise_evaluation_error_with_multiline_message(self):
205208
"""Test handling of errors with multiline error messages."""
@@ -280,10 +283,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]:
280283
in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
281284
)
282285

283-
# Status will be score_invalid due to postprocess
286+
# Status will be error and preserved
284287
assert row.eval_metadata is not None
285288
assert row.eval_metadata.status is not None
286-
assert row.eval_metadata.status.is_score_invalid()
289+
assert row.eval_metadata.status.is_error()
287290

288291
async def test_groupwise_evaluation_runtime_error(self):
289292
"""Test that RuntimeError in groupwise evaluation function is properly caught and handled."""
@@ -321,10 +324,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]:
321324
for row in rollouts.values():
322325
if row.evaluation_result is not None:
323326
assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
324-
# Status will be score_invalid due to postprocess
327+
# Status will be error and preserved
325328
assert row.eval_metadata is not None
326329
assert row.eval_metadata.status is not None
327-
assert row.eval_metadata.status.is_score_invalid()
330+
assert row.eval_metadata.status.is_error()
328331

329332

330333
class TestEvaluatorErrorHandlingEdgeCases:
@@ -467,17 +470,17 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
467470
assert row.evaluation_result.score == 0.0
468471
assert row.evaluation_result.is_score_valid is False
469472
assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue]
470-
# Status will be score_invalid due to postprocess
473+
# Status will be error and preserved
471474
assert row.eval_metadata is not None
472475
assert row.eval_metadata.status is not None
473-
assert row.eval_metadata.status.is_score_invalid()
476+
assert row.eval_metadata.status.is_error()
474477

475478

476479
class TestEvaluatorErrorHandlingStatusCodes:
477480
"""Test that Status codes are correctly set for different error scenarios."""
478481

479-
async def test_error_status_uses_score_invalid_code(self):
480-
"""Test that error status uses Status.Code.SCORE_INVALID due to postprocess."""
482+
async def test_error_status_uses_internal_code(self):
483+
"""Test that error status uses Status.Code.INTERNAL and is preserved."""
481484
from eval_protocol.pytest.evaluation_test import evaluation_test
482485

483486
input_messages = [
@@ -502,10 +505,11 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
502505
assert len(rollouts) == 1
503506
row = list(rollouts.values())[0]
504507

505-
# Verify status code is SCORE_INVALID (102) after postprocess
508+
# Verify status code is INTERNAL (13) and preserved (not overridden by postprocess)
506509
assert row.eval_metadata is not None
507510
assert row.eval_metadata.status is not None
508-
assert row.eval_metadata.status.code == Status.Code.SCORE_INVALID
511+
assert row.eval_metadata.status.code == Status.Code.INTERNAL
512+
assert row.eval_metadata.status.is_error()
509513

510514
async def test_evaluation_result_reason_format(self):
511515
"""Test that evaluation_result.reason contains the error details."""
@@ -541,7 +545,8 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
541545
assert "KeyError" in reason # pyright: ignore[reportOperatorIssue]
542546
assert "missing_key" in reason # pyright: ignore[reportOperatorIssue]
543547

544-
# Status will be score_invalid, not containing the error details
548+
# Status will be error and preserved
545549
assert row.eval_metadata is not None
546550
assert row.eval_metadata.status is not None
547-
assert row.eval_metadata.status.is_score_invalid()
551+
assert row.eval_metadata.status.is_error()
552+
assert "KeyError" in row.eval_metadata.status.message

0 commit comments

Comments
 (0)