From 1dafdd1e82568c6ebc1f620b4dc6d292d4354604 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 14:46:41 -0700 Subject: [PATCH 1/9] Revert "Revert "add try catch (#297)" (#301)" This reverts commit 56e00c21578dd182f1c05a07ab92af46c3d21cdb. --- eval_protocol/pytest/evaluation_test.py | 38 ++++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 0ec07e67..2afb13b5 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -19,6 +19,7 @@ EvaluationRow, EvaluationThreshold, EvaluationThresholdDict, + EvaluateResult, Status, ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper @@ -429,11 +430,19 @@ async def _execute_pointwise_eval_with_semaphore( experiment_id=experiment_id, run_id=run_id, ): - result = await execute_pytest( - test_func, - processed_row=row, - evaluation_test_kwargs=evaluation_test_kwargs, - ) + try: + result = await execute_pytest( + test_func, + processed_row=row, + evaluation_test_kwargs=evaluation_test_kwargs, + ) + except Exception as e: + result = row + result.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", + ) if not isinstance(result, EvaluationRow): raise ValueError( f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." @@ -455,11 +464,20 @@ async def _execute_groupwise_eval_with_semaphore( run_id=run_id, rollout_ids=group_rollout_ids or None, ): - results = await execute_pytest( - test_func, - processed_dataset=rows, - evaluation_test_kwargs=evaluation_test_kwargs, - ) + try: + results = await execute_pytest( + test_func, + processed_dataset=rows, + evaluation_test_kwargs=evaluation_test_kwargs, + ) + except Exception as e: + results = rows + for row in results: + row.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", + ) if not isinstance(results, list): raise ValueError( f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test." From 4311a5c2a5220bd3b6f1996863d7a1083569a588 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 15:10:45 -0700 Subject: [PATCH 2/9] set the eval metadata status as well --- eval_protocol/pytest/evaluation_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2afb13b5..f84d0392 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -371,7 +371,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo row.input_metadata.session_data = {} row.input_metadata.session_data["mode"] = mode # Initialize eval_metadata for each row - row.eval_metadata = eval_metadata + row.eval_metadata = eval_metadata.model_copy(deep=True) row.execution_metadata.experiment_id = experiment_id row.execution_metadata.invocation_id = invocation_id @@ -443,6 +443,10 @@ async def _execute_pointwise_eval_with_semaphore( is_score_valid=False, reason=f"Error during evaluation: {type(e).__name__}: {e}", ) + if result.eval_metadata is not None: + result.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) if not isinstance(result, EvaluationRow): raise ValueError( f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." @@ -477,7 +481,11 @@ async def _execute_groupwise_eval_with_semaphore( score=0.0, is_score_valid=False, reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) + ) + if row.eval_metadata is not None: + row.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) if not isinstance(results, list): raise ValueError( f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test." From 10899d8f2ff6dd58093706564542e277322efa41 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 15:25:53 -0700 Subject: [PATCH 3/9] add --- eval_protocol/pytest/evaluation_test.py | 6 +- eval_protocol/pytest/exception_config.py | 2 +- .../test_pytest_evaluator_error_handling.py | 544 ++++++++++++++++++ 3 files changed, 548 insertions(+), 4 deletions(-) create mode 100644 tests/pytest/test_pytest_evaluator_error_handling.py diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index f84d0392..564fc263 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -478,9 +478,9 @@ async def _execute_groupwise_eval_with_semaphore( results = rows for row in results: row.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", ) if row.eval_metadata is not None: row.eval_metadata.status = Status.error( diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py index 9ea23710..a05cc342 100644 --- a/eval_protocol/pytest/exception_config.py +++ b/eval_protocol/pytest/exception_config.py @@ -35,7 +35,7 @@ litellm.exceptions.NotFoundError, litellm.exceptions.BadRequestError, # remove this once we have a long term solution litellm.exceptions.ServiceUnavailableError, - litellm.exceptions.APIError + litellm.exceptions.APIError, } diff --git a/tests/pytest/test_pytest_evaluator_error_handling.py b/tests/pytest/test_pytest_evaluator_error_handling.py new file mode 100644 index 00000000..fa25e11d --- /dev/null +++ b/tests/pytest/test_pytest_evaluator_error_handling.py @@ -0,0 +1,544 @@ +""" +Unit tests for evaluator error handling in evaluation_test.py. + +Tests the error handling behavior added in lines 439-449 (pointwise) and +lines 477-488 (groupwise) that catches exceptions during evaluation and +properly sets eval_metadata.status and evaluation_result fields. + +Key behaviors tested: +1. When an exception occurs during evaluation, the exception is caught +2. evaluation_result is set with: + - score=0.0 + - is_score_valid=False + - reason containing "Error during evaluation: {ExceptionType}: {message}" +3. eval_metadata.status is initially set to error, but then: + - Gets overridden to eval_finished() at lines 601-606 if no rollout error + - Gets overridden to score_invalid() in postprocess (lines 92-94) because is_score_valid=False +4. The final state has status.is_score_invalid() == True, with error details preserved in evaluation_result.reason + +""" + +import pytest +from typing_extensions import override +from eval_protocol.models import EvaluationRow, Message, Status, EvaluateResult +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger + + +class TrackingLogger(DatasetLogger): + """Custom logger that tracks all logged rows for testing.""" + + def __init__(self, rollouts: dict[str, EvaluationRow]): + self.rollouts: dict[str, EvaluationRow] = rollouts + + @override + def log(self, row: EvaluationRow): + if row.execution_metadata.rollout_id is None: + raise ValueError("Rollout ID is None") + self.rollouts[row.execution_metadata.rollout_id] = row + + @override + def read(self, row_id: str | None = None) -> list[EvaluationRow]: + return [] + + +class TestPointwiseEvaluatorErrorHandling: + """Test error handling in pointwise evaluation mode.""" + + async def test_pointwise_evaluation_value_error(self): + """Test that ValueError in evaluation function is properly caught and handled.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [ + Message( + role="user", + content="Test message", + ), + ] + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Simulate an error during evaluation + raise ValueError("Test error in evaluation function") + + # Execute the test + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify error handling + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # Check evaluation_result was set with error details + assert row.evaluation_result is not None + assert row.evaluation_result.score == 0.0 + assert row.evaluation_result.is_score_valid is False + assert "Error during evaluation: ValueError: Test error in evaluation function" in row.evaluation_result.reason + + # Check eval_metadata.status was set to score_invalid (due to is_score_valid=False in postprocess) + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.message == "Score is invalid" + + async def test_pointwise_evaluation_runtime_error(self): + """Test that RuntimeError in evaluation function is properly caught and handled.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise RuntimeError("Runtime error during evaluation") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify error handling + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # Check error type is included in reason + assert row.evaluation_result is not None + assert "RuntimeError" in row.evaluation_result.reason + # Status will be score_invalid (not error) due to postprocess override + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + async def test_pointwise_evaluation_multiple_runs_with_errors(self): + """Test that errors are handled consistently across multiple runs.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=3, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise ValueError("Consistent error") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify all runs have error handling + assert len(rollouts) == 3 + for row in rollouts.values(): + assert row.evaluation_result is not None + assert row.evaluation_result.score == 0.0 + assert row.evaluation_result.is_score_valid is False + assert "ValueError" in row.evaluation_result.reason + # Status will be score_invalid due to postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + async def test_pointwise_evaluation_custom_exception(self): + """Test handling of custom exception types.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + class CustomEvaluationError(Exception): + """Custom exception for testing.""" + + pass + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise CustomEvaluationError("Custom error with details") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify custom exception is properly handled + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + assert row.evaluation_result is not None + assert "CustomEvaluationError" in row.evaluation_result.reason + assert "Custom error with details" in row.evaluation_result.reason + # Status will be score_invalid due to postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + async def test_pointwise_evaluation_error_with_multiline_message(self): + """Test handling of errors with multiline error messages.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise ValueError("Line 1\nLine 2\nLine 3") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify multiline error message is captured + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + assert row.evaluation_result is not None + assert "Line 1\nLine 2\nLine 3" in row.evaluation_result.reason + + +class TestGroupwiseEvaluatorErrorHandling: + """Test error handling in groupwise evaluation mode.""" + + async def test_groupwise_evaluation_value_error(self): + """Test that ValueError in groupwise evaluation function is properly caught and handled.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + # Groupwise mode requires at least 2 completion_params + completion_params_list = [ + {"model": "test/model-1"}, + {"model": "test/model-2"}, + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + completion_params=completion_params_list, + rollout_processor=NoOpRolloutProcessor(), + mode="groupwise", + num_runs=1, + logger=logger, + ) + def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: + # Simulate an error during groupwise evaluation + raise ValueError("Test error in groupwise evaluation") + + # Execute the test - groupwise mode groups all completion params together + await eval_fn(input_messages=input_messages, completion_params=completion_params_list[0]) # pyright: ignore[reportCallIssue] + + # Verify error handling - groupwise should have rows for all completion params + assert len(rollouts) > 0 + + # Check that all rows have proper error handling + for row in rollouts.values(): + if row.evaluation_result is not None: + assert row.evaluation_result.score == 0.0 + assert row.evaluation_result.is_score_valid is False + assert ( + "Error during evaluation: ValueError: Test error in groupwise evaluation" + in row.evaluation_result.reason + ) + + # Status will be score_invalid due to postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + async def test_groupwise_evaluation_runtime_error(self): + """Test that RuntimeError in groupwise evaluation function is properly caught and handled.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + # Groupwise mode requires at least 2 completion_params + completion_params_list = [ + {"model": "test/model-1"}, + {"model": "test/model-2"}, + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + completion_params=completion_params_list, + rollout_processor=NoOpRolloutProcessor(), + mode="groupwise", + num_runs=1, + logger=logger, + ) + def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: + raise RuntimeError("Runtime error during groupwise evaluation") + + await eval_fn(input_messages=input_messages, completion_params=completion_params_list[0]) # pyright: ignore[reportCallIssue] + + # Verify error handling + assert len(rollouts) > 0 + + for row in rollouts.values(): + if row.evaluation_result is not None: + assert "RuntimeError" in row.evaluation_result.reason + # Status will be score_invalid due to postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + +class TestEvaluatorErrorHandlingEdgeCases: + """Test edge cases for evaluator error handling.""" + + async def test_evaluation_error_with_missing_eval_metadata(self): + """Test error handling when eval_metadata is None (shouldn't happen but defensive).""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Manually set eval_metadata to None to test defensive handling + row.eval_metadata = None + raise ValueError("Error with missing eval_metadata") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify error handling doesn't crash even without eval_metadata + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # evaluation_result should still be set + assert row.evaluation_result is not None + assert row.evaluation_result.score == 0.0 + assert row.evaluation_result.is_score_valid is False + + async def test_evaluation_error_preserves_row_data(self): + """Test that error handling preserves existing row data.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Original message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Add some data to the row before error + row.messages.append(Message(role="assistant", content="Response")) + raise ValueError("Error after modifying row") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify row data is preserved + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # Original messages should still be there + assert len(row.messages) >= 1 + assert any(msg.content == "Original message" for msg in row.messages if msg.content) + + async def test_evaluation_error_with_empty_exception_message(self): + """Test handling of exceptions with empty error messages.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise ValueError("") # Empty error message + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + # Verify error handling works with empty message + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + assert row.evaluation_result is not None + assert "ValueError" in row.evaluation_result.reason # Should at least have the exception type + + +class TestEvaluatorErrorHandlingWithInputRows: + """Test error handling when using input_rows parameter.""" + + async def test_evaluation_error_with_input_rows(self): + """Test error handling works correctly with input_rows parameter.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + # Create pre-constructed EvaluationRow + input_row = EvaluationRow( + messages=[ + Message(role="user", content="Test from input_rows"), + ] + ) + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_rows=[[input_row]], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise ValueError("Error with input_rows") + + await eval_fn(input_rows=[input_row]) # pyright: ignore[reportCallIssue] + + # Verify error handling + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + assert row.evaluation_result is not None + assert row.evaluation_result.score == 0.0 + assert row.evaluation_result.is_score_valid is False + assert "ValueError" in row.evaluation_result.reason + # Status will be score_invalid due to postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() + + +class TestEvaluatorErrorHandlingStatusCodes: + """Test that Status codes are correctly set for different error scenarios.""" + + async def test_error_status_uses_score_invalid_code(self): + """Test that error status uses Status.Code.SCORE_INVALID due to postprocess.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise ValueError("Test error") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # Verify status code is SCORE_INVALID (102) after postprocess + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.code == Status.Code.SCORE_INVALID + + async def test_evaluation_result_reason_format(self): + """Test that evaluation_result.reason contains the error details.""" + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + raise KeyError("missing_key") + + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + assert len(rollouts) == 1 + row = list(rollouts.values())[0] + + # Verify reason format in evaluation_result: "Error during evaluation: ExceptionType: message" + assert row.evaluation_result is not None + reason = row.evaluation_result.reason + assert reason.startswith("Error during evaluation: ") + assert "KeyError" in reason + assert "missing_key" in reason + + # Status will be score_invalid, not containing the error details + assert row.eval_metadata is not None + assert row.eval_metadata.status is not None + assert row.eval_metadata.status.is_score_invalid() From 4298470b002f5403cc08a46a57b0903d957bcef0 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 15:27:54 -0700 Subject: [PATCH 4/9] lint --- .../test_pytest_evaluator_error_handling.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/pytest/test_pytest_evaluator_error_handling.py b/tests/pytest/test_pytest_evaluator_error_handling.py index fa25e11d..9847aa51 100644 --- a/tests/pytest/test_pytest_evaluator_error_handling.py +++ b/tests/pytest/test_pytest_evaluator_error_handling.py @@ -83,7 +83,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result is not None assert row.evaluation_result.score == 0.0 assert row.evaluation_result.is_score_valid is False - assert "Error during evaluation: ValueError: Test error in evaluation function" in row.evaluation_result.reason + assert "Error during evaluation: ValueError: Test error in evaluation function" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Check eval_metadata.status was set to score_invalid (due to is_score_valid=False in postprocess) assert row.eval_metadata is not None @@ -120,7 +120,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Check error type is included in reason assert row.evaluation_result is not None - assert "RuntimeError" in row.evaluation_result.reason + assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid (not error) due to postprocess override assert row.eval_metadata is not None assert row.eval_metadata.status is not None @@ -155,7 +155,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result is not None assert row.evaluation_result.score == 0.0 assert row.evaluation_result.is_score_valid is False - assert "ValueError" in row.evaluation_result.reason + assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid due to postprocess assert row.eval_metadata is not None assert row.eval_metadata.status is not None @@ -194,8 +194,8 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: row = list(rollouts.values())[0] assert row.evaluation_result is not None - assert "CustomEvaluationError" in row.evaluation_result.reason - assert "Custom error with details" in row.evaluation_result.reason + assert "CustomEvaluationError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] + assert "Custom error with details" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid due to postprocess assert row.eval_metadata is not None assert row.eval_metadata.status is not None @@ -229,7 +229,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: row = list(rollouts.values())[0] assert row.evaluation_result is not None - assert "Line 1\nLine 2\nLine 3" in row.evaluation_result.reason + assert "Line 1\nLine 2\nLine 3" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] class TestGroupwiseEvaluatorErrorHandling: @@ -277,7 +277,7 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: assert row.evaluation_result.is_score_valid is False assert ( "Error during evaluation: ValueError: Test error in groupwise evaluation" - in row.evaluation_result.reason + in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] ) # Status will be score_invalid due to postprocess @@ -320,7 +320,7 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: for row in rollouts.values(): if row.evaluation_result is not None: - assert "RuntimeError" in row.evaluation_result.reason + assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid due to postprocess assert row.eval_metadata is not None assert row.eval_metadata.status is not None @@ -425,7 +425,9 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: row = list(rollouts.values())[0] assert row.evaluation_result is not None - assert "ValueError" in row.evaluation_result.reason # Should at least have the exception type + assert ( + "ValueError" in row.evaluation_result.reason + ) # Should at least have the exception type # pyright: ignore[reportOperatorIssue] class TestEvaluatorErrorHandlingWithInputRows: @@ -464,7 +466,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result is not None assert row.evaluation_result.score == 0.0 assert row.evaluation_result.is_score_valid is False - assert "ValueError" in row.evaluation_result.reason + assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid due to postprocess assert row.eval_metadata is not None assert row.eval_metadata.status is not None @@ -534,9 +536,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Verify reason format in evaluation_result: "Error during evaluation: ExceptionType: message" assert row.evaluation_result is not None reason = row.evaluation_result.reason + assert reason is not None assert reason.startswith("Error during evaluation: ") - assert "KeyError" in reason - assert "missing_key" in reason + assert "KeyError" in reason # pyright: ignore[reportOperatorIssue] + assert "missing_key" in reason # pyright: ignore[reportOperatorIssue] # Status will be score_invalid, not containing the error details assert row.eval_metadata is not None From 23cfde52bb7fab181f4b439744d8c78cb0d0c7c6 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 15:29:53 -0700 Subject: [PATCH 5/9] add --- eval_protocol/pytest/exception_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py index a05cc342..209a178b 100644 --- a/eval_protocol/pytest/exception_config.py +++ b/eval_protocol/pytest/exception_config.py @@ -33,7 +33,6 @@ litellm.exceptions.InternalServerError, litellm.exceptions.Timeout, litellm.exceptions.NotFoundError, - litellm.exceptions.BadRequestError, # remove this once we have a long term solution litellm.exceptions.ServiceUnavailableError, litellm.exceptions.APIError, } From f29823404e596db7628b7ff5fad987e5d898a133 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 16:13:31 -0700 Subject: [PATCH 6/9] add --- eval_protocol/pytest/evaluation_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 564fc263..3ea1c853 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -602,7 +602,8 @@ async def _collect_result(config, lst): r.eval_metadata.status = Status.error( r.rollout_status.message, r.rollout_status.details ) - else: + elif not (r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING): + # if the eval_metadata status code has not been set to something else, consider it as finished r.eval_metadata.status = Status.eval_finished() # Optional debug print for assistant/tool sequence if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1": From aaf858b430ca7c995ba7245dadb27c3dbef18c57 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 16:34:06 -0700 Subject: [PATCH 7/9] avoid override --- eval_protocol/pytest/evaluation_test_postprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eval_protocol/pytest/evaluation_test_postprocess.py b/eval_protocol/pytest/evaluation_test_postprocess.py index 7fa7e3fe..a1764da5 100644 --- a/eval_protocol/pytest/evaluation_test_postprocess.py +++ b/eval_protocol/pytest/evaluation_test_postprocess.py @@ -91,7 +91,8 @@ def postprocess( result.evaluation_result.standard_error = standard_error if result.evaluation_result.is_score_valid is False: if result.eval_metadata is not None: - result.eval_metadata.status = Status.score_invalid() + if not result.eval_metadata.status or not result.eval_metadata.status.is_error(): + result.eval_metadata.status = Status.score_invalid() result.execution_metadata.experiment_duration_seconds = experiment_duration_seconds active_logger.log(result) From 59a1133e64af785206e31185308bf87181eaea5d Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 29 Oct 2025 21:00:05 -0700 Subject: [PATCH 8/9] fix ut --- eval_protocol/pytest/evaluation_test.py | 4 +- .../test_pytest_evaluator_error_handling.py | 47 ++++++++++--------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 3ea1c853..857765d3 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -602,7 +602,9 @@ async def _collect_result(config, lst): r.eval_metadata.status = Status.error( r.rollout_status.message, r.rollout_status.details ) - elif not (r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING): + elif not ( + r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING + ): # if the eval_metadata status code has not been set to something else, consider it as finished r.eval_metadata.status = Status.eval_finished() # Optional debug print for assistant/tool sequence diff --git a/tests/pytest/test_pytest_evaluator_error_handling.py b/tests/pytest/test_pytest_evaluator_error_handling.py index 9847aa51..70861679 100644 --- a/tests/pytest/test_pytest_evaluator_error_handling.py +++ b/tests/pytest/test_pytest_evaluator_error_handling.py @@ -85,11 +85,14 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result.is_score_valid is False assert "Error during evaluation: ValueError: Test error in evaluation function" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Check eval_metadata.status was set to score_invalid (due to is_score_valid=False in postprocess) + # Check eval_metadata.status was set to error and is preserved (not overridden by postprocess) assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() - assert row.eval_metadata.status.message == "Score is invalid" + assert row.eval_metadata.status.is_error() + assert ( + "Error during evaluation: ValueError: Test error in evaluation function" + in row.eval_metadata.status.message + ) async def test_pointwise_evaluation_runtime_error(self): """Test that RuntimeError in evaluation function is properly caught and handled.""" @@ -121,10 +124,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Check error type is included in reason assert row.evaluation_result is not None assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid (not error) due to postprocess override + # Status will be error and preserved (not overridden by postprocess) assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() async def test_pointwise_evaluation_multiple_runs_with_errors(self): """Test that errors are handled consistently across multiple runs.""" @@ -156,10 +159,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result.score == 0.0 assert row.evaluation_result.is_score_valid is False assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid due to postprocess + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() async def test_pointwise_evaluation_custom_exception(self): """Test handling of custom exception types.""" @@ -196,10 +199,10 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result is not None assert "CustomEvaluationError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] assert "Custom error with details" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid due to postprocess + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() async def test_pointwise_evaluation_error_with_multiline_message(self): """Test handling of errors with multiline error messages.""" @@ -280,10 +283,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] ) - # Status will be score_invalid due to postprocess + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() async def test_groupwise_evaluation_runtime_error(self): """Test that RuntimeError in groupwise evaluation function is properly caught and handled.""" @@ -321,10 +324,10 @@ def eval_fn(rows: list[EvaluationRow]) -> list[EvaluationRow]: for row in rollouts.values(): if row.evaluation_result is not None: assert "RuntimeError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid due to postprocess + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() class TestEvaluatorErrorHandlingEdgeCases: @@ -467,17 +470,17 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.evaluation_result.score == 0.0 assert row.evaluation_result.is_score_valid is False assert "ValueError" in row.evaluation_result.reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid due to postprocess + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() class TestEvaluatorErrorHandlingStatusCodes: """Test that Status codes are correctly set for different error scenarios.""" - async def test_error_status_uses_score_invalid_code(self): - """Test that error status uses Status.Code.SCORE_INVALID due to postprocess.""" + async def test_error_status_uses_internal_code(self): + """Test that error status uses Status.Code.INTERNAL and is preserved.""" from eval_protocol.pytest.evaluation_test import evaluation_test input_messages = [ @@ -502,10 +505,11 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert len(rollouts) == 1 row = list(rollouts.values())[0] - # Verify status code is SCORE_INVALID (102) after postprocess + # Verify status code is INTERNAL (13) and preserved (not overridden by postprocess) assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.code == Status.Code.SCORE_INVALID + assert row.eval_metadata.status.code == Status.Code.INTERNAL + assert row.eval_metadata.status.is_error() async def test_evaluation_result_reason_format(self): """Test that evaluation_result.reason contains the error details.""" @@ -541,7 +545,8 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert "KeyError" in reason # pyright: ignore[reportOperatorIssue] assert "missing_key" in reason # pyright: ignore[reportOperatorIssue] - # Status will be score_invalid, not containing the error details + # Status will be error and preserved assert row.eval_metadata is not None assert row.eval_metadata.status is not None - assert row.eval_metadata.status.is_score_invalid() + assert row.eval_metadata.status.is_error() + assert "KeyError" in row.eval_metadata.status.message From 138d537a1db9d709fe9d5cf7d7869ad7c156e3c0 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Thu, 30 Oct 2025 00:30:22 -0700 Subject: [PATCH 9/9] add --- eval_protocol/pytest/exception_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py index 209a178b..c8ccaf8e 100644 --- a/eval_protocol/pytest/exception_config.py +++ b/eval_protocol/pytest/exception_config.py @@ -33,6 +33,7 @@ litellm.exceptions.InternalServerError, litellm.exceptions.Timeout, litellm.exceptions.NotFoundError, + litellm.exceptions.BadRequestError, litellm.exceptions.ServiceUnavailableError, litellm.exceptions.APIError, }