diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2afb13b5..0ec07e67 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -19,7 +19,6 @@ EvaluationRow, EvaluationThreshold, EvaluationThresholdDict, - EvaluateResult, Status, ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper @@ -430,19 +429,11 @@ async def _execute_pointwise_eval_with_semaphore( experiment_id=experiment_id, run_id=run_id, ): - try: - result = await execute_pytest( - test_func, - processed_row=row, - evaluation_test_kwargs=evaluation_test_kwargs, - ) - except Exception as e: - result = row - result.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) + result = await execute_pytest( + test_func, + processed_row=row, + evaluation_test_kwargs=evaluation_test_kwargs, + ) if not isinstance(result, EvaluationRow): raise ValueError( f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." @@ -464,20 +455,11 @@ async def _execute_groupwise_eval_with_semaphore( run_id=run_id, rollout_ids=group_rollout_ids or None, ): - try: - results = await execute_pytest( - test_func, - processed_dataset=rows, - evaluation_test_kwargs=evaluation_test_kwargs, - ) - except Exception as e: - results = rows - for row in results: - row.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) + results = await execute_pytest( + test_func, + processed_dataset=rows, + evaluation_test_kwargs=evaluation_test_kwargs, + ) if not isinstance(results, list): raise ValueError( f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."