2424)
2525from eval_protocol .pytest .dual_mode_wrapper import create_dual_mode_wrapper
2626from eval_protocol .pytest .evaluation_test_postprocess import postprocess
27- from eval_protocol .pytest .execution import execute_pytest
27+ from eval_protocol .pytest .execution import execute_pytest , execute_pytest_with_exception_handling
2828from eval_protocol .pytest .generate_parameter_combinations import (
2929 ParameterizedTestKwargs ,
3030 generate_parameter_combinations ,
@@ -434,29 +434,11 @@ async def _execute_pointwise_eval_with_semaphore(
434434 experiment_id = experiment_id ,
435435 run_id = run_id ,
436436 ):
437- try :
438- result = await execute_pytest (
439- test_func ,
440- processed_row = row ,
441- evaluation_test_kwargs = evaluation_test_kwargs ,
442- )
443- except AssertionError :
444- raise
445- except Exception as e :
446- # Default: capture non-assert exceptions unless explicitly disabled
447- if os .getenv ("EP_RAISE_EVAL_EXCEPTIONS" , "false" ).strip () == "false" :
448- result = row
449- result .evaluation_result = EvaluateResult (
450- score = 0.0 ,
451- is_score_valid = False ,
452- reason = f"Error during evaluation: { type (e ).__name__ } : { e } " ,
453- )
454- if result .eval_metadata is not None :
455- result .eval_metadata .status = Status .error (
456- f"Error during evaluation: { type (e ).__name__ } : { e } " ,
457- )
458- else :
459- raise
437+ result = await execute_pytest_with_exception_handling (
438+ test_func = test_func ,
439+ evaluation_test_kwargs = evaluation_test_kwargs ,
440+ processed_row = row ,
441+ )
460442 if not isinstance (result , EvaluationRow ):
461443 raise ValueError (
462444 f"Test function { test_func .__name__ } did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
@@ -478,30 +460,11 @@ async def _execute_groupwise_eval_with_semaphore(
478460 run_id = run_id ,
479461 rollout_ids = group_rollout_ids or None ,
480462 ):
481- try :
482- results = await execute_pytest (
483- test_func ,
484- processed_dataset = rows ,
485- evaluation_test_kwargs = evaluation_test_kwargs ,
486- )
487- except AssertionError :
488- raise
489- except Exception as e :
490- # Default: capture non-assert exceptions unless explicitly disabled
491- if os .getenv ("EP_RAISE_EVAL_EXCEPTIONS" , "false" ).strip () == "false" :
492- results = rows
493- for row in results :
494- row .evaluation_result = EvaluateResult (
495- score = 0.0 ,
496- is_score_valid = False ,
497- reason = f"Error during evaluation: { type (e ).__name__ } : { e } " ,
498- )
499- if row .eval_metadata is not None :
500- row .eval_metadata .status = Status .error (
501- f"Error during evaluation: { type (e ).__name__ } : { e } " ,
502- )
503- else :
504- raise
463+ results = await execute_pytest_with_exception_handling (
464+ test_func = test_func ,
465+ evaluation_test_kwargs = evaluation_test_kwargs ,
466+ processed_dataset = rows ,
467+ )
505468 if not isinstance (results , list ):
506469 raise ValueError (
507470 f"Test function { test_func .__name__ } did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
@@ -592,30 +555,11 @@ async def _collect_result(config, lst):
592555 run_id = run_id ,
593556 rollout_ids = group_rollout_ids or None ,
594557 ):
595- try :
596- results = await execute_pytest (
597- test_func ,
598- processed_dataset = input_dataset ,
599- evaluation_test_kwargs = kwargs .get ("evaluation_test_kwargs" ) or {},
600- )
601- except AssertionError :
602- raise
603- except Exception as e :
604- # Default: capture non-assert exceptions unless explicitly disabled
605- if os .getenv ("EP_RAISE_EVAL_EXCEPTIONS" , "false" ).strip () == "false" :
606- results = input_dataset
607- for row in results :
608- row .evaluation_result = EvaluateResult (
609- score = 0.0 ,
610- is_score_valid = False ,
611- reason = f"Error during evaluation: { type (e ).__name__ } : { e } " ,
612- )
613- if row .eval_metadata is not None :
614- row .eval_metadata .status = Status .error (
615- f"Error during evaluation: { type (e ).__name__ } : { e } " ,
616- )
617- else :
618- raise
558+ results = await execute_pytest_with_exception_handling (
559+ test_func = test_func ,
560+ evaluation_test_kwargs = kwargs .get ("evaluation_test_kwargs" ) or {},
561+ processed_dataset = input_dataset ,
562+ )
619563 if (
620564 results is None
621565 or not isinstance (results , list )
0 commit comments