From f29110db8734f6db2eda3262a39c312605107320 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 5 Nov 2025 01:28:19 -0800 Subject: [PATCH 01/10] export ep params --- eval_protocol/pytest/evaluation_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 857765d3..980cd58d 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -704,11 +704,22 @@ async def _collect_result(config, lst): ) pytest_wrapper = pytest.mark.asyncio(pytest_wrapper) + ep_params: dict[str, Any] = { + "rollout_processor": rollout_processor, + "server_script_path": server_script_path, + "mcp_config_path": mcp_config_path, + "rollout_processor_kwargs": rollout_processor_kwargs, + "mode": mode, + } + + print(f"ep_params: {ep_params}") + # Create the dual mode wrapper dual_mode_wrapper = create_dual_mode_wrapper( test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper ) + setattr(dual_mode_wrapper, "__ep_params__", ep_params) return dual_mode_wrapper # pyright: ignore[reportReturnType, reportUnknownVariableType] return decorator From 6b9f1331a8ab25d0f057d3426a77849cb1b0a2f0 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 5 Nov 2025 01:57:48 -0800 Subject: [PATCH 02/10] fix server path --- eval_protocol/benchmarks/test_frozen_lake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py index c3b1684f..ac5c998a 100644 --- a/eval_protocol/benchmarks/test_frozen_lake.py +++ b/eval_protocol/benchmarks/test_frozen_lake.py @@ -46,7 +46,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation num_runs=1, max_concurrent_rollouts=3, mode="pointwise", - server_script_path="examples/frozen_lake_mcp/server.py", + server_script_path="eval_protocol/mcp_servers/frozen_lake/server.py", ) def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow: """ From bf126c18b0695d11b5a0c7b89bea9336886b67eb Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 5 Nov 2025 02:45:58 -0800 Subject: [PATCH 03/10] remove print statement --- eval_protocol/pytest/evaluation_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 980cd58d..0293cbfc 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -712,8 +712,6 @@ async def _collect_result(config, lst): "mode": mode, } - print(f"ep_params: {ep_params}") - # Create the dual mode wrapper dual_mode_wrapper = create_dual_mode_wrapper( test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper From d5ea771efbd8a0142c773d2b7bab7d07bae1457e Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 6 Nov 2025 22:58:06 -0800 Subject: [PATCH 04/10] raise on assert --- eval_protocol/pytest/evaluation_test.py | 50 +++++++++++++++---------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 0293cbfc..5153cb29 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -436,17 +436,23 @@ async def _execute_pointwise_eval_with_semaphore( processed_row=row, evaluation_test_kwargs=evaluation_test_kwargs, ) + except AssertionError: + raise except Exception as e: - result = row - result.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) - if result.eval_metadata is not None: - result.eval_metadata.status = Status.error( - f"Error during evaluation: {type(e).__name__}: {e}", + # Default: capture non-assert exceptions unless explicitly disabled + if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1": + result = row + result.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", ) + if result.eval_metadata is not None: + result.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) + else: + raise if not isinstance(result, EvaluationRow): raise ValueError( f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." @@ -474,18 +480,24 @@ async def _execute_groupwise_eval_with_semaphore( processed_dataset=rows, evaluation_test_kwargs=evaluation_test_kwargs, ) + except AssertionError: + raise except Exception as e: - results = rows - for row in results: - row.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) - if row.eval_metadata is not None: - row.eval_metadata.status = Status.error( - f"Error during evaluation: {type(e).__name__}: {e}", + # Default: capture non-assert exceptions unless explicitly disabled + if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1": + results = rows + for row in results: + row.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", ) + if row.eval_metadata is not None: + row.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) + else: + raise if not isinstance(results, list): raise ValueError( f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test." From aa1ce37d3f965a55785038155bc930d456c4e096 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 6 Nov 2025 23:04:37 -0800 Subject: [PATCH 05/10] use true/false --- eval_protocol/pytest/evaluation_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 1f242f05..581dcb78 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -444,7 +444,7 @@ async def _execute_pointwise_eval_with_semaphore( raise except Exception as e: # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1": + if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": result = row result.evaluation_result = EvaluateResult( score=0.0, @@ -488,7 +488,7 @@ async def _execute_groupwise_eval_with_semaphore( raise except Exception as e: # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1": + if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": results = rows for row in results: row.evaluation_result = EvaluateResult( @@ -624,7 +624,7 @@ async def _collect_result(config, lst): # if the eval_metadata status code has not been set to something else, consider it as finished r.eval_metadata.status = Status.eval_finished() # Optional debug print for assistant/tool sequence - if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1": + if os.getenv("EP_DEBUG_SERIALIZATION", "false").strip() == "false": try: preview = [ { From 94b1b9f478414f55bf89b1dd5e88ea43c561bee8 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 6 Nov 2025 23:07:35 -0800 Subject: [PATCH 06/10] same logic for all mode --- eval_protocol/pytest/evaluation_test.py | 31 ++++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 581dcb78..71c3ea6a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -592,11 +592,30 @@ async def _collect_result(config, lst): run_id=run_id, rollout_ids=group_rollout_ids or None, ): - results = await execute_pytest( - test_func, - processed_dataset=input_dataset, - evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, - ) + try: + results = await execute_pytest( + test_func, + processed_dataset=input_dataset, + evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, + ) + except AssertionError: + raise + except Exception as e: + # Default: capture non-assert exceptions unless explicitly disabled + if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": + results = input_dataset + for row in results: + row.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", + ) + if row.eval_metadata is not None: + row.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) + else: + raise if ( results is None or not isinstance(results, list) @@ -624,7 +643,7 @@ async def _collect_result(config, lst): # if the eval_metadata status code has not been set to something else, consider it as finished r.eval_metadata.status = Status.eval_finished() # Optional debug print for assistant/tool sequence - if os.getenv("EP_DEBUG_SERIALIZATION", "false").strip() == "false": + if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1": try: preview = [ { From a0aff8aa73286f4e794593aa68107adb4488b531 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 6 Nov 2025 23:08:07 -0800 Subject: [PATCH 07/10] undo --- eval_protocol/pytest/evaluation_test.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 71c3ea6a..36042efd 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -739,14 +739,6 @@ async def _collect_result(config, lst): ) pytest_wrapper = pytest.mark.asyncio(pytest_wrapper) - ep_params: dict[str, Any] = { - "rollout_processor": rollout_processor, - "server_script_path": server_script_path, - "mcp_config_path": mcp_config_path, - "rollout_processor_kwargs": rollout_processor_kwargs, - "mode": mode, - } - # Create the dual mode wrapper dual_mode_wrapper = create_dual_mode_wrapper( test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper @@ -757,7 +749,6 @@ async def _collect_result(config, lst): # with @evaluation_test. dual_mode_wrapper.__test__ = True - setattr(dual_mode_wrapper, "__ep_params__", ep_params) return dual_mode_wrapper # pyright: ignore[reportReturnType, reportUnknownVariableType] return decorator From d73e5587943f9df98b089eccbaa92941375f3710 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 6 Nov 2025 23:12:45 -0800 Subject: [PATCH 08/10] switch out variable --- eval_protocol/pytest/evaluation_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 36042efd..fab15538 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -444,7 +444,7 @@ async def _execute_pointwise_eval_with_semaphore( raise except Exception as e: # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": + if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": result = row result.evaluation_result = EvaluateResult( score=0.0, @@ -488,7 +488,7 @@ async def _execute_groupwise_eval_with_semaphore( raise except Exception as e: # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": + if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": results = rows for row in results: row.evaluation_result = EvaluateResult( @@ -602,7 +602,7 @@ async def _collect_result(config, lst): raise except Exception as e: # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false": + if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": results = input_dataset for row in results: row.evaluation_result = EvaluateResult( From 4b31337bdaf9e34bc4d7829d91a97284aef21f77 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 7 Nov 2025 14:33:28 -0800 Subject: [PATCH 09/10] better abstraction --- eval_protocol/pytest/evaluation_test.py | 88 +++++-------------------- eval_protocol/pytest/execution.py | 72 +++++++++++++++++++- 2 files changed, 86 insertions(+), 74 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index fab15538..763e3081 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -24,7 +24,7 @@ ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper from eval_protocol.pytest.evaluation_test_postprocess import postprocess -from eval_protocol.pytest.execution import execute_pytest +from eval_protocol.pytest.execution import execute_pytest, execute_pytest_with_exception_handling from eval_protocol.pytest.generate_parameter_combinations import ( ParameterizedTestKwargs, generate_parameter_combinations, @@ -434,29 +434,11 @@ async def _execute_pointwise_eval_with_semaphore( experiment_id=experiment_id, run_id=run_id, ): - try: - result = await execute_pytest( - test_func, - processed_row=row, - evaluation_test_kwargs=evaluation_test_kwargs, - ) - except AssertionError: - raise - except Exception as e: - # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": - result = row - result.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) - if result.eval_metadata is not None: - result.eval_metadata.status = Status.error( - f"Error during evaluation: {type(e).__name__}: {e}", - ) - else: - raise + result = await execute_pytest_with_exception_handling( + test_func=test_func, + evaluation_test_kwargs=evaluation_test_kwargs, + processed_row=row, + ) if not isinstance(result, EvaluationRow): raise ValueError( f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." @@ -478,30 +460,11 @@ async def _execute_groupwise_eval_with_semaphore( run_id=run_id, rollout_ids=group_rollout_ids or None, ): - try: - results = await execute_pytest( - test_func, - processed_dataset=rows, - evaluation_test_kwargs=evaluation_test_kwargs, - ) - except AssertionError: - raise - except Exception as e: - # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": - results = rows - for row in results: - row.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) - if row.eval_metadata is not None: - row.eval_metadata.status = Status.error( - f"Error during evaluation: {type(e).__name__}: {e}", - ) - else: - raise + results = await execute_pytest_with_exception_handling( + test_func=test_func, + evaluation_test_kwargs=evaluation_test_kwargs, + processed_dataset=rows, + ) if not isinstance(results, list): raise ValueError( f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test." @@ -592,30 +555,11 @@ async def _collect_result(config, lst): run_id=run_id, rollout_ids=group_rollout_ids or None, ): - try: - results = await execute_pytest( - test_func, - processed_dataset=input_dataset, - evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, - ) - except AssertionError: - raise - except Exception as e: - # Default: capture non-assert exceptions unless explicitly disabled - if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "false").strip() == "false": - results = input_dataset - for row in results: - row.evaluation_result = EvaluateResult( - score=0.0, - is_score_valid=False, - reason=f"Error during evaluation: {type(e).__name__}: {e}", - ) - if row.eval_metadata is not None: - row.eval_metadata.status = Status.error( - f"Error during evaluation: {type(e).__name__}: {e}", - ) - else: - raise + results = await execute_pytest_with_exception_handling( + test_func=test_func, + evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, + processed_dataset=input_dataset, + ) if ( results is None or not isinstance(results, list) diff --git a/eval_protocol/pytest/execution.py b/eval_protocol/pytest/execution.py index fa572ee0..dabe08e4 100644 --- a/eval_protocol/pytest/execution.py +++ b/eval_protocol/pytest/execution.py @@ -1,7 +1,8 @@ import asyncio +import os from collections.abc import Awaitable, Callable -from typing import cast -from eval_protocol.models import EvaluationRow +from typing import Any, cast +from eval_protocol.models import EvaluationRow, EvaluateResult, Status from eval_protocol.pytest.types import Dataset, EvaluationInputParam, TestFunction @@ -41,3 +42,70 @@ async def execute_pytest( return test_func(processed_dataset, **evaluation_test_kwargs) test_func = cast(Callable[[], EvaluationRow], test_func) return test_func(**evaluation_test_kwargs) + + +async def execute_pytest_with_exception_handling( + test_func: TestFunction, + evaluation_test_kwargs: dict[str, Any], + processed_row: EvaluationRow | None = None, + processed_dataset: list[EvaluationRow] | None = None, +) -> EvaluationRow | list[EvaluationRow]: + """Helper function to execute pytest with consistent exception handling. + + Args: + test_func: The test function to execute + evaluation_test_kwargs: Kwargs for the evaluation function + processed_row: Single row for pointwise evaluation (mutually exclusive with processed_dataset) + processed_dataset: Dataset for groupwise/all evaluation (mutually exclusive with processed_row) + + Returns: + The result of execute_pytest, or the input data with error results on exception + """ + try: + if processed_row is not None: + return await execute_pytest( + test_func, + processed_row=processed_row, + evaluation_test_kwargs=evaluation_test_kwargs, + ) + else: + return await execute_pytest( + test_func, + processed_dataset=processed_dataset, + evaluation_test_kwargs=evaluation_test_kwargs, + ) + except Exception as e: + if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "true").strip() == "false": + # Handle single row case + if processed_row is not None: + result = processed_row + result.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", + ) + if result.eval_metadata is not None: + result.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) + return result + # Handle list of rows case + elif processed_dataset is not None: + results = processed_dataset + for row in results: + row.evaluation_result = EvaluateResult( + score=0.0, + is_score_valid=False, + reason=f"Error during evaluation: {type(e).__name__}: {e}", + ) + if row.eval_metadata is not None: + row.eval_metadata.status = Status.error( + f"Error during evaluation: {type(e).__name__}: {e}", + ) + return results + else: + # This should never happen since one of processed_row/processed_dataset must be provided + raise ValueError("Neither processed_row nor processed_dataset was provided") + # Default: raise exceptions unless explicitly disabled + else: + raise From 5fce8ab90c0400fcf2fd81d382fdc2de8df75563 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 7 Nov 2025 15:02:00 -0800 Subject: [PATCH 10/10] properly set env var --- tests/pytest/test_pytest_evaluator_error_handling.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/pytest/test_pytest_evaluator_error_handling.py b/tests/pytest/test_pytest_evaluator_error_handling.py index 70861679..5a412c5d 100644 --- a/tests/pytest/test_pytest_evaluator_error_handling.py +++ b/tests/pytest/test_pytest_evaluator_error_handling.py @@ -25,6 +25,15 @@ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +@pytest.fixture(autouse=True) +def _force_catch_eval_exceptions(monkeypatch: pytest.MonkeyPatch): + """ + These tests validate the behavior when evaluation exceptions are caught and converted + into evaluation_result/status fields. Ensure the env var is set to disable raising. + """ + monkeypatch.setenv("EP_RAISE_EVAL_EXCEPTIONS", "false") + + class TrackingLogger(DatasetLogger): """Custom logger that tracks all logged rows for testing."""