Skip to content

Commit cf59d13

Browse files
authored
raise errors by default (#320)
* export ep params * fix server path * remove print statement * raise on assert * use true/false * same logic for all mode * undo * switch out variable * better abstraction * properly set env var
1 parent e4ba202 commit cf59d13

File tree

3 files changed

+93
-41
lines changed

3 files changed

+93
-41
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 14 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
)
2525
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
2626
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
27-
from eval_protocol.pytest.execution import execute_pytest
27+
from eval_protocol.pytest.execution import execute_pytest, execute_pytest_with_exception_handling
2828
from eval_protocol.pytest.generate_parameter_combinations import (
2929
ParameterizedTestKwargs,
3030
generate_parameter_combinations,
@@ -434,23 +434,11 @@ async def _execute_pointwise_eval_with_semaphore(
434434
experiment_id=experiment_id,
435435
run_id=run_id,
436436
):
437-
try:
438-
result = await execute_pytest(
439-
test_func,
440-
processed_row=row,
441-
evaluation_test_kwargs=evaluation_test_kwargs,
442-
)
443-
except Exception as e:
444-
result = row
445-
result.evaluation_result = EvaluateResult(
446-
score=0.0,
447-
is_score_valid=False,
448-
reason=f"Error during evaluation: {type(e).__name__}: {e}",
449-
)
450-
if result.eval_metadata is not None:
451-
result.eval_metadata.status = Status.error(
452-
f"Error during evaluation: {type(e).__name__}: {e}",
453-
)
437+
result = await execute_pytest_with_exception_handling(
438+
test_func=test_func,
439+
evaluation_test_kwargs=evaluation_test_kwargs,
440+
processed_row=row,
441+
)
454442
if not isinstance(result, EvaluationRow):
455443
raise ValueError(
456444
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
@@ -472,24 +460,11 @@ async def _execute_groupwise_eval_with_semaphore(
472460
run_id=run_id,
473461
rollout_ids=group_rollout_ids or None,
474462
):
475-
try:
476-
results = await execute_pytest(
477-
test_func,
478-
processed_dataset=rows,
479-
evaluation_test_kwargs=evaluation_test_kwargs,
480-
)
481-
except Exception as e:
482-
results = rows
483-
for row in results:
484-
row.evaluation_result = EvaluateResult(
485-
score=0.0,
486-
is_score_valid=False,
487-
reason=f"Error during evaluation: {type(e).__name__}: {e}",
488-
)
489-
if row.eval_metadata is not None:
490-
row.eval_metadata.status = Status.error(
491-
f"Error during evaluation: {type(e).__name__}: {e}",
492-
)
463+
results = await execute_pytest_with_exception_handling(
464+
test_func=test_func,
465+
evaluation_test_kwargs=evaluation_test_kwargs,
466+
processed_dataset=rows,
467+
)
493468
if not isinstance(results, list):
494469
raise ValueError(
495470
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
@@ -580,10 +555,10 @@ async def _collect_result(config, lst):
580555
run_id=run_id,
581556
rollout_ids=group_rollout_ids or None,
582557
):
583-
results = await execute_pytest(
584-
test_func,
585-
processed_dataset=input_dataset,
558+
results = await execute_pytest_with_exception_handling(
559+
test_func=test_func,
586560
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
561+
processed_dataset=input_dataset,
587562
)
588563
if (
589564
results is None

eval_protocol/pytest/execution.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import asyncio
2+
import os
23
from collections.abc import Awaitable, Callable
3-
from typing import cast
4-
from eval_protocol.models import EvaluationRow
4+
from typing import Any, cast
5+
from eval_protocol.models import EvaluationRow, EvaluateResult, Status
56
from eval_protocol.pytest.types import Dataset, EvaluationInputParam, TestFunction
67

78

@@ -41,3 +42,70 @@ async def execute_pytest(
4142
return test_func(processed_dataset, **evaluation_test_kwargs)
4243
test_func = cast(Callable[[], EvaluationRow], test_func)
4344
return test_func(**evaluation_test_kwargs)
45+
46+
47+
async def execute_pytest_with_exception_handling(
48+
test_func: TestFunction,
49+
evaluation_test_kwargs: dict[str, Any],
50+
processed_row: EvaluationRow | None = None,
51+
processed_dataset: list[EvaluationRow] | None = None,
52+
) -> EvaluationRow | list[EvaluationRow]:
53+
"""Helper function to execute pytest with consistent exception handling.
54+
55+
Args:
56+
test_func: The test function to execute
57+
evaluation_test_kwargs: Kwargs for the evaluation function
58+
processed_row: Single row for pointwise evaluation (mutually exclusive with processed_dataset)
59+
processed_dataset: Dataset for groupwise/all evaluation (mutually exclusive with processed_row)
60+
61+
Returns:
62+
The result of execute_pytest, or the input data with error results on exception
63+
"""
64+
try:
65+
if processed_row is not None:
66+
return await execute_pytest(
67+
test_func,
68+
processed_row=processed_row,
69+
evaluation_test_kwargs=evaluation_test_kwargs,
70+
)
71+
else:
72+
return await execute_pytest(
73+
test_func,
74+
processed_dataset=processed_dataset,
75+
evaluation_test_kwargs=evaluation_test_kwargs,
76+
)
77+
except Exception as e:
78+
if os.getenv("EP_RAISE_EVAL_EXCEPTIONS", "true").strip() == "false":
79+
# Handle single row case
80+
if processed_row is not None:
81+
result = processed_row
82+
result.evaluation_result = EvaluateResult(
83+
score=0.0,
84+
is_score_valid=False,
85+
reason=f"Error during evaluation: {type(e).__name__}: {e}",
86+
)
87+
if result.eval_metadata is not None:
88+
result.eval_metadata.status = Status.error(
89+
f"Error during evaluation: {type(e).__name__}: {e}",
90+
)
91+
return result
92+
# Handle list of rows case
93+
elif processed_dataset is not None:
94+
results = processed_dataset
95+
for row in results:
96+
row.evaluation_result = EvaluateResult(
97+
score=0.0,
98+
is_score_valid=False,
99+
reason=f"Error during evaluation: {type(e).__name__}: {e}",
100+
)
101+
if row.eval_metadata is not None:
102+
row.eval_metadata.status = Status.error(
103+
f"Error during evaluation: {type(e).__name__}: {e}",
104+
)
105+
return results
106+
else:
107+
# This should never happen since one of processed_row/processed_dataset must be provided
108+
raise ValueError("Neither processed_row nor processed_dataset was provided")
109+
# Default: raise exceptions unless explicitly disabled
110+
else:
111+
raise

tests/pytest/test_pytest_evaluator_error_handling.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@
2525
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
2626

2727

28+
@pytest.fixture(autouse=True)
29+
def _force_catch_eval_exceptions(monkeypatch: pytest.MonkeyPatch):
30+
"""
31+
These tests validate the behavior when evaluation exceptions are caught and converted
32+
into evaluation_result/status fields. Ensure the env var is set to disable raising.
33+
"""
34+
monkeypatch.setenv("EP_RAISE_EVAL_EXCEPTIONS", "false")
35+
36+
2837
class TrackingLogger(DatasetLogger):
2938
"""Custom logger that tracks all logged rows for testing."""
3039

0 commit comments

Comments
 (0)