Skip to content
Merged
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 55 additions & 24 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,17 +440,23 @@ async def _execute_pointwise_eval_with_semaphore(
processed_row=row,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if not isinstance(result, EvaluationRow):
raise ValueError(
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -478,18 +484,24 @@ async def _execute_groupwise_eval_with_semaphore(
processed_dataset=rows,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
if not isinstance(results, list):
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -580,11 +592,30 @@ async def _collect_result(config, lst):
run_id=run_id,
rollout_ids=group_rollout_ids or None,
):
results = await execute_pytest(
test_func,
processed_dataset=input_dataset,
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
)
try:
results = await execute_pytest(
test_func,
processed_dataset=input_dataset,
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
)
except AssertionError:
raise
except Exception as e:
# Default: capture non-assert exceptions unless explicitly disabled
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
results = input_dataset
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
if (
results is None
or not isinstance(results, list)
Expand Down
Loading