Skip to content
Merged
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 41 additions & 20 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,17 +440,23 @@ async def _execute_pointwise_eval_with_semaphore(
processed_row=row,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if not isinstance(result, EvaluationRow):
raise ValueError(
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -478,18 +484,24 @@ async def _execute_groupwise_eval_with_semaphore(
processed_dataset=rows,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "false").strip() == "false":
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
if not isinstance(results, list):
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -612,7 +624,7 @@ async def _collect_result(config, lst):
# if the eval_metadata status code has not been set to something else, consider it as finished
r.eval_metadata.status = Status.eval_finished()
# Optional debug print for assistant/tool sequence
if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1":
if os.getenv("EP_DEBUG_SERIALIZATION", "false").strip() == "false":
Comment thread
xzrderek marked this conversation as resolved.
Outdated
try:
preview = [
{
Expand Down Expand Up @@ -708,6 +720,14 @@ async def _collect_result(config, lst):
)
pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)

ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}

# Create the dual mode wrapper
dual_mode_wrapper = create_dual_mode_wrapper(
test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper
Expand All @@ -718,6 +738,7 @@ async def _collect_result(config, lst):
# with @evaluation_test.
dual_mode_wrapper.__test__ = True

setattr(dual_mode_wrapper, "__ep_params__", ep_params)
return dual_mode_wrapper # pyright: ignore[reportReturnType, reportUnknownVariableType]

return decorator
Loading