Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 41 additions & 12 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
EvaluationRow,
EvaluationThreshold,
EvaluationThresholdDict,
EvaluateResult,
Status,
)
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
Expand Down Expand Up @@ -370,7 +371,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
row.input_metadata.session_data = {}
row.input_metadata.session_data["mode"] = mode
# Initialize eval_metadata for each row
row.eval_metadata = eval_metadata
row.eval_metadata = eval_metadata.model_copy(deep=True)
row.execution_metadata.experiment_id = experiment_id
row.execution_metadata.invocation_id = invocation_id

Expand Down Expand Up @@ -429,11 +430,23 @@ async def _execute_pointwise_eval_with_semaphore(
experiment_id=experiment_id,
run_id=run_id,
):
result = await execute_pytest(
test_func,
processed_row=row,
evaluation_test_kwargs=evaluation_test_kwargs,
)
try:
result = await execute_pytest(
test_func,
processed_row=row,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except Exception as e:
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
if not isinstance(result, EvaluationRow):
raise ValueError(
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
Expand All @@ -455,11 +468,24 @@ async def _execute_groupwise_eval_with_semaphore(
run_id=run_id,
rollout_ids=group_rollout_ids or None,
):
results = await execute_pytest(
test_func,
processed_dataset=rows,
evaluation_test_kwargs=evaluation_test_kwargs,
)
try:
results = await execute_pytest(
test_func,
processed_dataset=rows,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except Exception as e:
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
if not isinstance(results, list):
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -576,7 +602,10 @@ async def _collect_result(config, lst):
r.eval_metadata.status = Status.error(
r.rollout_status.message, r.rollout_status.details
)
else:
elif not (
r.eval_metadata.status and r.eval_metadata.status.code != Status.Code.RUNNING
):
# if the eval_metadata status code has not been set to something else, consider it as finished
r.eval_metadata.status = Status.eval_finished()
# Optional debug print for assistant/tool sequence
if os.getenv("EP_DEBUG_SERIALIZATION", "0").strip() == "1":
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/pytest/evaluation_test_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def postprocess(
result.evaluation_result.standard_error = standard_error
if result.evaluation_result.is_score_valid is False:
if result.eval_metadata is not None:
result.eval_metadata.status = Status.score_invalid()
if not result.eval_metadata.status or not result.eval_metadata.status.is_error():
result.eval_metadata.status = Status.score_invalid()
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Error Status Blocks Score Invalid Flag

The condition for setting score_invalid status prevents it from being applied when an error status is already present. This means evaluation results may incorrectly retain an error status instead of score_invalid when is_score_valid is false, conflicting with test expectations.

Fix in Cursor Fix in Web

result.execution_metadata.experiment_duration_seconds = experiment_duration_seconds
active_logger.log(result)

Expand Down
4 changes: 2 additions & 2 deletions eval_protocol/pytest/exception_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
litellm.exceptions.InternalServerError,
litellm.exceptions.Timeout,
litellm.exceptions.NotFoundError,
litellm.exceptions.BadRequestError, # remove this once we have a long term solution
litellm.exceptions.BadRequestError,
litellm.exceptions.ServiceUnavailableError,
litellm.exceptions.APIError
litellm.exceptions.APIError,
}


Expand Down
Loading
Loading