Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions eval_protocol/cli_commands/create_rft.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,13 @@ def _validate_evaluator_locally(
docker_build_extra: str,
docker_run_extra: str,
) -> bool:
"""Run pytest locally for the selected evaluation test to validate the evaluator."""
"""Run pytest locally for the selected evaluation test to validate the evaluator.

The pytest helpers always enforce a small success threshold (0.01) for
evaluation_test-based suites so that an evaluation run where all scores are
0.0 will naturally fail with a non-zero pytest exit code, which we then treat
as a failed validator.
"""
if not selected_test_file or not selected_test_func:
# No local test associated; skip validation but warn the user.
print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.")
Expand Down Expand Up @@ -702,7 +708,7 @@ def _create_rft_job(
print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
if getattr(args, "evaluation_dataset", None):
body["evaluationDataset"] = args.evaluation_dataset

output_model_arg = getattr(args, "output_model", None)
if output_model_arg:
if len(output_model_arg) > 63:
Expand Down
27 changes: 25 additions & 2 deletions eval_protocol/cli_commands/local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
def _run_pytest_host(pytest_target: str) -> int:
"""Run pytest against a target on the host and return its exit code."""
print(f"Running locally: pytest {pytest_target} -vs")
proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
# Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
proc = subprocess.run(cmd)
return proc.returncode


Expand Down Expand Up @@ -69,6 +71,22 @@ def _run_pytest_in_docker(
"-w",
workdir,
]

# If EP_SUMMARY_JSON is set on the host, mirror it into the container so that
# pytest evaluation tests can write summary artifacts that are visible to the
# host. We map paths under the host logs directory (~/.eval_protocol) into the
# mounted container home directory.
host_summary_path = os.environ.get("EP_SUMMARY_JSON")
if host_summary_path:
try:
rel_path = os.path.relpath(host_summary_path, host_logs_dir)
# Only forward the variable when the summary path is inside the logs dir.
if not rel_path.startswith(os.pardir):
container_summary_path = os.path.join("/container_home/.eval_protocol", rel_path)
cmd += ["-e", f"EP_SUMMARY_JSON={container_summary_path}"]
except Exception:
# Best-effort only; do not fail docker execution if we can't map the path.
pass
# Try to match host user to avoid permission problems on mounted volume
try:
uid = os.getuid() # type: ignore[attr-defined]
Expand All @@ -78,7 +96,12 @@ def _run_pytest_in_docker(
pass
if run_extras:
cmd += run_extras
cmd += [image_tag, "pytest", pytest_target, "-vs"]

# Build pytest command, always enforcing the same small success threshold as
# the host runner so that all-zero score runs fail consistently.
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]

cmd += [image_tag] + pytest_cmd
print("Running in Docker:", " ".join(cmd))
try:
proc = subprocess.run(cmd)
Expand Down
61 changes: 60 additions & 1 deletion tests/test_evaluation_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@

from unittest.mock import Mock, patch

from eval_protocol.models import EvaluationRow, EvaluateResult, EvalMetadata, ExecutionMetadata, InputMetadata, Message
import pytest

from eval_protocol.models import (
EvaluationRow,
EvaluateResult,
EvalMetadata,
EvaluationThreshold,
ExecutionMetadata,
InputMetadata,
Message,
)
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci

Expand Down Expand Up @@ -206,6 +216,55 @@ def test_all_invalid_scores(self):
# Should still call logger.log for all rows
assert mock_logger.log.call_count == 2

@patch.dict("os.environ", {"EP_NO_UPLOAD": "1"}) # Disable uploads
def test_threshold_all_zero_scores_fail(self):
"""When all scores are 0.0 and threshold.success is 0.01, postprocess should fail."""
all_results = [
[self.create_test_row(0.0), self.create_test_row(0.0)],
]

mock_logger = Mock()
threshold = EvaluationThreshold(success=0.01, standard_error=None)

with pytest.raises(AssertionError) as excinfo:
postprocess(
all_results=all_results,
aggregation_method="mean",
threshold=threshold,
active_logger=mock_logger,
mode="pointwise",
completion_params={"model": "test-model"},
test_func_name="test_threshold_all_zero",
num_runs=1,
experiment_duration_seconds=10.0,
)

# Sanity check on the assertion message
assert "below threshold" in str(excinfo.value)

@patch.dict("os.environ", {"EP_NO_UPLOAD": "1"}) # Disable uploads
def test_threshold_equal_score_passes(self):
"""When agg_score equals threshold.success (0.01), postprocess should pass."""
all_results = [
[self.create_test_row(0.01)],
]

mock_logger = Mock()
threshold = EvaluationThreshold(success=0.01, standard_error=None)

# Should not raise
postprocess(
all_results=all_results,
aggregation_method="mean",
threshold=threshold,
active_logger=mock_logger,
mode="pointwise",
completion_params={"model": "test-model"},
test_func_name="test_threshold_equal_score",
num_runs=1,
experiment_duration_seconds=10.0,
)


class TestBootstrapEquivalence:
def test_bootstrap_equivalence_pandas_vs_pure_python(self):
Expand Down
Loading