From feecc86bb0578ef28be6811e0e74dc57d61f3331 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 9 Dec 2025 22:56:41 -0800 Subject: [PATCH 1/4] dont allow all 0 --- eval_protocol/cli_commands/create_rft.py | 10 +++++++-- eval_protocol/cli_commands/local_test.py | 27 ++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index 4f566338..c490f374 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -279,7 +279,13 @@ def _validate_evaluator_locally( docker_build_extra: str, docker_run_extra: str, ) -> bool: - """Run pytest locally for the selected evaluation test to validate the evaluator.""" + """Run pytest locally for the selected evaluation test to validate the evaluator. + + The pytest helpers always enforce a small success threshold (0.01) for + evaluation_test-based suites so that an evaluation run where all scores are + 0.0 will naturally fail with a non-zero pytest exit code, which we then treat + as a failed validator. + """ if not selected_test_file or not selected_test_func: # No local test associated; skip validation but warn the user. print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.") @@ -702,7 +708,7 @@ def _create_rft_job( print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'") if getattr(args, "evaluation_dataset", None): body["evaluationDataset"] = args.evaluation_dataset - + output_model_arg = getattr(args, "output_model", None) if output_model_arg: if len(output_model_arg) > 63: diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index 704345ac..dfa2ef9d 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -38,7 +38,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List def _run_pytest_host(pytest_target: str) -> int: """Run pytest against a target on the host and return its exit code.""" print(f"Running locally: pytest {pytest_target} -vs") - proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"]) + # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail. + cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.01", pytest_target, "-vs"] + proc = subprocess.run(cmd) return proc.returncode @@ -69,6 +71,22 @@ def _run_pytest_in_docker( "-w", workdir, ] + + # If EP_SUMMARY_JSON is set on the host, mirror it into the container so that + # pytest evaluation tests can write summary artifacts that are visible to the + # host. We map paths under the host logs directory (~/.eval_protocol) into the + # mounted container home directory. + host_summary_path = os.environ.get("EP_SUMMARY_JSON") + if host_summary_path: + try: + rel_path = os.path.relpath(host_summary_path, host_logs_dir) + # Only forward the variable when the summary path is inside the logs dir. + if not rel_path.startswith(os.pardir): + container_summary_path = os.path.join("/container_home/.eval_protocol", rel_path) + cmd += ["-e", f"EP_SUMMARY_JSON={container_summary_path}"] + except Exception: + # Best-effort only; do not fail docker execution if we can't map the path. + pass # Try to match host user to avoid permission problems on mounted volume try: uid = os.getuid() # type: ignore[attr-defined] @@ -78,7 +96,12 @@ def _run_pytest_in_docker( pass if run_extras: cmd += run_extras - cmd += [image_tag, "pytest", pytest_target, "-vs"] + + # Build pytest command, always enforcing the same small success threshold as + # the host runner so that all-zero score runs fail consistently. + pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.01", pytest_target, "-vs"] + + cmd += [image_tag] + pytest_cmd print("Running in Docker:", " ".join(cmd)) try: proc = subprocess.run(cmd) From 034ce388bba0fe1b1bc9b311c81967652c24adc9 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 9 Dec 2025 22:57:36 -0800 Subject: [PATCH 2/4] .001 --- eval_protocol/cli_commands/local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index dfa2ef9d..cc9a856a 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -39,7 +39,7 @@ def _run_pytest_host(pytest_target: str) -> int: """Run pytest against a target on the host and return its exit code.""" print(f"Running locally: pytest {pytest_target} -vs") # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail. - cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.01", pytest_target, "-vs"] + cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"] proc = subprocess.run(cmd) return proc.returncode From 66ca97f732f5593cb66b0817ac9a81224ceec10d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 9 Dec 2025 23:05:22 -0800 Subject: [PATCH 3/4] add test --- tests/test_evaluation_postprocess.py | 61 +++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/test_evaluation_postprocess.py b/tests/test_evaluation_postprocess.py index 05452151..05d2d81e 100644 --- a/tests/test_evaluation_postprocess.py +++ b/tests/test_evaluation_postprocess.py @@ -2,7 +2,17 @@ from unittest.mock import Mock, patch -from eval_protocol.models import EvaluationRow, EvaluateResult, EvalMetadata, ExecutionMetadata, InputMetadata, Message +import pytest + +from eval_protocol.models import ( + EvaluationRow, + EvaluateResult, + EvalMetadata, + EvaluationThreshold, + ExecutionMetadata, + InputMetadata, + Message, +) from eval_protocol.pytest.evaluation_test_postprocess import postprocess from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci @@ -206,6 +216,55 @@ def test_all_invalid_scores(self): # Should still call logger.log for all rows assert mock_logger.log.call_count == 2 + @patch.dict("os.environ", {"EP_NO_UPLOAD": "1"}) # Disable uploads + def test_threshold_all_zero_scores_fail(self): + """When all scores are 0.0 and threshold.success is 0.01, postprocess should fail.""" + all_results = [ + [self.create_test_row(0.0), self.create_test_row(0.0)], + ] + + mock_logger = Mock() + threshold = EvaluationThreshold(success=0.01, standard_error=None) + + with pytest.raises(AssertionError) as excinfo: + postprocess( + all_results=all_results, + aggregation_method="mean", + threshold=threshold, + active_logger=mock_logger, + mode="pointwise", + completion_params={"model": "test-model"}, + test_func_name="test_threshold_all_zero", + num_runs=1, + experiment_duration_seconds=10.0, + ) + + # Sanity check on the assertion message + assert "below threshold" in str(excinfo.value) + + @patch.dict("os.environ", {"EP_NO_UPLOAD": "1"}) # Disable uploads + def test_threshold_equal_score_passes(self): + """When agg_score equals threshold.success (0.01), postprocess should pass.""" + all_results = [ + [self.create_test_row(0.01)], + ] + + mock_logger = Mock() + threshold = EvaluationThreshold(success=0.01, standard_error=None) + + # Should not raise + postprocess( + all_results=all_results, + aggregation_method="mean", + threshold=threshold, + active_logger=mock_logger, + mode="pointwise", + completion_params={"model": "test-model"}, + test_func_name="test_threshold_equal_score", + num_runs=1, + experiment_duration_seconds=10.0, + ) + class TestBootstrapEquivalence: def test_bootstrap_equivalence_pandas_vs_pure_python(self): From a8914717e39825c126682e1686e036c0e7aa8960 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 9 Dec 2025 23:27:24 -0800 Subject: [PATCH 4/4] fix --- eval_protocol/cli_commands/local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index cc9a856a..7090eb06 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -99,7 +99,7 @@ def _run_pytest_in_docker( # Build pytest command, always enforcing the same small success threshold as # the host runner so that all-zero score runs fail consistently. - pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.01", pytest_target, "-vs"] + pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"] cmd += [image_tag] + pytest_cmd print("Running in Docker:", " ".join(cmd))