eval-protocol · xzrderek · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
@@ -279,7 +279,13 @@ def _validate_evaluator_locally(
     docker_build_extra: str,
     docker_run_extra: str,
 ) -> bool:
-    """Run pytest locally for the selected evaluation test to validate the evaluator."""
+    """Run pytest locally for the selected evaluation test to validate the evaluator.
+
+    The pytest helpers always enforce a small success threshold (0.01) for
+    evaluation_test-based suites so that an evaluation run where all scores are
+    0.0 will naturally fail with a non-zero pytest exit code, which we then treat
+    as a failed validator.
+    """
     if not selected_test_file or not selected_test_func:
         # No local test associated; skip validation but warn the user.
         print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.")
@@ -702,7 +708,7 @@ def _create_rft_job(
     print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
     if getattr(args, "evaluation_dataset", None):
         body["evaluationDataset"] = args.evaluation_dataset
-    
+
     output_model_arg = getattr(args, "output_model", None)
     if output_model_arg:
         if len(output_model_arg) > 63:

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
@@ -38,7 +38,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
 def _run_pytest_host(pytest_target: str) -> int:
     """Run pytest against a target on the host and return its exit code."""
     print(f"Running locally: pytest {pytest_target} -vs")
-    proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
+    # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
+    cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
+    proc = subprocess.run(cmd)
     return proc.returncode
 
 
@@ -69,6 +71,22 @@ def _run_pytest_in_docker(
         "-w",
         workdir,
     ]
+
+    # If EP_SUMMARY_JSON is set on the host, mirror it into the container so that
+    # pytest evaluation tests can write summary artifacts that are visible to the
+    # host. We map paths under the host logs directory (~/.eval_protocol) into the
+    # mounted container home directory.
+    host_summary_path = os.environ.get("EP_SUMMARY_JSON")
+    if host_summary_path:
+        try:
+            rel_path = os.path.relpath(host_summary_path, host_logs_dir)
+            # Only forward the variable when the summary path is inside the logs dir.
+            if not rel_path.startswith(os.pardir):
+                container_summary_path = os.path.join("/container_home/.eval_protocol", rel_path)
+                cmd += ["-e", f"EP_SUMMARY_JSON={container_summary_path}"]
+        except Exception:
+            # Best-effort only; do not fail docker execution if we can't map the path.
+            pass
     # Try to match host user to avoid permission problems on mounted volume
     try:
         uid = os.getuid()  # type: ignore[attr-defined]
@@ -78,7 +96,12 @@ def _run_pytest_in_docker(
         pass
     if run_extras:
         cmd += run_extras
-    cmd += [image_tag, "pytest", pytest_target, "-vs"]
+
+    # Build pytest command, always enforcing the same small success threshold as
+    # the host runner so that all-zero score runs fail consistently.
+    pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
+
+    cmd += [image_tag] + pytest_cmd
     print("Running in Docker:", " ".join(cmd))
     try:
         proc = subprocess.run(cmd)

diff --git a/tests/test_evaluation_postprocess.py b/tests/test_evaluation_postprocess.py
@@ -2,7 +2,17 @@
 
 from unittest.mock import Mock, patch
 
-from eval_protocol.models import EvaluationRow, EvaluateResult, EvalMetadata, ExecutionMetadata, InputMetadata, Message
+import pytest
+
+from eval_protocol.models import (
+    EvaluationRow,
+    EvaluateResult,
+    EvalMetadata,
+    EvaluationThreshold,
+    ExecutionMetadata,
+    InputMetadata,
+    Message,
+)
 from eval_protocol.pytest.evaluation_test_postprocess import postprocess
 from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci
 
@@ -206,6 +216,55 @@ def test_all_invalid_scores(self):
         # Should still call logger.log for all rows
         assert mock_logger.log.call_count == 2
 
+    @patch.dict("os.environ", {"EP_NO_UPLOAD": "1"})  # Disable uploads
+    def test_threshold_all_zero_scores_fail(self):
+        """When all scores are 0.0 and threshold.success is 0.01, postprocess should fail."""
+        all_results = [
+            [self.create_test_row(0.0), self.create_test_row(0.0)],
+        ]
+
+        mock_logger = Mock()
+        threshold = EvaluationThreshold(success=0.01, standard_error=None)
+
+        with pytest.raises(AssertionError) as excinfo:
+            postprocess(
+                all_results=all_results,
+                aggregation_method="mean",
+                threshold=threshold,
+                active_logger=mock_logger,
+                mode="pointwise",
+                completion_params={"model": "test-model"},
+                test_func_name="test_threshold_all_zero",
+                num_runs=1,
+                experiment_duration_seconds=10.0,
+            )
+
+        # Sanity check on the assertion message
+        assert "below threshold" in str(excinfo.value)
+
+    @patch.dict("os.environ", {"EP_NO_UPLOAD": "1"})  # Disable uploads
+    def test_threshold_equal_score_passes(self):
+        """When agg_score equals threshold.success (0.01), postprocess should pass."""
+        all_results = [
+            [self.create_test_row(0.01)],
+        ]
+
+        mock_logger = Mock()
+        threshold = EvaluationThreshold(success=0.01, standard_error=None)
+
+        # Should not raise
+        postprocess(
+            all_results=all_results,
+            aggregation_method="mean",
+            threshold=threshold,
+            active_logger=mock_logger,
+            mode="pointwise",
+            completion_params={"model": "test-model"},
+            test_func_name="test_threshold_equal_score",
+            num_runs=1,
+            experiment_duration_seconds=10.0,
+        )
+
 
 class TestBootstrapEquivalence:
     def test_bootstrap_equivalence_pandas_vs_pure_python(self):