From aa986f32e278a72ba23d1e6730e0c5f5dbd20ec4 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <klieret@meta.com>
Date: Thu, 18 Jun 2026 02:33:59 +0000
Subject: [PATCH] Change(eval): evaluate in :task_cleanroom images

Submissions are now evaluated in the artifact-free cleanroom image by
default instead of the full :task build environment, so a submission
can't rely on build artifacts leaked into :task. --image-tag stays as
an explicit override (pass --image-tag task for the full build env).

This avoids drifts between the inference image and evaluation image.

Internal-reference: fdbd6657
---
 src/programbench/cli/main.py        | 9 ++++++++-
 src/programbench/eval/eval.py       | 2 +-
 src/programbench/eval/eval_batch.py | 4 ++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py
index 6a36792..1b8d94d 100644
--- a/src/programbench/cli/main.py
+++ b/src/programbench/cli/main.py
@@ -54,7 +54,14 @@ def eval(
     filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex"),
     slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"),
     summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"),
-    image_tag: str = typer.Option("task", "--image-tag", help="Docker image tag to evaluate"),
+    image_tag: str = typer.Option(
+        "task_cleanroom",
+        "--image-tag",
+        help="Docker image tag to evaluate submissions in. Defaults to the "
+        "artifact-free cleanroom image so submissions can't rely on build "
+        "artifacts leaked into the full :task build environment. Pass "
+        "--image-tag task to use the full build environment instead.",
+    ),
     output: str = typer.Option(
         "",
         "-o",
diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py
index b423f3d..1d73a91 100644
--- a/src/programbench/eval/eval.py
+++ b/src/programbench/eval/eval.py
@@ -289,7 +289,7 @@ def __init__(
         submission_archive: Path | None = None,
         blob_dir: Path | None = None,
         remove_hashes: list[str] | None = None,
-        image_tag: str = "task",
+        image_tag: str = "task_cleanroom",
         from_existing: EvaluationResult | None = None,
         instance_id: str = "",
         docker_cpus: int = DOCKER_CPUS,
diff --git a/src/programbench/eval/eval_batch.py b/src/programbench/eval/eval_batch.py
index 8faa0a3..0a3ae12 100644
--- a/src/programbench/eval/eval_batch.py
+++ b/src/programbench/eval/eval_batch.py
@@ -229,7 +229,7 @@ def _evaluate_instance(
     source_dir: Path,
     target_dir: Path,
     force: bool,
-    image_tag: str = "task",
+    image_tag: str = "task_cleanroom",
     docker_cpus: int = DOCKER_CPUS,
     branch_workers: int = 1,
     branch_retries: int = 1,
@@ -374,7 +374,7 @@ def run_eval_batch(
     branch_workers: int = 1,
     docker_cpus: int = DOCKER_CPUS,
     summarize_only: bool = False,
-    image_tag: str = "task",
+    image_tag: str = "task_cleanroom",
     output: str | Path = "",
     branch_retries: int = 1,
 ) -> None: