From aa986f32e278a72ba23d1e6730e0c5f5dbd20ec4 Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Thu, 18 Jun 2026 02:33:59 +0000 Subject: [PATCH] Change(eval): evaluate in :task_cleanroom images Submissions are now evaluated in the artifact-free cleanroom image by default instead of the full :task build environment, so a submission can't rely on build artifacts leaked into :task. --image-tag stays as an explicit override (pass --image-tag task for the full build env). This avoids drifts between the inference image and evaluation image. Internal-reference: fdbd6657 --- src/programbench/cli/main.py | 9 ++++++++- src/programbench/eval/eval.py | 2 +- src/programbench/eval/eval_batch.py | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 6a36792..1b8d94d 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -54,7 +54,14 @@ def eval( filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex"), slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"), summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"), - image_tag: str = typer.Option("task", "--image-tag", help="Docker image tag to evaluate"), + image_tag: str = typer.Option( + "task_cleanroom", + "--image-tag", + help="Docker image tag to evaluate submissions in. Defaults to the " + "artifact-free cleanroom image so submissions can't rely on build " + "artifacts leaked into the full :task build environment. Pass " + "--image-tag task to use the full build environment instead.", + ), output: str = typer.Option( "", "-o", diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py index b423f3d..1d73a91 100644 --- a/src/programbench/eval/eval.py +++ b/src/programbench/eval/eval.py @@ -289,7 +289,7 @@ def __init__( submission_archive: Path | None = None, blob_dir: Path | None = None, remove_hashes: list[str] | None = None, - image_tag: str = "task", + image_tag: str = "task_cleanroom", from_existing: EvaluationResult | None = None, instance_id: str = "", docker_cpus: int = DOCKER_CPUS, diff --git a/src/programbench/eval/eval_batch.py b/src/programbench/eval/eval_batch.py index 8faa0a3..0a3ae12 100644 --- a/src/programbench/eval/eval_batch.py +++ b/src/programbench/eval/eval_batch.py @@ -229,7 +229,7 @@ def _evaluate_instance( source_dir: Path, target_dir: Path, force: bool, - image_tag: str = "task", + image_tag: str = "task_cleanroom", docker_cpus: int = DOCKER_CPUS, branch_workers: int = 1, branch_retries: int = 1, @@ -374,7 +374,7 @@ def run_eval_batch( branch_workers: int = 1, docker_cpus: int = DOCKER_CPUS, summarize_only: bool = False, - image_tag: str = "task", + image_tag: str = "task_cleanroom", output: str | Path = "", branch_retries: int = 1, ) -> None: