From 6e92650e7be6b1c47edf3e7c9aeebcfe1aff3b47 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <klieret@meta.com>
Date: Thu, 18 Jun 2026 20:09:00 +0000
Subject: [PATCH] Ref(eval): default to v6 docker images

Make the v6 image set the default everywhere eval resolves a tag: the
`eval` CLI `--image-tag`, the `Evaluator.evaluate` default, and both
`eval_batch` entry points now default to `task_cleanroom_v6` (was
`task_cleanroom`). Update docs/README.md to point inference users at the
`task_cleanroom_v6` / `task_v6` tags.

Tags are otherwise unchanged: `--image-tag task_v6` still selects the full
build environment, and explicit overrides are passed through verbatim.

Internal-reference-commit: a92ae6227464d6c1dbff015e6003fb70790760db
Internal-reference-commit: 22e92e67b0a399d7b9dc2f612c5f5eedec1c80cd
Internal-reference-commit: 46518a07432af49a4b86777bc2ee8b50f7548bc7
---
 docs/README.md                      | 6 +++---
 src/programbench/cli/main.py        | 4 ++--
 src/programbench/eval/eval.py       | 2 +-
 src/programbench/eval/eval_batch.py | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 0b501f4..0f72496 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -7,11 +7,11 @@
 
 ## Inference
 
-Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`.
+Please use the images with tag `task_cleanroom_v6` from `https://hub.docker.com/orgs/programbench/repositories`.
 E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image:
 
 ```
-https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/
+https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom_v6/
 ```
 
 (the `__` is replaced by `_1776_`).
@@ -49,7 +49,7 @@ After following the installation instructions from the [README](../README.md#qui
 uv run programbench eval /path/to/my-amazing-agent-run
 ```
 
-The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task`).
+The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task_v6`).
 
 > [!TIP]
 > Test blobs (per-branch test archives) are downloaded on demand from HuggingFace during evaluation.
diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py
index 1b8d94d..9d08efb 100644
--- a/src/programbench/cli/main.py
+++ b/src/programbench/cli/main.py
@@ -55,12 +55,12 @@ def eval(
     slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"),
     summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"),
     image_tag: str = typer.Option(
-        "task_cleanroom",
+        "task_cleanroom_v6",
         "--image-tag",
         help="Docker image tag to evaluate submissions in. Defaults to the "
         "artifact-free cleanroom image so submissions can't rely on build "
         "artifacts leaked into the full :task build environment. Pass "
-        "--image-tag task to use the full build environment instead.",
+        "--image-tag task_v6 to use the full build environment instead.",
     ),
     output: str = typer.Option(
         "",
diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py
index 14021fe..ba1f692 100644
--- a/src/programbench/eval/eval.py
+++ b/src/programbench/eval/eval.py
@@ -290,7 +290,7 @@ def __init__(
         submission_archive: Path | None = None,
         blob_dir: Path | None = None,
         remove_hashes: list[str] | None = None,
-        image_tag: str = "task_cleanroom",
+        image_tag: str = "task_cleanroom_v6",
         from_existing: EvaluationResult | None = None,
         instance_id: str = "",
         docker_cpus: int = DOCKER_CPUS,
diff --git a/src/programbench/eval/eval_batch.py b/src/programbench/eval/eval_batch.py
index 0a3ae12..5f412ce 100644
--- a/src/programbench/eval/eval_batch.py
+++ b/src/programbench/eval/eval_batch.py
@@ -229,7 +229,7 @@ def _evaluate_instance(
     source_dir: Path,
     target_dir: Path,
     force: bool,
-    image_tag: str = "task_cleanroom",
+    image_tag: str = "task_cleanroom_v6",
     docker_cpus: int = DOCKER_CPUS,
     branch_workers: int = 1,
     branch_retries: int = 1,
@@ -374,7 +374,7 @@ def run_eval_batch(
     branch_workers: int = 1,
     docker_cpus: int = DOCKER_CPUS,
     summarize_only: bool = False,
-    image_tag: str = "task_cleanroom",
+    image_tag: str = "task_cleanroom_v6",
     output: str | Path = "",
     branch_retries: int = 1,
 ) -> None: