From 6e92650e7be6b1c47edf3e7c9aeebcfe1aff3b47 Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Thu, 18 Jun 2026 20:09:00 +0000 Subject: [PATCH] Ref(eval): default to v6 docker images Make the v6 image set the default everywhere eval resolves a tag: the `eval` CLI `--image-tag`, the `Evaluator.evaluate` default, and both `eval_batch` entry points now default to `task_cleanroom_v6` (was `task_cleanroom`). Update docs/README.md to point inference users at the `task_cleanroom_v6` / `task_v6` tags. Tags are otherwise unchanged: `--image-tag task_v6` still selects the full build environment, and explicit overrides are passed through verbatim. Internal-reference-commit: a92ae6227464d6c1dbff015e6003fb70790760db Internal-reference-commit: 22e92e67b0a399d7b9dc2f612c5f5eedec1c80cd Internal-reference-commit: 46518a07432af49a4b86777bc2ee8b50f7548bc7 --- docs/README.md | 6 +++--- src/programbench/cli/main.py | 4 ++-- src/programbench/eval/eval.py | 2 +- src/programbench/eval/eval_batch.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/README.md b/docs/README.md index 0b501f4..0f72496 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,11 +7,11 @@ ## Inference -Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`. +Please use the images with tag `task_cleanroom_v6` from `https://hub.docker.com/orgs/programbench/repositories`. E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image: ``` -https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/ +https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom_v6/ ``` (the `__` is replaced by `_1776_`). @@ -49,7 +49,7 @@ After following the installation instructions from the [README](../README.md#qui uv run programbench eval /path/to/my-amazing-agent-run ``` -The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task`). +The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task_v6`). > [!TIP] > Test blobs (per-branch test archives) are downloaded on demand from HuggingFace during evaluation. diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 1b8d94d..9d08efb 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -55,12 +55,12 @@ def eval( slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"), summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"), image_tag: str = typer.Option( - "task_cleanroom", + "task_cleanroom_v6", "--image-tag", help="Docker image tag to evaluate submissions in. Defaults to the " "artifact-free cleanroom image so submissions can't rely on build " "artifacts leaked into the full :task build environment. Pass " - "--image-tag task to use the full build environment instead.", + "--image-tag task_v6 to use the full build environment instead.", ), output: str = typer.Option( "", diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py index 14021fe..ba1f692 100644 --- a/src/programbench/eval/eval.py +++ b/src/programbench/eval/eval.py @@ -290,7 +290,7 @@ def __init__( submission_archive: Path | None = None, blob_dir: Path | None = None, remove_hashes: list[str] | None = None, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", from_existing: EvaluationResult | None = None, instance_id: str = "", docker_cpus: int = DOCKER_CPUS, diff --git a/src/programbench/eval/eval_batch.py b/src/programbench/eval/eval_batch.py index 0a3ae12..5f412ce 100644 --- a/src/programbench/eval/eval_batch.py +++ b/src/programbench/eval/eval_batch.py @@ -229,7 +229,7 @@ def _evaluate_instance( source_dir: Path, target_dir: Path, force: bool, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", docker_cpus: int = DOCKER_CPUS, branch_workers: int = 1, branch_retries: int = 1, @@ -374,7 +374,7 @@ def run_eval_batch( branch_workers: int = 1, docker_cpus: int = DOCKER_CPUS, summarize_only: bool = False, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", output: str | Path = "", branch_retries: int = 1, ) -> None: