diff --git a/docs/README.md b/docs/README.md index 0b501f4..0f72496 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,11 +7,11 @@ ## Inference -Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`. +Please use the images with tag `task_cleanroom_v6` from `https://hub.docker.com/orgs/programbench/repositories`. E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image: ``` -https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/ +https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom_v6/ ``` (the `__` is replaced by `_1776_`). @@ -49,7 +49,7 @@ After following the installation instructions from the [README](../README.md#qui uv run programbench eval /path/to/my-amazing-agent-run ``` -The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task`). +The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task_v6`). > [!TIP] > Test blobs (per-branch test archives) are downloaded on demand from HuggingFace during evaluation. diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 1b8d94d..9d08efb 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -55,12 +55,12 @@ def eval( slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"), summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"), image_tag: str = typer.Option( - "task_cleanroom", + "task_cleanroom_v6", "--image-tag", help="Docker image tag to evaluate submissions in. Defaults to the " "artifact-free cleanroom image so submissions can't rely on build " "artifacts leaked into the full :task build environment. Pass " - "--image-tag task to use the full build environment instead.", + "--image-tag task_v6 to use the full build environment instead.", ), output: str = typer.Option( "", diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py index 14021fe..ba1f692 100644 --- a/src/programbench/eval/eval.py +++ b/src/programbench/eval/eval.py @@ -290,7 +290,7 @@ def __init__( submission_archive: Path | None = None, blob_dir: Path | None = None, remove_hashes: list[str] | None = None, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", from_existing: EvaluationResult | None = None, instance_id: str = "", docker_cpus: int = DOCKER_CPUS, diff --git a/src/programbench/eval/eval_batch.py b/src/programbench/eval/eval_batch.py index 0a3ae12..5f412ce 100644 --- a/src/programbench/eval/eval_batch.py +++ b/src/programbench/eval/eval_batch.py @@ -229,7 +229,7 @@ def _evaluate_instance( source_dir: Path, target_dir: Path, force: bool, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", docker_cpus: int = DOCKER_CPUS, branch_workers: int = 1, branch_retries: int = 1, @@ -374,7 +374,7 @@ def run_eval_batch( branch_workers: int = 1, docker_cpus: int = DOCKER_CPUS, summarize_only: bool = False, - image_tag: str = "task_cleanroom", + image_tag: str = "task_cleanroom_v6", output: str | Path = "", branch_retries: int = 1, ) -> None: