Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

## Inference

Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`.
Please use the images with tag `task_cleanroom_v6` from `https://hub.docker.com/orgs/programbench/repositories`.
E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image:

```
https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/
https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom_v6/
```

(the `__` is replaced by `_1776_`).
Expand Down Expand Up @@ -49,7 +49,7 @@ After following the installation instructions from the [README](../README.md#qui
uv run programbench eval /path/to/my-amazing-agent-run
```

The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task`).
The evaluation will automatically pull all required docker containers (e.g., `ffmpeg_1776_ffmpeg.360a402:task_v6`).
Comment thread
klieret marked this conversation as resolved.

> [!TIP]
> Test blobs (per-branch test archives) are downloaded on demand from HuggingFace during evaluation.
Expand Down
4 changes: 2 additions & 2 deletions src/programbench/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def eval(
slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"),
summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"),
image_tag: str = typer.Option(
"task_cleanroom",
"task_cleanroom_v6",
"--image-tag",
help="Docker image tag to evaluate submissions in. Defaults to the "
"artifact-free cleanroom image so submissions can't rely on build "
"artifacts leaked into the full :task build environment. Pass "
"--image-tag task to use the full build environment instead.",
"--image-tag task_v6 to use the full build environment instead.",
Comment thread
klieret marked this conversation as resolved.
),
output: str = typer.Option(
"",
Expand Down
2 changes: 1 addition & 1 deletion src/programbench/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def __init__(
submission_archive: Path | None = None,
blob_dir: Path | None = None,
remove_hashes: list[str] | None = None,
image_tag: str = "task_cleanroom",
image_tag: str = "task_cleanroom_v6",
from_existing: EvaluationResult | None = None,
instance_id: str = "",
docker_cpus: int = DOCKER_CPUS,
Expand Down
4 changes: 2 additions & 2 deletions src/programbench/eval/eval_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def _evaluate_instance(
source_dir: Path,
target_dir: Path,
force: bool,
image_tag: str = "task_cleanroom",
image_tag: str = "task_cleanroom_v6",
docker_cpus: int = DOCKER_CPUS,
branch_workers: int = 1,
branch_retries: int = 1,
Expand Down Expand Up @@ -374,7 +374,7 @@ def run_eval_batch(
branch_workers: int = 1,
docker_cpus: int = DOCKER_CPUS,
summarize_only: bool = False,
image_tag: str = "task_cleanroom",
image_tag: str = "task_cleanroom_v6",
output: str | Path = "",
branch_retries: int = 1,
) -> None:
Expand Down
Loading