Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/programbench/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,14 @@ def eval(
filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex"),
slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g. '0:5')"),
summarize_only: bool = typer.Option(False, "--summarize-only", help="Skip evaluation; just read existing results"),
image_tag: str = typer.Option("task", "--image-tag", help="Docker image tag to evaluate"),
image_tag: str = typer.Option(
"task_cleanroom",
"--image-tag",
help="Docker image tag to evaluate submissions in. Defaults to the "
"artifact-free cleanroom image so submissions can't rely on build "
"artifacts leaked into the full :task build environment. Pass "
"--image-tag task to use the full build environment instead.",
),
output: str = typer.Option(
"",
"-o",
Expand Down
2 changes: 1 addition & 1 deletion src/programbench/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def __init__(
submission_archive: Path | None = None,
blob_dir: Path | None = None,
remove_hashes: list[str] | None = None,
image_tag: str = "task",
image_tag: str = "task_cleanroom",
from_existing: EvaluationResult | None = None,
instance_id: str = "",
docker_cpus: int = DOCKER_CPUS,
Expand Down
4 changes: 2 additions & 2 deletions src/programbench/eval/eval_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def _evaluate_instance(
source_dir: Path,
target_dir: Path,
force: bool,
image_tag: str = "task",
image_tag: str = "task_cleanroom",
docker_cpus: int = DOCKER_CPUS,
branch_workers: int = 1,
branch_retries: int = 1,
Expand Down Expand Up @@ -374,7 +374,7 @@ def run_eval_batch(
branch_workers: int = 1,
docker_cpus: int = DOCKER_CPUS,
summarize_only: bool = False,
image_tag: str = "task",
image_tag: str = "task_cleanroom",
output: str | Path = "",
branch_retries: int = 1,
) -> None:
Expand Down
Loading