From b1e9e94ed5a205144ca130064a7597ceba6b27d3 Mon Sep 17 00:00:00 2001 From: John Yang Date: Tue, 16 Jun 2026 17:42:25 -0700 Subject: [PATCH 01/11] Add `programbench submit` (package / verify / register / recombine) --- src/programbench/cli/main.py | 2 + src/programbench/cli/submit.py | 208 ++++++++++++++++ src/programbench/data/templates/README.md.j2 | 78 ++++++ .../data/templates/submission.yaml.j2 | 28 +++ src/programbench/package.py | 216 +++++++++++++++++ src/programbench/register.py | 157 ++++++++++++ src/programbench/submission.py | 227 ++++++++++++++++++ src/programbench/verify.py | 100 ++++++++ 8 files changed, 1016 insertions(+) create mode 100644 src/programbench/cli/submit.py create mode 100644 src/programbench/data/templates/README.md.j2 create mode 100644 src/programbench/data/templates/submission.yaml.j2 create mode 100644 src/programbench/package.py create mode 100644 src/programbench/register.py create mode 100644 src/programbench/submission.py create mode 100644 src/programbench/verify.py diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 6a36792..85f20b9 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -9,6 +9,7 @@ import typer from programbench.cli.blob import app as blob_app +from programbench.cli.submit import app as submit_app from programbench.constants import DOCKER_CPUS app = typer.Typer( @@ -18,6 +19,7 @@ context_settings={"help_option_names": ["-h", "--help"]}, ) app.add_typer(blob_app, name="blob") +app.add_typer(submit_app, name="submit") @app.callback() diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py new file mode 100644 index 0000000..0980952 --- /dev/null +++ b/src/programbench/cli/submit.py @@ -0,0 +1,208 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json.""" + +from pathlib import Path + +import typer + +app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.") + + +@app.command() +def package( + run_dir: Path = typer.Argument( + ..., help="A `programbench eval` run directory (//submission.tar.gz)." + ), + upload_to: str = typer.Option( + "", + "--upload-to", + metavar="ORG[/DATASET]", + help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, " + "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a " + "per-submission dataset org/; pass 'org/name' to use an exact dataset.", + ), + overwrite: bool = typer.Option( + False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)." + ), +) -> None: + """Turn an evaluated run directory into a leaderboard submission, in place. + + Writes a submission.yaml manifest and _stats/score.json, and splits each large + eval.json into a light eval.json (kept) + a heavy .eval.log.json (raw log + + failure text) so the repo stays git-pushable. With --upload-to, the heavy files and + the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and + trajectories are left as TODO. + + \b + Examples: + programbench submit package output/my-run + programbench submit package output/my-run --upload-to programbench + """ + from rich.console import Console + + from programbench.package import package_run + + result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite) + console = Console() + console.print( + f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] " + f"(skipped {len(result.skipped)} unknown). " + f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%" + ) + console.print( + "[dim]Each eval.json was split into eval.json + .eval.log.json (recombine with " + "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]" + ) + + +@app.command() +def verify( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + tier1: bool = typer.Option( + False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)." + ), + workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."), + filter_spec: str = typer.Option( + "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex." + ), +) -> None: + """Verify a submission against its own claimed results. + + Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json + files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves + each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the + reported scores. + + \b + Examples: + programbench submit verify ./their-submission + programbench submit verify ./their-submission --tier1 -w 4 + """ + from rich.console import Console + from rich.table import Table + + from programbench.verify import verify_tier0, verify_tier1 + + result = ( + verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec) + if tier1 + else verify_tier0(submission_dir) + ) + + table = Table(title=f"Tier-{result.tier} verification", box=None) + table.add_column("Check", style="bold") + table.add_column("Claimed", justify="right") + table.add_column("Computed", justify="right") + table.add_column("", justify="center") + for c in result.checks: + table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌") + console = Console() + console.print(table) + if result.ok: + console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.") + else: + console.print("[bold red]FAIL[/bold red] — discrepancies found above.") + raise typer.Exit(1) + + +@app.command() +def register( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + registry: str = typer.Option( + "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)." + ), + source: str = typer.Option( + "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)." + ), + commit: str = typer.Option( + "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)." + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network." + ), + verify: bool = typer.Option( + True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)." + ), +) -> None: + """Register a packaged submission on the leaderboard by opening a PR to the registry. + + The PR adds a small submissions// entry: a pointer.yaml (the submission repo URL + + the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The + source URL and commit are read from the run directory's own git remote/HEAD. With `gh` + installed the registry is forked and the PR opened for you; otherwise the entry is left + committed on a branch and the steps to push + open the PR are printed. + + \b + Examples: + programbench submit register ./my-run --dry-run + programbench submit register ./my-run + """ + import tempfile + + from rich.console import Console + + from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry + + console = Console() + registry = registry or REGISTRY_DEFAULT + + if verify: + from programbench.verify import verify_tier0 + + if not verify_tier0(submission_dir).ok: + console.print( + "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass " + "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch." + ) + raise typer.Exit(1) + + plan = build_plan(submission_dir, registry) + if source: + plan.source = source + if commit: + plan.commit = commit + + if dry_run: + with tempfile.TemporaryDirectory() as tmp: + entry = write_entry(plan, submission_dir, Path(tmp)) + files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file()) + console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}") + console.print(f" branch: {plan.branch}") + console.print(f" source: {plan.source}\n commit: {plan.commit}") + console.print(" files: " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files)) + console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}") + console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}") + console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]") + return + + result = register_submission(submission_dir, registry) + if result.pr_url: + console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}") + else: + console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}") + + +@app.command() +def recombine( + run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."), +) -> None: + """Reverse `package`'s eval split: fold each .eval.log.json back into its + eval.json, restoring the original full eval output. + + The heavy file is read locally, or downloaded from its .url if it was uploaded to HF. + + \b + Examples: + programbench submit recombine ./their-submission + """ + from rich.console import Console + + from programbench.submission import recombine_eval_json + + n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir())) + Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}") diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2 new file mode 100644 index 0000000..9e6d1bb --- /dev/null +++ b/src/programbench/data/templates/README.md.j2 @@ -0,0 +1,78 @@ +

+ ProgramBench +

+ +> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?* · [Leaderboard](https://programbench.com) · [How to submit](https://programbench.com/blog/submission-guide) + +# [Submission Name Here] + + + +## System overview + + + +## Reproducing this run + + + +```bash +# 1. install the agent / dependencies +# 2. run inference per task (no internet, per the eval protocol) +# 3. programbench eval +# 4. programbench submit package --upload-to +``` + +## Extra stats (optional) + +The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are +**optional**, and each must be **computed by a script that reads your trajectories**, not +entered by hand: the number has to be recoverable from the run. `programbench` ships no +calculators (it makes no assumptions about your scaffold) — write your own that reads each +`traj.json` and emits a flat `{instance_id: value}` map to `_stats/.json`, and ship +the script here (e.g. under `_scripts/`) so the numbers are reproducible. + +## Links + + + +## Submission checklist + +- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission +- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold` +- [ ] Trajectories (`traj.json`) included for every task (agent submissions) +- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256` +- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written +- [ ] Filled in the System overview and Reproducing sections above +- [ ] `programbench submit verify .` passes +- [ ] Made this fork public +- [ ] Opened a registration PR to the submissions repo + +## Integrity attestations + +- [ ] Solutions were produced **only** from behavioral observation of the binary and its + bundled docs — no source code, repositories, mirrors, or package registries were consulted +- [ ] The model was not given internet access during evaluation +- [ ] The model did not have access to any unit tests during evaluation +- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results + +## Auditing + +Anyone can independently check this submission with the following instructions: + +```bash +git clone +cd {{ submission_id }} +uvx programbench submit verify . # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline) +uvx programbench submit verify . --tier1 # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker) +``` + +* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test +metadata. +* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs +them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from +the trajectories; only `score` is independently re-verifiable.) diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 new file mode 100644 index 0000000..061353d --- /dev/null +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -0,0 +1,28 @@ +# Generated by `programbench package` from: {{ run_dir }} +# [auto] fields are recomputed on every `package`; all other fields are preserved. +schema_version: 1 + +submission_id: {{ submission_id | tojson }} +submitter: + name: {{ submitter_name | tojson }} + contact: {{ submitter_contact | tojson }} # email or @github + affiliation: {{ affiliation | tojson }} + +system: + agent: {{ agent | tojson }} # scaffold/harness; "none" for a pure human submission + description_url: {{ description_url | tojson }} + is_os_model: {{ is_os_model | tojson }} # true if the model's weights are openly available + is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source + model: {{ model | tojson }} # display name used on the leaderboard + provider: {{ provider | tojson }} + type: {{ system_type | tojson }} # single-agent | multi-agent | other + +eval: + programbench_version: {{ programbench_version | tojson }} # [auto] + +headline: # [auto] score summary from evaluation; other stats live in stats/ + mean_score: {{ mean_score }} + resolved_pct: {{ resolved_pct }} + near_resolved_pct: {{ near_resolved_pct }} + n_instances_attempted: {{ n_attempted }} + n_instances_total: {{ n_total }} diff --git a/src/programbench/package.py b/src/programbench/package.py new file mode 100644 index 0000000..a86bcea --- /dev/null +++ b/src/programbench/package.py @@ -0,0 +1,216 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Turn a ``programbench eval`` run directory into a leaderboard submission, in place. + +Packaging is purely eval-derived. It writes: + +- ``_stats/score.json`` — per-instance, per-test pass/fail (the one stat from evaluation), +- ``submission.yaml`` — the manifest, with ``[auto]`` score fields recomputed and any + author-entered fields preserved across re-packaging, + +and splits each ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json`` +(the raw log + failure text) so the run repo stays git-pushable; the two recombine to the +original via ``programbench submit recombine``. With ``--upload-to`` the heavy files and the +``submission.tar.gz`` artifacts go to a HuggingFace dataset (replaced by ``.url`` + ``.sha256``). + +Other stats (cost, calls, …) are optional and come from the agent trajectories via scripts +the submitter writes — this command produces none of them, and makes no assumptions about +the scaffold. The run directory stays a valid input to ``programbench eval``. +""" + +import logging +import os +import shutil +import tempfile +from dataclasses import dataclass +from importlib.metadata import version +from pathlib import Path + +import yaml +from jinja2 import Environment, PackageLoader + +from programbench.submission import ( + Headline, + aggregate, + benchmark_instances, + score_from_tests, + sha256_file, + split_eval_json, + test_results_map, + write_stat, +) + +log = logging.getLogger(__name__) + +TODO = "TODO" + +# Author-entered manifest fields preserved across re-packaging: template var -> (path, default). +_CARRIED = { + "affiliation": ("submitter.affiliation", ""), + "agent": ("system.agent", TODO), + "description_url": ("system.description_url", "README.md"), + "is_os_model": ("system.is_os_model", False), + "is_os_scaffold": ("system.is_os_scaffold", False), + "model": ("system.model", TODO), + "provider": ("system.provider", TODO), + "submitter_contact": ("submitter.contact", TODO), + "submitter_name": ("submitter.name", TODO), + "system_type": ("system.type", "single-agent"), +} + + +@dataclass +class PackageResult: + run_dir: Path + packaged: list[str] + skipped: list[str] + headline: Headline + + +def _dig(d: dict, dotted: str): + for key in dotted.split("."): + if not isinstance(d, dict): + return None + d = d.get(key) + return d + + +def _carried_values(run_dir: Path) -> dict: + manifest_path = run_dir / "submission.yaml" + existing = yaml.safe_load(manifest_path.read_text()) if manifest_path.exists() else {} + # Use "is None" (not "or") so a real False/empty value is preserved, not clobbered. + return { + var: (default if (val := _dig(existing, path)) is None else val) for var, (path, default) in _CARRIED.items() + } + + +def _upload_artifacts( + api, dataset: str, pending: list[tuple[Path, str, str]], existing: set[str], overwrite: bool +) -> None: + """Upload all pending files to HF, then replace each with a .url + .sha256 and delete it. + + ``pending`` is (instance_dir, instance_id, filename) — submission.tar.gz and the heavy + .eval.log.json. Files already on HF are skipped unless ``overwrite``. Uses + ``upload_large_folder`` (resumable, multi-commit, retrying) since logs can be hundreds + of MB and a single big commit is fragile; files are hard-linked into a staging tree so + nothing is copied. + """ + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.sha256").write_text(sha256_file(instance_dir / fname) + "\n") + to_upload = [(d, iid, f) for d, iid, f in pending if overwrite or f"{iid}/{f}" not in existing] + if to_upload: + run_dir = pending[0][0].parent + with tempfile.TemporaryDirectory(dir=run_dir) as tmp: + staging = Path(tmp) + for instance_dir, iid, fname in to_upload: + dst = staging / iid / fname + dst.parent.mkdir(parents=True, exist_ok=True) + try: + os.link(instance_dir / fname, dst) # same-fs hardlink: no copy + except OSError: + shutil.copy2(instance_dir / fname, dst) + log.info("Uploading %d file(s) to %s (resumable)", len(to_upload), dataset) + api.upload_large_folder(repo_id=dataset, folder_path=str(staging), repo_type="dataset") + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.url").write_text( + f"https://huggingface.co/datasets/{dataset}/resolve/main/{iid}/{fname}\n" + ) + (instance_dir / fname).unlink() + + +def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = False) -> PackageResult: + instances = benchmark_instances() + run_name = run_dir.resolve().name + + api = dataset = None + existing: set[str] = set() + if upload_to: + # Each submission gets its own dataset: bare "org" -> "org/"; + # an explicit "org/name" is used as-is. + dataset = upload_to if "/" in upload_to else f"{upload_to}/{run_name}" + from huggingface_hub import HfApi + + api = HfApi() + api.create_repo(dataset, repo_type="dataset", exist_ok=True) + # Force public so `verify`/`recombine` can fetch the artifacts anonymously + # (orgs may default new datasets to private). + api.update_repo_settings(dataset, repo_type="dataset", private=False) + existing = set(api.list_repo_files(dataset, repo_type="dataset")) + + test_maps: dict[str, dict[str, bool]] = {} + packaged: list[str] = [] + skipped: list[str] = [] + pending: list[tuple[Path, str, str]] = [] + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + has_solution = (instance_dir / "submission.tar.gz").exists() or ( + instance_dir / "submission.tar.gz.url" + ).exists() + if not (eval_json.exists() and has_solution): + continue + if iid not in instances: + log.warning("Skipping %s (not a known ProgramBench instance)", iid) + skipped.append(iid) + continue + test_maps[iid] = test_results_map(eval_json, instances[iid]) + # Split the (potentially huge) eval.json into a light eval.json + a heavy + # .eval.log.json (log + failure text); they recombine to the original. + split_eval_json(instance_dir, iid) + if api: + for fname in (f"{iid}.eval.log.json", "submission.tar.gz"): + if (instance_dir / fname).exists(): + pending.append((instance_dir, iid, fname)) + packaged.append(iid) + + if not packaged: + raise ValueError(f"No packageable instances found under {run_dir}") + + # Write the scoring-derived artifacts first; they don't depend on the upload, so a + # failed/throttled upload leaves them correct and the run simply resumable. + # score.json is per-test ({iid: {test: passed}}) so scores can be recomputed later + # while striking out specific tests; the manifest headline is the score with no + # tests struck. + write_stat(run_dir, "score", test_maps) + scores = {iid: score_from_tests(m) for iid, m in test_maps.items()} + headline = aggregate(scores, len(instances)) + + carried = _carried_values(run_dir) + env = Environment(loader=PackageLoader("programbench", "data/templates"), autoescape=False) + (run_dir / "submission.yaml").write_text( + env.get_template("submission.yaml.j2").render( + run_dir=run_dir, + submission_id=run_dir.resolve().name, + programbench_version=version("programbench"), + mean_score=headline.mean_score, + resolved_pct=headline.resolved_pct, + near_resolved_pct=headline.near_resolved_pct, + n_attempted=headline.n_instances_attempted, + n_total=headline.n_instances_total, + **carried, + ) + + "\n" + ) + + # README is created once (a starting point for the author); never overwritten. + readme = run_dir / "README.md" + if not readme.exists(): + readme.write_text( + env.get_template("README.md.j2").render( + submission_id=run_dir.resolve().name, + mean_pct=round(headline.mean_score * 100, 1), + resolved_pct=headline.resolved_pct, + n_attempted=headline.n_instances_attempted, + n_total=headline.n_instances_total, + **carried, + ) + ) + + if api and pending: + _upload_artifacts(api, dataset, pending, existing, overwrite) + + return PackageResult(run_dir, packaged, skipped, headline) diff --git a/src/programbench/register.py b/src/programbench/register.py new file mode 100644 index 0000000..b5a4cb2 --- /dev/null +++ b/src/programbench/register.py @@ -0,0 +1,157 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Register a packaged submission into the leaderboard registry by opening a PR. + +A registry entry is small and self-contained: a pointer to the submission's own public +repo, plus the manifest and stat files copied out of it. + + submissions// + pointer.yaml # source repo URL + the exact commit that was scored + submission.yaml # copied from the submission + _stats/*.json # copied from the submission + +This builds that entry against a clone of the registry (default +github.com/ProgramBench/submissions) and opens the PR. With ``gh`` it forks the registry +and opens the PR for you; without it, it leaves the commit on a branch in a clone and +prints the compare URL so you can open the PR by hand. +""" + +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import yaml + +REGISTRY_DEFAULT = "https://github.com/ProgramBench/submissions" + + +def _git(cwd: Path, *args: str) -> str: + return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() + + +def _to_https(url: str) -> str: + """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" + url = url.removesuffix(".git") + if url.startswith("git@"): + host, path = url[4:].split(":", 1) + return f"https://{host}/{path}" + return url + + +def _slug(registry: str) -> str: + """``https://github.com/Owner/Repo`` -> ``Owner/Repo`` (what ``gh`` expects).""" + return _to_https(registry).removeprefix("https://github.com/") + + +@dataclass +class RegisterPlan: + submission_id: str + source: str + commit: str + registry: str + branch: str + pointer: str # rendered pointer.yaml + files: list[str] # entry-relative paths that will be added + title: str + body: str + + +@dataclass +class RegisterResult: + plan: RegisterPlan + pr_url: str | None # set when a PR was opened (gh path) + next_steps: str | None # set when manual steps remain (no-gh path) + + +def build_plan(submission_dir: Path, registry: str) -> RegisterPlan: + sub_id = submission_dir.resolve().name + manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text()) + source = _to_https(_git(submission_dir, "remote", "get-url", "origin")) + commit = _git(submission_dir, "rev-parse", "HEAD") + pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False) + files = ["pointer.yaml", "submission.yaml"] + [ + f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json")) + ] + system, head = manifest["system"], manifest["headline"] + body = ( + f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n" + f"- mean score: {head['mean_score'] * 100:.1f}\n" + f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n" + f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n" + f"Source: {source}\nCommit: `{commit}`\n\n" + "Tier-0 verified (`programbench submit verify .`)." + ) + return RegisterPlan( + sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body + ) + + +def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -> Path: + """Materialize ``submissions//`` under ``registry_root`` (overwriting any existing entry).""" + entry = registry_root / "submissions" / plan.submission_id + if entry.exists(): + shutil.rmtree(entry) + (entry / "_stats").mkdir(parents=True) + (entry / "pointer.yaml").write_text(plan.pointer) + shutil.copyfile(submission_dir / "submission.yaml", entry / "submission.yaml") + for p in sorted((submission_dir / "_stats").glob("*.json")): + shutil.copyfile(p, entry / "_stats" / p.name) + return entry + + +def register_submission(submission_dir: Path, registry: str) -> RegisterResult: + """Clone the registry, commit the entry on a branch, and open the PR. + + Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward. + Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual + push + compare-URL steps in ``next_steps`` (so the clone must outlive this call). + """ + plan = build_plan(submission_dir, registry) + slug = _slug(registry) + clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions" + + if shutil.which("gh"): + # Fork the registry under the authed user (no-op if it exists) and clone the fork; + # origin -> fork, upstream -> registry. + subprocess.run( + ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)], + check=True, + capture_output=True, + text=True, + ) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _git(clone, "commit", "-m", plan.title) + _git(clone, "push", "-u", "origin", plan.branch) + pr_url = subprocess.run( + ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body], + cwd=clone, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + shutil.rmtree(clone.parent) + return RegisterResult(plan, pr_url, None) + + # No gh: clone the registry directly, commit the branch, and hand back the steps. + _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone)) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _git(clone, "commit", "-m", plan.title) + steps = ( + "`gh` not found, so the PR was not opened. The entry is committed on branch " + f"`{plan.branch}` in:\n {clone}\n\n" + "To finish, from that clone push the branch to your fork of the registry and open a PR:\n" + " git remote add fork https://github.com//submissions\n" + f" git push -u fork {plan.branch}\n" + f" {_to_https(registry)}/compare/main...:{plan.branch}?expand=1" + ) + return RegisterResult(plan, None, steps) diff --git a/src/programbench/submission.py b/src/programbench/submission.py new file mode 100644 index 0000000..71b8b34 --- /dev/null +++ b/src/programbench/submission.py @@ -0,0 +1,227 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Shared helpers for building (`package`) and checking (`verify`) submissions. + +Both commands must score a run directory the same way, so the scoring and headline +aggregation live here and are imported by each command. +""" + +import hashlib +import json +import logging +import shutil +import subprocess +import tarfile +import tempfile +import urllib.request +from dataclasses import asdict, dataclass +from pathlib import Path + +import yaml + +from programbench.eval.eval import EvaluationResult +from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances + +log = logging.getLogger(__name__) + +RESOLVED_THRESHOLD = 1.0 +NEAR_RESOLVED_THRESHOLD = 0.95 +FIXTURE_PREFIX = "testorg__" + + +def benchmark_instances() -> dict[str, dict]: + """Real benchmark instances, keyed by id (excludes the bundled test fixture).""" + return {i["instance_id"]: i for i in load_all_instances() if not i["instance_id"].startswith(FIXTURE_PREFIX)} + + +def sha256_file(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def test_results_map(eval_json: Path, instance: dict) -> dict[str, bool]: + """Per-test pass/fail for one instance, after the same active-branch / ignored-test + filtering as ``info``. Keyed by ``"/"``, value ``True`` iff passed. + + This is the raw material a score is computed from, so the leaderboard can later + recompute scores while striking out specific tests (see the registry's ignore map). + """ + result = EvaluationResult.model_validate_json(eval_json.read_text()) + result = result.for_branches(get_active_branches(instance)).without_ignored(get_ignored_tests(instance)) + return {t.full_name: t.is_resolved for t in result.test_results} + + +def score_from_tests(tests: dict[str, bool], ignore: set[str] = frozenset()) -> float: + """Fraction passed over the non-ignored tests (0.0 if none remain).""" + kept = [passed for name, passed in tests.items() if name not in ignore] + return sum(kept) / len(kept) if kept else 0.0 + + +def score_instance(eval_json: Path, instance: dict) -> float: + """Per-instance score with ignored-branch/test filtering (same logic as `info`).""" + return score_from_tests(test_results_map(eval_json, instance)) + + +def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]: + """Map instance_id -> score for every /.eval.json present and known.""" + scores: dict[str, float] = {} + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + if eval_json.exists() and iid in instances: + scores[iid] = score_instance(eval_json, instances[iid]) + return scores + + +def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None: + """Write a per-instance stat file: ``/_stats/.json`` = ``{iid: value}``.""" + (run_dir / "_stats").mkdir(exist_ok=True) + (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True)) + + +_HEAVY_EXTRA_KEYS = ("message", "text") + + +def _full_name(t: dict) -> str: + return f"{t['branch']}/{t['name']}" if t.get("branch") else t["name"] + + +def split_eval_json(instance_dir: Path, iid: str) -> None: + """Split ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json``. + + The heavy file holds the only bulky parts — the top-level ``log`` and each failing + test's ``message``/``text`` — keyed so the two recombine into the exact original. + Nothing is dropped; the union of the two files is the original eval.json. + """ + p = instance_dir / f"{iid}.eval.json" + data = json.loads(p.read_text()) + # Idempotent: if there's nothing heavy left (already split, or genuinely light), do + # nothing — never clobber an existing eval.log.json. + has_heavy = bool(data.get("log")) or any( + k in (t.get("extra") or {}) for t in data.get("test_results", []) for k in _HEAVY_EXTRA_KEYS + ) + if not has_heavy: + return + heavy: dict = {"log": data.get("log") or [], "failures": {}} + for t in data.get("test_results", []): + extra = t.get("extra") or {} + moved = {k: extra.pop(k) for k in _HEAVY_EXTRA_KEYS if k in extra} + if moved: + heavy["failures"][_full_name(t)] = moved + data["log"] = [] + p.write_text(json.dumps(data, indent=2)) + (instance_dir / f"{iid}.eval.log.json").write_text(json.dumps(heavy)) + + +def recombine_eval_json(instance_dir: Path, iid: str) -> bool: + """Inverse of :func:`split_eval_json`: fold the heavy file back into ``.eval.json`` + (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``. + + The heavy file is read locally, or downloaded from ``.eval.log.json.url`` if hosted. + Returns True if a recombine happened. + """ + light = instance_dir / f"{iid}.eval.json" + log_file = instance_dir / f"{iid}.eval.log.json" + url_file = instance_dir / f"{iid}.eval.log.json.url" + if not light.exists(): + return False + if log_file.exists(): + heavy = json.loads(log_file.read_text()) + elif url_file.exists(): + with urllib.request.urlopen(url_file.read_text().strip()) as r: # noqa: S310 + heavy = json.loads(r.read()) + else: + return False + data = json.loads(light.read_text()) + data["log"] = heavy.get("log", []) + failures = heavy.get("failures", {}) + for t in data.get("test_results", []): + if (name := _full_name(t)) in failures: + t.setdefault("extra", {}).update(failures[name]) + light.write_text(json.dumps(data, indent=2)) + for f in (log_file, url_file, instance_dir / f"{iid}.eval.log.json.sha256"): + f.unlink(missing_ok=True) + return True + + +@dataclass +class Headline: + mean_score: float + resolved_pct: float + near_resolved_pct: float + n_instances_attempted: int + n_instances_total: int + + def as_dict(self) -> dict: + return asdict(self) + + +def aggregate(scores: dict[str, float], n_total: int) -> Headline: + values = list(scores.values()) + if not values: + raise ValueError("No scored instances found") + n = len(values) + # mean is over attempted instances; resolved/near are over the full benchmark + # (an unattempted task counts as unresolved). + return Headline( + mean_score=round(sum(values) / n, 4), + resolved_pct=round(100 * sum(s >= RESOLVED_THRESHOLD for s in values) / n_total, 1), + near_resolved_pct=round(100 * sum(s >= NEAR_RESOLVED_THRESHOLD for s in values) / n_total, 1), + n_instances_attempted=n, + n_instances_total=n_total, + ) + + +def load_manifest(submission_dir: Path) -> dict: + return yaml.safe_load((submission_dir / "submission.yaml").read_text()) + + +def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: + """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256. + + Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or + ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is + enforced for inline/url; for git it is advisory (packing is not byte-reproducible). + """ + sha_file = instance_dir / "submission.tar.gz.sha256" + expected = sha_file.read_text().split()[0] if sha_file.exists() else None + + inline = instance_dir / "submission.tar.gz" + url_file = instance_dir / "submission.tar.gz.url" + ref_file = instance_dir / "submission.ref.yaml" + if inline.exists(): + shutil.copy2(inline, dest_tar) + elif url_file.exists(): + urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar) # noqa: S310 + elif ref_file.exists(): + _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar) + expected = None # git packing is not byte-reproducible; rely on re-eval instead + else: + raise ValueError(f"{instance_dir.name}: no submission.tar.gz, .url, or .ref.yaml found") + + if expected and (got := sha256_file(dest_tar)) != expected: + raise ValueError(f"{instance_dir.name}: sha256 mismatch (expected {expected[:12]}…, got {got[:12]}…)") + + +def _pack_git_ref(ref: dict, dest_tar: Path) -> None: + with tempfile.TemporaryDirectory() as tmp: + src = Path(tmp) / "src" + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", ref["ref"], ref["repo"], str(src)], + check=True, + capture_output=True, + ) + root = src / ref["subpath"] if ref.get("subpath") else src + with tarfile.open(dest_tar, "w:gz") as tar: + for p in sorted(root.rglob("*")): + rel = p.relative_to(root).as_posix() + if rel.split("/", 1)[0] == ".git": + continue + tar.add(p, arcname=rel, recursive=False) diff --git a/src/programbench/verify.py b/src/programbench/verify.py new file mode 100644 index 0000000..3fdfd28 --- /dev/null +++ b/src/programbench/verify.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Verify a packaged submission against its own claimed results. + +Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json +files (with ignored-test filtering) and check it matches submission.yaml. This is the +free consistency check a third party or CI can run with only ``programbench`` installed. + +Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``, +and confirm the freshly produced scores match the submitted eval.json. This is what +proves the artifacts actually yield the reported results. +""" + +import logging +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from programbench.submission import ( + Headline, + aggregate, + benchmark_instances, + load_manifest, + resolve_submission_tar, + score_run, +) + +log = logging.getLogger(__name__) + +TOLERANCE = 0.011 # headline floats are rounded; allow a hair more than the last digit + + +@dataclass +class Check: + name: str + claimed: object + computed: object + ok: bool + + +@dataclass +class VerifyResult: + tier: int + checks: list[Check] + + @property + def ok(self) -> bool: + return all(c.ok for c in self.checks) + + +def _close(a: object, b: object) -> bool: + if a is None: + return False + return abs(float(a) - float(b)) <= TOLERANCE + + +def _headline_checks(claimed: dict, computed: Headline) -> list[Check]: + return [ + Check(name, claimed.get(name), value, _close(claimed.get(name), value)) + for name, value in computed.as_dict().items() + ] + + +def verify_tier0(submission_dir: Path) -> VerifyResult: + manifest = load_manifest(submission_dir) + instances = benchmark_instances() + computed = aggregate(score_run(submission_dir, instances), len(instances)) + return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed)) + + +def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult: + from programbench.eval.eval_batch import run_eval_batch + + instances = benchmark_instances() + sub_root = submission_dir + submitted = score_run(sub_root, instances) + + with tempfile.TemporaryDirectory() as tmp: + run = Path(tmp) + for iid in submitted: + (run / iid).mkdir(parents=True) + resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz") + run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True) + fresh = score_run(run, instances) + + checks = [ + Check( + iid, + round(submitted[iid], 4), + round(fresh.get(iid, float("nan")), 4), + _close(submitted[iid], fresh.get(iid)), + ) + for iid in submitted + if not filter_spec or iid in fresh + ] + return VerifyResult(1, checks) From ab60e4227f013cf783e5cd69b4eef7cae871f125 Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 09:15:42 -0700 Subject: [PATCH 02/11] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/programbench/data/templates/submission.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 index 061353d..358dead 100644 --- a/src/programbench/data/templates/submission.yaml.j2 +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -1,5 +1,5 @@ -# Generated by `programbench package` from: {{ run_dir }} -# [auto] fields are recomputed on every `package`; all other fields are preserved. +# Generated by `programbench submit package` from: {{ run_dir }} +# [auto] fields are recomputed on every `programbench submit package`; all other fields are preserved. schema_version: 1 submission_id: {{ submission_id | tojson }} From 701818a947ae31a22d433a3f6dcdf6527707faf5 Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 09:15:56 -0700 Subject: [PATCH 03/11] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/programbench/submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/programbench/submission.py b/src/programbench/submission.py index 71b8b34..6bf3edd 100644 --- a/src/programbench/submission.py +++ b/src/programbench/submission.py @@ -80,7 +80,7 @@ def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]: return scores -def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None: +def write_stat(run_dir: Path, stat: str, by_instance: dict[str, object]) -> None: """Write a per-instance stat file: ``/_stats/.json`` = ``{iid: value}``.""" (run_dir / "_stats").mkdir(exist_ok=True) (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True)) From af7c74e533d8f7e4ae4c9e70a21c5afaf9884924 Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 09:17:07 -0700 Subject: [PATCH 04/11] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/programbench/submission.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/programbench/submission.py b/src/programbench/submission.py index 6bf3edd..f778c87 100644 --- a/src/programbench/submission.py +++ b/src/programbench/submission.py @@ -186,8 +186,7 @@ def load_manifest(submission_dir: Path) -> dict: def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256. - Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or - ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is + Supports the artifact forms: inline file, ``.url`` (downloaded), or ``submission.ref.yaml`` (git checkout packed). enforced for inline/url; for git it is advisory (packing is not byte-reproducible). """ sha_file = instance_dir / "submission.tar.gz.sha256" From 1106d337bba3e47a6e1557bef29b1a97c326935a Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 09:37:06 -0700 Subject: [PATCH 05/11] Address review: make submit register --source/--commit take effect; fix Tier-1 verify - register: thread --source/--commit through build_plan/register_submission so they actually change pointer.yaml + PR body (previously no-ops). - verify: guard _close against None on either side (Tier-1 no longer crashes when a re-eval produces no fresh score); filter Tier-1 checks by the same regex as the re-eval and report missing scores as NaN/fail instead of silently skipping them. - submission: repair resolve_submission_tar docstring left dangling by the SPEC.md edit. --- src/programbench/cli/submit.py | 13 ++++--------- src/programbench/register.py | 16 +++++++++++----- src/programbench/submission.py | 3 ++- src/programbench/verify.py | 12 ++++++++---- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py index 0980952..c3bce27 100644 --- a/src/programbench/cli/submit.py +++ b/src/programbench/cli/submit.py @@ -161,13 +161,8 @@ def register( ) raise typer.Exit(1) - plan = build_plan(submission_dir, registry) - if source: - plan.source = source - if commit: - plan.commit = commit - if dry_run: + plan = build_plan(submission_dir, registry, source or None, commit or None) with tempfile.TemporaryDirectory() as tmp: entry = write_entry(plan, submission_dir, Path(tmp)) files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file()) @@ -180,11 +175,11 @@ def register( console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]") return - result = register_submission(submission_dir, registry) + result = register_submission(submission_dir, registry, source or None, commit or None) if result.pr_url: - console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}") + console.print(f"[bold green]Opened PR[/bold green] for {result.plan.submission_id}: {result.pr_url}") else: - console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}") + console.print(f"[bold]Prepared[/bold] registry entry for {result.plan.submission_id}.\n{result.next_steps}") @app.command() diff --git a/src/programbench/register.py b/src/programbench/register.py index b5a4cb2..da119f8 100644 --- a/src/programbench/register.py +++ b/src/programbench/register.py @@ -69,11 +69,15 @@ class RegisterResult: next_steps: str | None # set when manual steps remain (no-gh path) -def build_plan(submission_dir: Path, registry: str) -> RegisterPlan: +def build_plan( + submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None +) -> RegisterPlan: sub_id = submission_dir.resolve().name manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text()) - source = _to_https(_git(submission_dir, "remote", "get-url", "origin")) - commit = _git(submission_dir, "rev-parse", "HEAD") + # Overrides win; otherwise autodetect from the submission's own git remote/HEAD. The + # autodetect calls are skipped (short-circuited) when an override is supplied. + source = source or _to_https(_git(submission_dir, "remote", "get-url", "origin")) + commit = commit or _git(submission_dir, "rev-parse", "HEAD") pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False) files = ["pointer.yaml", "submission.yaml"] + [ f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json")) @@ -105,14 +109,16 @@ def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) - return entry -def register_submission(submission_dir: Path, registry: str) -> RegisterResult: +def register_submission( + submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None +) -> RegisterResult: """Clone the registry, commit the entry on a branch, and open the PR. Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward. Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual push + compare-URL steps in ``next_steps`` (so the clone must outlive this call). """ - plan = build_plan(submission_dir, registry) + plan = build_plan(submission_dir, registry, source, commit) slug = _slug(registry) clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions" diff --git a/src/programbench/submission.py b/src/programbench/submission.py index f778c87..db6f4b7 100644 --- a/src/programbench/submission.py +++ b/src/programbench/submission.py @@ -186,7 +186,8 @@ def load_manifest(submission_dir: Path) -> dict: def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256. - Supports the artifact forms: inline file, ``.url`` (downloaded), or ``submission.ref.yaml`` (git checkout packed). + Supports three artifact forms: inline file, ``.url`` (downloaded), or + ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is enforced for inline/url; for git it is advisory (packing is not byte-reproducible). """ sha_file = instance_dir / "submission.tar.gz.sha256" diff --git a/src/programbench/verify.py b/src/programbench/verify.py index 3fdfd28..aa7bb7a 100644 --- a/src/programbench/verify.py +++ b/src/programbench/verify.py @@ -16,6 +16,7 @@ """ import logging +import re import tempfile from dataclasses import dataclass from pathlib import Path @@ -53,7 +54,7 @@ def ok(self) -> bool: def _close(a: object, b: object) -> bool: - if a is None: + if a is None or b is None: return False return abs(float(a) - float(b)) <= TOLERANCE @@ -87,14 +88,17 @@ def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = " run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True) fresh = score_run(run, instances) + # Same regex semantics as the re-eval filter (instance_filters.filter_instances), so a + # filtered-in instance that produced no fresh score is reported as a failure (NaN), not + # silently skipped. + targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)] checks = [ Check( iid, round(submitted[iid], 4), - round(fresh.get(iid, float("nan")), 4), + round(fresh[iid], 4) if iid in fresh else float("nan"), _close(submitted[iid], fresh.get(iid)), ) - for iid in submitted - if not filter_spec or iid in fresh + for iid in targets ] return VerifyResult(1, checks) From 398dc3a60804a6f68df27f8535ad755dae382efc Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 09:42:14 -0700 Subject: [PATCH 06/11] Address review (remaining): tighten verify tolerance, check recombine downloads, add submit CLI tests - verify: TOLERANCE 0.011 -> 1e-6 (Tier-0 recomputes deterministically, so this only absorbs float noise; real drift now fails). Verified Tier-0 still passes on a real run. - submission: recombine verifies a downloaded eval.log.json against its .sha256 sidecar; soften split/recombine docstrings (lossless / semantically identical, not byte-for-byte). - tests: add submit --help, submit package --help, submit register --help smoke tests. --- src/programbench/submission.py | 17 ++++++++++++----- src/programbench/verify.py | 5 ++++- tests/test_cli.py | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/programbench/submission.py b/src/programbench/submission.py index db6f4b7..18430b1 100644 --- a/src/programbench/submission.py +++ b/src/programbench/submission.py @@ -97,8 +97,9 @@ def split_eval_json(instance_dir: Path, iid: str) -> None: """Split ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json``. The heavy file holds the only bulky parts — the top-level ``log`` and each failing - test's ``message``/``text`` — keyed so the two recombine into the exact original. - Nothing is dropped; the union of the two files is the original eval.json. + test's ``message``/``text`` — keyed so the two recombine losslessly. Nothing is dropped; + the union of the two files holds everything in the original eval.json (the rebuilt file + is semantically identical, though not necessarily byte-for-byte). """ p = instance_dir / f"{iid}.eval.json" data = json.loads(p.read_text()) @@ -122,9 +123,11 @@ def split_eval_json(instance_dir: Path, iid: str) -> None: def recombine_eval_json(instance_dir: Path, iid: str) -> bool: """Inverse of :func:`split_eval_json`: fold the heavy file back into ``.eval.json`` - (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``. + (restoring the full eval output losslessly), then remove the heavy file and its + ``.url``/``.sha256``. - The heavy file is read locally, or downloaded from ``.eval.log.json.url`` if hosted. + The heavy file is read locally, or downloaded from ``.eval.log.json.url`` if hosted; + a downloaded file is checked against its ``.sha256`` sidecar when one is present. Returns True if a recombine happened. """ light = instance_dir / f"{iid}.eval.json" @@ -136,7 +139,11 @@ def recombine_eval_json(instance_dir: Path, iid: str) -> bool: heavy = json.loads(log_file.read_text()) elif url_file.exists(): with urllib.request.urlopen(url_file.read_text().strip()) as r: # noqa: S310 - heavy = json.loads(r.read()) + raw = r.read() + sha_file = instance_dir / f"{iid}.eval.log.json.sha256" + if sha_file.exists() and (got := hashlib.sha256(raw).hexdigest()) != sha_file.read_text().split()[0]: + raise ValueError(f"{iid}: eval.log.json sha256 mismatch on download (got {got[:12]}…)") + heavy = json.loads(raw) else: return False data = json.loads(light.read_text()) diff --git a/src/programbench/verify.py b/src/programbench/verify.py index aa7bb7a..8cdda87 100644 --- a/src/programbench/verify.py +++ b/src/programbench/verify.py @@ -32,7 +32,10 @@ log = logging.getLogger(__name__) -TOLERANCE = 0.011 # headline floats are rounded; allow a hair more than the last digit +# Tier-0 recomputes the headline from the same eval.json with the same deterministic +# rounding `package` used, so a consistent submission matches exactly. The epsilon only +# absorbs float representation noise; any real drift (>= the rounding granularity) fails. +TOLERANCE = 1e-6 @dataclass diff --git a/tests/test_cli.py b/tests/test_cli.py index 594bd9d..904ec59 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -37,3 +37,21 @@ def test_blob_sync_help(): result = runner.invoke(app, ["blob", "sync", "--help"]) assert result.exit_code == 0 assert "instance" in result.output.lower() + + +def test_submit_help(): + result = runner.invoke(app, ["submit", "--help"]) + assert result.exit_code == 0 + assert all(cmd in result.output for cmd in ("package", "verify", "register", "recombine")) + + +def test_submit_package_help(): + result = runner.invoke(app, ["submit", "package", "--help"]) + assert result.exit_code == 0 + assert "upload" in result.output.lower() + + +def test_submit_register_help(): + result = runner.invoke(app, ["submit", "register", "--help"]) + assert result.exit_code == 0 + assert "registry" in result.output.lower() From 623d47eb77eb4db43c8ca2d666e483d2585e0b9d Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 10:26:17 -0700 Subject: [PATCH 07/11] Add `programbench submit publish`: create + push a submission's GitHub repo Middle step between package and register. With gh, creates the public repo and pushes in one shot; without gh, pushes to a --remote you pre-created or prints the steps. Repo name defaults to the submission id; register reads the URL back from the git remote, so it is never stored in submission.yaml. Adds a --dry-run and a CLI smoke test. --- src/programbench/cli/submit.py | 60 ++++++++++++++++++++ src/programbench/publish.py | 101 +++++++++++++++++++++++++++++++++ tests/test_cli.py | 6 ++ 3 files changed, 167 insertions(+) create mode 100644 src/programbench/publish.py diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py index c3bce27..5c7a4ab 100644 --- a/src/programbench/cli/submit.py +++ b/src/programbench/cli/submit.py @@ -6,6 +6,7 @@ """Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json.""" +import shutil from pathlib import Path import typer @@ -110,6 +111,65 @@ def verify( raise typer.Exit(1) +@app.command() +def publish( + run_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + owner: str = typer.Option( + "", "--owner", help="GitHub org/user to create the repo under (default: your gh account)." + ), + repo: str = typer.Option("", "--repo", help="Repository name (default: the submission directory name)."), + private: bool = typer.Option( + False, "--private", help="Create the repo private (it must be public before you can register it)." + ), + remote: str = typer.Option( + "", "--remote", help="Push to this existing empty repo URL instead of creating one (the no-gh path)." + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Show what would be created/pushed; touch no network and make no commit." + ), +) -> None: + """Create this submission's public GitHub repo and push it (package -> publish -> register). + + Heavy artifacts already live on HuggingFace (as .url + .sha256 from `package`), so only + light files are committed. With `gh` the repo is created and pushed in one shot; without + it, pass `--remote ` to an empty repo you created, or follow the printed steps. The + repo name defaults to the directory name and the URL is read back by `register`, so it is + never stored in submission.yaml. + + \b + Examples: + programbench submit publish ./my-run --dry-run + programbench submit publish ./my-run --owner my-org + """ + from rich.console import Console + + from programbench.publish import _origin, publish as do_publish + + console = Console() + name = repo or run_dir.resolve().name + + if dry_run: + existing = _origin(run_dir) + if existing: + plan = f"push current commit to existing remote [bold]{existing}[/bold]" + elif remote: + plan = f"add remote [bold]{remote}[/bold] and push" + elif shutil.which("gh"): + plan = f"`gh repo create` [bold]{f'{owner}/{name}' if owner else name}[/bold] ({'private' if private else 'public'}), set origin, and push" + else: + plan = f"commit locally only — no gh and no --remote, so the repo for [bold]{name}[/bold] can't be created" + console.print(f"[bold]Would publish[/bold] {run_dir}:\n {plan}") + console.print("[dim]Dry run — no commit, nothing created or pushed. Drop --dry-run to publish.[/dim]") + return + + result = do_publish(run_dir, owner=owner, repo=repo, private=private, remote=remote) + if result.repo_url: + console.print(f"[bold green]Published[/bold green] {name} -> {result.repo_url}") + console.print("[dim]Next: `programbench submit register .` to register it on the leaderboard.[/dim]") + else: + console.print(f"[bold]Committed[/bold] {name} locally.\n{result.next_steps}") + + @app.command() def register( submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), diff --git a/src/programbench/publish.py b/src/programbench/publish.py new file mode 100644 index 0000000..035173e --- /dev/null +++ b/src/programbench/publish.py @@ -0,0 +1,101 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Create a submission's public GitHub repo and push it. + +The middle step between ``package`` and ``register``: it turns a packaged run directory +into a public Git repo and pushes it. The heavy artifacts already live on HuggingFace (as +``.url`` + ``.sha256`` written by ``package``), so only light files are committed. With +``gh`` the repo is created and pushed in one shot; without ``gh`` it commits locally and +either pushes to a ``--remote`` you pre-created, or prints the steps to finish by hand. + +The repo URL is never stored in ``submission.yaml`` — it defaults to the submission id and +``register`` reads it back from the git remote this sets, keeping the manifest host-agnostic. +""" + +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + + +def _git(cwd: Path, *args: str) -> str: + return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() + + +def _to_https(url: str) -> str: + """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" + url = url.removesuffix(".git") + if url.startswith("git@"): + host, path = url[4:].split(":", 1) + return f"https://{host}/{path}" + return url + + +def _origin(run_dir: Path) -> str | None: + if not (run_dir / ".git").exists() or "origin" not in _git(run_dir, "remote").split(): + return None + return _git(run_dir, "remote", "get-url", "origin") + + +@dataclass +class PublishResult: + repo_url: str | None # the pushed repo (https), when known + committed: bool # whether a new commit was made + next_steps: str | None # manual steps when we could not finish (no gh, no --remote) + + +def _ensure_committed(run_dir: Path) -> bool: + """Init the repo if needed and commit any pending changes; True if a commit was made.""" + if not (run_dir / ".git").exists(): + _git(run_dir, "init", "-b", "main") + _git(run_dir, "add", "-A") + if not _git(run_dir, "status", "--porcelain"): + return False + _git(run_dir, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}") + return True + + +def _gh_repo_url(slug: str, private: bool) -> str: + """The repo's URL, creating it (public unless ``private``) if it doesn't exist yet.""" + view = ["gh", "repo", "view", slug, "--json", "url", "-q", ".url"] + if subprocess.run(view, capture_output=True, text=True).returncode != 0: + subprocess.run( + ["gh", "repo", "create", slug, "--private" if private else "--public"], + check=True, + capture_output=True, + text=True, + ) + return subprocess.run(view, check=True, capture_output=True, text=True).stdout.strip() + + +def publish(run_dir: Path, owner: str = "", repo: str = "", private: bool = False, remote: str = "") -> PublishResult: + name = repo or run_dir.resolve().name + committed = _ensure_committed(run_dir) + + # Pick the target repo: an explicit --remote, an already-wired origin, or one created + # via gh. Without any of those we can only commit locally and hand back the steps. + target = remote or _origin(run_dir) + if not target: + if not shutil.which("gh"): + steps = ( + "`gh` is not installed and no --remote was given, so the repo could not be created. " + f"The submission is committed locally in {run_dir}. To finish:\n" + f" 1. Create an empty PUBLIC repo (named '{name}') at https://github.com/new\n" + " 2. From the submission directory, wire it up and push:\n" + " git remote add origin \n" + " git push -u origin HEAD:main\n" + "Then run `programbench submit register .` to register it on the leaderboard." + ) + return PublishResult(None, committed, steps) + target = _gh_repo_url(f"{owner}/{name}" if owner else name, private) + + # Push over HTTPS using gh's credentials: reliable everywhere (an SSH origin needs keys + # set up, and would fail in sandboxes that block port 22). + url = _to_https(target) + _git(run_dir, "remote", "set-url" if _origin(run_dir) else "add", "origin", url) + _git(run_dir, "push", "-u", "origin", "HEAD:main") + return PublishResult(url, committed, None) diff --git a/tests/test_cli.py b/tests/test_cli.py index 904ec59..f23d7de 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -55,3 +55,9 @@ def test_submit_register_help(): result = runner.invoke(app, ["submit", "register", "--help"]) assert result.exit_code == 0 assert "registry" in result.output.lower() + + +def test_submit_publish_help(): + result = runner.invoke(app, ["submit", "publish", "--help"]) + assert result.exit_code == 0 + assert "owner" in result.output.lower() From 146b975bfd9d9a96e42e1bfd0cf797ded18c9ba1 Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 10:47:57 -0700 Subject: [PATCH 08/11] Address second review round (submit group) verify: _close treats non-numeric manifest values as a failed check (no crash); Tier-1 only resolves/downloads the --filter-matched subset, not every tarball; drop dead logger. submission: reject non-http(s) URLs (SSRF/file:// guard) and add download timeouts for recombine + resolve_submission_tar; drop dead logger. package: accept submission.ref.yaml as a valid solution form (matches resolve_submission_tar). register: fix `gh repo fork` (takes no dest arg -> run from clone.parent); add % to the PR body mean score; git-identity fallback for commits in fresh containers. publish: git-identity fallback for the commit. docs/tests: correct CLI module docstring + manifest 'stats/'->'_stats/' comment; assert publish in submit --help; add lossless split/recombine round-trip unit tests. --- src/programbench/cli/submit.py | 3 +- .../data/templates/submission.yaml.j2 | 2 +- src/programbench/package.py | 7 +-- src/programbench/publish.py | 10 +++- src/programbench/register.py | 21 ++++++-- src/programbench/submission.py | 22 ++++++-- src/programbench/verify.py | 18 +++---- tests/test_cli.py | 2 +- tests/test_submission.py | 53 +++++++++++++++++++ 9 files changed, 111 insertions(+), 27 deletions(-) create mode 100644 tests/test_submission.py diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py index 5c7a4ab..2378f1c 100644 --- a/src/programbench/cli/submit.py +++ b/src/programbench/cli/submit.py @@ -4,7 +4,8 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json.""" +"""Submission lifecycle commands: package an eval run, publish its repo, verify a submission, +register it on the leaderboard, and recombine a split eval.json.""" import shutil from pathlib import Path diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 index 358dead..eda220f 100644 --- a/src/programbench/data/templates/submission.yaml.j2 +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -20,7 +20,7 @@ system: eval: programbench_version: {{ programbench_version | tojson }} # [auto] -headline: # [auto] score summary from evaluation; other stats live in stats/ +headline: # [auto] score summary from evaluation; other stats live in _stats/ mean_score: {{ mean_score }} resolved_pct: {{ resolved_pct }} near_resolved_pct: {{ near_resolved_pct }} diff --git a/src/programbench/package.py b/src/programbench/package.py index a86bcea..c985e11 100644 --- a/src/programbench/package.py +++ b/src/programbench/package.py @@ -148,9 +148,10 @@ def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = F for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): iid = instance_dir.name eval_json = instance_dir / f"{iid}.eval.json" - has_solution = (instance_dir / "submission.tar.gz").exists() or ( - instance_dir / "submission.tar.gz.url" - ).exists() + # Any artifact form resolve_submission_tar understands counts as a solution. + has_solution = any( + (instance_dir / f).exists() for f in ("submission.tar.gz", "submission.tar.gz.url", "submission.ref.yaml") + ) if not (eval_json.exists() and has_solution): continue if iid not in instances: diff --git a/src/programbench/publish.py b/src/programbench/publish.py index 035173e..81b37e3 100644 --- a/src/programbench/publish.py +++ b/src/programbench/publish.py @@ -49,13 +49,19 @@ class PublishResult: def _ensure_committed(run_dir: Path) -> bool: - """Init the repo if needed and commit any pending changes; True if a commit was made.""" + """Init the repo if needed and commit any pending changes; True if a commit was made. + + Supplies a fallback git identity when none is configured (common in fresh CI containers, + where ``git commit`` would otherwise error out).""" if not (run_dir / ".git").exists(): _git(run_dir, "init", "-b", "main") _git(run_dir, "add", "-A") if not _git(run_dir, "status", "--porcelain"): return False - _git(run_dir, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}") + ident = [] + if subprocess.run(["git", "config", "user.email"], cwd=run_dir, capture_output=True).returncode != 0: + ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"] + _git(run_dir, *ident, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}") return True diff --git a/src/programbench/register.py b/src/programbench/register.py index da119f8..0f42525 100644 --- a/src/programbench/register.py +++ b/src/programbench/register.py @@ -35,6 +35,15 @@ def _git(cwd: Path, *args: str) -> str: return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() +def _commit(cwd: Path, message: str) -> None: + """Commit staged changes, supplying a fallback identity when git has none configured + (common in fresh CI containers, where ``git commit`` would otherwise error out).""" + ident = [] + if subprocess.run(["git", "config", "user.email"], cwd=cwd, capture_output=True).returncode != 0: + ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"] + _git(cwd, *ident, "commit", "-m", message) + + def _to_https(url: str) -> str: """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" url = url.removesuffix(".git") @@ -85,7 +94,7 @@ def build_plan( system, head = manifest["system"], manifest["headline"] body = ( f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n" - f"- mean score: {head['mean_score'] * 100:.1f}\n" + f"- mean score: {head['mean_score'] * 100:.1f}%\n" f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n" f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n" f"Source: {source}\nCommit: `{commit}`\n\n" @@ -124,9 +133,11 @@ def register_submission( if shutil.which("gh"): # Fork the registry under the authed user (no-op if it exists) and clone the fork; - # origin -> fork, upstream -> registry. + # origin -> fork, upstream -> registry. gh repo fork takes no destination arg, so it + # clones into /; running from clone.parent makes that equal `clone`. subprocess.run( - ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)], + ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"], + cwd=clone.parent, check=True, capture_output=True, text=True, @@ -134,7 +145,7 @@ def register_submission( _git(clone, "checkout", "-b", plan.branch) write_entry(plan, submission_dir, clone) _git(clone, "add", f"submissions/{plan.submission_id}") - _git(clone, "commit", "-m", plan.title) + _commit(clone, plan.title) _git(clone, "push", "-u", "origin", plan.branch) pr_url = subprocess.run( ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body], @@ -151,7 +162,7 @@ def register_submission( _git(clone, "checkout", "-b", plan.branch) write_entry(plan, submission_dir, clone) _git(clone, "add", f"submissions/{plan.submission_id}") - _git(clone, "commit", "-m", plan.title) + _commit(clone, plan.title) steps = ( "`gh` not found, so the PR was not opened. The entry is committed on branch " f"`{plan.branch}` in:\n {clone}\n\n" diff --git a/src/programbench/submission.py b/src/programbench/submission.py index 18430b1..396a9ec 100644 --- a/src/programbench/submission.py +++ b/src/programbench/submission.py @@ -12,11 +12,11 @@ import hashlib import json -import logging import shutil import subprocess import tarfile import tempfile +import urllib.parse import urllib.request from dataclasses import asdict, dataclass from pathlib import Path @@ -26,11 +26,19 @@ from programbench.eval.eval import EvaluationResult from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances -log = logging.getLogger(__name__) - RESOLVED_THRESHOLD = 1.0 NEAR_RESOLVED_THRESHOLD = 0.95 FIXTURE_PREFIX = "testorg__" +DOWNLOAD_TIMEOUT = 60 # seconds; fail fast rather than hang on a stalled connection + + +def _checked_url(raw: str) -> str: + """A submission-supplied URL, rejecting non-http(s) schemes (e.g. file://) to avoid SSRF + / local file reads when resolving untrusted third-party submissions.""" + url = raw.strip() + if urllib.parse.urlparse(url).scheme not in ("http", "https"): + raise ValueError(f"refusing to fetch non-http(s) URL: {url!r}") + return url def benchmark_instances() -> dict[str, dict]: @@ -138,7 +146,7 @@ def recombine_eval_json(instance_dir: Path, iid: str) -> bool: if log_file.exists(): heavy = json.loads(log_file.read_text()) elif url_file.exists(): - with urllib.request.urlopen(url_file.read_text().strip()) as r: # noqa: S310 + with urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r: # noqa: S310 raw = r.read() sha_file = instance_dir / f"{iid}.eval.log.json.sha256" if sha_file.exists() and (got := hashlib.sha256(raw).hexdigest()) != sha_file.read_text().split()[0]: @@ -206,7 +214,11 @@ def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: if inline.exists(): shutil.copy2(inline, dest_tar) elif url_file.exists(): - urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar) # noqa: S310 + with ( + urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r, # noqa: S310 + dest_tar.open("wb") as out, + ): + shutil.copyfileobj(r, out) elif ref_file.exists(): _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar) expected = None # git packing is not byte-reproducible; rely on re-eval instead diff --git a/src/programbench/verify.py b/src/programbench/verify.py index 8cdda87..bc2ebc5 100644 --- a/src/programbench/verify.py +++ b/src/programbench/verify.py @@ -15,7 +15,6 @@ proves the artifacts actually yield the reported results. """ -import logging import re import tempfile from dataclasses import dataclass @@ -30,8 +29,6 @@ score_run, ) -log = logging.getLogger(__name__) - # Tier-0 recomputes the headline from the same eval.json with the same deterministic # rounding `package` used, so a consistent submission matches exactly. The epsilon only # absorbs float representation noise; any real drift (>= the rounding granularity) fails. @@ -57,9 +54,10 @@ def ok(self) -> bool: def _close(a: object, b: object) -> bool: - if a is None or b is None: + # Non-numeric (e.g. a user-edited/invalid manifest value) is a failed check, not a crash. + if not isinstance(a, (int, float)) or not isinstance(b, (int, float)): return False - return abs(float(a) - float(b)) <= TOLERANCE + return abs(a - b) <= TOLERANCE def _headline_checks(claimed: dict, computed: Headline) -> list[Check]: @@ -83,18 +81,20 @@ def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = " sub_root = submission_dir submitted = score_run(sub_root, instances) + # Same regex semantics as the re-eval filter (instance_filters.filter_instances): only + # resolve/download and re-eval the targeted instances, not every submitted tarball. + targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)] + with tempfile.TemporaryDirectory() as tmp: run = Path(tmp) - for iid in submitted: + for iid in targets: (run / iid).mkdir(parents=True) resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz") run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True) fresh = score_run(run, instances) - # Same regex semantics as the re-eval filter (instance_filters.filter_instances), so a - # filtered-in instance that produced no fresh score is reported as a failure (NaN), not + # A targeted instance that produced no fresh score is reported as a failure (NaN), not # silently skipped. - targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)] checks = [ Check( iid, diff --git a/tests/test_cli.py b/tests/test_cli.py index f23d7de..984085d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -42,7 +42,7 @@ def test_blob_sync_help(): def test_submit_help(): result = runner.invoke(app, ["submit", "--help"]) assert result.exit_code == 0 - assert all(cmd in result.output for cmd in ("package", "verify", "register", "recombine")) + assert all(cmd in result.output for cmd in ("package", "publish", "verify", "register", "recombine")) def test_submit_package_help(): diff --git a/tests/test_submission.py b/tests/test_submission.py new file mode 100644 index 0000000..9d1d5d4 --- /dev/null +++ b/tests/test_submission.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for submission helpers that don't need the benchmark data.""" + +import json + +from programbench.submission import recombine_eval_json, split_eval_json + + +def test_split_recombine_roundtrip_is_lossless(tmp_path): + iid = "org__tool.abc1234" + d = tmp_path / iid + d.mkdir() + original = { + "instance_id": iid, + "log": [{"step": 1, "out": "x" * 200}, {"step": 2, "out": "y"}], + "test_results": [ + {"branch": "main", "name": "t_pass", "is_resolved": True, "extra": {"duration": 0.5}}, + { + "branch": "main", + "name": "t_fail", + "is_resolved": False, + "extra": {"message": "assertion failed", "text": "trace " * 50, "duration": 1.2}, + }, + {"branch": "feat", "name": "t_other", "is_resolved": False, "extra": {"text": "boom"}}, + ], + } + eval_json = d / f"{iid}.eval.json" + eval_json.write_text(json.dumps(original, indent=2)) + + split_eval_json(d, iid) + light = json.loads(eval_json.read_text()) + assert light["log"] == [] + assert "message" not in light["test_results"][1]["extra"] + assert (d / f"{iid}.eval.log.json").exists() + + assert recombine_eval_json(d, iid) is True + assert json.loads(eval_json.read_text()) == original + assert not (d / f"{iid}.eval.log.json").exists() + + +def test_split_is_idempotent_and_noop_when_light(tmp_path): + iid = "org__tool.def5678" + d = tmp_path / iid + d.mkdir() + light = {"instance_id": iid, "log": [], "test_results": [{"branch": "main", "name": "t", "is_resolved": True}]} + (d / f"{iid}.eval.json").write_text(json.dumps(light)) + split_eval_json(d, iid) + assert not (d / f"{iid}.eval.log.json").exists() # nothing heavy -> no split file written From 21d80e1baee5b4f4afb7a64d1580171ffa8618ae Mon Sep 17 00:00:00 2001 From: John Yang Date: Wed, 17 Jun 2026 22:51:53 -0700 Subject: [PATCH 09/11] register: maintainer branch-PR path, HTTPS push, robust PR creation - Push a branch straight to the registry when the user has push access (forks are often disabled on private/org repos); only fork when they can't push. - Normalize the push remote to HTTPS (gh may wire ssh, which needs keys / is sandbox-blocked). - Open the PR with an explicit --head (gh's inference was unreliable) and resolve the PR URL by querying the branch, raising a real error if creation produced none. --- src/programbench/register.py | 66 ++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/src/programbench/register.py b/src/programbench/register.py index 0f42525..7e384a9 100644 --- a/src/programbench/register.py +++ b/src/programbench/register.py @@ -123,7 +123,8 @@ def register_submission( ) -> RegisterResult: """Clone the registry, commit the entry on a branch, and open the PR. - Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward. + With ``gh``: maintainers (push access) get a branch + PR straight on the registry; + everyone else forks first (and a fork is only possible if the registry allows it). Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual push + compare-URL steps in ``next_steps`` (so the clone must outlive this call). """ @@ -132,28 +133,71 @@ def register_submission( clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions" if shutil.which("gh"): - # Fork the registry under the authed user (no-op if it exists) and clone the fork; - # origin -> fork, upstream -> registry. gh repo fork takes no destination arg, so it - # clones into /; running from clone.parent makes that equal `clone`. - subprocess.run( - ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"], - cwd=clone.parent, - check=True, - capture_output=True, - text=True, + # Maintainers push a branch straight to the registry; others fork (only works if the + # registry permits forks — org/private repos often disable them). + can_push = ( + subprocess.run( + ["gh", "api", f"repos/{slug}", "--jq", ".permissions.push"], capture_output=True, text=True + ).stdout.strip() + == "true" ) + if can_push: + _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone)) + head = plan.branch + else: + # gh repo fork takes no destination arg, so it clones into /; + # running from clone.parent makes that equal `clone`. + subprocess.run( + ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"], + cwd=clone.parent, + check=True, + capture_output=True, + text=True, + ) + login = subprocess.run( + ["gh", "api", "user", "--jq", ".login"], check=True, capture_output=True, text=True + ).stdout.strip() + head = f"{login}:{plan.branch}" + # Push over HTTPS: gh may wire an ssh remote, and ssh needs keys set up (and is blocked + # in some sandboxes), whereas gh's https credentials always work. + _git(clone, "remote", "set-url", "origin", _to_https(_git(clone, "remote", "get-url", "origin"))) _git(clone, "checkout", "-b", plan.branch) write_entry(plan, submission_dir, clone) _git(clone, "add", f"submissions/{plan.submission_id}") _commit(clone, plan.title) _git(clone, "push", "-u", "origin", plan.branch) + # Open the PR (explicit --head; gh's inference is unreliable). The branch lookup is the + # source of truth: gh pr create can exit nonzero yet still create the PR, and a PR for + # the branch may already exist from a prior run. + created = subprocess.run( + ["gh", "pr", "create", "--repo", slug, "--head", head, "--title", plan.title, "--body", plan.body], + cwd=clone, + capture_output=True, + text=True, + ) pr_url = subprocess.run( - ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body], + [ + "gh", + "pr", + "list", + "--repo", + slug, + "--head", + plan.branch, + "--state", + "open", + "--json", + "url", + "--jq", + ".[0].url", + ], cwd=clone, check=True, capture_output=True, text=True, ).stdout.strip() + if not pr_url: + raise RuntimeError(f"gh pr create did not open a PR:\n{created.stderr or created.stdout}") shutil.rmtree(clone.parent) return RegisterResult(plan, pr_url, None) From dd10df2f7ccb09960da96e9e31409d1a5aa1078d Mon Sep 17 00:00:00 2001 From: John Yang Date: Thu, 18 Jun 2026 13:29:33 -0700 Subject: [PATCH 10/11] Stop storing scores in submission.yaml; verify score.json vs eval.json Leaderboard scores are recomputed from _stats/score.json with the registry's ignore list, so a cached headline in submission.yaml is redundant and goes stale on every ignore-list change. Drop the headline block from the template + package. Re-point Tier-0 verify to recompute per-test pass/fail from each eval.json and check it matches score.json (no headline to compare). Make register re-runnable (force-push its branch) so a PR can be updated. --- src/programbench/cli/submit.py | 32 +++++++----- .../data/templates/submission.yaml.j2 | 9 +--- src/programbench/package.py | 5 -- src/programbench/register.py | 3 +- src/programbench/verify.py | 51 +++++++++++-------- 5 files changed, 52 insertions(+), 48 deletions(-) diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py index 2378f1c..4681a51 100644 --- a/src/programbench/cli/submit.py +++ b/src/programbench/cli/submit.py @@ -73,12 +73,12 @@ def verify( "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex." ), ) -> None: - """Verify a submission against its own claimed results. + """Verify a submission against its own artifacts. - Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json - files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves - each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the - reported scores. + Tier 0 (default, no Docker) recomputes each instance's per-test pass/fail from its + eval.json and checks it matches _stats/score.json. Tier 1 (--tier1) additionally + resolves each submission.tar.gz and re-runs evaluation to confirm the artifacts + reproduce the reported scores. \b Examples: @@ -96,17 +96,21 @@ def verify( else verify_tier0(submission_dir) ) - table = Table(title=f"Tier-{result.tier} verification", box=None) - table.add_column("Check", style="bold") - table.add_column("Claimed", justify="right") - table.add_column("Computed", justify="right") - table.add_column("", justify="center") - for c in result.checks: - table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌") console = Console() - console.print(table) + fails = [c for c in result.checks if not c.ok] + console.print( + f"Tier-{result.tier}: [bold]{len(result.checks) - len(fails)}/{len(result.checks)}[/bold] checks consistent" + ) + if fails: + table = Table(title="Discrepancies", box=None) + table.add_column("Instance", style="bold") + table.add_column("score.json", justify="right") + table.add_column("recomputed", justify="right") + for c in fails: + table.add_row(c.name, str(c.claimed), str(c.computed)) + console.print(table) if result.ok: - console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.") + console.print("[bold green]PASS[/bold green] — submission is consistent with its artifacts.") else: console.print("[bold red]FAIL[/bold red] — discrepancies found above.") raise typer.Exit(1) diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 index eda220f..1539bb6 100644 --- a/src/programbench/data/templates/submission.yaml.j2 +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -19,10 +19,5 @@ system: eval: programbench_version: {{ programbench_version | tojson }} # [auto] - -headline: # [auto] score summary from evaluation; other stats live in _stats/ - mean_score: {{ mean_score }} - resolved_pct: {{ resolved_pct }} - near_resolved_pct: {{ near_resolved_pct }} - n_instances_attempted: {{ n_attempted }} - n_instances_total: {{ n_total }} +# Scores are not stored here: the leaderboard recomputes them from _stats/score.json with the +# registry's current ignored-tests list, so any cached numbers would just go stale. diff --git a/src/programbench/package.py b/src/programbench/package.py index c985e11..13dc7bf 100644 --- a/src/programbench/package.py +++ b/src/programbench/package.py @@ -187,11 +187,6 @@ def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = F run_dir=run_dir, submission_id=run_dir.resolve().name, programbench_version=version("programbench"), - mean_score=headline.mean_score, - resolved_pct=headline.resolved_pct, - near_resolved_pct=headline.near_resolved_pct, - n_attempted=headline.n_instances_attempted, - n_total=headline.n_instances_total, **carried, ) + "\n" diff --git a/src/programbench/register.py b/src/programbench/register.py index 7e384a9..e77fcc9 100644 --- a/src/programbench/register.py +++ b/src/programbench/register.py @@ -165,7 +165,8 @@ def register_submission( write_entry(plan, submission_dir, clone) _git(clone, "add", f"submissions/{plan.submission_id}") _commit(clone, plan.title) - _git(clone, "push", "-u", "origin", plan.branch) + # Force so re-running register updates an existing PR (the add- branch is ours). + _git(clone, "push", "-u", "--force", "origin", plan.branch) # Open the PR (explicit --head; gh's inference is unreliable). The branch lookup is the # source of truth: gh pr create can exit nonzero yet still create the PR, and a PR for # the branch may already exist from a prior run. diff --git a/src/programbench/verify.py b/src/programbench/verify.py index bc2ebc5..edb1334 100644 --- a/src/programbench/verify.py +++ b/src/programbench/verify.py @@ -4,35 +4,33 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -"""Verify a packaged submission against its own claimed results. +"""Verify a packaged submission against its own artifacts. -Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json -files (with ignored-test filtering) and check it matches submission.yaml. This is the -free consistency check a third party or CI can run with only ``programbench`` installed. +Tier 0 (default, no Docker): recompute each instance's per-test pass/fail from its own +eval.json and check it matches the submitted _stats/score.json — i.e. the reported scores +faithfully reflect the eval output. A free check a third party or CI can run with only +``programbench`` installed. (Leaderboard scores aren't stored in the submission, so there +is no headline to check against.) Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``, and confirm the freshly produced scores match the submitted eval.json. This is what proves the artifacts actually yield the reported results. """ +import json import re import tempfile from dataclasses import dataclass from pathlib import Path from programbench.submission import ( - Headline, - aggregate, benchmark_instances, - load_manifest, resolve_submission_tar, score_run, + test_results_map, ) -# Tier-0 recomputes the headline from the same eval.json with the same deterministic -# rounding `package` used, so a consistent submission matches exactly. The epsilon only -# absorbs float representation noise; any real drift (>= the rounding granularity) fails. -TOLERANCE = 1e-6 +TOLERANCE = 1e-6 # Tier-1 score floats are rounded; this only absorbs representation noise. @dataclass @@ -60,18 +58,29 @@ def _close(a: object, b: object) -> bool: return abs(a - b) <= TOLERANCE -def _headline_checks(claimed: dict, computed: Headline) -> list[Check]: - return [ - Check(name, claimed.get(name), value, _close(claimed.get(name), value)) - for name, value in computed.as_dict().items() - ] - - def verify_tier0(submission_dir: Path) -> VerifyResult: - manifest = load_manifest(submission_dir) + """Per instance, recompute the per-test pass/fail from its eval.json and check it matches + the submitted _stats/score.json (so the stored scores reflect the eval output, untampered).""" instances = benchmark_instances() - computed = aggregate(score_run(submission_dir, instances), len(instances)) - return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed)) + stored = json.loads((submission_dir / "_stats" / "score.json").read_text()) + checks = [] + for iid, stored_map in sorted(stored.items()): + eval_json = submission_dir / iid / f"{iid}.eval.json" + if iid not in instances: + checks.append(Check(iid, "in score.json", "not a benchmark instance", False)) + elif not eval_json.exists(): + checks.append(Check(iid, f"{sum(stored_map.values())}/{len(stored_map)} pass", "no eval.json", False)) + else: + recomputed = test_results_map(eval_json, instances[iid]) + checks.append( + Check( + iid, + f"{sum(stored_map.values())}/{len(stored_map)} pass", + f"{sum(recomputed.values())}/{len(recomputed)} pass", + recomputed == stored_map, + ) + ) + return VerifyResult(0, checks) def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult: From f3cc0302420a64e5d33e520fad9efc1d236a6c62 Mon Sep 17 00:00:00 2001 From: John Yang Date: Thu, 18 Jun 2026 13:33:37 -0700 Subject: [PATCH 11/11] register: build PR body without the headline block (use score.json count) --- src/programbench/register.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/programbench/register.py b/src/programbench/register.py index e77fcc9..7494211 100644 --- a/src/programbench/register.py +++ b/src/programbench/register.py @@ -20,6 +20,7 @@ prints the compare URL so you can open the PR by hand. """ +import json import shutil import subprocess import tempfile @@ -91,14 +92,13 @@ def build_plan( files = ["pointer.yaml", "submission.yaml"] + [ f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json")) ] - system, head = manifest["system"], manifest["headline"] + system = manifest["system"] + n_attempted = len(json.loads((submission_dir / "_stats" / "score.json").read_text())) body = ( f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n" - f"- mean score: {head['mean_score'] * 100:.1f}%\n" - f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n" - f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n" + f"- instances attempted: {n_attempted}\n\n" f"Source: {source}\nCommit: `{commit}`\n\n" - "Tier-0 verified (`programbench submit verify .`)." + "Tier-0 verified (`programbench submit verify .`). Leaderboard scores are recomputed from `_stats/score.json`." ) return RegisterPlan( sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body