From b1e9e94ed5a205144ca130064a7597ceba6b27d3 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Tue, 16 Jun 2026 17:42:25 -0700
Subject: [PATCH 01/11] Add `programbench submit` (package / verify / register
 / recombine)

---
 src/programbench/cli/main.py                  |   2 +
 src/programbench/cli/submit.py                | 208 ++++++++++++++++
 src/programbench/data/templates/README.md.j2  |  78 ++++++
 .../data/templates/submission.yaml.j2         |  28 +++
 src/programbench/package.py                   | 216 +++++++++++++++++
 src/programbench/register.py                  | 157 ++++++++++++
 src/programbench/submission.py                | 227 ++++++++++++++++++
 src/programbench/verify.py                    | 100 ++++++++
 8 files changed, 1016 insertions(+)
 create mode 100644 src/programbench/cli/submit.py
 create mode 100644 src/programbench/data/templates/README.md.j2
 create mode 100644 src/programbench/data/templates/submission.yaml.j2
 create mode 100644 src/programbench/package.py
 create mode 100644 src/programbench/register.py
 create mode 100644 src/programbench/submission.py
 create mode 100644 src/programbench/verify.py
diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py
index 6a36792..85f20b9 100644
--- a/src/programbench/cli/main.py
+++ b/src/programbench/cli/main.py
@@ -9,6 +9,7 @@
 import typer
 
 from programbench.cli.blob import app as blob_app
+from programbench.cli.submit import app as submit_app
 from programbench.constants import DOCKER_CPUS
 
 app = typer.Typer(
@@ -18,6 +19,7 @@
     context_settings={"help_option_names": ["-h", "--help"]},
 )
 app.add_typer(blob_app, name="blob")
+app.add_typer(submit_app, name="submit")
 
 
 @app.callback()
diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
new file mode 100644
index 0000000..0980952
--- /dev/null
+++ b/src/programbench/cli/submit.py
@@ -0,0 +1,208 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""
+
+from pathlib import Path
+
+import typer
+
+app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.")
+
+
+@app.command()
+def package(
+    run_dir: Path = typer.Argument(
+        ..., help="A `programbench eval` run directory (<run_dir>/<iid>/submission.tar.gz)."
+    ),
+    upload_to: str = typer.Option(
+        "",
+        "--upload-to",
+        metavar="ORG[/DATASET]",
+        help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, "
+        "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a "
+        "per-submission dataset org/<run-dir-name>; pass 'org/name' to use an exact dataset.",
+    ),
+    overwrite: bool = typer.Option(
+        False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)."
+    ),
+) -> None:
+    """Turn an evaluated run directory into a leaderboard submission, in place.
+
+    Writes a submission.yaml manifest and _stats/score.json, and splits each large
+    eval.json into a light eval.json (kept) + a heavy <iid>.eval.log.json (raw log +
+    failure text) so the repo stays git-pushable. With --upload-to, the heavy files and
+    the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and
+    trajectories are left as TODO.
+
+    \b
+    Examples:
+        programbench submit package output/my-run
+        programbench submit package output/my-run --upload-to programbench
+    """
+    from rich.console import Console
+
+    from programbench.package import package_run
+
+    result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite)
+    console = Console()
+    console.print(
+        f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] "
+        f"(skipped {len(result.skipped)} unknown). "
+        f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%"
+    )
+    console.print(
+        "[dim]Each eval.json was split into eval.json + <iid>.eval.log.json (recombine with "
+        "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]"
+    )
+
+
+@app.command()
+def verify(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    tier1: bool = typer.Option(
+        False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)."
+    ),
+    workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."),
+    filter_spec: str = typer.Option(
+        "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex."
+    ),
+) -> None:
+    """Verify a submission against its own claimed results.
+
+    Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json
+    files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves
+    each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the
+    reported scores.
+
+    \b
+    Examples:
+        programbench submit verify ./their-submission
+        programbench submit verify ./their-submission --tier1 -w 4
+    """
+    from rich.console import Console
+    from rich.table import Table
+
+    from programbench.verify import verify_tier0, verify_tier1
+
+    result = (
+        verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec)
+        if tier1
+        else verify_tier0(submission_dir)
+    )
+
+    table = Table(title=f"Tier-{result.tier} verification", box=None)
+    table.add_column("Check", style="bold")
+    table.add_column("Claimed", justify="right")
+    table.add_column("Computed", justify="right")
+    table.add_column("", justify="center")
+    for c in result.checks:
+        table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌")
+    console = Console()
+    console.print(table)
+    if result.ok:
+        console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.")
+    else:
+        console.print("[bold red]FAIL[/bold red] — discrepancies found above.")
+        raise typer.Exit(1)
+
+
+@app.command()
+def register(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    registry: str = typer.Option(
+        "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)."
+    ),
+    source: str = typer.Option(
+        "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)."
+    ),
+    commit: str = typer.Option(
+        "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)."
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network."
+    ),
+    verify: bool = typer.Option(
+        True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)."
+    ),
+) -> None:
+    """Register a packaged submission on the leaderboard by opening a PR to the registry.
+
+    The PR adds a small submissions/<id>/ entry: a pointer.yaml (the submission repo URL +
+    the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The
+    source URL and commit are read from the run directory's own git remote/HEAD. With `gh`
+    installed the registry is forked and the PR opened for you; otherwise the entry is left
+    committed on a branch and the steps to push + open the PR are printed.
+
+    \b
+    Examples:
+        programbench submit register ./my-run --dry-run
+        programbench submit register ./my-run
+    """
+    import tempfile
+
+    from rich.console import Console
+
+    from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry
+
+    console = Console()
+    registry = registry or REGISTRY_DEFAULT
+
+    if verify:
+        from programbench.verify import verify_tier0
+
+        if not verify_tier0(submission_dir).ok:
+            console.print(
+                "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass "
+                "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch."
+            )
+            raise typer.Exit(1)
+
+    plan = build_plan(submission_dir, registry)
+    if source:
+        plan.source = source
+    if commit:
+        plan.commit = commit
+
+    if dry_run:
+        with tempfile.TemporaryDirectory() as tmp:
+            entry = write_entry(plan, submission_dir, Path(tmp))
+            files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file())
+        console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}")
+        console.print(f"  branch: {plan.branch}")
+        console.print(f"  source: {plan.source}\n  commit: {plan.commit}")
+        console.print("  files:  " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files))
+        console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}")
+        console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}")
+        console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]")
+        return
+
+    result = register_submission(submission_dir, registry)
+    if result.pr_url:
+        console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}")
+    else:
+        console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}")
+
+
+@app.command()
+def recombine(
+    run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."),
+) -> None:
+    """Reverse `package`'s eval split: fold each <iid>.eval.log.json back into its
+    eval.json, restoring the original full eval output.
+
+    The heavy file is read locally, or downloaded from its .url if it was uploaded to HF.
+
+    \b
+    Examples:
+        programbench submit recombine ./their-submission
+    """
+    from rich.console import Console
+
+    from programbench.submission import recombine_eval_json
+
+    n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir()))
+    Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}")
diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2
new file mode 100644
index 0000000..9e6d1bb
--- /dev/null
+++ b/src/programbench/data/templates/README.md.j2
@@ -0,0 +1,78 @@
+<p align="center">
+  <a href="https://programbench.com"><img src="https://programbench.com/static/images/fox_hero_200.png" width="110" alt="ProgramBench"></a>
+</p>
+
+> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?*  ·  [Leaderboard](https://programbench.com)  ·  [How to submit](https://programbench.com/blog/submission-guide)
+
+# [Submission Name Here]
+
+<!-- Manifest, scores, and per-test results live in `submission.yaml` and `_stats/`. This file
+is for the things the manifest can't capture — please fill in the sections below. -->
+
+## System overview
+
+<!-- One or two paragraphs: what is your system and how does it work end to end? Cover
+     the model (exact id/version and key settings like temperature / reasoning effort),
+     the agent/scaffold (framework + version, prompting, tools, step limits), and your
+     test-time strategy (single attempt, best-of-N, iterative test/fix, ...). -->
+
+## Reproducing this run
+
+<!-- The exact commands to reproduce this submission, ideally runnable as-is. -->
+
+```bash
+# 1. install the agent / dependencies
+# 2. run inference per task (no internet, per the eval protocol)
+# 3. programbench eval <run-dir>
+# 4. programbench submit package <run-dir> --upload-to <org>
+```
+
+## Extra stats (optional)
+
+The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are
+**optional**, and each must be **computed by a script that reads your trajectories**, not
+entered by hand: the number has to be recoverable from the run. `programbench` ships no
+calculators (it makes no assumptions about your scaffold) — write your own that reads each
+`traj.json` and emits a flat `{instance_id: value}` map to `_stats/<name>.json`, and ship
+the script here (e.g. under `_scripts/`) so the numbers are reproducible.
+
+## Links
+
+<!-- Optional: agent/scaffold code, model card, paper, blog post. -->
+
+## Submission checklist
+
+- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission
+- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold`
+- [ ] Trajectories (`traj.json`) included for every task (agent submissions)
+- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256`
+- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written
+- [ ] Filled in the System overview and Reproducing sections above
+- [ ] `programbench submit verify .` passes
+- [ ] Made this fork public
+- [ ] Opened a registration PR to the submissions repo
+
+## Integrity attestations
+
+- [ ] Solutions were produced **only** from behavioral observation of the binary and its
+      bundled docs — no source code, repositories, mirrors, or package registries were consulted
+- [ ] The model was not given internet access during evaluation
+- [ ] The model did not have access to any unit tests during evaluation
+- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results
+
+## Auditing
+
+Anyone can independently check this submission with the following instructions:
+
+```bash
+git clone <your-submission-repo>
+cd {{ submission_id }}
+uvx programbench submit verify .          # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline)
+uvx programbench submit verify . --tier1  # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker)
+```
+
+* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test
+metadata.
+* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs
+them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from
+the trajectories; only `score` is independently re-verifiable.)
diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
new file mode 100644
index 0000000..061353d
--- /dev/null
+++ b/src/programbench/data/templates/submission.yaml.j2
@@ -0,0 +1,28 @@
+# Generated by `programbench package` from: {{ run_dir }}
+# [auto] fields are recomputed on every `package`; all other fields are preserved.
+schema_version: 1
+
+submission_id: {{ submission_id | tojson }}
+submitter:
+  name: {{ submitter_name | tojson }}
+  contact: {{ submitter_contact | tojson }}    # email or @github
+  affiliation: {{ affiliation | tojson }}
+
+system:
+  agent: {{ agent | tojson }}                   # scaffold/harness; "none" for a pure human submission
+  description_url: {{ description_url | tojson }}
+  is_os_model: {{ is_os_model | tojson }}       # true if the model's weights are openly available
+  is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source
+  model: {{ model | tojson }}                   # display name used on the leaderboard
+  provider: {{ provider | tojson }}
+  type: {{ system_type | tojson }}              # single-agent | multi-agent | other
+
+eval:
+  programbench_version: {{ programbench_version | tojson }}   # [auto]
+
+headline:                # [auto] score summary from evaluation; other stats live in stats/
+  mean_score: {{ mean_score }}
+  resolved_pct: {{ resolved_pct }}
+  near_resolved_pct: {{ near_resolved_pct }}
+  n_instances_attempted: {{ n_attempted }}
+  n_instances_total: {{ n_total }}
diff --git a/src/programbench/package.py b/src/programbench/package.py
new file mode 100644
index 0000000..a86bcea
--- /dev/null
+++ b/src/programbench/package.py
@@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Turn a ``programbench eval`` run directory into a leaderboard submission, in place.
+
+Packaging is purely eval-derived. It writes:
+
+- ``_stats/score.json`` — per-instance, per-test pass/fail (the one stat from evaluation),
+- ``submission.yaml`` — the manifest, with ``[auto]`` score fields recomputed and any
+  author-entered fields preserved across re-packaging,
+
+and splits each ``<iid>.eval.json`` into a light eval.json + a heavy ``<iid>.eval.log.json``
+(the raw log + failure text) so the run repo stays git-pushable; the two recombine to the
+original via ``programbench submit recombine``. With ``--upload-to`` the heavy files and the
+``submission.tar.gz`` artifacts go to a HuggingFace dataset (replaced by ``.url`` + ``.sha256``).
+
+Other stats (cost, calls, …) are optional and come from the agent trajectories via scripts
+the submitter writes — this command produces none of them, and makes no assumptions about
+the scaffold. The run directory stays a valid input to ``programbench eval``.
+"""
+
+import logging
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass
+from importlib.metadata import version
+from pathlib import Path
+
+import yaml
+from jinja2 import Environment, PackageLoader
+
+from programbench.submission import (
+    Headline,
+    aggregate,
+    benchmark_instances,
+    score_from_tests,
+    sha256_file,
+    split_eval_json,
+    test_results_map,
+    write_stat,
+)
+
+log = logging.getLogger(__name__)
+
+TODO = "TODO"
+
+# Author-entered manifest fields preserved across re-packaging: template var -> (path, default).
+_CARRIED = {
+    "affiliation": ("submitter.affiliation", ""),
+    "agent": ("system.agent", TODO),
+    "description_url": ("system.description_url", "README.md"),
+    "is_os_model": ("system.is_os_model", False),
+    "is_os_scaffold": ("system.is_os_scaffold", False),
+    "model": ("system.model", TODO),
+    "provider": ("system.provider", TODO),
+    "submitter_contact": ("submitter.contact", TODO),
+    "submitter_name": ("submitter.name", TODO),
+    "system_type": ("system.type", "single-agent"),
+}
+
+
+@dataclass
+class PackageResult:
+    run_dir: Path
+    packaged: list[str]
+    skipped: list[str]
+    headline: Headline
+
+
+def _dig(d: dict, dotted: str):
+    for key in dotted.split("."):
+        if not isinstance(d, dict):
+            return None
+        d = d.get(key)
+    return d
+
+
+def _carried_values(run_dir: Path) -> dict:
+    manifest_path = run_dir / "submission.yaml"
+    existing = yaml.safe_load(manifest_path.read_text()) if manifest_path.exists() else {}
+    # Use "is None" (not "or") so a real False/empty value is preserved, not clobbered.
+    return {
+        var: (default if (val := _dig(existing, path)) is None else val) for var, (path, default) in _CARRIED.items()
+    }
+
+
+def _upload_artifacts(
+    api, dataset: str, pending: list[tuple[Path, str, str]], existing: set[str], overwrite: bool
+) -> None:
+    """Upload all pending files to HF, then replace each with a .url + .sha256 and delete it.
+
+    ``pending`` is (instance_dir, instance_id, filename) — submission.tar.gz and the heavy
+    <iid>.eval.log.json. Files already on HF are skipped unless ``overwrite``. Uses
+    ``upload_large_folder`` (resumable, multi-commit, retrying) since logs can be hundreds
+    of MB and a single big commit is fragile; files are hard-linked into a staging tree so
+    nothing is copied.
+    """
+    for instance_dir, iid, fname in pending:
+        (instance_dir / f"{fname}.sha256").write_text(sha256_file(instance_dir / fname) + "\n")
+    to_upload = [(d, iid, f) for d, iid, f in pending if overwrite or f"{iid}/{f}" not in existing]
+    if to_upload:
+        run_dir = pending[0][0].parent
+        with tempfile.TemporaryDirectory(dir=run_dir) as tmp:
+            staging = Path(tmp)
+            for instance_dir, iid, fname in to_upload:
+                dst = staging / iid / fname
+                dst.parent.mkdir(parents=True, exist_ok=True)
+                try:
+                    os.link(instance_dir / fname, dst)  # same-fs hardlink: no copy
+                except OSError:
+                    shutil.copy2(instance_dir / fname, dst)
+            log.info("Uploading %d file(s) to %s (resumable)", len(to_upload), dataset)
+            api.upload_large_folder(repo_id=dataset, folder_path=str(staging), repo_type="dataset")
+    for instance_dir, iid, fname in pending:
+        (instance_dir / f"{fname}.url").write_text(
+            f"https://huggingface.co/datasets/{dataset}/resolve/main/{iid}/{fname}\n"
+        )
+        (instance_dir / fname).unlink()
+
+
+def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = False) -> PackageResult:
+    instances = benchmark_instances()
+    run_name = run_dir.resolve().name
+
+    api = dataset = None
+    existing: set[str] = set()
+    if upload_to:
+        # Each submission gets its own dataset: bare "org" -> "org/<run-name>";
+        # an explicit "org/name" is used as-is.
+        dataset = upload_to if "/" in upload_to else f"{upload_to}/{run_name}"
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        api.create_repo(dataset, repo_type="dataset", exist_ok=True)
+        # Force public so `verify`/`recombine` can fetch the artifacts anonymously
+        # (orgs may default new datasets to private).
+        api.update_repo_settings(dataset, repo_type="dataset", private=False)
+        existing = set(api.list_repo_files(dataset, repo_type="dataset"))
+
+    test_maps: dict[str, dict[str, bool]] = {}
+    packaged: list[str] = []
+    skipped: list[str] = []
+    pending: list[tuple[Path, str, str]] = []
+    for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        iid = instance_dir.name
+        eval_json = instance_dir / f"{iid}.eval.json"
+        has_solution = (instance_dir / "submission.tar.gz").exists() or (
+            instance_dir / "submission.tar.gz.url"
+        ).exists()
+        if not (eval_json.exists() and has_solution):
+            continue
+        if iid not in instances:
+            log.warning("Skipping %s (not a known ProgramBench instance)", iid)
+            skipped.append(iid)
+            continue
+        test_maps[iid] = test_results_map(eval_json, instances[iid])
+        # Split the (potentially huge) eval.json into a light eval.json + a heavy
+        # <iid>.eval.log.json (log + failure text); they recombine to the original.
+        split_eval_json(instance_dir, iid)
+        if api:
+            for fname in (f"{iid}.eval.log.json", "submission.tar.gz"):
+                if (instance_dir / fname).exists():
+                    pending.append((instance_dir, iid, fname))
+        packaged.append(iid)
+
+    if not packaged:
+        raise ValueError(f"No packageable instances found under {run_dir}")
+
+    # Write the scoring-derived artifacts first; they don't depend on the upload, so a
+    # failed/throttled upload leaves them correct and the run simply resumable.
+    # score.json is per-test ({iid: {test: passed}}) so scores can be recomputed later
+    # while striking out specific tests; the manifest headline is the score with no
+    # tests struck.
+    write_stat(run_dir, "score", test_maps)
+    scores = {iid: score_from_tests(m) for iid, m in test_maps.items()}
+    headline = aggregate(scores, len(instances))
+
+    carried = _carried_values(run_dir)
+    env = Environment(loader=PackageLoader("programbench", "data/templates"), autoescape=False)
+    (run_dir / "submission.yaml").write_text(
+        env.get_template("submission.yaml.j2").render(
+            run_dir=run_dir,
+            submission_id=run_dir.resolve().name,
+            programbench_version=version("programbench"),
+            mean_score=headline.mean_score,
+            resolved_pct=headline.resolved_pct,
+            near_resolved_pct=headline.near_resolved_pct,
+            n_attempted=headline.n_instances_attempted,
+            n_total=headline.n_instances_total,
+            **carried,
+        )
+        + "\n"
+    )
+
+    # README is created once (a starting point for the author); never overwritten.
+    readme = run_dir / "README.md"
+    if not readme.exists():
+        readme.write_text(
+            env.get_template("README.md.j2").render(
+                submission_id=run_dir.resolve().name,
+                mean_pct=round(headline.mean_score * 100, 1),
+                resolved_pct=headline.resolved_pct,
+                n_attempted=headline.n_instances_attempted,
+                n_total=headline.n_instances_total,
+                **carried,
+            )
+        )
+
+    if api and pending:
+        _upload_artifacts(api, dataset, pending, existing, overwrite)
+
+    return PackageResult(run_dir, packaged, skipped, headline)
diff --git a/src/programbench/register.py b/src/programbench/register.py
new file mode 100644
index 0000000..b5a4cb2
--- /dev/null
+++ b/src/programbench/register.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Register a packaged submission into the leaderboard registry by opening a PR.
+
+A registry entry is small and self-contained: a pointer to the submission's own public
+repo, plus the manifest and stat files copied out of it.
+
+    submissions/<id>/
+      pointer.yaml      # source repo URL + the exact commit that was scored
+      submission.yaml   # copied from the submission
+      _stats/*.json     # copied from the submission
+
+This builds that entry against a clone of the registry (default
+github.com/ProgramBench/submissions) and opens the PR. With ``gh`` it forks the registry
+and opens the PR for you; without it, it leaves the commit on a branch in a clone and
+prints the compare URL so you can open the PR by hand.
+"""
+
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+REGISTRY_DEFAULT = "https://github.com/ProgramBench/submissions"
+
+
+def _git(cwd: Path, *args: str) -> str:
+    return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip()
+
+
+def _to_https(url: str) -> str:
+    """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL."""
+    url = url.removesuffix(".git")
+    if url.startswith("git@"):
+        host, path = url[4:].split(":", 1)
+        return f"https://{host}/{path}"
+    return url
+
+
+def _slug(registry: str) -> str:
+    """``https://github.com/Owner/Repo`` -> ``Owner/Repo`` (what ``gh`` expects)."""
+    return _to_https(registry).removeprefix("https://github.com/")
+
+
+@dataclass
+class RegisterPlan:
+    submission_id: str
+    source: str
+    commit: str
+    registry: str
+    branch: str
+    pointer: str  # rendered pointer.yaml
+    files: list[str]  # entry-relative paths that will be added
+    title: str
+    body: str
+
+
+@dataclass
+class RegisterResult:
+    plan: RegisterPlan
+    pr_url: str | None  # set when a PR was opened (gh path)
+    next_steps: str | None  # set when manual steps remain (no-gh path)
+
+
+def build_plan(submission_dir: Path, registry: str) -> RegisterPlan:
+    sub_id = submission_dir.resolve().name
+    manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text())
+    source = _to_https(_git(submission_dir, "remote", "get-url", "origin"))
+    commit = _git(submission_dir, "rev-parse", "HEAD")
+    pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False)
+    files = ["pointer.yaml", "submission.yaml"] + [
+        f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json"))
+    ]
+    system, head = manifest["system"], manifest["headline"]
+    body = (
+        f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n"
+        f"- mean score: {head['mean_score'] * 100:.1f}\n"
+        f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n"
+        f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n"
+        f"Source: {source}\nCommit: `{commit}`\n\n"
+        "Tier-0 verified (`programbench submit verify .`)."
+    )
+    return RegisterPlan(
+        sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body
+    )
+
+
+def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -> Path:
+    """Materialize ``submissions/<id>/`` under ``registry_root`` (overwriting any existing entry)."""
+    entry = registry_root / "submissions" / plan.submission_id
+    if entry.exists():
+        shutil.rmtree(entry)
+    (entry / "_stats").mkdir(parents=True)
+    (entry / "pointer.yaml").write_text(plan.pointer)
+    shutil.copyfile(submission_dir / "submission.yaml", entry / "submission.yaml")
+    for p in sorted((submission_dir / "_stats").glob("*.json")):
+        shutil.copyfile(p, entry / "_stats" / p.name)
+    return entry
+
+
+def register_submission(submission_dir: Path, registry: str) -> RegisterResult:
+    """Clone the registry, commit the entry on a branch, and open the PR.
+
+    Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward.
+    Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual
+    push + compare-URL steps in ``next_steps`` (so the clone must outlive this call).
+    """
+    plan = build_plan(submission_dir, registry)
+    slug = _slug(registry)
+    clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions"
+
+    if shutil.which("gh"):
+        # Fork the registry under the authed user (no-op if it exists) and clone the fork;
+        # origin -> fork, upstream -> registry.
+        subprocess.run(
+            ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        _git(clone, "checkout", "-b", plan.branch)
+        write_entry(plan, submission_dir, clone)
+        _git(clone, "add", f"submissions/{plan.submission_id}")
+        _git(clone, "commit", "-m", plan.title)
+        _git(clone, "push", "-u", "origin", plan.branch)
+        pr_url = subprocess.run(
+            ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body],
+            cwd=clone,
+            check=True,
+            capture_output=True,
+            text=True,
+        ).stdout.strip()
+        shutil.rmtree(clone.parent)
+        return RegisterResult(plan, pr_url, None)
+
+    # No gh: clone the registry directly, commit the branch, and hand back the steps.
+    _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone))
+    _git(clone, "checkout", "-b", plan.branch)
+    write_entry(plan, submission_dir, clone)
+    _git(clone, "add", f"submissions/{plan.submission_id}")
+    _git(clone, "commit", "-m", plan.title)
+    steps = (
+        "`gh` not found, so the PR was not opened. The entry is committed on branch "
+        f"`{plan.branch}` in:\n  {clone}\n\n"
+        "To finish, from that clone push the branch to your fork of the registry and open a PR:\n"
+        "  git remote add fork https://github.com/<you>/submissions\n"
+        f"  git push -u fork {plan.branch}\n"
+        f"  {_to_https(registry)}/compare/main...<you>:{plan.branch}?expand=1"
+    )
+    return RegisterResult(plan, None, steps)
diff --git a/src/programbench/submission.py b/src/programbench/submission.py
new file mode 100644
index 0000000..71b8b34
--- /dev/null
+++ b/src/programbench/submission.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Shared helpers for building (`package`) and checking (`verify`) submissions.
+
+Both commands must score a run directory the same way, so the scoring and headline
+aggregation live here and are imported by each command.
+"""
+
+import hashlib
+import json
+import logging
+import shutil
+import subprocess
+import tarfile
+import tempfile
+import urllib.request
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+import yaml
+
+from programbench.eval.eval import EvaluationResult
+from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances
+
+log = logging.getLogger(__name__)
+
+RESOLVED_THRESHOLD = 1.0
+NEAR_RESOLVED_THRESHOLD = 0.95
+FIXTURE_PREFIX = "testorg__"
+
+
+def benchmark_instances() -> dict[str, dict]:
+    """Real benchmark instances, keyed by id (excludes the bundled test fixture)."""
+    return {i["instance_id"]: i for i in load_all_instances() if not i["instance_id"].startswith(FIXTURE_PREFIX)}
+
+
+def sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def test_results_map(eval_json: Path, instance: dict) -> dict[str, bool]:
+    """Per-test pass/fail for one instance, after the same active-branch / ignored-test
+    filtering as ``info``. Keyed by ``"<branch>/<test_name>"``, value ``True`` iff passed.
+
+    This is the raw material a score is computed from, so the leaderboard can later
+    recompute scores while striking out specific tests (see the registry's ignore map).
+    """
+    result = EvaluationResult.model_validate_json(eval_json.read_text())
+    result = result.for_branches(get_active_branches(instance)).without_ignored(get_ignored_tests(instance))
+    return {t.full_name: t.is_resolved for t in result.test_results}
+
+
+def score_from_tests(tests: dict[str, bool], ignore: set[str] = frozenset()) -> float:
+    """Fraction passed over the non-ignored tests (0.0 if none remain)."""
+    kept = [passed for name, passed in tests.items() if name not in ignore]
+    return sum(kept) / len(kept) if kept else 0.0
+
+
+def score_instance(eval_json: Path, instance: dict) -> float:
+    """Per-instance score with ignored-branch/test filtering (same logic as `info`)."""
+    return score_from_tests(test_results_map(eval_json, instance))
+
+
+def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]:
+    """Map instance_id -> score for every <iid>/<iid>.eval.json present and known."""
+    scores: dict[str, float] = {}
+    for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        iid = instance_dir.name
+        eval_json = instance_dir / f"{iid}.eval.json"
+        if eval_json.exists() and iid in instances:
+            scores[iid] = score_instance(eval_json, instances[iid])
+    return scores
+
+
+def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None:
+    """Write a per-instance stat file: ``<run_dir>/_stats/<stat>.json`` = ``{iid: value}``."""
+    (run_dir / "_stats").mkdir(exist_ok=True)
+    (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True))
+
+
+_HEAVY_EXTRA_KEYS = ("message", "text")
+
+
+def _full_name(t: dict) -> str:
+    return f"{t['branch']}/{t['name']}" if t.get("branch") else t["name"]
+
+
+def split_eval_json(instance_dir: Path, iid: str) -> None:
+    """Split ``<iid>.eval.json`` into a light eval.json + a heavy ``<iid>.eval.log.json``.
+
+    The heavy file holds the only bulky parts — the top-level ``log`` and each failing
+    test's ``message``/``text`` — keyed so the two recombine into the exact original.
+    Nothing is dropped; the union of the two files is the original eval.json.
+    """
+    p = instance_dir / f"{iid}.eval.json"
+    data = json.loads(p.read_text())
+    # Idempotent: if there's nothing heavy left (already split, or genuinely light), do
+    # nothing — never clobber an existing eval.log.json.
+    has_heavy = bool(data.get("log")) or any(
+        k in (t.get("extra") or {}) for t in data.get("test_results", []) for k in _HEAVY_EXTRA_KEYS
+    )
+    if not has_heavy:
+        return
+    heavy: dict = {"log": data.get("log") or [], "failures": {}}
+    for t in data.get("test_results", []):
+        extra = t.get("extra") or {}
+        moved = {k: extra.pop(k) for k in _HEAVY_EXTRA_KEYS if k in extra}
+        if moved:
+            heavy["failures"][_full_name(t)] = moved
+    data["log"] = []
+    p.write_text(json.dumps(data, indent=2))
+    (instance_dir / f"{iid}.eval.log.json").write_text(json.dumps(heavy))
+
+
+def recombine_eval_json(instance_dir: Path, iid: str) -> bool:
+    """Inverse of :func:`split_eval_json`: fold the heavy file back into ``<iid>.eval.json``
+    (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``.
+
+    The heavy file is read locally, or downloaded from ``<iid>.eval.log.json.url`` if hosted.
+    Returns True if a recombine happened.
+    """
+    light = instance_dir / f"{iid}.eval.json"
+    log_file = instance_dir / f"{iid}.eval.log.json"
+    url_file = instance_dir / f"{iid}.eval.log.json.url"
+    if not light.exists():
+        return False
+    if log_file.exists():
+        heavy = json.loads(log_file.read_text())
+    elif url_file.exists():
+        with urllib.request.urlopen(url_file.read_text().strip()) as r:  # noqa: S310
+            heavy = json.loads(r.read())
+    else:
+        return False
+    data = json.loads(light.read_text())
+    data["log"] = heavy.get("log", [])
+    failures = heavy.get("failures", {})
+    for t in data.get("test_results", []):
+        if (name := _full_name(t)) in failures:
+            t.setdefault("extra", {}).update(failures[name])
+    light.write_text(json.dumps(data, indent=2))
+    for f in (log_file, url_file, instance_dir / f"{iid}.eval.log.json.sha256"):
+        f.unlink(missing_ok=True)
+    return True
+
+
+@dataclass
+class Headline:
+    mean_score: float
+    resolved_pct: float
+    near_resolved_pct: float
+    n_instances_attempted: int
+    n_instances_total: int
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+
+def aggregate(scores: dict[str, float], n_total: int) -> Headline:
+    values = list(scores.values())
+    if not values:
+        raise ValueError("No scored instances found")
+    n = len(values)
+    # mean is over attempted instances; resolved/near are over the full benchmark
+    # (an unattempted task counts as unresolved).
+    return Headline(
+        mean_score=round(sum(values) / n, 4),
+        resolved_pct=round(100 * sum(s >= RESOLVED_THRESHOLD for s in values) / n_total, 1),
+        near_resolved_pct=round(100 * sum(s >= NEAR_RESOLVED_THRESHOLD for s in values) / n_total, 1),
+        n_instances_attempted=n,
+        n_instances_total=n_total,
+    )
+
+
+def load_manifest(submission_dir: Path) -> dict:
+    return yaml.safe_load((submission_dir / "submission.yaml").read_text())
+
+
+def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None:
+    """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256.
+
+    Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or
+    ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is
+    enforced for inline/url; for git it is advisory (packing is not byte-reproducible).
+    """
+    sha_file = instance_dir / "submission.tar.gz.sha256"
+    expected = sha_file.read_text().split()[0] if sha_file.exists() else None
+
+    inline = instance_dir / "submission.tar.gz"
+    url_file = instance_dir / "submission.tar.gz.url"
+    ref_file = instance_dir / "submission.ref.yaml"
+    if inline.exists():
+        shutil.copy2(inline, dest_tar)
+    elif url_file.exists():
+        urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar)  # noqa: S310
+    elif ref_file.exists():
+        _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar)
+        expected = None  # git packing is not byte-reproducible; rely on re-eval instead
+    else:
+        raise ValueError(f"{instance_dir.name}: no submission.tar.gz, .url, or .ref.yaml found")
+
+    if expected and (got := sha256_file(dest_tar)) != expected:
+        raise ValueError(f"{instance_dir.name}: sha256 mismatch (expected {expected[:12]}…, got {got[:12]}…)")
+
+
+def _pack_git_ref(ref: dict, dest_tar: Path) -> None:
+    with tempfile.TemporaryDirectory() as tmp:
+        src = Path(tmp) / "src"
+        subprocess.run(
+            ["git", "clone", "--depth", "1", "--branch", ref["ref"], ref["repo"], str(src)],
+            check=True,
+            capture_output=True,
+        )
+        root = src / ref["subpath"] if ref.get("subpath") else src
+        with tarfile.open(dest_tar, "w:gz") as tar:
+            for p in sorted(root.rglob("*")):
+                rel = p.relative_to(root).as_posix()
+                if rel.split("/", 1)[0] == ".git":
+                    continue
+                tar.add(p, arcname=rel, recursive=False)
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
new file mode 100644
index 0000000..3fdfd28
--- /dev/null
+++ b/src/programbench/verify.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Verify a packaged submission against its own claimed results.
+
+Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json
+files (with ignored-test filtering) and check it matches submission.yaml. This is the
+free consistency check a third party or CI can run with only ``programbench`` installed.
+
+Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``,
+and confirm the freshly produced scores match the submitted eval.json. This is what
+proves the artifacts actually yield the reported results.
+"""
+
+import logging
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+from programbench.submission import (
+    Headline,
+    aggregate,
+    benchmark_instances,
+    load_manifest,
+    resolve_submission_tar,
+    score_run,
+)
+
+log = logging.getLogger(__name__)
+
+TOLERANCE = 0.011  # headline floats are rounded; allow a hair more than the last digit
+
+
+@dataclass
+class Check:
+    name: str
+    claimed: object
+    computed: object
+    ok: bool
+
+
+@dataclass
+class VerifyResult:
+    tier: int
+    checks: list[Check]
+
+    @property
+    def ok(self) -> bool:
+        return all(c.ok for c in self.checks)
+
+
+def _close(a: object, b: object) -> bool:
+    if a is None:
+        return False
+    return abs(float(a) - float(b)) <= TOLERANCE
+
+
+def _headline_checks(claimed: dict, computed: Headline) -> list[Check]:
+    return [
+        Check(name, claimed.get(name), value, _close(claimed.get(name), value))
+        for name, value in computed.as_dict().items()
+    ]
+
+
+def verify_tier0(submission_dir: Path) -> VerifyResult:
+    manifest = load_manifest(submission_dir)
+    instances = benchmark_instances()
+    computed = aggregate(score_run(submission_dir, instances), len(instances))
+    return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed))
+
+
+def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult:
+    from programbench.eval.eval_batch import run_eval_batch
+
+    instances = benchmark_instances()
+    sub_root = submission_dir
+    submitted = score_run(sub_root, instances)
+
+    with tempfile.TemporaryDirectory() as tmp:
+        run = Path(tmp)
+        for iid in submitted:
+            (run / iid).mkdir(parents=True)
+            resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz")
+        run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True)
+        fresh = score_run(run, instances)
+
+    checks = [
+        Check(
+            iid,
+            round(submitted[iid], 4),
+            round(fresh.get(iid, float("nan")), 4),
+            _close(submitted[iid], fresh.get(iid)),
+        )
+        for iid in submitted
+        if not filter_spec or iid in fresh
+    ]
+    return VerifyResult(1, checks)

From ab60e4227f013cf783e5cd69b4eef7cae871f125 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 09:15:42 -0700
Subject: [PATCH 02/11] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/programbench/data/templates/submission.yaml.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
index 061353d..358dead 100644
--- a/src/programbench/data/templates/submission.yaml.j2
+++ b/src/programbench/data/templates/submission.yaml.j2
@@ -1,5 +1,5 @@
-# Generated by `programbench package` from: {{ run_dir }}
-# [auto] fields are recomputed on every `package`; all other fields are preserved.
+# Generated by `programbench submit package` from: {{ run_dir }}
+# [auto] fields are recomputed on every `programbench submit package`; all other fields are preserved.
 schema_version: 1
 
 submission_id: {{ submission_id | tojson }}

From 701818a947ae31a22d433a3f6dcdf6527707faf5 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 09:15:56 -0700
Subject: [PATCH 03/11] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/programbench/submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/programbench/submission.py b/src/programbench/submission.py
index 71b8b34..6bf3edd 100644
--- a/src/programbench/submission.py
+++ b/src/programbench/submission.py
@@ -80,7 +80,7 @@ def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]:
     return scores
 
 
-def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None:
+def write_stat(run_dir: Path, stat: str, by_instance: dict[str, object]) -> None:
     """Write a per-instance stat file: ``<run_dir>/_stats/<stat>.json`` = ``{iid: value}``."""
     (run_dir / "_stats").mkdir(exist_ok=True)
     (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True))

From af7c74e533d8f7e4ae4c9e70a21c5afaf9884924 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 09:17:07 -0700
Subject: [PATCH 04/11] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/programbench/submission.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/programbench/submission.py b/src/programbench/submission.py
index 6bf3edd..f778c87 100644
--- a/src/programbench/submission.py
+++ b/src/programbench/submission.py
@@ -186,8 +186,7 @@ def load_manifest(submission_dir: Path) -> dict:
 def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None:
     """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256.
 
-    Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or
-    ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is
+    Supports the artifact forms: inline file, ``.url`` (downloaded), or ``submission.ref.yaml`` (git checkout packed).
     enforced for inline/url; for git it is advisory (packing is not byte-reproducible).
     """
     sha_file = instance_dir / "submission.tar.gz.sha256"

From 1106d337bba3e47a6e1557bef29b1a97c326935a Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 09:37:06 -0700
Subject: [PATCH 05/11] Address review: make submit register --source/--commit
 take effect; fix Tier-1 verify

- register: thread --source/--commit through build_plan/register_submission so
  they actually change pointer.yaml + PR body (previously no-ops).
- verify: guard _close against None on either side (Tier-1 no longer crashes when
  a re-eval produces no fresh score); filter Tier-1 checks by the same regex as the
  re-eval and report missing scores as NaN/fail instead of silently skipping them.
- submission: repair resolve_submission_tar docstring left dangling by the SPEC.md edit.
---
 src/programbench/cli/submit.py | 13 ++++---------
 src/programbench/register.py   | 16 +++++++++++-----
 src/programbench/submission.py |  3 ++-
 src/programbench/verify.py     | 12 ++++++++----
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
index 0980952..c3bce27 100644
--- a/src/programbench/cli/submit.py
+++ b/src/programbench/cli/submit.py
@@ -161,13 +161,8 @@ def register(
             )
             raise typer.Exit(1)
 
-    plan = build_plan(submission_dir, registry)
-    if source:
-        plan.source = source
-    if commit:
-        plan.commit = commit
-
     if dry_run:
+        plan = build_plan(submission_dir, registry, source or None, commit or None)
         with tempfile.TemporaryDirectory() as tmp:
             entry = write_entry(plan, submission_dir, Path(tmp))
             files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file())
@@ -180,11 +175,11 @@ def register(
         console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]")
         return
 
-    result = register_submission(submission_dir, registry)
+    result = register_submission(submission_dir, registry, source or None, commit or None)
     if result.pr_url:
-        console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}")
+        console.print(f"[bold green]Opened PR[/bold green] for {result.plan.submission_id}: {result.pr_url}")
     else:
-        console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}")
+        console.print(f"[bold]Prepared[/bold] registry entry for {result.plan.submission_id}.\n{result.next_steps}")
 
 
 @app.command()
diff --git a/src/programbench/register.py b/src/programbench/register.py
index b5a4cb2..da119f8 100644
--- a/src/programbench/register.py
+++ b/src/programbench/register.py
@@ -69,11 +69,15 @@ class RegisterResult:
     next_steps: str | None  # set when manual steps remain (no-gh path)
 
 
-def build_plan(submission_dir: Path, registry: str) -> RegisterPlan:
+def build_plan(
+    submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None
+) -> RegisterPlan:
     sub_id = submission_dir.resolve().name
     manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text())
-    source = _to_https(_git(submission_dir, "remote", "get-url", "origin"))
-    commit = _git(submission_dir, "rev-parse", "HEAD")
+    # Overrides win; otherwise autodetect from the submission's own git remote/HEAD. The
+    # autodetect calls are skipped (short-circuited) when an override is supplied.
+    source = source or _to_https(_git(submission_dir, "remote", "get-url", "origin"))
+    commit = commit or _git(submission_dir, "rev-parse", "HEAD")
     pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False)
     files = ["pointer.yaml", "submission.yaml"] + [
         f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json"))
@@ -105,14 +109,16 @@ def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -
     return entry
 
 
-def register_submission(submission_dir: Path, registry: str) -> RegisterResult:
+def register_submission(
+    submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None
+) -> RegisterResult:
     """Clone the registry, commit the entry on a branch, and open the PR.
 
     Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward.
     Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual
     push + compare-URL steps in ``next_steps`` (so the clone must outlive this call).
     """
-    plan = build_plan(submission_dir, registry)
+    plan = build_plan(submission_dir, registry, source, commit)
     slug = _slug(registry)
     clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions"
 
diff --git a/src/programbench/submission.py b/src/programbench/submission.py
index f778c87..db6f4b7 100644
--- a/src/programbench/submission.py
+++ b/src/programbench/submission.py
@@ -186,7 +186,8 @@ def load_manifest(submission_dir: Path) -> dict:
 def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None:
     """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256.
 
-    Supports the artifact forms: inline file, ``.url`` (downloaded), or ``submission.ref.yaml`` (git checkout packed).
+    Supports three artifact forms: inline file, ``.url`` (downloaded), or
+    ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is
     enforced for inline/url; for git it is advisory (packing is not byte-reproducible).
     """
     sha_file = instance_dir / "submission.tar.gz.sha256"
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
index 3fdfd28..aa7bb7a 100644
--- a/src/programbench/verify.py
+++ b/src/programbench/verify.py
@@ -16,6 +16,7 @@
 """
 
 import logging
+import re
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
@@ -53,7 +54,7 @@ def ok(self) -> bool:
 
 
 def _close(a: object, b: object) -> bool:
-    if a is None:
+    if a is None or b is None:
         return False
     return abs(float(a) - float(b)) <= TOLERANCE
 
@@ -87,14 +88,17 @@ def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "
         run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True)
         fresh = score_run(run, instances)
 
+    # Same regex semantics as the re-eval filter (instance_filters.filter_instances), so a
+    # filtered-in instance that produced no fresh score is reported as a failure (NaN), not
+    # silently skipped.
+    targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)]
     checks = [
         Check(
             iid,
             round(submitted[iid], 4),
-            round(fresh.get(iid, float("nan")), 4),
+            round(fresh[iid], 4) if iid in fresh else float("nan"),
             _close(submitted[iid], fresh.get(iid)),
         )
-        for iid in submitted
-        if not filter_spec or iid in fresh
+        for iid in targets
     ]
     return VerifyResult(1, checks)

From 398dc3a60804a6f68df27f8535ad755dae382efc Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 09:42:14 -0700
Subject: [PATCH 06/11] Address review (remaining): tighten verify tolerance,
 check recombine downloads, add submit CLI tests

- verify: TOLERANCE 0.011 -> 1e-6 (Tier-0 recomputes deterministically, so this only
  absorbs float noise; real drift now fails). Verified Tier-0 still passes on a real run.
- submission: recombine verifies a downloaded eval.log.json against its .sha256 sidecar;
  soften split/recombine docstrings (lossless / semantically identical, not byte-for-byte).
- tests: add submit --help, submit package --help, submit register --help smoke tests.
---
 src/programbench/submission.py | 17 ++++++++++++-----
 src/programbench/verify.py     |  5 ++++-
 tests/test_cli.py              | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/programbench/submission.py b/src/programbench/submission.py
index db6f4b7..18430b1 100644
--- a/src/programbench/submission.py
+++ b/src/programbench/submission.py
@@ -97,8 +97,9 @@ def split_eval_json(instance_dir: Path, iid: str) -> None:
     """Split ``<iid>.eval.json`` into a light eval.json + a heavy ``<iid>.eval.log.json``.
 
     The heavy file holds the only bulky parts — the top-level ``log`` and each failing
-    test's ``message``/``text`` — keyed so the two recombine into the exact original.
-    Nothing is dropped; the union of the two files is the original eval.json.
+    test's ``message``/``text`` — keyed so the two recombine losslessly. Nothing is dropped;
+    the union of the two files holds everything in the original eval.json (the rebuilt file
+    is semantically identical, though not necessarily byte-for-byte).
     """
     p = instance_dir / f"{iid}.eval.json"
     data = json.loads(p.read_text())
@@ -122,9 +123,11 @@ def split_eval_json(instance_dir: Path, iid: str) -> None:
 
 def recombine_eval_json(instance_dir: Path, iid: str) -> bool:
     """Inverse of :func:`split_eval_json`: fold the heavy file back into ``<iid>.eval.json``
-    (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``.
+    (restoring the full eval output losslessly), then remove the heavy file and its
+    ``.url``/``.sha256``.
 
-    The heavy file is read locally, or downloaded from ``<iid>.eval.log.json.url`` if hosted.
+    The heavy file is read locally, or downloaded from ``<iid>.eval.log.json.url`` if hosted;
+    a downloaded file is checked against its ``.sha256`` sidecar when one is present.
     Returns True if a recombine happened.
     """
     light = instance_dir / f"{iid}.eval.json"
@@ -136,7 +139,11 @@ def recombine_eval_json(instance_dir: Path, iid: str) -> bool:
         heavy = json.loads(log_file.read_text())
     elif url_file.exists():
         with urllib.request.urlopen(url_file.read_text().strip()) as r:  # noqa: S310
-            heavy = json.loads(r.read())
+            raw = r.read()
+        sha_file = instance_dir / f"{iid}.eval.log.json.sha256"
+        if sha_file.exists() and (got := hashlib.sha256(raw).hexdigest()) != sha_file.read_text().split()[0]:
+            raise ValueError(f"{iid}: eval.log.json sha256 mismatch on download (got {got[:12]}…)")
+        heavy = json.loads(raw)
     else:
         return False
     data = json.loads(light.read_text())
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
index aa7bb7a..8cdda87 100644
--- a/src/programbench/verify.py
+++ b/src/programbench/verify.py
@@ -32,7 +32,10 @@
 
 log = logging.getLogger(__name__)
 
-TOLERANCE = 0.011  # headline floats are rounded; allow a hair more than the last digit
+# Tier-0 recomputes the headline from the same eval.json with the same deterministic
+# rounding `package` used, so a consistent submission matches exactly. The epsilon only
+# absorbs float representation noise; any real drift (>= the rounding granularity) fails.
+TOLERANCE = 1e-6
 
 
 @dataclass
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 594bd9d..904ec59 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -37,3 +37,21 @@ def test_blob_sync_help():
     result = runner.invoke(app, ["blob", "sync", "--help"])
     assert result.exit_code == 0
     assert "instance" in result.output.lower()
+
+
+def test_submit_help():
+    result = runner.invoke(app, ["submit", "--help"])
+    assert result.exit_code == 0
+    assert all(cmd in result.output for cmd in ("package", "verify", "register", "recombine"))
+
+
+def test_submit_package_help():
+    result = runner.invoke(app, ["submit", "package", "--help"])
+    assert result.exit_code == 0
+    assert "upload" in result.output.lower()
+
+
+def test_submit_register_help():
+    result = runner.invoke(app, ["submit", "register", "--help"])
+    assert result.exit_code == 0
+    assert "registry" in result.output.lower()

From 623d47eb77eb4db43c8ca2d666e483d2585e0b9d Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 10:26:17 -0700
Subject: [PATCH 07/11] Add `programbench submit publish`: create + push a
 submission's GitHub repo

Middle step between package and register. With gh, creates the public repo and pushes
in one shot; without gh, pushes to a --remote you pre-created or prints the steps. Repo
name defaults to the submission id; register reads the URL back from the git remote, so
it is never stored in submission.yaml. Adds a --dry-run and a CLI smoke test.
---
 src/programbench/cli/submit.py |  60 ++++++++++++++++++++
 src/programbench/publish.py    | 101 +++++++++++++++++++++++++++++++++
 tests/test_cli.py              |   6 ++
 3 files changed, 167 insertions(+)
 create mode 100644 src/programbench/publish.py

diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
index c3bce27..5c7a4ab 100644
--- a/src/programbench/cli/submit.py
+++ b/src/programbench/cli/submit.py
@@ -6,6 +6,7 @@
 
 """Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""
 
+import shutil
 from pathlib import Path
 
 import typer
@@ -110,6 +111,65 @@ def verify(
         raise typer.Exit(1)
 
 
+@app.command()
+def publish(
+    run_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    owner: str = typer.Option(
+        "", "--owner", help="GitHub org/user to create the repo under (default: your gh account)."
+    ),
+    repo: str = typer.Option("", "--repo", help="Repository name (default: the submission directory name)."),
+    private: bool = typer.Option(
+        False, "--private", help="Create the repo private (it must be public before you can register it)."
+    ),
+    remote: str = typer.Option(
+        "", "--remote", help="Push to this existing empty repo URL instead of creating one (the no-gh path)."
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run", help="Show what would be created/pushed; touch no network and make no commit."
+    ),
+) -> None:
+    """Create this submission's public GitHub repo and push it (package -> publish -> register).
+
+    Heavy artifacts already live on HuggingFace (as .url + .sha256 from `package`), so only
+    light files are committed. With `gh` the repo is created and pushed in one shot; without
+    it, pass `--remote <url>` to an empty repo you created, or follow the printed steps. The
+    repo name defaults to the directory name and the URL is read back by `register`, so it is
+    never stored in submission.yaml.
+
+    \b
+    Examples:
+        programbench submit publish ./my-run --dry-run
+        programbench submit publish ./my-run --owner my-org
+    """
+    from rich.console import Console
+
+    from programbench.publish import _origin, publish as do_publish
+
+    console = Console()
+    name = repo or run_dir.resolve().name
+
+    if dry_run:
+        existing = _origin(run_dir)
+        if existing:
+            plan = f"push current commit to existing remote [bold]{existing}[/bold]"
+        elif remote:
+            plan = f"add remote [bold]{remote}[/bold] and push"
+        elif shutil.which("gh"):
+            plan = f"`gh repo create` [bold]{f'{owner}/{name}' if owner else name}[/bold] ({'private' if private else 'public'}), set origin, and push"
+        else:
+            plan = f"commit locally only — no gh and no --remote, so the repo for [bold]{name}[/bold] can't be created"
+        console.print(f"[bold]Would publish[/bold] {run_dir}:\n  {plan}")
+        console.print("[dim]Dry run — no commit, nothing created or pushed. Drop --dry-run to publish.[/dim]")
+        return
+
+    result = do_publish(run_dir, owner=owner, repo=repo, private=private, remote=remote)
+    if result.repo_url:
+        console.print(f"[bold green]Published[/bold green] {name} -> {result.repo_url}")
+        console.print("[dim]Next: `programbench submit register .` to register it on the leaderboard.[/dim]")
+    else:
+        console.print(f"[bold]Committed[/bold] {name} locally.\n{result.next_steps}")
+
+
 @app.command()
 def register(
     submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
diff --git a/src/programbench/publish.py b/src/programbench/publish.py
new file mode 100644
index 0000000..035173e
--- /dev/null
+++ b/src/programbench/publish.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Create a submission's public GitHub repo and push it.
+
+The middle step between ``package`` and ``register``: it turns a packaged run directory
+into a public Git repo and pushes it. The heavy artifacts already live on HuggingFace (as
+``.url`` + ``.sha256`` written by ``package``), so only light files are committed. With
+``gh`` the repo is created and pushed in one shot; without ``gh`` it commits locally and
+either pushes to a ``--remote`` you pre-created, or prints the steps to finish by hand.
+
+The repo URL is never stored in ``submission.yaml`` — it defaults to the submission id and
+``register`` reads it back from the git remote this sets, keeping the manifest host-agnostic.
+"""
+
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+
+def _git(cwd: Path, *args: str) -> str:
+    return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip()
+
+
+def _to_https(url: str) -> str:
+    """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL."""
+    url = url.removesuffix(".git")
+    if url.startswith("git@"):
+        host, path = url[4:].split(":", 1)
+        return f"https://{host}/{path}"
+    return url
+
+
+def _origin(run_dir: Path) -> str | None:
+    if not (run_dir / ".git").exists() or "origin" not in _git(run_dir, "remote").split():
+        return None
+    return _git(run_dir, "remote", "get-url", "origin")
+
+
+@dataclass
+class PublishResult:
+    repo_url: str | None  # the pushed repo (https), when known
+    committed: bool  # whether a new commit was made
+    next_steps: str | None  # manual steps when we could not finish (no gh, no --remote)
+
+
+def _ensure_committed(run_dir: Path) -> bool:
+    """Init the repo if needed and commit any pending changes; True if a commit was made."""
+    if not (run_dir / ".git").exists():
+        _git(run_dir, "init", "-b", "main")
+    _git(run_dir, "add", "-A")
+    if not _git(run_dir, "status", "--porcelain"):
+        return False
+    _git(run_dir, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}")
+    return True
+
+
+def _gh_repo_url(slug: str, private: bool) -> str:
+    """The repo's URL, creating it (public unless ``private``) if it doesn't exist yet."""
+    view = ["gh", "repo", "view", slug, "--json", "url", "-q", ".url"]
+    if subprocess.run(view, capture_output=True, text=True).returncode != 0:
+        subprocess.run(
+            ["gh", "repo", "create", slug, "--private" if private else "--public"],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    return subprocess.run(view, check=True, capture_output=True, text=True).stdout.strip()
+
+
+def publish(run_dir: Path, owner: str = "", repo: str = "", private: bool = False, remote: str = "") -> PublishResult:
+    name = repo or run_dir.resolve().name
+    committed = _ensure_committed(run_dir)
+
+    # Pick the target repo: an explicit --remote, an already-wired origin, or one created
+    # via gh. Without any of those we can only commit locally and hand back the steps.
+    target = remote or _origin(run_dir)
+    if not target:
+        if not shutil.which("gh"):
+            steps = (
+                "`gh` is not installed and no --remote was given, so the repo could not be created. "
+                f"The submission is committed locally in {run_dir}. To finish:\n"
+                f"  1. Create an empty PUBLIC repo (named '{name}') at https://github.com/new\n"
+                "  2. From the submission directory, wire it up and push:\n"
+                "       git remote add origin <its-url>\n"
+                "       git push -u origin HEAD:main\n"
+                "Then run `programbench submit register .` to register it on the leaderboard."
+            )
+            return PublishResult(None, committed, steps)
+        target = _gh_repo_url(f"{owner}/{name}" if owner else name, private)
+
+    # Push over HTTPS using gh's credentials: reliable everywhere (an SSH origin needs keys
+    # set up, and would fail in sandboxes that block port 22).
+    url = _to_https(target)
+    _git(run_dir, "remote", "set-url" if _origin(run_dir) else "add", "origin", url)
+    _git(run_dir, "push", "-u", "origin", "HEAD:main")
+    return PublishResult(url, committed, None)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 904ec59..f23d7de 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -55,3 +55,9 @@ def test_submit_register_help():
     result = runner.invoke(app, ["submit", "register", "--help"])
     assert result.exit_code == 0
     assert "registry" in result.output.lower()
+
+
+def test_submit_publish_help():
+    result = runner.invoke(app, ["submit", "publish", "--help"])
+    assert result.exit_code == 0
+    assert "owner" in result.output.lower()

From 146b975bfd9d9a96e42e1bfd0cf797ded18c9ba1 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 10:47:57 -0700
Subject: [PATCH 08/11] Address second review round (submit group)

verify: _close treats non-numeric manifest values as a failed check (no crash); Tier-1
only resolves/downloads the --filter-matched subset, not every tarball; drop dead logger.
submission: reject non-http(s) URLs (SSRF/file:// guard) and add download timeouts for
recombine + resolve_submission_tar; drop dead logger.
package: accept submission.ref.yaml as a valid solution form (matches resolve_submission_tar).
register: fix `gh repo fork` (takes no dest arg -> run from clone.parent); add % to the PR
body mean score; git-identity fallback for commits in fresh containers.
publish: git-identity fallback for the commit.
docs/tests: correct CLI module docstring + manifest 'stats/'->'_stats/' comment; assert
publish in submit --help; add lossless split/recombine round-trip unit tests.
---
 src/programbench/cli/submit.py                |  3 +-
 .../data/templates/submission.yaml.j2         |  2 +-
 src/programbench/package.py                   |  7 +--
 src/programbench/publish.py                   | 10 +++-
 src/programbench/register.py                  | 21 ++++++--
 src/programbench/submission.py                | 22 ++++++--
 src/programbench/verify.py                    | 18 +++----
 tests/test_cli.py                             |  2 +-
 tests/test_submission.py                      | 53 +++++++++++++++++++
 9 files changed, 111 insertions(+), 27 deletions(-)
 create mode 100644 tests/test_submission.py

diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
index 5c7a4ab..2378f1c 100644
--- a/src/programbench/cli/submit.py
+++ b/src/programbench/cli/submit.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""
+"""Submission lifecycle commands: package an eval run, publish its repo, verify a submission,
+register it on the leaderboard, and recombine a split eval.json."""
 
 import shutil
 from pathlib import Path
diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
index 358dead..eda220f 100644
--- a/src/programbench/data/templates/submission.yaml.j2
+++ b/src/programbench/data/templates/submission.yaml.j2
@@ -20,7 +20,7 @@ system:
 eval:
   programbench_version: {{ programbench_version | tojson }}   # [auto]
 
-headline:                # [auto] score summary from evaluation; other stats live in stats/
+headline:                # [auto] score summary from evaluation; other stats live in _stats/
   mean_score: {{ mean_score }}
   resolved_pct: {{ resolved_pct }}
   near_resolved_pct: {{ near_resolved_pct }}
diff --git a/src/programbench/package.py b/src/programbench/package.py
index a86bcea..c985e11 100644
--- a/src/programbench/package.py
+++ b/src/programbench/package.py
@@ -148,9 +148,10 @@ def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = F
     for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
         iid = instance_dir.name
         eval_json = instance_dir / f"{iid}.eval.json"
-        has_solution = (instance_dir / "submission.tar.gz").exists() or (
-            instance_dir / "submission.tar.gz.url"
-        ).exists()
+        # Any artifact form resolve_submission_tar understands counts as a solution.
+        has_solution = any(
+            (instance_dir / f).exists() for f in ("submission.tar.gz", "submission.tar.gz.url", "submission.ref.yaml")
+        )
         if not (eval_json.exists() and has_solution):
             continue
         if iid not in instances:
diff --git a/src/programbench/publish.py b/src/programbench/publish.py
index 035173e..81b37e3 100644
--- a/src/programbench/publish.py
+++ b/src/programbench/publish.py
@@ -49,13 +49,19 @@ class PublishResult:
 
 
 def _ensure_committed(run_dir: Path) -> bool:
-    """Init the repo if needed and commit any pending changes; True if a commit was made."""
+    """Init the repo if needed and commit any pending changes; True if a commit was made.
+
+    Supplies a fallback git identity when none is configured (common in fresh CI containers,
+    where ``git commit`` would otherwise error out)."""
     if not (run_dir / ".git").exists():
         _git(run_dir, "init", "-b", "main")
     _git(run_dir, "add", "-A")
     if not _git(run_dir, "status", "--porcelain"):
         return False
-    _git(run_dir, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}")
+    ident = []
+    if subprocess.run(["git", "config", "user.email"], cwd=run_dir, capture_output=True).returncode != 0:
+        ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"]
+    _git(run_dir, *ident, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}")
     return True
 
 
diff --git a/src/programbench/register.py b/src/programbench/register.py
index da119f8..0f42525 100644
--- a/src/programbench/register.py
+++ b/src/programbench/register.py
@@ -35,6 +35,15 @@ def _git(cwd: Path, *args: str) -> str:
     return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip()
 
 
+def _commit(cwd: Path, message: str) -> None:
+    """Commit staged changes, supplying a fallback identity when git has none configured
+    (common in fresh CI containers, where ``git commit`` would otherwise error out)."""
+    ident = []
+    if subprocess.run(["git", "config", "user.email"], cwd=cwd, capture_output=True).returncode != 0:
+        ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"]
+    _git(cwd, *ident, "commit", "-m", message)
+
+
 def _to_https(url: str) -> str:
     """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL."""
     url = url.removesuffix(".git")
@@ -85,7 +94,7 @@ def build_plan(
     system, head = manifest["system"], manifest["headline"]
     body = (
         f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n"
-        f"- mean score: {head['mean_score'] * 100:.1f}\n"
+        f"- mean score: {head['mean_score'] * 100:.1f}%\n"
         f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n"
         f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n"
         f"Source: {source}\nCommit: `{commit}`\n\n"
@@ -124,9 +133,11 @@ def register_submission(
 
     if shutil.which("gh"):
         # Fork the registry under the authed user (no-op if it exists) and clone the fork;
-        # origin -> fork, upstream -> registry.
+        # origin -> fork, upstream -> registry. gh repo fork takes no destination arg, so it
+        # clones into <cwd>/<repo-name>; running from clone.parent makes that equal `clone`.
         subprocess.run(
-            ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)],
+            ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"],
+            cwd=clone.parent,
             check=True,
             capture_output=True,
             text=True,
@@ -134,7 +145,7 @@ def register_submission(
         _git(clone, "checkout", "-b", plan.branch)
         write_entry(plan, submission_dir, clone)
         _git(clone, "add", f"submissions/{plan.submission_id}")
-        _git(clone, "commit", "-m", plan.title)
+        _commit(clone, plan.title)
         _git(clone, "push", "-u", "origin", plan.branch)
         pr_url = subprocess.run(
             ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body],
@@ -151,7 +162,7 @@ def register_submission(
     _git(clone, "checkout", "-b", plan.branch)
     write_entry(plan, submission_dir, clone)
     _git(clone, "add", f"submissions/{plan.submission_id}")
-    _git(clone, "commit", "-m", plan.title)
+    _commit(clone, plan.title)
     steps = (
         "`gh` not found, so the PR was not opened. The entry is committed on branch "
         f"`{plan.branch}` in:\n  {clone}\n\n"
diff --git a/src/programbench/submission.py b/src/programbench/submission.py
index 18430b1..396a9ec 100644
--- a/src/programbench/submission.py
+++ b/src/programbench/submission.py
@@ -12,11 +12,11 @@
 
 import hashlib
 import json
-import logging
 import shutil
 import subprocess
 import tarfile
 import tempfile
+import urllib.parse
 import urllib.request
 from dataclasses import asdict, dataclass
 from pathlib import Path
@@ -26,11 +26,19 @@
 from programbench.eval.eval import EvaluationResult
 from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances
 
-log = logging.getLogger(__name__)
-
 RESOLVED_THRESHOLD = 1.0
 NEAR_RESOLVED_THRESHOLD = 0.95
 FIXTURE_PREFIX = "testorg__"
+DOWNLOAD_TIMEOUT = 60  # seconds; fail fast rather than hang on a stalled connection
+
+
+def _checked_url(raw: str) -> str:
+    """A submission-supplied URL, rejecting non-http(s) schemes (e.g. file://) to avoid SSRF
+    / local file reads when resolving untrusted third-party submissions."""
+    url = raw.strip()
+    if urllib.parse.urlparse(url).scheme not in ("http", "https"):
+        raise ValueError(f"refusing to fetch non-http(s) URL: {url!r}")
+    return url
 
 
 def benchmark_instances() -> dict[str, dict]:
@@ -138,7 +146,7 @@ def recombine_eval_json(instance_dir: Path, iid: str) -> bool:
     if log_file.exists():
         heavy = json.loads(log_file.read_text())
     elif url_file.exists():
-        with urllib.request.urlopen(url_file.read_text().strip()) as r:  # noqa: S310
+        with urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r:  # noqa: S310
             raw = r.read()
         sha_file = instance_dir / f"{iid}.eval.log.json.sha256"
         if sha_file.exists() and (got := hashlib.sha256(raw).hexdigest()) != sha_file.read_text().split()[0]:
@@ -206,7 +214,11 @@ def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None:
     if inline.exists():
         shutil.copy2(inline, dest_tar)
     elif url_file.exists():
-        urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar)  # noqa: S310
+        with (
+            urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r,  # noqa: S310
+            dest_tar.open("wb") as out,
+        ):
+            shutil.copyfileobj(r, out)
     elif ref_file.exists():
         _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar)
         expected = None  # git packing is not byte-reproducible; rely on re-eval instead
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
index 8cdda87..bc2ebc5 100644
--- a/src/programbench/verify.py
+++ b/src/programbench/verify.py
@@ -15,7 +15,6 @@
 proves the artifacts actually yield the reported results.
 """
 
-import logging
 import re
 import tempfile
 from dataclasses import dataclass
@@ -30,8 +29,6 @@
     score_run,
 )
 
-log = logging.getLogger(__name__)
-
 # Tier-0 recomputes the headline from the same eval.json with the same deterministic
 # rounding `package` used, so a consistent submission matches exactly. The epsilon only
 # absorbs float representation noise; any real drift (>= the rounding granularity) fails.
@@ -57,9 +54,10 @@ def ok(self) -> bool:
 
 
 def _close(a: object, b: object) -> bool:
-    if a is None or b is None:
+    # Non-numeric (e.g. a user-edited/invalid manifest value) is a failed check, not a crash.
+    if not isinstance(a, (int, float)) or not isinstance(b, (int, float)):
         return False
-    return abs(float(a) - float(b)) <= TOLERANCE
+    return abs(a - b) <= TOLERANCE
 
 
 def _headline_checks(claimed: dict, computed: Headline) -> list[Check]:
@@ -83,18 +81,20 @@ def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "
     sub_root = submission_dir
     submitted = score_run(sub_root, instances)
 
+    # Same regex semantics as the re-eval filter (instance_filters.filter_instances): only
+    # resolve/download and re-eval the targeted instances, not every submitted tarball.
+    targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)]
+
     with tempfile.TemporaryDirectory() as tmp:
         run = Path(tmp)
-        for iid in submitted:
+        for iid in targets:
             (run / iid).mkdir(parents=True)
             resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz")
         run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True)
         fresh = score_run(run, instances)
 
-    # Same regex semantics as the re-eval filter (instance_filters.filter_instances), so a
-    # filtered-in instance that produced no fresh score is reported as a failure (NaN), not
+    # A targeted instance that produced no fresh score is reported as a failure (NaN), not
     # silently skipped.
-    targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)]
     checks = [
         Check(
             iid,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index f23d7de..984085d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -42,7 +42,7 @@ def test_blob_sync_help():
 def test_submit_help():
     result = runner.invoke(app, ["submit", "--help"])
     assert result.exit_code == 0
-    assert all(cmd in result.output for cmd in ("package", "verify", "register", "recombine"))
+    assert all(cmd in result.output for cmd in ("package", "publish", "verify", "register", "recombine"))
 
 
 def test_submit_package_help():
diff --git a/tests/test_submission.py b/tests/test_submission.py
new file mode 100644
index 0000000..9d1d5d4
--- /dev/null
+++ b/tests/test_submission.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for submission helpers that don't need the benchmark data."""
+
+import json
+
+from programbench.submission import recombine_eval_json, split_eval_json
+
+
+def test_split_recombine_roundtrip_is_lossless(tmp_path):
+    iid = "org__tool.abc1234"
+    d = tmp_path / iid
+    d.mkdir()
+    original = {
+        "instance_id": iid,
+        "log": [{"step": 1, "out": "x" * 200}, {"step": 2, "out": "y"}],
+        "test_results": [
+            {"branch": "main", "name": "t_pass", "is_resolved": True, "extra": {"duration": 0.5}},
+            {
+                "branch": "main",
+                "name": "t_fail",
+                "is_resolved": False,
+                "extra": {"message": "assertion failed", "text": "trace " * 50, "duration": 1.2},
+            },
+            {"branch": "feat", "name": "t_other", "is_resolved": False, "extra": {"text": "boom"}},
+        ],
+    }
+    eval_json = d / f"{iid}.eval.json"
+    eval_json.write_text(json.dumps(original, indent=2))
+
+    split_eval_json(d, iid)
+    light = json.loads(eval_json.read_text())
+    assert light["log"] == []
+    assert "message" not in light["test_results"][1]["extra"]
+    assert (d / f"{iid}.eval.log.json").exists()
+
+    assert recombine_eval_json(d, iid) is True
+    assert json.loads(eval_json.read_text()) == original
+    assert not (d / f"{iid}.eval.log.json").exists()
+
+
+def test_split_is_idempotent_and_noop_when_light(tmp_path):
+    iid = "org__tool.def5678"
+    d = tmp_path / iid
+    d.mkdir()
+    light = {"instance_id": iid, "log": [], "test_results": [{"branch": "main", "name": "t", "is_resolved": True}]}
+    (d / f"{iid}.eval.json").write_text(json.dumps(light))
+    split_eval_json(d, iid)
+    assert not (d / f"{iid}.eval.log.json").exists()  # nothing heavy -> no split file written

From 21d80e1baee5b4f4afb7a64d1580171ffa8618ae Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Wed, 17 Jun 2026 22:51:53 -0700
Subject: [PATCH 09/11] register: maintainer branch-PR path, HTTPS push, robust
 PR creation

- Push a branch straight to the registry when the user has push access (forks are often
  disabled on private/org repos); only fork when they can't push.
- Normalize the push remote to HTTPS (gh may wire ssh, which needs keys / is sandbox-blocked).
- Open the PR with an explicit --head (gh's inference was unreliable) and resolve the PR URL
  by querying the branch, raising a real error if creation produced none.
---
 src/programbench/register.py | 66 ++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/programbench/register.py b/src/programbench/register.py
index 0f42525..7e384a9 100644
--- a/src/programbench/register.py
+++ b/src/programbench/register.py
@@ -123,7 +123,8 @@ def register_submission(
 ) -> RegisterResult:
     """Clone the registry, commit the entry on a branch, and open the PR.
 
-    Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward.
+    With ``gh``: maintainers (push access) get a branch + PR straight on the registry;
+    everyone else forks first (and a fork is only possible if the registry allows it).
     Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual
     push + compare-URL steps in ``next_steps`` (so the clone must outlive this call).
     """
@@ -132,28 +133,71 @@ def register_submission(
     clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions"
 
     if shutil.which("gh"):
-        # Fork the registry under the authed user (no-op if it exists) and clone the fork;
-        # origin -> fork, upstream -> registry. gh repo fork takes no destination arg, so it
-        # clones into <cwd>/<repo-name>; running from clone.parent makes that equal `clone`.
-        subprocess.run(
-            ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"],
-            cwd=clone.parent,
-            check=True,
-            capture_output=True,
-            text=True,
+        # Maintainers push a branch straight to the registry; others fork (only works if the
+        # registry permits forks — org/private repos often disable them).
+        can_push = (
+            subprocess.run(
+                ["gh", "api", f"repos/{slug}", "--jq", ".permissions.push"], capture_output=True, text=True
+            ).stdout.strip()
+            == "true"
         )
+        if can_push:
+            _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone))
+            head = plan.branch
+        else:
+            # gh repo fork takes no destination arg, so it clones into <cwd>/<repo-name>;
+            # running from clone.parent makes that equal `clone`.
+            subprocess.run(
+                ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"],
+                cwd=clone.parent,
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            login = subprocess.run(
+                ["gh", "api", "user", "--jq", ".login"], check=True, capture_output=True, text=True
+            ).stdout.strip()
+            head = f"{login}:{plan.branch}"
+        # Push over HTTPS: gh may wire an ssh remote, and ssh needs keys set up (and is blocked
+        # in some sandboxes), whereas gh's https credentials always work.
+        _git(clone, "remote", "set-url", "origin", _to_https(_git(clone, "remote", "get-url", "origin")))
         _git(clone, "checkout", "-b", plan.branch)
         write_entry(plan, submission_dir, clone)
         _git(clone, "add", f"submissions/{plan.submission_id}")
         _commit(clone, plan.title)
         _git(clone, "push", "-u", "origin", plan.branch)
+        # Open the PR (explicit --head; gh's inference is unreliable). The branch lookup is the
+        # source of truth: gh pr create can exit nonzero yet still create the PR, and a PR for
+        # the branch may already exist from a prior run.
+        created = subprocess.run(
+            ["gh", "pr", "create", "--repo", slug, "--head", head, "--title", plan.title, "--body", plan.body],
+            cwd=clone,
+            capture_output=True,
+            text=True,
+        )
         pr_url = subprocess.run(
-            ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body],
+            [
+                "gh",
+                "pr",
+                "list",
+                "--repo",
+                slug,
+                "--head",
+                plan.branch,
+                "--state",
+                "open",
+                "--json",
+                "url",
+                "--jq",
+                ".[0].url",
+            ],
             cwd=clone,
             check=True,
             capture_output=True,
             text=True,
         ).stdout.strip()
+        if not pr_url:
+            raise RuntimeError(f"gh pr create did not open a PR:\n{created.stderr or created.stdout}")
         shutil.rmtree(clone.parent)
         return RegisterResult(plan, pr_url, None)
 

From dd10df2f7ccb09960da96e9e31409d1a5aa1078d Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Thu, 18 Jun 2026 13:29:33 -0700
Subject: [PATCH 10/11] Stop storing scores in submission.yaml; verify
 score.json vs eval.json

Leaderboard scores are recomputed from _stats/score.json with the registry's ignore list,
so a cached headline in submission.yaml is redundant and goes stale on every ignore-list
change. Drop the headline block from the template + package. Re-point Tier-0 verify to
recompute per-test pass/fail from each eval.json and check it matches score.json (no headline
to compare). Make register re-runnable (force-push its branch) so a PR can be updated.
---
 src/programbench/cli/submit.py                | 32 +++++++-----
 .../data/templates/submission.yaml.j2         |  9 +---
 src/programbench/package.py                   |  5 --
 src/programbench/register.py                  |  3 +-
 src/programbench/verify.py                    | 51 +++++++++++--------
 5 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
index 2378f1c..4681a51 100644
--- a/src/programbench/cli/submit.py
+++ b/src/programbench/cli/submit.py
@@ -73,12 +73,12 @@ def verify(
         "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex."
     ),
 ) -> None:
-    """Verify a submission against its own claimed results.
+    """Verify a submission against its own artifacts.
 
-    Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json
-    files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves
-    each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the
-    reported scores.
+    Tier 0 (default, no Docker) recomputes each instance's per-test pass/fail from its
+    eval.json and checks it matches _stats/score.json. Tier 1 (--tier1) additionally
+    resolves each submission.tar.gz and re-runs evaluation to confirm the artifacts
+    reproduce the reported scores.
 
     \b
     Examples:
@@ -96,17 +96,21 @@ def verify(
         else verify_tier0(submission_dir)
     )
 
-    table = Table(title=f"Tier-{result.tier} verification", box=None)
-    table.add_column("Check", style="bold")
-    table.add_column("Claimed", justify="right")
-    table.add_column("Computed", justify="right")
-    table.add_column("", justify="center")
-    for c in result.checks:
-        table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌")
     console = Console()
-    console.print(table)
+    fails = [c for c in result.checks if not c.ok]
+    console.print(
+        f"Tier-{result.tier}: [bold]{len(result.checks) - len(fails)}/{len(result.checks)}[/bold] checks consistent"
+    )
+    if fails:
+        table = Table(title="Discrepancies", box=None)
+        table.add_column("Instance", style="bold")
+        table.add_column("score.json", justify="right")
+        table.add_column("recomputed", justify="right")
+        for c in fails:
+            table.add_row(c.name, str(c.claimed), str(c.computed))
+        console.print(table)
     if result.ok:
-        console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.")
+        console.print("[bold green]PASS[/bold green] — submission is consistent with its artifacts.")
     else:
         console.print("[bold red]FAIL[/bold red] — discrepancies found above.")
         raise typer.Exit(1)
diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
index eda220f..1539bb6 100644
--- a/src/programbench/data/templates/submission.yaml.j2
+++ b/src/programbench/data/templates/submission.yaml.j2
@@ -19,10 +19,5 @@ system:
 
 eval:
   programbench_version: {{ programbench_version | tojson }}   # [auto]
-
-headline:                # [auto] score summary from evaluation; other stats live in _stats/
-  mean_score: {{ mean_score }}
-  resolved_pct: {{ resolved_pct }}
-  near_resolved_pct: {{ near_resolved_pct }}
-  n_instances_attempted: {{ n_attempted }}
-  n_instances_total: {{ n_total }}
+# Scores are not stored here: the leaderboard recomputes them from _stats/score.json with the
+# registry's current ignored-tests list, so any cached numbers would just go stale.
diff --git a/src/programbench/package.py b/src/programbench/package.py
index c985e11..13dc7bf 100644
--- a/src/programbench/package.py
+++ b/src/programbench/package.py
@@ -187,11 +187,6 @@ def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = F
             run_dir=run_dir,
             submission_id=run_dir.resolve().name,
             programbench_version=version("programbench"),
-            mean_score=headline.mean_score,
-            resolved_pct=headline.resolved_pct,
-            near_resolved_pct=headline.near_resolved_pct,
-            n_attempted=headline.n_instances_attempted,
-            n_total=headline.n_instances_total,
             **carried,
         )
         + "\n"
diff --git a/src/programbench/register.py b/src/programbench/register.py
index 7e384a9..e77fcc9 100644
--- a/src/programbench/register.py
+++ b/src/programbench/register.py
@@ -165,7 +165,8 @@ def register_submission(
         write_entry(plan, submission_dir, clone)
         _git(clone, "add", f"submissions/{plan.submission_id}")
         _commit(clone, plan.title)
-        _git(clone, "push", "-u", "origin", plan.branch)
+        # Force so re-running register updates an existing PR (the add-<id> branch is ours).
+        _git(clone, "push", "-u", "--force", "origin", plan.branch)
         # Open the PR (explicit --head; gh's inference is unreliable). The branch lookup is the
         # source of truth: gh pr create can exit nonzero yet still create the PR, and a PR for
         # the branch may already exist from a prior run.
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
index bc2ebc5..edb1334 100644
--- a/src/programbench/verify.py
+++ b/src/programbench/verify.py
@@ -4,35 +4,33 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Verify a packaged submission against its own claimed results.
+"""Verify a packaged submission against its own artifacts.
 
-Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json
-files (with ignored-test filtering) and check it matches submission.yaml. This is the
-free consistency check a third party or CI can run with only ``programbench`` installed.
+Tier 0 (default, no Docker): recompute each instance's per-test pass/fail from its own
+eval.json and check it matches the submitted _stats/score.json — i.e. the reported scores
+faithfully reflect the eval output. A free check a third party or CI can run with only
+``programbench`` installed. (Leaderboard scores aren't stored in the submission, so there
+is no headline to check against.)
 
 Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``,
 and confirm the freshly produced scores match the submitted eval.json. This is what
 proves the artifacts actually yield the reported results.
 """
 
+import json
 import re
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 
 from programbench.submission import (
-    Headline,
-    aggregate,
     benchmark_instances,
-    load_manifest,
     resolve_submission_tar,
     score_run,
+    test_results_map,
 )
 
-# Tier-0 recomputes the headline from the same eval.json with the same deterministic
-# rounding `package` used, so a consistent submission matches exactly. The epsilon only
-# absorbs float representation noise; any real drift (>= the rounding granularity) fails.
-TOLERANCE = 1e-6
+TOLERANCE = 1e-6  # Tier-1 score floats are rounded; this only absorbs representation noise.
 
 
 @dataclass
@@ -60,18 +58,29 @@ def _close(a: object, b: object) -> bool:
     return abs(a - b) <= TOLERANCE
 
 
-def _headline_checks(claimed: dict, computed: Headline) -> list[Check]:
-    return [
-        Check(name, claimed.get(name), value, _close(claimed.get(name), value))
-        for name, value in computed.as_dict().items()
-    ]
-
-
 def verify_tier0(submission_dir: Path) -> VerifyResult:
-    manifest = load_manifest(submission_dir)
+    """Per instance, recompute the per-test pass/fail from its eval.json and check it matches
+    the submitted _stats/score.json (so the stored scores reflect the eval output, untampered)."""
     instances = benchmark_instances()
-    computed = aggregate(score_run(submission_dir, instances), len(instances))
-    return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed))
+    stored = json.loads((submission_dir / "_stats" / "score.json").read_text())
+    checks = []
+    for iid, stored_map in sorted(stored.items()):
+        eval_json = submission_dir / iid / f"{iid}.eval.json"
+        if iid not in instances:
+            checks.append(Check(iid, "in score.json", "not a benchmark instance", False))
+        elif not eval_json.exists():
+            checks.append(Check(iid, f"{sum(stored_map.values())}/{len(stored_map)} pass", "no eval.json", False))
+        else:
+            recomputed = test_results_map(eval_json, instances[iid])
+            checks.append(
+                Check(
+                    iid,
+                    f"{sum(stored_map.values())}/{len(stored_map)} pass",
+                    f"{sum(recomputed.values())}/{len(recomputed)} pass",
+                    recomputed == stored_map,
+                )
+            )
+    return VerifyResult(0, checks)
 
 
 def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult:

From f3cc0302420a64e5d33e520fad9efc1d236a6c62 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Thu, 18 Jun 2026 13:33:37 -0700
Subject: [PATCH 11/11] register: build PR body without the headline block (use
 score.json count)

---
 src/programbench/register.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/programbench/register.py b/src/programbench/register.py
index e77fcc9..7494211 100644
--- a/src/programbench/register.py
+++ b/src/programbench/register.py
@@ -20,6 +20,7 @@
 prints the compare URL so you can open the PR by hand.
 """
 
+import json
 import shutil
 import subprocess
 import tempfile
@@ -91,14 +92,13 @@ def build_plan(
     files = ["pointer.yaml", "submission.yaml"] + [
         f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json"))
     ]
-    system, head = manifest["system"], manifest["headline"]
+    system = manifest["system"]
+    n_attempted = len(json.loads((submission_dir / "_stats" / "score.json").read_text()))
     body = (
         f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n"
-        f"- mean score: {head['mean_score'] * 100:.1f}%\n"
-        f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n"
-        f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n"
+        f"- instances attempted: {n_attempted}\n\n"
         f"Source: {source}\nCommit: `{commit}`\n\n"
-        "Tier-0 verified (`programbench submit verify .`)."
+        "Tier-0 verified (`programbench submit verify .`). Leaderboard scores are recomputed from `_stats/score.json`."
     )
     return RegisterPlan(
         sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body