diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 6a36792..85f20b9 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -9,6 +9,7 @@ import typer from programbench.cli.blob import app as blob_app +from programbench.cli.submit import app as submit_app from programbench.constants import DOCKER_CPUS app = typer.Typer( @@ -18,6 +19,7 @@ context_settings={"help_option_names": ["-h", "--help"]}, ) app.add_typer(blob_app, name="blob") +app.add_typer(submit_app, name="submit") @app.callback() diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py new file mode 100644 index 0000000..4681a51 --- /dev/null +++ b/src/programbench/cli/submit.py @@ -0,0 +1,268 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Submission lifecycle commands: package an eval run, publish its repo, verify a submission, +register it on the leaderboard, and recombine a split eval.json.""" + +import shutil +from pathlib import Path + +import typer + +app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.") + + +@app.command() +def package( + run_dir: Path = typer.Argument( + ..., help="A `programbench eval` run directory (//submission.tar.gz)." + ), + upload_to: str = typer.Option( + "", + "--upload-to", + metavar="ORG[/DATASET]", + help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, " + "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a " + "per-submission dataset org/; pass 'org/name' to use an exact dataset.", + ), + overwrite: bool = typer.Option( + False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)." + ), +) -> None: + """Turn an evaluated run directory into a leaderboard submission, in place. + + Writes a submission.yaml manifest and _stats/score.json, and splits each large + eval.json into a light eval.json (kept) + a heavy .eval.log.json (raw log + + failure text) so the repo stays git-pushable. With --upload-to, the heavy files and + the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and + trajectories are left as TODO. + + \b + Examples: + programbench submit package output/my-run + programbench submit package output/my-run --upload-to programbench + """ + from rich.console import Console + + from programbench.package import package_run + + result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite) + console = Console() + console.print( + f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] " + f"(skipped {len(result.skipped)} unknown). " + f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%" + ) + console.print( + "[dim]Each eval.json was split into eval.json + .eval.log.json (recombine with " + "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]" + ) + + +@app.command() +def verify( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + tier1: bool = typer.Option( + False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)." + ), + workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."), + filter_spec: str = typer.Option( + "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex." + ), +) -> None: + """Verify a submission against its own artifacts. + + Tier 0 (default, no Docker) recomputes each instance's per-test pass/fail from its + eval.json and checks it matches _stats/score.json. Tier 1 (--tier1) additionally + resolves each submission.tar.gz and re-runs evaluation to confirm the artifacts + reproduce the reported scores. + + \b + Examples: + programbench submit verify ./their-submission + programbench submit verify ./their-submission --tier1 -w 4 + """ + from rich.console import Console + from rich.table import Table + + from programbench.verify import verify_tier0, verify_tier1 + + result = ( + verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec) + if tier1 + else verify_tier0(submission_dir) + ) + + console = Console() + fails = [c for c in result.checks if not c.ok] + console.print( + f"Tier-{result.tier}: [bold]{len(result.checks) - len(fails)}/{len(result.checks)}[/bold] checks consistent" + ) + if fails: + table = Table(title="Discrepancies", box=None) + table.add_column("Instance", style="bold") + table.add_column("score.json", justify="right") + table.add_column("recomputed", justify="right") + for c in fails: + table.add_row(c.name, str(c.claimed), str(c.computed)) + console.print(table) + if result.ok: + console.print("[bold green]PASS[/bold green] — submission is consistent with its artifacts.") + else: + console.print("[bold red]FAIL[/bold red] — discrepancies found above.") + raise typer.Exit(1) + + +@app.command() +def publish( + run_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + owner: str = typer.Option( + "", "--owner", help="GitHub org/user to create the repo under (default: your gh account)." + ), + repo: str = typer.Option("", "--repo", help="Repository name (default: the submission directory name)."), + private: bool = typer.Option( + False, "--private", help="Create the repo private (it must be public before you can register it)." + ), + remote: str = typer.Option( + "", "--remote", help="Push to this existing empty repo URL instead of creating one (the no-gh path)." + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Show what would be created/pushed; touch no network and make no commit." + ), +) -> None: + """Create this submission's public GitHub repo and push it (package -> publish -> register). + + Heavy artifacts already live on HuggingFace (as .url + .sha256 from `package`), so only + light files are committed. With `gh` the repo is created and pushed in one shot; without + it, pass `--remote ` to an empty repo you created, or follow the printed steps. The + repo name defaults to the directory name and the URL is read back by `register`, so it is + never stored in submission.yaml. + + \b + Examples: + programbench submit publish ./my-run --dry-run + programbench submit publish ./my-run --owner my-org + """ + from rich.console import Console + + from programbench.publish import _origin, publish as do_publish + + console = Console() + name = repo or run_dir.resolve().name + + if dry_run: + existing = _origin(run_dir) + if existing: + plan = f"push current commit to existing remote [bold]{existing}[/bold]" + elif remote: + plan = f"add remote [bold]{remote}[/bold] and push" + elif shutil.which("gh"): + plan = f"`gh repo create` [bold]{f'{owner}/{name}' if owner else name}[/bold] ({'private' if private else 'public'}), set origin, and push" + else: + plan = f"commit locally only — no gh and no --remote, so the repo for [bold]{name}[/bold] can't be created" + console.print(f"[bold]Would publish[/bold] {run_dir}:\n {plan}") + console.print("[dim]Dry run — no commit, nothing created or pushed. Drop --dry-run to publish.[/dim]") + return + + result = do_publish(run_dir, owner=owner, repo=repo, private=private, remote=remote) + if result.repo_url: + console.print(f"[bold green]Published[/bold green] {name} -> {result.repo_url}") + console.print("[dim]Next: `programbench submit register .` to register it on the leaderboard.[/dim]") + else: + console.print(f"[bold]Committed[/bold] {name} locally.\n{result.next_steps}") + + +@app.command() +def register( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + registry: str = typer.Option( + "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)." + ), + source: str = typer.Option( + "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)." + ), + commit: str = typer.Option( + "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)." + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network." + ), + verify: bool = typer.Option( + True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)." + ), +) -> None: + """Register a packaged submission on the leaderboard by opening a PR to the registry. + + The PR adds a small submissions// entry: a pointer.yaml (the submission repo URL + + the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The + source URL and commit are read from the run directory's own git remote/HEAD. With `gh` + installed the registry is forked and the PR opened for you; otherwise the entry is left + committed on a branch and the steps to push + open the PR are printed. + + \b + Examples: + programbench submit register ./my-run --dry-run + programbench submit register ./my-run + """ + import tempfile + + from rich.console import Console + + from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry + + console = Console() + registry = registry or REGISTRY_DEFAULT + + if verify: + from programbench.verify import verify_tier0 + + if not verify_tier0(submission_dir).ok: + console.print( + "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass " + "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch." + ) + raise typer.Exit(1) + + if dry_run: + plan = build_plan(submission_dir, registry, source or None, commit or None) + with tempfile.TemporaryDirectory() as tmp: + entry = write_entry(plan, submission_dir, Path(tmp)) + files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file()) + console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}") + console.print(f" branch: {plan.branch}") + console.print(f" source: {plan.source}\n commit: {plan.commit}") + console.print(" files: " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files)) + console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}") + console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}") + console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]") + return + + result = register_submission(submission_dir, registry, source or None, commit or None) + if result.pr_url: + console.print(f"[bold green]Opened PR[/bold green] for {result.plan.submission_id}: {result.pr_url}") + else: + console.print(f"[bold]Prepared[/bold] registry entry for {result.plan.submission_id}.\n{result.next_steps}") + + +@app.command() +def recombine( + run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."), +) -> None: + """Reverse `package`'s eval split: fold each .eval.log.json back into its + eval.json, restoring the original full eval output. + + The heavy file is read locally, or downloaded from its .url if it was uploaded to HF. + + \b + Examples: + programbench submit recombine ./their-submission + """ + from rich.console import Console + + from programbench.submission import recombine_eval_json + + n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir())) + Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}") diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2 new file mode 100644 index 0000000..9e6d1bb --- /dev/null +++ b/src/programbench/data/templates/README.md.j2 @@ -0,0 +1,78 @@ +

+ ProgramBench +

+ +> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?* · [Leaderboard](https://programbench.com) · [How to submit](https://programbench.com/blog/submission-guide) + +# [Submission Name Here] + + + +## System overview + + + +## Reproducing this run + + + +```bash +# 1. install the agent / dependencies +# 2. run inference per task (no internet, per the eval protocol) +# 3. programbench eval +# 4. programbench submit package --upload-to +``` + +## Extra stats (optional) + +The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are +**optional**, and each must be **computed by a script that reads your trajectories**, not +entered by hand: the number has to be recoverable from the run. `programbench` ships no +calculators (it makes no assumptions about your scaffold) — write your own that reads each +`traj.json` and emits a flat `{instance_id: value}` map to `_stats/.json`, and ship +the script here (e.g. under `_scripts/`) so the numbers are reproducible. + +## Links + + + +## Submission checklist + +- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission +- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold` +- [ ] Trajectories (`traj.json`) included for every task (agent submissions) +- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256` +- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written +- [ ] Filled in the System overview and Reproducing sections above +- [ ] `programbench submit verify .` passes +- [ ] Made this fork public +- [ ] Opened a registration PR to the submissions repo + +## Integrity attestations + +- [ ] Solutions were produced **only** from behavioral observation of the binary and its + bundled docs — no source code, repositories, mirrors, or package registries were consulted +- [ ] The model was not given internet access during evaluation +- [ ] The model did not have access to any unit tests during evaluation +- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results + +## Auditing + +Anyone can independently check this submission with the following instructions: + +```bash +git clone +cd {{ submission_id }} +uvx programbench submit verify . # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline) +uvx programbench submit verify . --tier1 # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker) +``` + +* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test +metadata. +* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs +them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from +the trajectories; only `score` is independently re-verifiable.) diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 new file mode 100644 index 0000000..1539bb6 --- /dev/null +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -0,0 +1,23 @@ +# Generated by `programbench submit package` from: {{ run_dir }} +# [auto] fields are recomputed on every `programbench submit package`; all other fields are preserved. +schema_version: 1 + +submission_id: {{ submission_id | tojson }} +submitter: + name: {{ submitter_name | tojson }} + contact: {{ submitter_contact | tojson }} # email or @github + affiliation: {{ affiliation | tojson }} + +system: + agent: {{ agent | tojson }} # scaffold/harness; "none" for a pure human submission + description_url: {{ description_url | tojson }} + is_os_model: {{ is_os_model | tojson }} # true if the model's weights are openly available + is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source + model: {{ model | tojson }} # display name used on the leaderboard + provider: {{ provider | tojson }} + type: {{ system_type | tojson }} # single-agent | multi-agent | other + +eval: + programbench_version: {{ programbench_version | tojson }} # [auto] +# Scores are not stored here: the leaderboard recomputes them from _stats/score.json with the +# registry's current ignored-tests list, so any cached numbers would just go stale. diff --git a/src/programbench/package.py b/src/programbench/package.py new file mode 100644 index 0000000..13dc7bf --- /dev/null +++ b/src/programbench/package.py @@ -0,0 +1,212 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Turn a ``programbench eval`` run directory into a leaderboard submission, in place. + +Packaging is purely eval-derived. It writes: + +- ``_stats/score.json`` — per-instance, per-test pass/fail (the one stat from evaluation), +- ``submission.yaml`` — the manifest, with ``[auto]`` score fields recomputed and any + author-entered fields preserved across re-packaging, + +and splits each ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json`` +(the raw log + failure text) so the run repo stays git-pushable; the two recombine to the +original via ``programbench submit recombine``. With ``--upload-to`` the heavy files and the +``submission.tar.gz`` artifacts go to a HuggingFace dataset (replaced by ``.url`` + ``.sha256``). + +Other stats (cost, calls, …) are optional and come from the agent trajectories via scripts +the submitter writes — this command produces none of them, and makes no assumptions about +the scaffold. The run directory stays a valid input to ``programbench eval``. +""" + +import logging +import os +import shutil +import tempfile +from dataclasses import dataclass +from importlib.metadata import version +from pathlib import Path + +import yaml +from jinja2 import Environment, PackageLoader + +from programbench.submission import ( + Headline, + aggregate, + benchmark_instances, + score_from_tests, + sha256_file, + split_eval_json, + test_results_map, + write_stat, +) + +log = logging.getLogger(__name__) + +TODO = "TODO" + +# Author-entered manifest fields preserved across re-packaging: template var -> (path, default). +_CARRIED = { + "affiliation": ("submitter.affiliation", ""), + "agent": ("system.agent", TODO), + "description_url": ("system.description_url", "README.md"), + "is_os_model": ("system.is_os_model", False), + "is_os_scaffold": ("system.is_os_scaffold", False), + "model": ("system.model", TODO), + "provider": ("system.provider", TODO), + "submitter_contact": ("submitter.contact", TODO), + "submitter_name": ("submitter.name", TODO), + "system_type": ("system.type", "single-agent"), +} + + +@dataclass +class PackageResult: + run_dir: Path + packaged: list[str] + skipped: list[str] + headline: Headline + + +def _dig(d: dict, dotted: str): + for key in dotted.split("."): + if not isinstance(d, dict): + return None + d = d.get(key) + return d + + +def _carried_values(run_dir: Path) -> dict: + manifest_path = run_dir / "submission.yaml" + existing = yaml.safe_load(manifest_path.read_text()) if manifest_path.exists() else {} + # Use "is None" (not "or") so a real False/empty value is preserved, not clobbered. + return { + var: (default if (val := _dig(existing, path)) is None else val) for var, (path, default) in _CARRIED.items() + } + + +def _upload_artifacts( + api, dataset: str, pending: list[tuple[Path, str, str]], existing: set[str], overwrite: bool +) -> None: + """Upload all pending files to HF, then replace each with a .url + .sha256 and delete it. + + ``pending`` is (instance_dir, instance_id, filename) — submission.tar.gz and the heavy + .eval.log.json. Files already on HF are skipped unless ``overwrite``. Uses + ``upload_large_folder`` (resumable, multi-commit, retrying) since logs can be hundreds + of MB and a single big commit is fragile; files are hard-linked into a staging tree so + nothing is copied. + """ + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.sha256").write_text(sha256_file(instance_dir / fname) + "\n") + to_upload = [(d, iid, f) for d, iid, f in pending if overwrite or f"{iid}/{f}" not in existing] + if to_upload: + run_dir = pending[0][0].parent + with tempfile.TemporaryDirectory(dir=run_dir) as tmp: + staging = Path(tmp) + for instance_dir, iid, fname in to_upload: + dst = staging / iid / fname + dst.parent.mkdir(parents=True, exist_ok=True) + try: + os.link(instance_dir / fname, dst) # same-fs hardlink: no copy + except OSError: + shutil.copy2(instance_dir / fname, dst) + log.info("Uploading %d file(s) to %s (resumable)", len(to_upload), dataset) + api.upload_large_folder(repo_id=dataset, folder_path=str(staging), repo_type="dataset") + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.url").write_text( + f"https://huggingface.co/datasets/{dataset}/resolve/main/{iid}/{fname}\n" + ) + (instance_dir / fname).unlink() + + +def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = False) -> PackageResult: + instances = benchmark_instances() + run_name = run_dir.resolve().name + + api = dataset = None + existing: set[str] = set() + if upload_to: + # Each submission gets its own dataset: bare "org" -> "org/"; + # an explicit "org/name" is used as-is. + dataset = upload_to if "/" in upload_to else f"{upload_to}/{run_name}" + from huggingface_hub import HfApi + + api = HfApi() + api.create_repo(dataset, repo_type="dataset", exist_ok=True) + # Force public so `verify`/`recombine` can fetch the artifacts anonymously + # (orgs may default new datasets to private). + api.update_repo_settings(dataset, repo_type="dataset", private=False) + existing = set(api.list_repo_files(dataset, repo_type="dataset")) + + test_maps: dict[str, dict[str, bool]] = {} + packaged: list[str] = [] + skipped: list[str] = [] + pending: list[tuple[Path, str, str]] = [] + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + # Any artifact form resolve_submission_tar understands counts as a solution. + has_solution = any( + (instance_dir / f).exists() for f in ("submission.tar.gz", "submission.tar.gz.url", "submission.ref.yaml") + ) + if not (eval_json.exists() and has_solution): + continue + if iid not in instances: + log.warning("Skipping %s (not a known ProgramBench instance)", iid) + skipped.append(iid) + continue + test_maps[iid] = test_results_map(eval_json, instances[iid]) + # Split the (potentially huge) eval.json into a light eval.json + a heavy + # .eval.log.json (log + failure text); they recombine to the original. + split_eval_json(instance_dir, iid) + if api: + for fname in (f"{iid}.eval.log.json", "submission.tar.gz"): + if (instance_dir / fname).exists(): + pending.append((instance_dir, iid, fname)) + packaged.append(iid) + + if not packaged: + raise ValueError(f"No packageable instances found under {run_dir}") + + # Write the scoring-derived artifacts first; they don't depend on the upload, so a + # failed/throttled upload leaves them correct and the run simply resumable. + # score.json is per-test ({iid: {test: passed}}) so scores can be recomputed later + # while striking out specific tests; the manifest headline is the score with no + # tests struck. + write_stat(run_dir, "score", test_maps) + scores = {iid: score_from_tests(m) for iid, m in test_maps.items()} + headline = aggregate(scores, len(instances)) + + carried = _carried_values(run_dir) + env = Environment(loader=PackageLoader("programbench", "data/templates"), autoescape=False) + (run_dir / "submission.yaml").write_text( + env.get_template("submission.yaml.j2").render( + run_dir=run_dir, + submission_id=run_dir.resolve().name, + programbench_version=version("programbench"), + **carried, + ) + + "\n" + ) + + # README is created once (a starting point for the author); never overwritten. + readme = run_dir / "README.md" + if not readme.exists(): + readme.write_text( + env.get_template("README.md.j2").render( + submission_id=run_dir.resolve().name, + mean_pct=round(headline.mean_score * 100, 1), + resolved_pct=headline.resolved_pct, + n_attempted=headline.n_instances_attempted, + n_total=headline.n_instances_total, + **carried, + ) + ) + + if api and pending: + _upload_artifacts(api, dataset, pending, existing, overwrite) + + return PackageResult(run_dir, packaged, skipped, headline) diff --git a/src/programbench/publish.py b/src/programbench/publish.py new file mode 100644 index 0000000..81b37e3 --- /dev/null +++ b/src/programbench/publish.py @@ -0,0 +1,107 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Create a submission's public GitHub repo and push it. + +The middle step between ``package`` and ``register``: it turns a packaged run directory +into a public Git repo and pushes it. The heavy artifacts already live on HuggingFace (as +``.url`` + ``.sha256`` written by ``package``), so only light files are committed. With +``gh`` the repo is created and pushed in one shot; without ``gh`` it commits locally and +either pushes to a ``--remote`` you pre-created, or prints the steps to finish by hand. + +The repo URL is never stored in ``submission.yaml`` — it defaults to the submission id and +``register`` reads it back from the git remote this sets, keeping the manifest host-agnostic. +""" + +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + + +def _git(cwd: Path, *args: str) -> str: + return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() + + +def _to_https(url: str) -> str: + """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" + url = url.removesuffix(".git") + if url.startswith("git@"): + host, path = url[4:].split(":", 1) + return f"https://{host}/{path}" + return url + + +def _origin(run_dir: Path) -> str | None: + if not (run_dir / ".git").exists() or "origin" not in _git(run_dir, "remote").split(): + return None + return _git(run_dir, "remote", "get-url", "origin") + + +@dataclass +class PublishResult: + repo_url: str | None # the pushed repo (https), when known + committed: bool # whether a new commit was made + next_steps: str | None # manual steps when we could not finish (no gh, no --remote) + + +def _ensure_committed(run_dir: Path) -> bool: + """Init the repo if needed and commit any pending changes; True if a commit was made. + + Supplies a fallback git identity when none is configured (common in fresh CI containers, + where ``git commit`` would otherwise error out).""" + if not (run_dir / ".git").exists(): + _git(run_dir, "init", "-b", "main") + _git(run_dir, "add", "-A") + if not _git(run_dir, "status", "--porcelain"): + return False + ident = [] + if subprocess.run(["git", "config", "user.email"], cwd=run_dir, capture_output=True).returncode != 0: + ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"] + _git(run_dir, *ident, "commit", "-m", f"ProgramBench submission: {run_dir.resolve().name}") + return True + + +def _gh_repo_url(slug: str, private: bool) -> str: + """The repo's URL, creating it (public unless ``private``) if it doesn't exist yet.""" + view = ["gh", "repo", "view", slug, "--json", "url", "-q", ".url"] + if subprocess.run(view, capture_output=True, text=True).returncode != 0: + subprocess.run( + ["gh", "repo", "create", slug, "--private" if private else "--public"], + check=True, + capture_output=True, + text=True, + ) + return subprocess.run(view, check=True, capture_output=True, text=True).stdout.strip() + + +def publish(run_dir: Path, owner: str = "", repo: str = "", private: bool = False, remote: str = "") -> PublishResult: + name = repo or run_dir.resolve().name + committed = _ensure_committed(run_dir) + + # Pick the target repo: an explicit --remote, an already-wired origin, or one created + # via gh. Without any of those we can only commit locally and hand back the steps. + target = remote or _origin(run_dir) + if not target: + if not shutil.which("gh"): + steps = ( + "`gh` is not installed and no --remote was given, so the repo could not be created. " + f"The submission is committed locally in {run_dir}. To finish:\n" + f" 1. Create an empty PUBLIC repo (named '{name}') at https://github.com/new\n" + " 2. From the submission directory, wire it up and push:\n" + " git remote add origin \n" + " git push -u origin HEAD:main\n" + "Then run `programbench submit register .` to register it on the leaderboard." + ) + return PublishResult(None, committed, steps) + target = _gh_repo_url(f"{owner}/{name}" if owner else name, private) + + # Push over HTTPS using gh's credentials: reliable everywhere (an SSH origin needs keys + # set up, and would fail in sandboxes that block port 22). + url = _to_https(target) + _git(run_dir, "remote", "set-url" if _origin(run_dir) else "add", "origin", url) + _git(run_dir, "push", "-u", "origin", "HEAD:main") + return PublishResult(url, committed, None) diff --git a/src/programbench/register.py b/src/programbench/register.py new file mode 100644 index 0000000..7494211 --- /dev/null +++ b/src/programbench/register.py @@ -0,0 +1,219 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Register a packaged submission into the leaderboard registry by opening a PR. + +A registry entry is small and self-contained: a pointer to the submission's own public +repo, plus the manifest and stat files copied out of it. + + submissions// + pointer.yaml # source repo URL + the exact commit that was scored + submission.yaml # copied from the submission + _stats/*.json # copied from the submission + +This builds that entry against a clone of the registry (default +github.com/ProgramBench/submissions) and opens the PR. With ``gh`` it forks the registry +and opens the PR for you; without it, it leaves the commit on a branch in a clone and +prints the compare URL so you can open the PR by hand. +""" + +import json +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import yaml + +REGISTRY_DEFAULT = "https://github.com/ProgramBench/submissions" + + +def _git(cwd: Path, *args: str) -> str: + return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() + + +def _commit(cwd: Path, message: str) -> None: + """Commit staged changes, supplying a fallback identity when git has none configured + (common in fresh CI containers, where ``git commit`` would otherwise error out).""" + ident = [] + if subprocess.run(["git", "config", "user.email"], cwd=cwd, capture_output=True).returncode != 0: + ident = ["-c", "user.name=ProgramBench", "-c", "user.email=submissions@programbench.com"] + _git(cwd, *ident, "commit", "-m", message) + + +def _to_https(url: str) -> str: + """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" + url = url.removesuffix(".git") + if url.startswith("git@"): + host, path = url[4:].split(":", 1) + return f"https://{host}/{path}" + return url + + +def _slug(registry: str) -> str: + """``https://github.com/Owner/Repo`` -> ``Owner/Repo`` (what ``gh`` expects).""" + return _to_https(registry).removeprefix("https://github.com/") + + +@dataclass +class RegisterPlan: + submission_id: str + source: str + commit: str + registry: str + branch: str + pointer: str # rendered pointer.yaml + files: list[str] # entry-relative paths that will be added + title: str + body: str + + +@dataclass +class RegisterResult: + plan: RegisterPlan + pr_url: str | None # set when a PR was opened (gh path) + next_steps: str | None # set when manual steps remain (no-gh path) + + +def build_plan( + submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None +) -> RegisterPlan: + sub_id = submission_dir.resolve().name + manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text()) + # Overrides win; otherwise autodetect from the submission's own git remote/HEAD. The + # autodetect calls are skipped (short-circuited) when an override is supplied. + source = source or _to_https(_git(submission_dir, "remote", "get-url", "origin")) + commit = commit or _git(submission_dir, "rev-parse", "HEAD") + pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False) + files = ["pointer.yaml", "submission.yaml"] + [ + f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json")) + ] + system = manifest["system"] + n_attempted = len(json.loads((submission_dir / "_stats" / "score.json").read_text())) + body = ( + f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n" + f"- instances attempted: {n_attempted}\n\n" + f"Source: {source}\nCommit: `{commit}`\n\n" + "Tier-0 verified (`programbench submit verify .`). Leaderboard scores are recomputed from `_stats/score.json`." + ) + return RegisterPlan( + sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body + ) + + +def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -> Path: + """Materialize ``submissions//`` under ``registry_root`` (overwriting any existing entry).""" + entry = registry_root / "submissions" / plan.submission_id + if entry.exists(): + shutil.rmtree(entry) + (entry / "_stats").mkdir(parents=True) + (entry / "pointer.yaml").write_text(plan.pointer) + shutil.copyfile(submission_dir / "submission.yaml", entry / "submission.yaml") + for p in sorted((submission_dir / "_stats").glob("*.json")): + shutil.copyfile(p, entry / "_stats" / p.name) + return entry + + +def register_submission( + submission_dir: Path, registry: str, source: str | None = None, commit: str | None = None +) -> RegisterResult: + """Clone the registry, commit the entry on a branch, and open the PR. + + With ``gh``: maintainers (push access) get a branch + PR straight on the registry; + everyone else forks first (and a fork is only possible if the registry allows it). + Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual + push + compare-URL steps in ``next_steps`` (so the clone must outlive this call). + """ + plan = build_plan(submission_dir, registry, source, commit) + slug = _slug(registry) + clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions" + + if shutil.which("gh"): + # Maintainers push a branch straight to the registry; others fork (only works if the + # registry permits forks — org/private repos often disable them). + can_push = ( + subprocess.run( + ["gh", "api", f"repos/{slug}", "--jq", ".permissions.push"], capture_output=True, text=True + ).stdout.strip() + == "true" + ) + if can_push: + _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone)) + head = plan.branch + else: + # gh repo fork takes no destination arg, so it clones into /; + # running from clone.parent makes that equal `clone`. + subprocess.run( + ["gh", "repo", "fork", slug, "--clone", "--default-branch-only"], + cwd=clone.parent, + check=True, + capture_output=True, + text=True, + ) + login = subprocess.run( + ["gh", "api", "user", "--jq", ".login"], check=True, capture_output=True, text=True + ).stdout.strip() + head = f"{login}:{plan.branch}" + # Push over HTTPS: gh may wire an ssh remote, and ssh needs keys set up (and is blocked + # in some sandboxes), whereas gh's https credentials always work. + _git(clone, "remote", "set-url", "origin", _to_https(_git(clone, "remote", "get-url", "origin"))) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _commit(clone, plan.title) + # Force so re-running register updates an existing PR (the add- branch is ours). + _git(clone, "push", "-u", "--force", "origin", plan.branch) + # Open the PR (explicit --head; gh's inference is unreliable). The branch lookup is the + # source of truth: gh pr create can exit nonzero yet still create the PR, and a PR for + # the branch may already exist from a prior run. + created = subprocess.run( + ["gh", "pr", "create", "--repo", slug, "--head", head, "--title", plan.title, "--body", plan.body], + cwd=clone, + capture_output=True, + text=True, + ) + pr_url = subprocess.run( + [ + "gh", + "pr", + "list", + "--repo", + slug, + "--head", + plan.branch, + "--state", + "open", + "--json", + "url", + "--jq", + ".[0].url", + ], + cwd=clone, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + if not pr_url: + raise RuntimeError(f"gh pr create did not open a PR:\n{created.stderr or created.stdout}") + shutil.rmtree(clone.parent) + return RegisterResult(plan, pr_url, None) + + # No gh: clone the registry directly, commit the branch, and hand back the steps. + _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone)) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _commit(clone, plan.title) + steps = ( + "`gh` not found, so the PR was not opened. The entry is committed on branch " + f"`{plan.branch}` in:\n {clone}\n\n" + "To finish, from that clone push the branch to your fork of the registry and open a PR:\n" + " git remote add fork https://github.com//submissions\n" + f" git push -u fork {plan.branch}\n" + f" {_to_https(registry)}/compare/main...:{plan.branch}?expand=1" + ) + return RegisterResult(plan, None, steps) diff --git a/src/programbench/submission.py b/src/programbench/submission.py new file mode 100644 index 0000000..396a9ec --- /dev/null +++ b/src/programbench/submission.py @@ -0,0 +1,246 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Shared helpers for building (`package`) and checking (`verify`) submissions. + +Both commands must score a run directory the same way, so the scoring and headline +aggregation live here and are imported by each command. +""" + +import hashlib +import json +import shutil +import subprocess +import tarfile +import tempfile +import urllib.parse +import urllib.request +from dataclasses import asdict, dataclass +from pathlib import Path + +import yaml + +from programbench.eval.eval import EvaluationResult +from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances + +RESOLVED_THRESHOLD = 1.0 +NEAR_RESOLVED_THRESHOLD = 0.95 +FIXTURE_PREFIX = "testorg__" +DOWNLOAD_TIMEOUT = 60 # seconds; fail fast rather than hang on a stalled connection + + +def _checked_url(raw: str) -> str: + """A submission-supplied URL, rejecting non-http(s) schemes (e.g. file://) to avoid SSRF + / local file reads when resolving untrusted third-party submissions.""" + url = raw.strip() + if urllib.parse.urlparse(url).scheme not in ("http", "https"): + raise ValueError(f"refusing to fetch non-http(s) URL: {url!r}") + return url + + +def benchmark_instances() -> dict[str, dict]: + """Real benchmark instances, keyed by id (excludes the bundled test fixture).""" + return {i["instance_id"]: i for i in load_all_instances() if not i["instance_id"].startswith(FIXTURE_PREFIX)} + + +def sha256_file(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def test_results_map(eval_json: Path, instance: dict) -> dict[str, bool]: + """Per-test pass/fail for one instance, after the same active-branch / ignored-test + filtering as ``info``. Keyed by ``"/"``, value ``True`` iff passed. + + This is the raw material a score is computed from, so the leaderboard can later + recompute scores while striking out specific tests (see the registry's ignore map). + """ + result = EvaluationResult.model_validate_json(eval_json.read_text()) + result = result.for_branches(get_active_branches(instance)).without_ignored(get_ignored_tests(instance)) + return {t.full_name: t.is_resolved for t in result.test_results} + + +def score_from_tests(tests: dict[str, bool], ignore: set[str] = frozenset()) -> float: + """Fraction passed over the non-ignored tests (0.0 if none remain).""" + kept = [passed for name, passed in tests.items() if name not in ignore] + return sum(kept) / len(kept) if kept else 0.0 + + +def score_instance(eval_json: Path, instance: dict) -> float: + """Per-instance score with ignored-branch/test filtering (same logic as `info`).""" + return score_from_tests(test_results_map(eval_json, instance)) + + +def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]: + """Map instance_id -> score for every /.eval.json present and known.""" + scores: dict[str, float] = {} + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + if eval_json.exists() and iid in instances: + scores[iid] = score_instance(eval_json, instances[iid]) + return scores + + +def write_stat(run_dir: Path, stat: str, by_instance: dict[str, object]) -> None: + """Write a per-instance stat file: ``/_stats/.json`` = ``{iid: value}``.""" + (run_dir / "_stats").mkdir(exist_ok=True) + (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True)) + + +_HEAVY_EXTRA_KEYS = ("message", "text") + + +def _full_name(t: dict) -> str: + return f"{t['branch']}/{t['name']}" if t.get("branch") else t["name"] + + +def split_eval_json(instance_dir: Path, iid: str) -> None: + """Split ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json``. + + The heavy file holds the only bulky parts — the top-level ``log`` and each failing + test's ``message``/``text`` — keyed so the two recombine losslessly. Nothing is dropped; + the union of the two files holds everything in the original eval.json (the rebuilt file + is semantically identical, though not necessarily byte-for-byte). + """ + p = instance_dir / f"{iid}.eval.json" + data = json.loads(p.read_text()) + # Idempotent: if there's nothing heavy left (already split, or genuinely light), do + # nothing — never clobber an existing eval.log.json. + has_heavy = bool(data.get("log")) or any( + k in (t.get("extra") or {}) for t in data.get("test_results", []) for k in _HEAVY_EXTRA_KEYS + ) + if not has_heavy: + return + heavy: dict = {"log": data.get("log") or [], "failures": {}} + for t in data.get("test_results", []): + extra = t.get("extra") or {} + moved = {k: extra.pop(k) for k in _HEAVY_EXTRA_KEYS if k in extra} + if moved: + heavy["failures"][_full_name(t)] = moved + data["log"] = [] + p.write_text(json.dumps(data, indent=2)) + (instance_dir / f"{iid}.eval.log.json").write_text(json.dumps(heavy)) + + +def recombine_eval_json(instance_dir: Path, iid: str) -> bool: + """Inverse of :func:`split_eval_json`: fold the heavy file back into ``.eval.json`` + (restoring the full eval output losslessly), then remove the heavy file and its + ``.url``/``.sha256``. + + The heavy file is read locally, or downloaded from ``.eval.log.json.url`` if hosted; + a downloaded file is checked against its ``.sha256`` sidecar when one is present. + Returns True if a recombine happened. + """ + light = instance_dir / f"{iid}.eval.json" + log_file = instance_dir / f"{iid}.eval.log.json" + url_file = instance_dir / f"{iid}.eval.log.json.url" + if not light.exists(): + return False + if log_file.exists(): + heavy = json.loads(log_file.read_text()) + elif url_file.exists(): + with urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r: # noqa: S310 + raw = r.read() + sha_file = instance_dir / f"{iid}.eval.log.json.sha256" + if sha_file.exists() and (got := hashlib.sha256(raw).hexdigest()) != sha_file.read_text().split()[0]: + raise ValueError(f"{iid}: eval.log.json sha256 mismatch on download (got {got[:12]}…)") + heavy = json.loads(raw) + else: + return False + data = json.loads(light.read_text()) + data["log"] = heavy.get("log", []) + failures = heavy.get("failures", {}) + for t in data.get("test_results", []): + if (name := _full_name(t)) in failures: + t.setdefault("extra", {}).update(failures[name]) + light.write_text(json.dumps(data, indent=2)) + for f in (log_file, url_file, instance_dir / f"{iid}.eval.log.json.sha256"): + f.unlink(missing_ok=True) + return True + + +@dataclass +class Headline: + mean_score: float + resolved_pct: float + near_resolved_pct: float + n_instances_attempted: int + n_instances_total: int + + def as_dict(self) -> dict: + return asdict(self) + + +def aggregate(scores: dict[str, float], n_total: int) -> Headline: + values = list(scores.values()) + if not values: + raise ValueError("No scored instances found") + n = len(values) + # mean is over attempted instances; resolved/near are over the full benchmark + # (an unattempted task counts as unresolved). + return Headline( + mean_score=round(sum(values) / n, 4), + resolved_pct=round(100 * sum(s >= RESOLVED_THRESHOLD for s in values) / n_total, 1), + near_resolved_pct=round(100 * sum(s >= NEAR_RESOLVED_THRESHOLD for s in values) / n_total, 1), + n_instances_attempted=n, + n_instances_total=n_total, + ) + + +def load_manifest(submission_dir: Path) -> dict: + return yaml.safe_load((submission_dir / "submission.yaml").read_text()) + + +def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: + """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256. + + Supports three artifact forms: inline file, ``.url`` (downloaded), or + ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is + enforced for inline/url; for git it is advisory (packing is not byte-reproducible). + """ + sha_file = instance_dir / "submission.tar.gz.sha256" + expected = sha_file.read_text().split()[0] if sha_file.exists() else None + + inline = instance_dir / "submission.tar.gz" + url_file = instance_dir / "submission.tar.gz.url" + ref_file = instance_dir / "submission.ref.yaml" + if inline.exists(): + shutil.copy2(inline, dest_tar) + elif url_file.exists(): + with ( + urllib.request.urlopen(_checked_url(url_file.read_text()), timeout=DOWNLOAD_TIMEOUT) as r, # noqa: S310 + dest_tar.open("wb") as out, + ): + shutil.copyfileobj(r, out) + elif ref_file.exists(): + _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar) + expected = None # git packing is not byte-reproducible; rely on re-eval instead + else: + raise ValueError(f"{instance_dir.name}: no submission.tar.gz, .url, or .ref.yaml found") + + if expected and (got := sha256_file(dest_tar)) != expected: + raise ValueError(f"{instance_dir.name}: sha256 mismatch (expected {expected[:12]}…, got {got[:12]}…)") + + +def _pack_git_ref(ref: dict, dest_tar: Path) -> None: + with tempfile.TemporaryDirectory() as tmp: + src = Path(tmp) / "src" + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", ref["ref"], ref["repo"], str(src)], + check=True, + capture_output=True, + ) + root = src / ref["subpath"] if ref.get("subpath") else src + with tarfile.open(dest_tar, "w:gz") as tar: + for p in sorted(root.rglob("*")): + rel = p.relative_to(root).as_posix() + if rel.split("/", 1)[0] == ".git": + continue + tar.add(p, arcname=rel, recursive=False) diff --git a/src/programbench/verify.py b/src/programbench/verify.py new file mode 100644 index 0000000..edb1334 --- /dev/null +++ b/src/programbench/verify.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Verify a packaged submission against its own artifacts. + +Tier 0 (default, no Docker): recompute each instance's per-test pass/fail from its own +eval.json and check it matches the submitted _stats/score.json — i.e. the reported scores +faithfully reflect the eval output. A free check a third party or CI can run with only +``programbench`` installed. (Leaderboard scores aren't stored in the submission, so there +is no headline to check against.) + +Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``, +and confirm the freshly produced scores match the submitted eval.json. This is what +proves the artifacts actually yield the reported results. +""" + +import json +import re +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from programbench.submission import ( + benchmark_instances, + resolve_submission_tar, + score_run, + test_results_map, +) + +TOLERANCE = 1e-6 # Tier-1 score floats are rounded; this only absorbs representation noise. + + +@dataclass +class Check: + name: str + claimed: object + computed: object + ok: bool + + +@dataclass +class VerifyResult: + tier: int + checks: list[Check] + + @property + def ok(self) -> bool: + return all(c.ok for c in self.checks) + + +def _close(a: object, b: object) -> bool: + # Non-numeric (e.g. a user-edited/invalid manifest value) is a failed check, not a crash. + if not isinstance(a, (int, float)) or not isinstance(b, (int, float)): + return False + return abs(a - b) <= TOLERANCE + + +def verify_tier0(submission_dir: Path) -> VerifyResult: + """Per instance, recompute the per-test pass/fail from its eval.json and check it matches + the submitted _stats/score.json (so the stored scores reflect the eval output, untampered).""" + instances = benchmark_instances() + stored = json.loads((submission_dir / "_stats" / "score.json").read_text()) + checks = [] + for iid, stored_map in sorted(stored.items()): + eval_json = submission_dir / iid / f"{iid}.eval.json" + if iid not in instances: + checks.append(Check(iid, "in score.json", "not a benchmark instance", False)) + elif not eval_json.exists(): + checks.append(Check(iid, f"{sum(stored_map.values())}/{len(stored_map)} pass", "no eval.json", False)) + else: + recomputed = test_results_map(eval_json, instances[iid]) + checks.append( + Check( + iid, + f"{sum(stored_map.values())}/{len(stored_map)} pass", + f"{sum(recomputed.values())}/{len(recomputed)} pass", + recomputed == stored_map, + ) + ) + return VerifyResult(0, checks) + + +def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult: + from programbench.eval.eval_batch import run_eval_batch + + instances = benchmark_instances() + sub_root = submission_dir + submitted = score_run(sub_root, instances) + + # Same regex semantics as the re-eval filter (instance_filters.filter_instances): only + # resolve/download and re-eval the targeted instances, not every submitted tarball. + targets = [iid for iid in submitted if not filter_spec or re.match(filter_spec, iid)] + + with tempfile.TemporaryDirectory() as tmp: + run = Path(tmp) + for iid in targets: + (run / iid).mkdir(parents=True) + resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz") + run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True) + fresh = score_run(run, instances) + + # A targeted instance that produced no fresh score is reported as a failure (NaN), not + # silently skipped. + checks = [ + Check( + iid, + round(submitted[iid], 4), + round(fresh[iid], 4) if iid in fresh else float("nan"), + _close(submitted[iid], fresh.get(iid)), + ) + for iid in targets + ] + return VerifyResult(1, checks) diff --git a/tests/test_cli.py b/tests/test_cli.py index 594bd9d..984085d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -37,3 +37,27 @@ def test_blob_sync_help(): result = runner.invoke(app, ["blob", "sync", "--help"]) assert result.exit_code == 0 assert "instance" in result.output.lower() + + +def test_submit_help(): + result = runner.invoke(app, ["submit", "--help"]) + assert result.exit_code == 0 + assert all(cmd in result.output for cmd in ("package", "publish", "verify", "register", "recombine")) + + +def test_submit_package_help(): + result = runner.invoke(app, ["submit", "package", "--help"]) + assert result.exit_code == 0 + assert "upload" in result.output.lower() + + +def test_submit_register_help(): + result = runner.invoke(app, ["submit", "register", "--help"]) + assert result.exit_code == 0 + assert "registry" in result.output.lower() + + +def test_submit_publish_help(): + result = runner.invoke(app, ["submit", "publish", "--help"]) + assert result.exit_code == 0 + assert "owner" in result.output.lower() diff --git a/tests/test_submission.py b/tests/test_submission.py new file mode 100644 index 0000000..9d1d5d4 --- /dev/null +++ b/tests/test_submission.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for submission helpers that don't need the benchmark data.""" + +import json + +from programbench.submission import recombine_eval_json, split_eval_json + + +def test_split_recombine_roundtrip_is_lossless(tmp_path): + iid = "org__tool.abc1234" + d = tmp_path / iid + d.mkdir() + original = { + "instance_id": iid, + "log": [{"step": 1, "out": "x" * 200}, {"step": 2, "out": "y"}], + "test_results": [ + {"branch": "main", "name": "t_pass", "is_resolved": True, "extra": {"duration": 0.5}}, + { + "branch": "main", + "name": "t_fail", + "is_resolved": False, + "extra": {"message": "assertion failed", "text": "trace " * 50, "duration": 1.2}, + }, + {"branch": "feat", "name": "t_other", "is_resolved": False, "extra": {"text": "boom"}}, + ], + } + eval_json = d / f"{iid}.eval.json" + eval_json.write_text(json.dumps(original, indent=2)) + + split_eval_json(d, iid) + light = json.loads(eval_json.read_text()) + assert light["log"] == [] + assert "message" not in light["test_results"][1]["extra"] + assert (d / f"{iid}.eval.log.json").exists() + + assert recombine_eval_json(d, iid) is True + assert json.loads(eval_json.read_text()) == original + assert not (d / f"{iid}.eval.log.json").exists() + + +def test_split_is_idempotent_and_noop_when_light(tmp_path): + iid = "org__tool.def5678" + d = tmp_path / iid + d.mkdir() + light = {"instance_id": iid, "log": [], "test_results": [{"branch": "main", "name": "t", "is_resolved": True}]} + (d / f"{iid}.eval.json").write_text(json.dumps(light)) + split_eval_json(d, iid) + assert not (d / f"{iid}.eval.log.json").exists() # nothing heavy -> no split file written