Overworldai · SandersAaronD · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.github/benchmark.py b/.github/benchmark.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""GPU benchmark + consistency for the world_engine CI (issue #48).
+
+Run on the ephemeral G4 runner by .github/workflows/benchmark.yml. `run` does a
+performance rollout (LFPS) and a deterministic consistency forward pass for one
+(config, ref) and writes a JSON result + a latent .npy. `compare` turns the
+collected results into a markdown table (perf delta + consistency MSE). The
+workflow runs `run` against both the main and PR world_engine installs (only
+the engine code differs); "main" runs first and creates the shared KV-cache
+state that "pr" then loads.
+
+  python .github/benchmark.py run \
+      --config-id wp15-1b-bf16 --ref main \
+      --model-uri Overworld/Waypoint-1.5-1B --quant none \
+      --shared-state results/state_wp15-1b-bf16.pt --out results
+
+  python .github/benchmark.py compare --results-dir results --out results/summary.md
+"""
+import argparse
+import json
+import platform
+import time
+from pathlib import Path
+
+
+# How many gen_frame() steps to pre-roll before snapshotting the KV cache for
+# the consistency check ("fully populated cache").
+CONSISTENCY_PREROLL = 8
+
+
+def env_info():
+    import torch
+    info = {
+        "torch": torch.__version__,
+        "torch_cuda": torch.version.cuda,
+        "os": f"{platform.system()} {platform.release()} ({platform.machine()})",
+        "gpu": None,
+    }
+    if torch.cuda.is_available():
+        p = torch.cuda.get_device_properties(torch.cuda.current_device())
+        info["gpu"] = {
+            "name": p.name,
+            "capability": f"{p.major}.{p.minor}",
+            "memory_gb": round(p.total_memory / 1e9, 1),
+        }
+    return info
+
+
+def build_engine(model_uri, quant, overrides):
+    from world_engine import WorldEngine
+    return WorldEngine(
+        model_uri,
+        quant=quant,
+        model_config_overrides=overrides,
+        device="cuda",
+    )
+
+
+def run_perf(engine, n_frames):
+    """Time an n_frames DiT-only rollout. LFPS = latent frames per second."""
+    import torch
+
+    for _ in range(3):  # warmup
+        engine.gen_frame(return_img=False)
+    engine.reset()
+    engine.gen_frame(return_img=False)  # prime
+    torch.cuda.synchronize()
+
+    t0 = time.perf_counter()
+    for _ in range(n_frames):
+        engine.gen_frame(return_img=False)
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - t0
+    return {"status": "ok", "n_frames": n_frames, "elapsed_s": elapsed, "lfps": n_frames / elapsed}
+
+
+def run_consistency(engine, shared_state: Path, latent_path: Path):
+    """Deterministic single forward from a shared, fully-populated KV cache.
+
+    The first ref to run creates the shared state (snapshotting the cache after
+    a fixed pre-roll) and saves it; every other ref loads that exact state, so
+    the only variable is the engine code. Latent is saved as .npy so the
+    aggregator needs numpy only (no torch/GPU).
+    """
+    import numpy as np
+    import torch
+
+    # NB: full determinism also needs CUBLAS_WORKSPACE_CONFIG=:4096:8 in the
+    # environment (set by the workflow) before CUDA initialises.
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+
+    if shared_state.exists():
+        engine.load_state(torch.load(shared_state))
+        produced = False
+    else:
+        engine.reset()
+        for _ in range(CONSISTENCY_PREROLL):
+            engine.gen_frame(return_img=False)
+        shared_state.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(engine.get_state(), shared_state)
+        produced = True
+
+    latent = engine.gen_frame(return_img=False).detach().float().cpu().numpy()
+    latent_path.parent.mkdir(parents=True, exist_ok=True)
+    np.save(latent_path, latent)
+    return {
+        "status": "ok",
+        "produced_shared_state": produced,
+        "latent_shape": list(latent.shape),
+        "latent_file": latent_path.name,
+    }
+
+
+def cmd_run(args):
+    quant = None if (args.quant or "none").lower() in ("none", "null", "") else args.quant
+    overrides = (
+        json.loads(args.overrides)
+        if args.overrides and args.overrides.lower() not in ("none", "null")
+        else None
+    )
+    out = Path(args.out)
+    out.mkdir(parents=True, exist_ok=True)
+
+    result = {
+        "config_id": args.config_id,
+        "ref": args.ref,
+        "model_uri": args.model_uri,
+        "quant": quant,
+        "overrides": overrides,
+        "env": env_info(),
+        "perf": None,
+        "consistency": None,
+    }
+
+    try:
+        engine = build_engine(args.model_uri, quant, overrides)
+    except Exception as e:  # engine init failure -> whole config marked failed
+        result["error"] = f"engine init failed: {e!r}"
+        _write_result(out, args, result)
+        return
+
+    if args.mode in ("perf", "both"):
+        try:
+            result["perf"] = run_perf(engine, args.n_frames)
+        except Exception as e:
+            result["perf"] = {"status": "failed", "error": repr(e)}
+
+    if args.mode in ("consistency", "both"):
+        try:
+            result["consistency"] = run_consistency(
+                engine,
+                Path(args.shared_state),
+                out / f"latent_{args.config_id}_{args.ref}.npy",
+            )
+        except Exception as e:
+            result["consistency"] = {"status": "failed", "error": repr(e)}
+
+    _write_result(out, args, result)
+
+
+def _write_result(out: Path, args, result):
+    path = out / f"result_{args.config_id}_{args.ref}.json"
+    path.write_text(json.dumps(result, indent=2))
+    print(f"wrote {path}")
+
+
+def _lfps(result):
+    perf = (result or {}).get("perf") or {}
+    return perf.get("lfps") if perf.get("status") == "ok" else None
+
+
+def cmd_compare(args):
+    import numpy as np
+
+    rd = Path(args.results_dir)
+    results = {}
+    for f in sorted(rd.glob("result_*.json")):
+        r = json.loads(f.read_text())
+        results[(r["config_id"], r["ref"])] = r
+    config_ids = sorted({c for c, _ in results})
+
+    lines = ["## GPU benchmark (issue #48)", ""]
+    gpu = next((r["env"].get("gpu") for r in results.values() if r.get("env")), None)
+    if gpu:
+        lines += [f"Machine: **{gpu['name']}** ({gpu['memory_gb']} GB, cc {gpu['capability']})", ""]
+
+    lines += ["### Performance — LFPS (256-frame rollout)", "",
+              "| Config | main | PR | Δ% |", "|---|---|---|---|"]
+    for cid in config_ids:
+        ml, pl = _lfps(results.get((cid, "main"))), _lfps(results.get((cid, "pr")))
+        if isinstance(ml, float) and isinstance(pl, float) and ml:
+            delta = f"{(pl - ml) / ml * 100:+.1f}%"
+        else:
+            delta = "—"
+        fmt = lambda v: f"{v:.2f}" if isinstance(v, float) else "**FAILED**"
+        lines.append(f"| `{cid}` | {fmt(ml)} | {fmt(pl)} | {delta} |")
+
+    lines += ["", "### Consistency — MSE(main, PR) latent", "",
+              "| Config | MSE | status |", "|---|---|---|"]
+    for cid in config_ids:
+        try:
+            a = np.load(rd / f"latent_{cid}_main.npy")
+            b = np.load(rd / f"latent_{cid}_pr.npy")
+            lines.append(f"| `{cid}` | {float(np.mean((a - b) ** 2)):.3e} | ok |")
+        except Exception as e:
+            lines.append(f"| `{cid}` | — | **FAILED**: {e} |")
+
+    Path(args.out).write_text("\n".join(lines) + "\n")
+    print("\n".join(lines))
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    r = sub.add_parser("run", help="run perf+consistency for one (config, ref)")
+    r.add_argument("--config-id", required=True)
+    r.add_argument("--ref", required=True, help="label for this engine build, e.g. main or pr")
+    r.add_argument("--model-uri", required=True)
+    r.add_argument("--quant", default="none")
+    r.add_argument("--overrides", default=None, help="JSON dict of model_config_overrides")
+    r.add_argument("--mode", choices=["perf", "consistency", "both"], default="both")
+    r.add_argument("--n-frames", type=int, default=256)
+    r.add_argument("--shared-state", required=True, help="path to the shared KV-cache state .pt")
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_run)
+
+    c = sub.add_parser("compare", help="aggregate results into a markdown table")
+    c.add_argument("--results-dir", required=True)
+    c.add_argument("--out", required=True)
+    c.set_defaults(func=cmd_compare)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/runner-startup.sh b/.github/runner-startup.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Startup script for the ephemeral GPU runner VM (issue #48).
+#
+# Passed to `gcloud compute instances create` as metadata, so it runs at boot
+# with no SSH in (which is why the provisioner SA needs only create/delete, not
+# setMetadata). It registers this VM as a one-shot (`--ephemeral`) self-hosted
+# GitHub Actions runner; after the single job it unregisters itself, and the
+# `stop-runner` job (plus the VM's max-run-duration backstop) deletes the VM.
+#
+# Required instance metadata: gh_repo, runner_token, runner_name, runner_labels.
+# The base image (Deep Learning VM, common-cu129-…-nvidia-580) already has the
+# NVIDIA driver; world_engine deps are installed by the workflow's job steps.
+set -euo pipefail
+# Echo every command and announce the failing line. This output lands on the VM
+# serial console, which the workflow's "Wait for runner to register" step dumps
+# into the Actions log if registration doesn't complete in time.
+set -x
+trap 'echo "[runner-startup] FAILED at line ${LINENO} (exit $?)" >&2' ERR
+
+RUNNER_VERSION="2.334.0"
+meta() { curl -s -H "Metadata-Flavor: Google" \
+  "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"; }
+
+GH_REPO="$(meta gh_repo)"
+RUNNER_TOKEN="$(meta runner_token)"
+RUNNER_NAME="$(meta runner_name)"
+RUNNER_LABELS="$(meta runner_labels)"
+
+# The Actions runner refuses to run as root; this is a throwaway single-job VM,
+# so allow it rather than provisioning a dedicated user.
+export RUNNER_ALLOW_RUNASROOT=1
+
+mkdir -p /actions-runner && cd /actions-runner
+curl -sL -o runner.tar.gz \
+  "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
+tar xzf runner.tar.gz
+
+# Install the runner's OS dependencies (libicu et al.). Missing libs are a
+# common silent cause of config.sh failing on minimal/base images.
+./bin/installdependencies.sh
+
+./config.sh \
+  --unattended \
+  --ephemeral \
+  --url "https://github.com/${GH_REPO}" \
+  --token "${RUNNER_TOKEN}" \
+  --name "${RUNNER_NAME}" \
+  --labels "${RUNNER_LABELS}" \
+  --replace
+
+# Blocks until the single job completes, then the ephemeral runner exits and
+# unregisters itself from GitHub.
+./run.sh