From 2639057c84135410db11df97cc72e2ef2e2e3fc5 Mon Sep 17 00:00:00 2001
From: Aaron Sanders <sanders.aaron.d@gmail.com>
Date: Tue, 2 Jun 2026 17:09:14 -0700
Subject: [PATCH 1/7] Add GPU benchmark + consistency CI workflow (issue #48)

Adds an ephemeral-runner workflow that, on ready-for-review PRs, provisions a
G4 (RTX PRO 6000) VM in overworld-dev as a one-shot self-hosted runner, runs a
256-frame perf rollout (LFPS) and a deterministic consistency forward pass for
each config against both main and the PR HEAD, posts a comparison table + MSE,
then deletes the VM.

- examples/ci.py: provider-agnostic perf+consistency harness and aggregator
- .github/runner-startup.sh: registers the VM as an ephemeral runner (no SSH,
  fits the provisioner SA's create/delete-only permissions)
- .github/workflows/benchmark.yml: start-runner / benchmark / aggregate /
  stop-runner

Draft until the standard RTX PRO 6000 quota request for overworld-dev clears.
---
 .github/runner-startup.sh       |  44 ++++++
 .github/workflows/benchmark.yml | 163 +++++++++++++++++++++
 examples/ci.py                  | 244 ++++++++++++++++++++++++++++++++
 3 files changed, 451 insertions(+)
 create mode 100755 .github/runner-startup.sh
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100755 examples/ci.py

diff --git a/.github/runner-startup.sh b/.github/runner-startup.sh
new file mode 100755
index 0000000..ae4bd11
--- /dev/null
+++ b/.github/runner-startup.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Startup script for the ephemeral GPU runner VM (issue #48).
+#
+# Passed to `gcloud compute instances create` as metadata, so it runs at boot
+# with no SSH in (which is why the provisioner SA needs only create/delete, not
+# setMetadata). It registers this VM as a one-shot (`--ephemeral`) self-hosted
+# GitHub Actions runner; after the single job it unregisters itself, and the
+# `stop-runner` job (plus the VM's max-run-duration backstop) deletes the VM.
+#
+# Required instance metadata: gh_repo, runner_token, runner_name, runner_labels.
+# The base image (Deep Learning VM, common-cu129-…-nvidia-580) already has the
+# NVIDIA driver; world_engine deps are installed by the workflow's job steps.
+set -euo pipefail
+
+RUNNER_VERSION="2.323.0"   # TODO: pin to the current actions/runner release
+meta() { curl -s -H "Metadata-Flavor: Google" \
+  "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"; }
+
+GH_REPO="$(meta gh_repo)"
+RUNNER_TOKEN="$(meta runner_token)"
+RUNNER_NAME="$(meta runner_name)"
+RUNNER_LABELS="$(meta runner_labels)"
+
+# The Actions runner refuses to run as root; this is a throwaway single-job VM,
+# so allow it rather than provisioning a dedicated user.
+export RUNNER_ALLOW_RUNASROOT=1
+
+mkdir -p /actions-runner && cd /actions-runner
+curl -sL -o runner.tar.gz \
+  "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
+tar xzf runner.tar.gz
+
+./config.sh \
+  --unattended \
+  --ephemeral \
+  --url "https://github.com/${GH_REPO}" \
+  --token "${RUNNER_TOKEN}" \
+  --name "${RUNNER_NAME}" \
+  --labels "${RUNNER_LABELS}" \
+  --replace
+
+# Blocks until the single job completes, then the ephemeral runner exits and
+# unregisters itself from GitHub.
+./run.sh
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..b49ba31
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,163 @@
+# GPU benchmark + consistency suite for ready-for-review PRs (issue #48).
+#
+# Flow: start-runner provisions an ephemeral G4 (RTX PRO 6000) VM in
+# overworld-dev that registers itself as a one-shot self-hosted runner ->
+# benchmark runs perf + consistency for each config against both `main` and the
+# PR HEAD on that GPU -> aggregate posts a comparison table -> stop-runner
+# deletes the VM (with a max-run-duration backstop in case the job dies).
+#
+# Required repo secrets:
+#   WIF_PROVIDER_DEV     - overworld-dev WIF provider (infra bootstrap output)
+#   WE_CI_PROVISIONER_SA - dev-we-ci-provisioner@overworld-dev.iam.gserviceaccount.com
+#   WE_CI_NODE_SA        - dev-we-ci-node@overworld-dev.iam.gserviceaccount.com
+#   GH_RUNNER_PAT        - PAT (or App token) with repo Administration:write,
+#                          used to mint runner registration tokens
+#   HF_TOKEN             - Hugging Face token for gated Overworld/* model pulls
+#
+# NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in
+# overworld-dev/us-central1, which is currently 0 (request pending). Until then
+# the create step fails on quota — switch --provisioning-model to SPOT to test
+# against the preemptible quota (subject to capacity / mid-run preemption).
+
+name: GPU Benchmark
+
+on:
+  pull_request:
+    types: [ready_for_review, synchronize, reopened]
+
+permissions:
+  contents: read
+  id-token: write        # Workload Identity Federation
+  pull-requests: write   # post the results comment
+
+env:
+  GCP_PROJECT: overworld-dev
+  ZONE: us-central1-b
+  MACHINE_TYPE: g4-standard-48
+  IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580
+  IMAGE_PROJECT: deeplearning-platform-release
+  RUNNER_NAME: we-ci-${{ github.run_id }}
+  RUNNER_LABEL: gha-${{ github.run_id }}
+  # WorldEngine configs to test (issue #48 starter set). id|model_uri|quant
+  CONFIGS: |
+    wp15-1b-bf16|Overworld/Waypoint-1.5-1B|none
+    wp15-1b-intw8a8|Overworld/Waypoint-1.5-1B|intw8a8
+
+jobs:
+  start-runner:
+    if: github.event.pull_request.draft == false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - id: auth
+        uses: google-github-actions/auth@v2
+        with:
+          workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }}
+          service_account: ${{ secrets.WE_CI_PROVISIONER_SA }}
+      - uses: google-github-actions/setup-gcloud@v2
+      - name: Mint runner registration token
+        id: reg
+        run: |
+          token=$(curl -fsX POST \
+            -H "Authorization: token ${{ secrets.GH_RUNNER_PAT }}" \
+            -H "Accept: application/vnd.github+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
+            | jq -r .token)
+          echo "::add-mask::$token"
+          echo "token=$token" >> "$GITHUB_OUTPUT"
+      - name: Create ephemeral GPU runner VM
+        run: |
+          gcloud compute instances create "$RUNNER_NAME" \
+            --project "$GCP_PROJECT" --zone "$ZONE" \
+            --machine-type "$MACHINE_TYPE" \
+            --maintenance-policy TERMINATE --provisioning-model STANDARD \
+            --image-family "$IMAGE_FAMILY" --image-project "$IMAGE_PROJECT" \
+            --boot-disk-size 200GB --boot-disk-type hyperdisk-balanced \
+            --service-account "${{ secrets.WE_CI_NODE_SA }}" --scopes cloud-platform \
+            --max-run-duration 3h --instance-termination-action DELETE \
+            --metadata-from-file startup-script=.github/runner-startup.sh \
+            --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$RUNNER_NAME,runner_labels=$RUNNER_LABEL"
+
+  benchmark:
+    needs: start-runner
+    runs-on: [self-hosted, "gha-${{ github.run_id }}"]
+    env:
+      CUBLAS_WORKSPACE_CONFIG: ":4096:8"   # required for deterministic cuBLAS
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    steps:
+      - name: Checkout PR HEAD
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: pr
+      - name: Checkout main
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: main
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: Run perf + consistency (main, then PR, per config)
+        run: |
+          set -euo pipefail
+          mkdir -p results
+          # Use the PR's harness against BOTH engine installs so only the
+          # world_engine package differs between refs.
+          cp pr/examples/ci.py main/examples/ci.py
+          while IFS='|' read -r id uri quant; do
+            [ -z "$id" ] && continue
+            for ref in main pr; do   # main first: it creates the shared state
+              ( cd "$ref" && uv sync --extra dev >/dev/null && \
+                uv run python examples/ci.py run \
+                  --config-id "$id" --ref "$ref" \
+                  --model-uri "$uri" --quant "$quant" \
+                  --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \
+                  --out "$GITHUB_WORKSPACE/results" )
+            done
+          done <<< "${CONFIGS}"
+      - uses: actions/upload-artifact@v4
+        with:
+          name: gpu-bench-results
+          path: results/
+
+  aggregate:
+    needs: benchmark
+    if: always() && needs.benchmark.result != 'skipped'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
+        with:
+          name: gpu-bench-results
+          path: results
+      - name: Build summary
+        run: |
+          pip install --quiet numpy
+          python examples/ci.py compare --results-dir results --out results/summary.md
+      - name: Comment on PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('results/summary.md', 'utf8');
+            await github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body,
+            });
+
+  stop-runner:
+    needs: [start-runner, benchmark]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - uses: google-github-actions/auth@v2
+        with:
+          workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }}
+          service_account: ${{ secrets.WE_CI_PROVISIONER_SA }}
+      - uses: google-github-actions/setup-gcloud@v2
+      - name: Delete runner VM
+        run: |
+          gcloud compute instances delete "$RUNNER_NAME" \
+            --project "$GCP_PROJECT" --zone "$ZONE" --quiet || true
diff --git a/examples/ci.py b/examples/ci.py
new file mode 100755
index 0000000..fbb4f74
--- /dev/null
+++ b/examples/ci.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+"""world_engine CI perf + consistency harness (issue #48).
+
+Provider-agnostic: this script knows nothing about GCP/GitHub Actions. Given a
+WorldEngine config it runs a performance rollout (LFPS) and/or a deterministic
+consistency forward pass, writing JSON results (+ a latent .npy for
+consistency) to --out. A separate `compare` subcommand turns a directory of
+per-ref results into a markdown table (perf delta + consistency MSE).
+
+The CI workflow runs the SAME copy of this script against two world_engine
+installs (main and the PR HEAD) so that only the engine code differs.
+
+Usage:
+  # produce results for one (config, ref); run "main" first so it creates the
+  # shared KV-cache state that "pr" then loads.
+  python examples/ci.py run \
+      --config-id wp15-1b-bf16 --ref main \
+      --model-uri Overworld/Waypoint-1.5-1B --quant none \
+      --shared-state results/state_wp15-1b-bf16.pt --out results
+
+  # aggregate
+  python examples/ci.py compare --results-dir results --out results/summary.md
+"""
+import argparse
+import json
+import platform
+import time
+from pathlib import Path
+
+
+# How many gen_frame() steps to pre-roll before snapshotting the KV cache for
+# the consistency check ("fully populated cache").
+CONSISTENCY_PREROLL = 8
+
+
+def env_info():
+    import torch
+    info = {
+        "torch": torch.__version__,
+        "torch_cuda": torch.version.cuda,
+        "os": f"{platform.system()} {platform.release()} ({platform.machine()})",
+        "gpu": None,
+    }
+    if torch.cuda.is_available():
+        p = torch.cuda.get_device_properties(torch.cuda.current_device())
+        info["gpu"] = {
+            "name": p.name,
+            "capability": f"{p.major}.{p.minor}",
+            "memory_gb": round(p.total_memory / 1e9, 1),
+        }
+    return info
+
+
+def build_engine(model_uri, quant, overrides):
+    from world_engine import WorldEngine
+    return WorldEngine(
+        model_uri,
+        quant=quant,
+        model_config_overrides=overrides,
+        device="cuda",
+    )
+
+
+def run_perf(engine, n_frames):
+    """Time an n_frames DiT-only rollout. LFPS = latent frames per second."""
+    import torch
+
+    for _ in range(3):  # warmup
+        engine.gen_frame(return_img=False)
+    engine.reset()
+    engine.gen_frame(return_img=False)  # prime
+    torch.cuda.synchronize()
+
+    t0 = time.perf_counter()
+    for _ in range(n_frames):
+        engine.gen_frame(return_img=False)
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - t0
+    return {"status": "ok", "n_frames": n_frames, "elapsed_s": elapsed, "lfps": n_frames / elapsed}
+
+
+def run_consistency(engine, shared_state: Path, latent_path: Path):
+    """Deterministic single forward from a shared, fully-populated KV cache.
+
+    The first ref to run creates the shared state (snapshotting the cache after
+    a fixed pre-roll) and saves it; every other ref loads that exact state, so
+    the only variable is the engine code. Latent is saved as .npy so the
+    aggregator needs numpy only (no torch/GPU).
+    """
+    import numpy as np
+    import torch
+
+    # NB: full determinism also needs CUBLAS_WORKSPACE_CONFIG=:4096:8 in the
+    # environment (set by the workflow) before CUDA initialises.
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+
+    if shared_state.exists():
+        engine.load_state(torch.load(shared_state))
+        produced = False
+    else:
+        engine.reset()
+        for _ in range(CONSISTENCY_PREROLL):
+            engine.gen_frame(return_img=False)
+        shared_state.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(engine.get_state(), shared_state)
+        produced = True
+
+    latent = engine.gen_frame(return_img=False).detach().float().cpu().numpy()
+    latent_path.parent.mkdir(parents=True, exist_ok=True)
+    np.save(latent_path, latent)
+    return {
+        "status": "ok",
+        "produced_shared_state": produced,
+        "latent_shape": list(latent.shape),
+        "latent_file": latent_path.name,
+    }
+
+
+def cmd_run(args):
+    quant = None if (args.quant or "none").lower() in ("none", "null", "") else args.quant
+    overrides = (
+        json.loads(args.overrides)
+        if args.overrides and args.overrides.lower() not in ("none", "null")
+        else None
+    )
+    out = Path(args.out)
+    out.mkdir(parents=True, exist_ok=True)
+
+    result = {
+        "config_id": args.config_id,
+        "ref": args.ref,
+        "model_uri": args.model_uri,
+        "quant": quant,
+        "overrides": overrides,
+        "env": env_info(),
+        "perf": None,
+        "consistency": None,
+    }
+
+    try:
+        engine = build_engine(args.model_uri, quant, overrides)
+    except Exception as e:  # engine init failure -> whole config marked failed
+        result["error"] = f"engine init failed: {e!r}"
+        _write_result(out, args, result)
+        return
+
+    if args.mode in ("perf", "both"):
+        try:
+            result["perf"] = run_perf(engine, args.n_frames)
+        except Exception as e:
+            result["perf"] = {"status": "failed", "error": repr(e)}
+
+    if args.mode in ("consistency", "both"):
+        try:
+            result["consistency"] = run_consistency(
+                engine,
+                Path(args.shared_state),
+                out / f"latent_{args.config_id}_{args.ref}.npy",
+            )
+        except Exception as e:
+            result["consistency"] = {"status": "failed", "error": repr(e)}
+
+    _write_result(out, args, result)
+
+
+def _write_result(out: Path, args, result):
+    path = out / f"result_{args.config_id}_{args.ref}.json"
+    path.write_text(json.dumps(result, indent=2))
+    print(f"wrote {path}")
+
+
+def _lfps(result):
+    perf = (result or {}).get("perf") or {}
+    return perf.get("lfps") if perf.get("status") == "ok" else None
+
+
+def cmd_compare(args):
+    import numpy as np
+
+    rd = Path(args.results_dir)
+    results = {}
+    for f in sorted(rd.glob("result_*.json")):
+        r = json.loads(f.read_text())
+        results[(r["config_id"], r["ref"])] = r
+    config_ids = sorted({c for c, _ in results})
+
+    lines = ["## GPU benchmark (issue #48)", ""]
+    gpu = next((r["env"].get("gpu") for r in results.values() if r.get("env")), None)
+    if gpu:
+        lines += [f"Machine: **{gpu['name']}** ({gpu['memory_gb']} GB, cc {gpu['capability']})", ""]
+
+    lines += ["### Performance — LFPS (256-frame rollout)", "",
+              "| Config | main | PR | Δ% |", "|---|---|---|---|"]
+    for cid in config_ids:
+        ml, pl = _lfps(results.get((cid, "main"))), _lfps(results.get((cid, "pr")))
+        if isinstance(ml, float) and isinstance(pl, float) and ml:
+            delta = f"{(pl - ml) / ml * 100:+.1f}%"
+        else:
+            delta = "—"
+        fmt = lambda v: f"{v:.2f}" if isinstance(v, float) else "**FAILED**"
+        lines.append(f"| `{cid}` | {fmt(ml)} | {fmt(pl)} | {delta} |")
+
+    lines += ["", "### Consistency — MSE(main, PR) latent", "",
+              "| Config | MSE | status |", "|---|---|---|"]
+    for cid in config_ids:
+        try:
+            a = np.load(rd / f"latent_{cid}_main.npy")
+            b = np.load(rd / f"latent_{cid}_pr.npy")
+            lines.append(f"| `{cid}` | {float(np.mean((a - b) ** 2)):.3e} | ok |")
+        except Exception as e:
+            lines.append(f"| `{cid}` | — | **FAILED**: {e} |")
+
+    Path(args.out).write_text("\n".join(lines) + "\n")
+    print("\n".join(lines))
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    r = sub.add_parser("run", help="run perf+consistency for one (config, ref)")
+    r.add_argument("--config-id", required=True)
+    r.add_argument("--ref", required=True, help="label for this engine build, e.g. main or pr")
+    r.add_argument("--model-uri", required=True)
+    r.add_argument("--quant", default="none")
+    r.add_argument("--overrides", default=None, help="JSON dict of model_config_overrides")
+    r.add_argument("--mode", choices=["perf", "consistency", "both"], default="both")
+    r.add_argument("--n-frames", type=int, default=256)
+    r.add_argument("--shared-state", required=True, help="path to the shared KV-cache state .pt")
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_run)
+
+    c = sub.add_parser("compare", help="aggregate results into a markdown table")
+    c.add_argument("--results-dir", required=True)
+    c.add_argument("--out", required=True)
+    c.set_defaults(func=cmd_compare)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()

From 8bfda98370961a8755fdd1c03bb539ecfbaf2c5e Mon Sep 17 00:00:00 2001
From: Aaron Sanders <sanders.aaron.d@gmail.com>
Date: Tue, 2 Jun 2026 17:13:02 -0700
Subject: [PATCH 2/7] Move CI benchmark script out of examples/, drop
 provider-agnostic framing

The script is GCP/world_engine-specific CI glue, not a general-purpose
example. Move examples/ci.py -> .github/benchmark.py and reword. Switch the
workflow to invoke it via uv run --project per ref (drops the copy hack).
---
 examples/ci.py => .github/benchmark.py | 25 ++++++++++---------------
 .github/workflows/benchmark.yml        | 19 +++++++++----------
 2 files changed, 19 insertions(+), 25 deletions(-)
 rename examples/ci.py => .github/benchmark.py (90%)

diff --git a/examples/ci.py b/.github/benchmark.py
similarity index 90%
rename from examples/ci.py
rename to .github/benchmark.py
index fbb4f74..0221cd9 100755
--- a/examples/ci.py
+++ b/.github/benchmark.py
@@ -1,25 +1,20 @@
 #!/usr/bin/env python3
-"""world_engine CI perf + consistency harness (issue #48).
+"""GPU benchmark + consistency for the world_engine CI (issue #48).
 
-Provider-agnostic: this script knows nothing about GCP/GitHub Actions. Given a
-WorldEngine config it runs a performance rollout (LFPS) and/or a deterministic
-consistency forward pass, writing JSON results (+ a latent .npy for
-consistency) to --out. A separate `compare` subcommand turns a directory of
-per-ref results into a markdown table (perf delta + consistency MSE).
+Run on the ephemeral G4 runner by .github/workflows/benchmark.yml. `run` does a
+performance rollout (LFPS) and a deterministic consistency forward pass for one
+(config, ref) and writes a JSON result + a latent .npy. `compare` turns the
+collected results into a markdown table (perf delta + consistency MSE). The
+workflow runs `run` against both the main and PR world_engine installs (only
+the engine code differs); "main" runs first and creates the shared KV-cache
+state that "pr" then loads.
 
-The CI workflow runs the SAME copy of this script against two world_engine
-installs (main and the PR HEAD) so that only the engine code differs.
-
-Usage:
-  # produce results for one (config, ref); run "main" first so it creates the
-  # shared KV-cache state that "pr" then loads.
-  python examples/ci.py run \
+  python .github/benchmark.py run \
       --config-id wp15-1b-bf16 --ref main \
       --model-uri Overworld/Waypoint-1.5-1B --quant none \
       --shared-state results/state_wp15-1b-bf16.pt --out results
 
-  # aggregate
-  python examples/ci.py compare --results-dir results --out results/summary.md
+  python .github/benchmark.py compare --results-dir results --out results/summary.md
 """
 import argparse
 import json
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b49ba31..d28d7f6 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -101,18 +101,17 @@ jobs:
         run: |
           set -euo pipefail
           mkdir -p results
-          # Use the PR's harness against BOTH engine installs so only the
-          # world_engine package differs between refs.
-          cp pr/examples/ci.py main/examples/ci.py
+          # Always invoke the PR's copy of the script, but switch the uv project
+          # per ref so only the world_engine install differs between runs.
+          script="$GITHUB_WORKSPACE/pr/.github/benchmark.py"
           while IFS='|' read -r id uri quant; do
             [ -z "$id" ] && continue
             for ref in main pr; do   # main first: it creates the shared state
-              ( cd "$ref" && uv sync --extra dev >/dev/null && \
-                uv run python examples/ci.py run \
-                  --config-id "$id" --ref "$ref" \
-                  --model-uri "$uri" --quant "$quant" \
-                  --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \
-                  --out "$GITHUB_WORKSPACE/results" )
+              uv run --project "$ref" --group dev python "$script" run \
+                --config-id "$id" --ref "$ref" \
+                --model-uri "$uri" --quant "$quant" \
+                --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \
+                --out "$GITHUB_WORKSPACE/results"
             done
           done <<< "${CONFIGS}"
       - uses: actions/upload-artifact@v4
@@ -133,7 +132,7 @@ jobs:
       - name: Build summary
         run: |
           pip install --quiet numpy
-          python examples/ci.py compare --results-dir results --out results/summary.md
+          python .github/benchmark.py compare --results-dir results --out results/summary.md
       - name: Comment on PR
         uses: actions/github-script@v7
         with:

From 3bea4b7e0f983fbe965e1639e2d0fe3188292653 Mon Sep 17 00:00:00 2001
From: Clydingus <40514241+Clydingus@users.noreply.github.com>
Date: Thu, 4 Jun 2026 11:28:48 -0400
Subject: [PATCH 3/7] Rename RUNNER_NAME env var off reserved Actions prefix

RUNNER_NAME is a built-in GitHub Actions environment variable that the
runner overrides at runtime with its own name (e.g. "GitHub Actions
1000007239"). The workflow's `env: RUNNER_NAME` was therefore ignored,
and start-runner passed the runner's name to `gcloud compute instances
create`, which rejected it as an invalid GCE resource name:

  Invalid value for field 'resource.name': 'GitHub Actions 1000007239'.

The same bug made stop-runner's `instances delete "$RUNNER_NAME"` target
the wrong name (silently, via `|| true`), so a created VM would leak.

Rename to GH_RUNNER_NAME / GH_RUNNER_LABEL: clear of the reserved RUNNER_
and GITHUB_ prefixes, and consistent with the GH_RUNNER_PAT secret.
Updates the create positional arg, delete arg, and metadata. The
benchmark job's runs-on label is a literal expression and is unaffected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index d28d7f6..f397404 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,8 +36,12 @@ env:
   MACHINE_TYPE: g4-standard-48
   IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580
   IMAGE_PROJECT: deeplearning-platform-release
-  RUNNER_NAME: we-ci-${{ github.run_id }}
-  RUNNER_LABEL: gha-${{ github.run_id }}
+  # NB: do NOT name these RUNNER_NAME / RUNNER_* — those are reserved built-in
+  # Actions env vars the runner overrides at runtime (RUNNER_NAME becomes the
+  # hosted runner's own name, e.g. "GitHub Actions 123"), which then gets passed
+  # as the GCE instance name and rejected as an invalid resource name.
+  GH_RUNNER_NAME: we-ci-${{ github.run_id }}
+  GH_RUNNER_LABEL: gha-${{ github.run_id }}
   # WorldEngine configs to test (issue #48 starter set). id|model_uri|quant
   CONFIGS: |
     wp15-1b-bf16|Overworld/Waypoint-1.5-1B|none
@@ -67,7 +71,7 @@ jobs:
           echo "token=$token" >> "$GITHUB_OUTPUT"
       - name: Create ephemeral GPU runner VM
         run: |
-          gcloud compute instances create "$RUNNER_NAME" \
+          gcloud compute instances create "$GH_RUNNER_NAME" \
             --project "$GCP_PROJECT" --zone "$ZONE" \
             --machine-type "$MACHINE_TYPE" \
             --maintenance-policy TERMINATE --provisioning-model STANDARD \
@@ -76,7 +80,7 @@ jobs:
             --service-account "${{ secrets.WE_CI_NODE_SA }}" --scopes cloud-platform \
             --max-run-duration 3h --instance-termination-action DELETE \
             --metadata-from-file startup-script=.github/runner-startup.sh \
-            --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$RUNNER_NAME,runner_labels=$RUNNER_LABEL"
+            --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$GH_RUNNER_NAME,runner_labels=$GH_RUNNER_LABEL"
 
   benchmark:
     needs: start-runner
@@ -158,5 +162,5 @@ jobs:
       - uses: google-github-actions/setup-gcloud@v2
       - name: Delete runner VM
         run: |
-          gcloud compute instances delete "$RUNNER_NAME" \
+          gcloud compute instances delete "$GH_RUNNER_NAME" \
             --project "$GCP_PROJECT" --zone "$ZONE" --quiet || true

From 6df0952ee08971f36a671692a8d128e3573f77b0 Mon Sep 17 00:00:00 2001
From: Clydingus <40514241+Clydingus@users.noreply.github.com>
Date: Thu, 4 Jun 2026 12:10:32 -0400
Subject: [PATCH 4/7] Harden runner-token mint: surface HTTP status, fail on
 empty token
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mint step runs under `bash -e` (no pipefail), so `curl -f | jq` hid a
failing PAT call behind jq's exit 0 — yielding an empty registration
token that silently flowed into the VM metadata and only surfaced later
as an opaque create failure.

Capture the HTTP status with -w, print it, and on non-201 emit the API's
.message/.documentation_url (non-sensitive error text, never the token)
and exit 1. Only parse/emit the token on success; keep masking it. Also
switch to `Authorization: Bearer` (the documented scheme for fine-grained
PATs) and pass the PAT via env rather than inline interpolation.

This both fixes the silent-empty-token footgun and tells us the exact
reason the current GH_RUNNER_PAT mint is failing (the token is configured
correctly per the UI, so the failure is something only the HTTP response
will reveal).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index f397404..12b4f7c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -61,14 +61,29 @@ jobs:
       - uses: google-github-actions/setup-gcloud@v2
       - name: Mint runner registration token
         id: reg
+        env:
+          GH_RUNNER_PAT: ${{ secrets.GH_RUNNER_PAT }}
         run: |
-          token=$(curl -fsX POST \
-            -H "Authorization: token ${{ secrets.GH_RUNNER_PAT }}" \
+          set -euo pipefail
+          # Capture the HTTP status separately from the body so a failed mint
+          # fails loudly with the real reason instead of silently yielding an
+          # empty token (the default shell here has no pipefail, so a plain
+          # `curl -f | jq` masks curl's failure behind jq's exit 0).
+          code=$(curl -sS -o resp.json -w '%{http_code}' -X POST \
+            -H "Authorization: Bearer ${GH_RUNNER_PAT}" \
             -H "Accept: application/vnd.github+json" \
-            "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
-            | jq -r .token)
-          echo "::add-mask::$token"
-          echo "token=$token" >> "$GITHUB_OUTPUT"
+            "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token")
+          echo "registration-token HTTP ${code}"
+          if [ "${code}" != "201" ]; then
+            # .message/.documentation_url are non-sensitive error text, never the token
+            echo "::error::failed to mint runner registration token (HTTP ${code})"
+            jq -r '.message // "", .documentation_url // ""' resp.json || true
+            exit 1
+          fi
+          token=$(jq -er .token resp.json)
+          echo "::add-mask::${token}"
+          echo "token=${token}" >> "$GITHUB_OUTPUT"
+          rm -f resp.json
       - name: Create ephemeral GPU runner VM
         run: |
           gcloud compute instances create "$GH_RUNNER_NAME" \

From 21e6c36cc27d10a3165293a5b0cdd8cff9f043a3 Mon Sep 17 00:00:00 2001
From: Clydingus <40514241+Clydingus@users.noreply.github.com>
Date: Thu, 4 Jun 2026 14:24:33 -0400
Subject: [PATCH 5/7] Mint runner token via GitHub App instead of PAT

A fine-grained PAT is capped by its owner's repo role; the token owner
has Maintain (not Admin) on world_engine, and creating a runner
registration token requires Admin, so the PAT 403'd ("Resource not
accessible by personal access token") despite carrying Administration:write.

Switch to a GitHub App installation token (actions/create-github-app-token),
whose permissions come from the App install (Administration:write) and are
not bounded by any human's role. Auto-scoped to this repo, auto-masked,
1h expiry. Replaces GH_RUNNER_PAT with WE_CI_APP_ID / WE_CI_APP_PRIVATE_KEY.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 12b4f7c..0e43be8 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -10,8 +10,8 @@
 #   WIF_PROVIDER_DEV     - overworld-dev WIF provider (infra bootstrap output)
 #   WE_CI_PROVISIONER_SA - dev-we-ci-provisioner@overworld-dev.iam.gserviceaccount.com
 #   WE_CI_NODE_SA        - dev-we-ci-node@overworld-dev.iam.gserviceaccount.com
-#   GH_RUNNER_PAT        - PAT (or App token) with repo Administration:write,
-#                          used to mint runner registration tokens
+#   WE_CI_APP_ID         - GitHub App (Administration:write) id, for minting
+#                          runner registration tokens
 #   HF_TOKEN             - Hugging Face token for gated Overworld/* model pulls
 #
 # NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in
@@ -59,10 +59,18 @@ jobs:
           workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }}
           service_account: ${{ secrets.WE_CI_PROVISIONER_SA }}
       - uses: google-github-actions/setup-gcloud@v2
+      # App installation token: carries the App's Administration:write directly.
+      # Auto-scoped to this repo (the App is installed only here), auto-masked, expires in 1h.
+      - name: Generate GitHub App token
+        id: app-token
+        uses: actions/create-github-app-token@v2
+        with:
+          app-id: ${{ secrets.WE_CI_APP_ID }}
+          private-key: ${{ secrets.WE_CI_APP_PRIVATE_KEY }}
       - name: Mint runner registration token
         id: reg
         env:
-          GH_RUNNER_PAT: ${{ secrets.GH_RUNNER_PAT }}
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
         run: |
           set -euo pipefail
           # Capture the HTTP status separately from the body so a failed mint
@@ -70,7 +78,7 @@ jobs:
           # empty token (the default shell here has no pipefail, so a plain
           # `curl -f | jq` masks curl's failure behind jq's exit 0).
           code=$(curl -sS -o resp.json -w '%{http_code}' -X POST \
-            -H "Authorization: Bearer ${GH_RUNNER_PAT}" \
+            -H "Authorization: Bearer ${GH_TOKEN}" \
             -H "Accept: application/vnd.github+json" \
             "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token")
           echo "registration-token HTTP ${code}"

From ffbb9b3086c3c28e686bffa5d449b5b8faf4c1ce Mon Sep 17 00:00:00 2001
From: Clydingus <40514241+Clydingus@users.noreply.github.com>
Date: Thu, 4 Jun 2026 14:54:50 -0400
Subject: [PATCH 6/7] Fail fast on runner-registration failure instead of
 hanging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the VM's startup script fails to register the runner, the benchmark
job (runs-on the ephemeral label) sits queued waiting for a runner that
never comes — up to GitHub's ~24h auto-fail — while the VM self-deletes
at 3h, leaving no signal and no logs (startup output only reaches the VM
serial console).

start-runner now polls the runners API for an online runner with our
label (~10 min bound) and, on timeout, dumps the VM serial console into
the Actions log and fails — so the real cause is visible, benchmark is
skipped (not queued), and stop-runner tears the VM down.

Also harden runner-startup.sh: `set -x` + an ERR trap (so the serial dump
pinpoints the failing line), run ./bin/installdependencies.sh (missing
runner OS deps are a common silent config.sh failure), and bump
RUNNER_VERSION 2.323.0 -> 2.334.0. Add timeout-minutes: 90 to benchmark
as a backstop for a runner that registers then stalls mid-build.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/runner-startup.sh       | 11 ++++++++++-
 .github/workflows/benchmark.yml | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/.github/runner-startup.sh b/.github/runner-startup.sh
index ae4bd11..80e7033 100755
--- a/.github/runner-startup.sh
+++ b/.github/runner-startup.sh
@@ -11,8 +11,13 @@
 # The base image (Deep Learning VM, common-cu129-…-nvidia-580) already has the
 # NVIDIA driver; world_engine deps are installed by the workflow's job steps.
 set -euo pipefail
+# Echo every command and announce the failing line. This output lands on the VM
+# serial console, which the workflow's "Wait for runner to register" step dumps
+# into the Actions log if registration doesn't complete in time.
+set -x
+trap 'echo "[runner-startup] FAILED at line ${LINENO} (exit $?)" >&2' ERR
 
-RUNNER_VERSION="2.323.0"   # TODO: pin to the current actions/runner release
+RUNNER_VERSION="2.334.0"
 meta() { curl -s -H "Metadata-Flavor: Google" \
   "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"; }
 
@@ -30,6 +35,10 @@ curl -sL -o runner.tar.gz \
   "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
 tar xzf runner.tar.gz
 
+# Install the runner's OS dependencies (libicu et al.). Missing libs are a
+# common silent cause of config.sh failing on minimal/base images.
+./bin/installdependencies.sh
+
 ./config.sh \
   --unattended \
   --ephemeral \
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0e43be8..fe4dcd4 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -105,8 +105,31 @@ jobs:
             --metadata-from-file startup-script=.github/runner-startup.sh \
             --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$GH_RUNNER_NAME,runner_labels=$GH_RUNNER_LABEL"
 
+      # The startup script registers the runner on the VM; its output goes to the
+      # serial console, invisible to Actions. Without this gate, a failed
+      # registration leaves the `benchmark` job queued (waiting for a runner that
+      # never comes) until GitHub's ~24h auto-fail, with no signal. Poll the
+      # runners API for our label; on timeout, dump the serial console so the
+      # startup failure is visible here, then fail fast so stop-runner tears down.
+      - name: Wait for runner to register
+        env:
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
+        run: |
+          set -euo pipefail
+          for i in $(seq 1 40); do            # ~10 min @ 15s
+            online=$(gh api "repos/${{ github.repository }}/actions/runners" \
+              --jq "[.runners[] | select(any(.labels[]; .name==\"$GH_RUNNER_LABEL\")) | select(.status==\"online\")] | length")
+            if [ "${online:-0}" -ge 1 ]; then echo "runner online"; exit 0; fi
+            echo "waiting for runner ($i/40)…"; sleep 15
+          done
+          echo "::error::runner '$GH_RUNNER_NAME' never came online — serial console follows:"
+          gcloud compute instances get-serial-port-output "$GH_RUNNER_NAME" \
+            --project "$GCP_PROJECT" --zone "$ZONE" || true
+          exit 1
+
   benchmark:
     needs: start-runner
+    timeout-minutes: 90   # backstop for a runner that registers then stalls mid-build
     runs-on: [self-hosted, "gha-${{ github.run_id }}"]
     env:
       CUBLAS_WORKSPACE_CONFIG: ":4096:8"   # required for deterministic cuBLAS

From 986451aa63b9090eef3f61bc67e1d0c6420e7c29 Mon Sep 17 00:00:00 2001
From: Aaron Sanders <sanders.aaron.d@gmail.com>
Date: Fri, 5 Jun 2026 13:04:19 -0700
Subject: [PATCH 7/7] Move benchmark runner zone to us-south1-a

us-central1 has no RTX PRO 6000 capacity; the us-south1 quota request
is filed. Point the workflow ZONE at us-south1-a.
---
 .github/workflows/benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index fe4dcd4..5991d5f 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -15,7 +15,7 @@
 #   HF_TOKEN             - Hugging Face token for gated Overworld/* model pulls
 #
 # NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in
-# overworld-dev/us-central1, which is currently 0 (request pending). Until then
+# overworld-dev/us-south1, which is currently 0 (request pending). Until then
 # the create step fails on quota — switch --provisioning-model to SPOT to test
 # against the preemptible quota (subject to capacity / mid-run preemption).
 
@@ -32,7 +32,7 @@ permissions:
 
 env:
   GCP_PROJECT: overworld-dev
-  ZONE: us-central1-b
+  ZONE: us-south1-a
   MACHINE_TYPE: g4-standard-48
   IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580
   IMAGE_PROJECT: deeplearning-platform-release