From 2639057c84135410db11df97cc72e2ef2e2e3fc5 Mon Sep 17 00:00:00 2001 From: Aaron Sanders Date: Tue, 2 Jun 2026 17:09:14 -0700 Subject: [PATCH 1/7] Add GPU benchmark + consistency CI workflow (issue #48) Adds an ephemeral-runner workflow that, on ready-for-review PRs, provisions a G4 (RTX PRO 6000) VM in overworld-dev as a one-shot self-hosted runner, runs a 256-frame perf rollout (LFPS) and a deterministic consistency forward pass for each config against both main and the PR HEAD, posts a comparison table + MSE, then deletes the VM. - examples/ci.py: provider-agnostic perf+consistency harness and aggregator - .github/runner-startup.sh: registers the VM as an ephemeral runner (no SSH, fits the provisioner SA's create/delete-only permissions) - .github/workflows/benchmark.yml: start-runner / benchmark / aggregate / stop-runner Draft until the standard RTX PRO 6000 quota request for overworld-dev clears. --- .github/runner-startup.sh | 44 ++++++ .github/workflows/benchmark.yml | 163 +++++++++++++++++++++ examples/ci.py | 244 ++++++++++++++++++++++++++++++++ 3 files changed, 451 insertions(+) create mode 100755 .github/runner-startup.sh create mode 100644 .github/workflows/benchmark.yml create mode 100755 examples/ci.py diff --git a/.github/runner-startup.sh b/.github/runner-startup.sh new file mode 100755 index 0000000..ae4bd11 --- /dev/null +++ b/.github/runner-startup.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Startup script for the ephemeral GPU runner VM (issue #48). +# +# Passed to `gcloud compute instances create` as metadata, so it runs at boot +# with no SSH in (which is why the provisioner SA needs only create/delete, not +# setMetadata). It registers this VM as a one-shot (`--ephemeral`) self-hosted +# GitHub Actions runner; after the single job it unregisters itself, and the +# `stop-runner` job (plus the VM's max-run-duration backstop) deletes the VM. +# +# Required instance metadata: gh_repo, runner_token, runner_name, runner_labels. +# The base image (Deep Learning VM, common-cu129-…-nvidia-580) already has the +# NVIDIA driver; world_engine deps are installed by the workflow's job steps. +set -euo pipefail + +RUNNER_VERSION="2.323.0" # TODO: pin to the current actions/runner release +meta() { curl -s -H "Metadata-Flavor: Google" \ + "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"; } + +GH_REPO="$(meta gh_repo)" +RUNNER_TOKEN="$(meta runner_token)" +RUNNER_NAME="$(meta runner_name)" +RUNNER_LABELS="$(meta runner_labels)" + +# The Actions runner refuses to run as root; this is a throwaway single-job VM, +# so allow it rather than provisioning a dedicated user. +export RUNNER_ALLOW_RUNASROOT=1 + +mkdir -p /actions-runner && cd /actions-runner +curl -sL -o runner.tar.gz \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +tar xzf runner.tar.gz + +./config.sh \ + --unattended \ + --ephemeral \ + --url "https://github.com/${GH_REPO}" \ + --token "${RUNNER_TOKEN}" \ + --name "${RUNNER_NAME}" \ + --labels "${RUNNER_LABELS}" \ + --replace + +# Blocks until the single job completes, then the ephemeral runner exits and +# unregisters itself from GitHub. +./run.sh diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..b49ba31 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,163 @@ +# GPU benchmark + consistency suite for ready-for-review PRs (issue #48). +# +# Flow: start-runner provisions an ephemeral G4 (RTX PRO 6000) VM in +# overworld-dev that registers itself as a one-shot self-hosted runner -> +# benchmark runs perf + consistency for each config against both `main` and the +# PR HEAD on that GPU -> aggregate posts a comparison table -> stop-runner +# deletes the VM (with a max-run-duration backstop in case the job dies). +# +# Required repo secrets: +# WIF_PROVIDER_DEV - overworld-dev WIF provider (infra bootstrap output) +# WE_CI_PROVISIONER_SA - dev-we-ci-provisioner@overworld-dev.iam.gserviceaccount.com +# WE_CI_NODE_SA - dev-we-ci-node@overworld-dev.iam.gserviceaccount.com +# GH_RUNNER_PAT - PAT (or App token) with repo Administration:write, +# used to mint runner registration tokens +# HF_TOKEN - Hugging Face token for gated Overworld/* model pulls +# +# NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in +# overworld-dev/us-central1, which is currently 0 (request pending). Until then +# the create step fails on quota — switch --provisioning-model to SPOT to test +# against the preemptible quota (subject to capacity / mid-run preemption). + +name: GPU Benchmark + +on: + pull_request: + types: [ready_for_review, synchronize, reopened] + +permissions: + contents: read + id-token: write # Workload Identity Federation + pull-requests: write # post the results comment + +env: + GCP_PROJECT: overworld-dev + ZONE: us-central1-b + MACHINE_TYPE: g4-standard-48 + IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580 + IMAGE_PROJECT: deeplearning-platform-release + RUNNER_NAME: we-ci-${{ github.run_id }} + RUNNER_LABEL: gha-${{ github.run_id }} + # WorldEngine configs to test (issue #48 starter set). id|model_uri|quant + CONFIGS: | + wp15-1b-bf16|Overworld/Waypoint-1.5-1B|none + wp15-1b-intw8a8|Overworld/Waypoint-1.5-1B|intw8a8 + +jobs: + start-runner: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - id: auth + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }} + service_account: ${{ secrets.WE_CI_PROVISIONER_SA }} + - uses: google-github-actions/setup-gcloud@v2 + - name: Mint runner registration token + id: reg + run: | + token=$(curl -fsX POST \ + -H "Authorization: token ${{ secrets.GH_RUNNER_PAT }}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \ + | jq -r .token) + echo "::add-mask::$token" + echo "token=$token" >> "$GITHUB_OUTPUT" + - name: Create ephemeral GPU runner VM + run: | + gcloud compute instances create "$RUNNER_NAME" \ + --project "$GCP_PROJECT" --zone "$ZONE" \ + --machine-type "$MACHINE_TYPE" \ + --maintenance-policy TERMINATE --provisioning-model STANDARD \ + --image-family "$IMAGE_FAMILY" --image-project "$IMAGE_PROJECT" \ + --boot-disk-size 200GB --boot-disk-type hyperdisk-balanced \ + --service-account "${{ secrets.WE_CI_NODE_SA }}" --scopes cloud-platform \ + --max-run-duration 3h --instance-termination-action DELETE \ + --metadata-from-file startup-script=.github/runner-startup.sh \ + --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$RUNNER_NAME,runner_labels=$RUNNER_LABEL" + + benchmark: + needs: start-runner + runs-on: [self-hosted, "gha-${{ github.run_id }}"] + env: + CUBLAS_WORKSPACE_CONFIG: ":4096:8" # required for deterministic cuBLAS + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - name: Checkout PR HEAD + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: pr + - name: Checkout main + uses: actions/checkout@v4 + with: + ref: main + path: main + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: Run perf + consistency (main, then PR, per config) + run: | + set -euo pipefail + mkdir -p results + # Use the PR's harness against BOTH engine installs so only the + # world_engine package differs between refs. + cp pr/examples/ci.py main/examples/ci.py + while IFS='|' read -r id uri quant; do + [ -z "$id" ] && continue + for ref in main pr; do # main first: it creates the shared state + ( cd "$ref" && uv sync --extra dev >/dev/null && \ + uv run python examples/ci.py run \ + --config-id "$id" --ref "$ref" \ + --model-uri "$uri" --quant "$quant" \ + --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \ + --out "$GITHUB_WORKSPACE/results" ) + done + done <<< "${CONFIGS}" + - uses: actions/upload-artifact@v4 + with: + name: gpu-bench-results + path: results/ + + aggregate: + needs: benchmark + if: always() && needs.benchmark.result != 'skipped' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: gpu-bench-results + path: results + - name: Build summary + run: | + pip install --quiet numpy + python examples/ci.py compare --results-dir results --out results/summary.md + - name: Comment on PR + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('results/summary.md', 'utf8'); + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body, + }); + + stop-runner: + needs: [start-runner, benchmark] + if: always() + runs-on: ubuntu-latest + steps: + - uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }} + service_account: ${{ secrets.WE_CI_PROVISIONER_SA }} + - uses: google-github-actions/setup-gcloud@v2 + - name: Delete runner VM + run: | + gcloud compute instances delete "$RUNNER_NAME" \ + --project "$GCP_PROJECT" --zone "$ZONE" --quiet || true diff --git a/examples/ci.py b/examples/ci.py new file mode 100755 index 0000000..fbb4f74 --- /dev/null +++ b/examples/ci.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +"""world_engine CI perf + consistency harness (issue #48). + +Provider-agnostic: this script knows nothing about GCP/GitHub Actions. Given a +WorldEngine config it runs a performance rollout (LFPS) and/or a deterministic +consistency forward pass, writing JSON results (+ a latent .npy for +consistency) to --out. A separate `compare` subcommand turns a directory of +per-ref results into a markdown table (perf delta + consistency MSE). + +The CI workflow runs the SAME copy of this script against two world_engine +installs (main and the PR HEAD) so that only the engine code differs. + +Usage: + # produce results for one (config, ref); run "main" first so it creates the + # shared KV-cache state that "pr" then loads. + python examples/ci.py run \ + --config-id wp15-1b-bf16 --ref main \ + --model-uri Overworld/Waypoint-1.5-1B --quant none \ + --shared-state results/state_wp15-1b-bf16.pt --out results + + # aggregate + python examples/ci.py compare --results-dir results --out results/summary.md +""" +import argparse +import json +import platform +import time +from pathlib import Path + + +# How many gen_frame() steps to pre-roll before snapshotting the KV cache for +# the consistency check ("fully populated cache"). +CONSISTENCY_PREROLL = 8 + + +def env_info(): + import torch + info = { + "torch": torch.__version__, + "torch_cuda": torch.version.cuda, + "os": f"{platform.system()} {platform.release()} ({platform.machine()})", + "gpu": None, + } + if torch.cuda.is_available(): + p = torch.cuda.get_device_properties(torch.cuda.current_device()) + info["gpu"] = { + "name": p.name, + "capability": f"{p.major}.{p.minor}", + "memory_gb": round(p.total_memory / 1e9, 1), + } + return info + + +def build_engine(model_uri, quant, overrides): + from world_engine import WorldEngine + return WorldEngine( + model_uri, + quant=quant, + model_config_overrides=overrides, + device="cuda", + ) + + +def run_perf(engine, n_frames): + """Time an n_frames DiT-only rollout. LFPS = latent frames per second.""" + import torch + + for _ in range(3): # warmup + engine.gen_frame(return_img=False) + engine.reset() + engine.gen_frame(return_img=False) # prime + torch.cuda.synchronize() + + t0 = time.perf_counter() + for _ in range(n_frames): + engine.gen_frame(return_img=False) + torch.cuda.synchronize() + elapsed = time.perf_counter() - t0 + return {"status": "ok", "n_frames": n_frames, "elapsed_s": elapsed, "lfps": n_frames / elapsed} + + +def run_consistency(engine, shared_state: Path, latent_path: Path): + """Deterministic single forward from a shared, fully-populated KV cache. + + The first ref to run creates the shared state (snapshotting the cache after + a fixed pre-roll) and saves it; every other ref loads that exact state, so + the only variable is the engine code. Latent is saved as .npy so the + aggregator needs numpy only (no torch/GPU). + """ + import numpy as np + import torch + + # NB: full determinism also needs CUBLAS_WORKSPACE_CONFIG=:4096:8 in the + # environment (set by the workflow) before CUDA initialises. + torch.use_deterministic_algorithms(True) + torch.manual_seed(0) + + if shared_state.exists(): + engine.load_state(torch.load(shared_state)) + produced = False + else: + engine.reset() + for _ in range(CONSISTENCY_PREROLL): + engine.gen_frame(return_img=False) + shared_state.parent.mkdir(parents=True, exist_ok=True) + torch.save(engine.get_state(), shared_state) + produced = True + + latent = engine.gen_frame(return_img=False).detach().float().cpu().numpy() + latent_path.parent.mkdir(parents=True, exist_ok=True) + np.save(latent_path, latent) + return { + "status": "ok", + "produced_shared_state": produced, + "latent_shape": list(latent.shape), + "latent_file": latent_path.name, + } + + +def cmd_run(args): + quant = None if (args.quant or "none").lower() in ("none", "null", "") else args.quant + overrides = ( + json.loads(args.overrides) + if args.overrides and args.overrides.lower() not in ("none", "null") + else None + ) + out = Path(args.out) + out.mkdir(parents=True, exist_ok=True) + + result = { + "config_id": args.config_id, + "ref": args.ref, + "model_uri": args.model_uri, + "quant": quant, + "overrides": overrides, + "env": env_info(), + "perf": None, + "consistency": None, + } + + try: + engine = build_engine(args.model_uri, quant, overrides) + except Exception as e: # engine init failure -> whole config marked failed + result["error"] = f"engine init failed: {e!r}" + _write_result(out, args, result) + return + + if args.mode in ("perf", "both"): + try: + result["perf"] = run_perf(engine, args.n_frames) + except Exception as e: + result["perf"] = {"status": "failed", "error": repr(e)} + + if args.mode in ("consistency", "both"): + try: + result["consistency"] = run_consistency( + engine, + Path(args.shared_state), + out / f"latent_{args.config_id}_{args.ref}.npy", + ) + except Exception as e: + result["consistency"] = {"status": "failed", "error": repr(e)} + + _write_result(out, args, result) + + +def _write_result(out: Path, args, result): + path = out / f"result_{args.config_id}_{args.ref}.json" + path.write_text(json.dumps(result, indent=2)) + print(f"wrote {path}") + + +def _lfps(result): + perf = (result or {}).get("perf") or {} + return perf.get("lfps") if perf.get("status") == "ok" else None + + +def cmd_compare(args): + import numpy as np + + rd = Path(args.results_dir) + results = {} + for f in sorted(rd.glob("result_*.json")): + r = json.loads(f.read_text()) + results[(r["config_id"], r["ref"])] = r + config_ids = sorted({c for c, _ in results}) + + lines = ["## GPU benchmark (issue #48)", ""] + gpu = next((r["env"].get("gpu") for r in results.values() if r.get("env")), None) + if gpu: + lines += [f"Machine: **{gpu['name']}** ({gpu['memory_gb']} GB, cc {gpu['capability']})", ""] + + lines += ["### Performance — LFPS (256-frame rollout)", "", + "| Config | main | PR | Δ% |", "|---|---|---|---|"] + for cid in config_ids: + ml, pl = _lfps(results.get((cid, "main"))), _lfps(results.get((cid, "pr"))) + if isinstance(ml, float) and isinstance(pl, float) and ml: + delta = f"{(pl - ml) / ml * 100:+.1f}%" + else: + delta = "—" + fmt = lambda v: f"{v:.2f}" if isinstance(v, float) else "**FAILED**" + lines.append(f"| `{cid}` | {fmt(ml)} | {fmt(pl)} | {delta} |") + + lines += ["", "### Consistency — MSE(main, PR) latent", "", + "| Config | MSE | status |", "|---|---|---|"] + for cid in config_ids: + try: + a = np.load(rd / f"latent_{cid}_main.npy") + b = np.load(rd / f"latent_{cid}_pr.npy") + lines.append(f"| `{cid}` | {float(np.mean((a - b) ** 2)):.3e} | ok |") + except Exception as e: + lines.append(f"| `{cid}` | — | **FAILED**: {e} |") + + Path(args.out).write_text("\n".join(lines) + "\n") + print("\n".join(lines)) + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + sub = ap.add_subparsers(dest="cmd", required=True) + + r = sub.add_parser("run", help="run perf+consistency for one (config, ref)") + r.add_argument("--config-id", required=True) + r.add_argument("--ref", required=True, help="label for this engine build, e.g. main or pr") + r.add_argument("--model-uri", required=True) + r.add_argument("--quant", default="none") + r.add_argument("--overrides", default=None, help="JSON dict of model_config_overrides") + r.add_argument("--mode", choices=["perf", "consistency", "both"], default="both") + r.add_argument("--n-frames", type=int, default=256) + r.add_argument("--shared-state", required=True, help="path to the shared KV-cache state .pt") + r.add_argument("--out", required=True) + r.set_defaults(func=cmd_run) + + c = sub.add_parser("compare", help="aggregate results into a markdown table") + c.add_argument("--results-dir", required=True) + c.add_argument("--out", required=True) + c.set_defaults(func=cmd_compare) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() From 8bfda98370961a8755fdd1c03bb539ecfbaf2c5e Mon Sep 17 00:00:00 2001 From: Aaron Sanders Date: Tue, 2 Jun 2026 17:13:02 -0700 Subject: [PATCH 2/7] Move CI benchmark script out of examples/, drop provider-agnostic framing The script is GCP/world_engine-specific CI glue, not a general-purpose example. Move examples/ci.py -> .github/benchmark.py and reword. Switch the workflow to invoke it via uv run --project per ref (drops the copy hack). --- examples/ci.py => .github/benchmark.py | 25 ++++++++++--------------- .github/workflows/benchmark.yml | 19 +++++++++---------- 2 files changed, 19 insertions(+), 25 deletions(-) rename examples/ci.py => .github/benchmark.py (90%) diff --git a/examples/ci.py b/.github/benchmark.py similarity index 90% rename from examples/ci.py rename to .github/benchmark.py index fbb4f74..0221cd9 100755 --- a/examples/ci.py +++ b/.github/benchmark.py @@ -1,25 +1,20 @@ #!/usr/bin/env python3 -"""world_engine CI perf + consistency harness (issue #48). +"""GPU benchmark + consistency for the world_engine CI (issue #48). -Provider-agnostic: this script knows nothing about GCP/GitHub Actions. Given a -WorldEngine config it runs a performance rollout (LFPS) and/or a deterministic -consistency forward pass, writing JSON results (+ a latent .npy for -consistency) to --out. A separate `compare` subcommand turns a directory of -per-ref results into a markdown table (perf delta + consistency MSE). +Run on the ephemeral G4 runner by .github/workflows/benchmark.yml. `run` does a +performance rollout (LFPS) and a deterministic consistency forward pass for one +(config, ref) and writes a JSON result + a latent .npy. `compare` turns the +collected results into a markdown table (perf delta + consistency MSE). The +workflow runs `run` against both the main and PR world_engine installs (only +the engine code differs); "main" runs first and creates the shared KV-cache +state that "pr" then loads. -The CI workflow runs the SAME copy of this script against two world_engine -installs (main and the PR HEAD) so that only the engine code differs. - -Usage: - # produce results for one (config, ref); run "main" first so it creates the - # shared KV-cache state that "pr" then loads. - python examples/ci.py run \ + python .github/benchmark.py run \ --config-id wp15-1b-bf16 --ref main \ --model-uri Overworld/Waypoint-1.5-1B --quant none \ --shared-state results/state_wp15-1b-bf16.pt --out results - # aggregate - python examples/ci.py compare --results-dir results --out results/summary.md + python .github/benchmark.py compare --results-dir results --out results/summary.md """ import argparse import json diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b49ba31..d28d7f6 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -101,18 +101,17 @@ jobs: run: | set -euo pipefail mkdir -p results - # Use the PR's harness against BOTH engine installs so only the - # world_engine package differs between refs. - cp pr/examples/ci.py main/examples/ci.py + # Always invoke the PR's copy of the script, but switch the uv project + # per ref so only the world_engine install differs between runs. + script="$GITHUB_WORKSPACE/pr/.github/benchmark.py" while IFS='|' read -r id uri quant; do [ -z "$id" ] && continue for ref in main pr; do # main first: it creates the shared state - ( cd "$ref" && uv sync --extra dev >/dev/null && \ - uv run python examples/ci.py run \ - --config-id "$id" --ref "$ref" \ - --model-uri "$uri" --quant "$quant" \ - --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \ - --out "$GITHUB_WORKSPACE/results" ) + uv run --project "$ref" --group dev python "$script" run \ + --config-id "$id" --ref "$ref" \ + --model-uri "$uri" --quant "$quant" \ + --shared-state "$GITHUB_WORKSPACE/results/state_${id}.pt" \ + --out "$GITHUB_WORKSPACE/results" done done <<< "${CONFIGS}" - uses: actions/upload-artifact@v4 @@ -133,7 +132,7 @@ jobs: - name: Build summary run: | pip install --quiet numpy - python examples/ci.py compare --results-dir results --out results/summary.md + python .github/benchmark.py compare --results-dir results --out results/summary.md - name: Comment on PR uses: actions/github-script@v7 with: From 3bea4b7e0f983fbe965e1639e2d0fe3188292653 Mon Sep 17 00:00:00 2001 From: Clydingus <40514241+Clydingus@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:28:48 -0400 Subject: [PATCH 3/7] Rename RUNNER_NAME env var off reserved Actions prefix RUNNER_NAME is a built-in GitHub Actions environment variable that the runner overrides at runtime with its own name (e.g. "GitHub Actions 1000007239"). The workflow's `env: RUNNER_NAME` was therefore ignored, and start-runner passed the runner's name to `gcloud compute instances create`, which rejected it as an invalid GCE resource name: Invalid value for field 'resource.name': 'GitHub Actions 1000007239'. The same bug made stop-runner's `instances delete "$RUNNER_NAME"` target the wrong name (silently, via `|| true`), so a created VM would leak. Rename to GH_RUNNER_NAME / GH_RUNNER_LABEL: clear of the reserved RUNNER_ and GITHUB_ prefixes, and consistent with the GH_RUNNER_PAT secret. Updates the create positional arg, delete arg, and metadata. The benchmark job's runs-on label is a literal expression and is unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/benchmark.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d28d7f6..f397404 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,8 +36,12 @@ env: MACHINE_TYPE: g4-standard-48 IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580 IMAGE_PROJECT: deeplearning-platform-release - RUNNER_NAME: we-ci-${{ github.run_id }} - RUNNER_LABEL: gha-${{ github.run_id }} + # NB: do NOT name these RUNNER_NAME / RUNNER_* — those are reserved built-in + # Actions env vars the runner overrides at runtime (RUNNER_NAME becomes the + # hosted runner's own name, e.g. "GitHub Actions 123"), which then gets passed + # as the GCE instance name and rejected as an invalid resource name. + GH_RUNNER_NAME: we-ci-${{ github.run_id }} + GH_RUNNER_LABEL: gha-${{ github.run_id }} # WorldEngine configs to test (issue #48 starter set). id|model_uri|quant CONFIGS: | wp15-1b-bf16|Overworld/Waypoint-1.5-1B|none @@ -67,7 +71,7 @@ jobs: echo "token=$token" >> "$GITHUB_OUTPUT" - name: Create ephemeral GPU runner VM run: | - gcloud compute instances create "$RUNNER_NAME" \ + gcloud compute instances create "$GH_RUNNER_NAME" \ --project "$GCP_PROJECT" --zone "$ZONE" \ --machine-type "$MACHINE_TYPE" \ --maintenance-policy TERMINATE --provisioning-model STANDARD \ @@ -76,7 +80,7 @@ jobs: --service-account "${{ secrets.WE_CI_NODE_SA }}" --scopes cloud-platform \ --max-run-duration 3h --instance-termination-action DELETE \ --metadata-from-file startup-script=.github/runner-startup.sh \ - --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$RUNNER_NAME,runner_labels=$RUNNER_LABEL" + --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$GH_RUNNER_NAME,runner_labels=$GH_RUNNER_LABEL" benchmark: needs: start-runner @@ -158,5 +162,5 @@ jobs: - uses: google-github-actions/setup-gcloud@v2 - name: Delete runner VM run: | - gcloud compute instances delete "$RUNNER_NAME" \ + gcloud compute instances delete "$GH_RUNNER_NAME" \ --project "$GCP_PROJECT" --zone "$ZONE" --quiet || true From 6df0952ee08971f36a671692a8d128e3573f77b0 Mon Sep 17 00:00:00 2001 From: Clydingus <40514241+Clydingus@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:10:32 -0400 Subject: [PATCH 4/7] Harden runner-token mint: surface HTTP status, fail on empty token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mint step runs under `bash -e` (no pipefail), so `curl -f | jq` hid a failing PAT call behind jq's exit 0 — yielding an empty registration token that silently flowed into the VM metadata and only surfaced later as an opaque create failure. Capture the HTTP status with -w, print it, and on non-201 emit the API's .message/.documentation_url (non-sensitive error text, never the token) and exit 1. Only parse/emit the token on success; keep masking it. Also switch to `Authorization: Bearer` (the documented scheme for fine-grained PATs) and pass the PAT via env rather than inline interpolation. This both fixes the silent-empty-token footgun and tells us the exact reason the current GH_RUNNER_PAT mint is failing (the token is configured correctly per the UI, so the failure is something only the HTTP response will reveal). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/benchmark.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f397404..12b4f7c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -61,14 +61,29 @@ jobs: - uses: google-github-actions/setup-gcloud@v2 - name: Mint runner registration token id: reg + env: + GH_RUNNER_PAT: ${{ secrets.GH_RUNNER_PAT }} run: | - token=$(curl -fsX POST \ - -H "Authorization: token ${{ secrets.GH_RUNNER_PAT }}" \ + set -euo pipefail + # Capture the HTTP status separately from the body so a failed mint + # fails loudly with the real reason instead of silently yielding an + # empty token (the default shell here has no pipefail, so a plain + # `curl -f | jq` masks curl's failure behind jq's exit 0). + code=$(curl -sS -o resp.json -w '%{http_code}' -X POST \ + -H "Authorization: Bearer ${GH_RUNNER_PAT}" \ -H "Accept: application/vnd.github+json" \ - "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \ - | jq -r .token) - echo "::add-mask::$token" - echo "token=$token" >> "$GITHUB_OUTPUT" + "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token") + echo "registration-token HTTP ${code}" + if [ "${code}" != "201" ]; then + # .message/.documentation_url are non-sensitive error text, never the token + echo "::error::failed to mint runner registration token (HTTP ${code})" + jq -r '.message // "", .documentation_url // ""' resp.json || true + exit 1 + fi + token=$(jq -er .token resp.json) + echo "::add-mask::${token}" + echo "token=${token}" >> "$GITHUB_OUTPUT" + rm -f resp.json - name: Create ephemeral GPU runner VM run: | gcloud compute instances create "$GH_RUNNER_NAME" \ From 21e6c36cc27d10a3165293a5b0cdd8cff9f043a3 Mon Sep 17 00:00:00 2001 From: Clydingus <40514241+Clydingus@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:24:33 -0400 Subject: [PATCH 5/7] Mint runner token via GitHub App instead of PAT A fine-grained PAT is capped by its owner's repo role; the token owner has Maintain (not Admin) on world_engine, and creating a runner registration token requires Admin, so the PAT 403'd ("Resource not accessible by personal access token") despite carrying Administration:write. Switch to a GitHub App installation token (actions/create-github-app-token), whose permissions come from the App install (Administration:write) and are not bounded by any human's role. Auto-scoped to this repo, auto-masked, 1h expiry. Replaces GH_RUNNER_PAT with WE_CI_APP_ID / WE_CI_APP_PRIVATE_KEY. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/benchmark.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 12b4f7c..0e43be8 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -10,8 +10,8 @@ # WIF_PROVIDER_DEV - overworld-dev WIF provider (infra bootstrap output) # WE_CI_PROVISIONER_SA - dev-we-ci-provisioner@overworld-dev.iam.gserviceaccount.com # WE_CI_NODE_SA - dev-we-ci-node@overworld-dev.iam.gserviceaccount.com -# GH_RUNNER_PAT - PAT (or App token) with repo Administration:write, -# used to mint runner registration tokens +# WE_CI_APP_ID - GitHub App (Administration:write) id, for minting +# runner registration tokens # HF_TOKEN - Hugging Face token for gated Overworld/* model pulls # # NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in @@ -59,10 +59,18 @@ jobs: workload_identity_provider: ${{ secrets.WIF_PROVIDER_DEV }} service_account: ${{ secrets.WE_CI_PROVISIONER_SA }} - uses: google-github-actions/setup-gcloud@v2 + # App installation token: carries the App's Administration:write directly. + # Auto-scoped to this repo (the App is installed only here), auto-masked, expires in 1h. + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v2 + with: + app-id: ${{ secrets.WE_CI_APP_ID }} + private-key: ${{ secrets.WE_CI_APP_PRIVATE_KEY }} - name: Mint runner registration token id: reg env: - GH_RUNNER_PAT: ${{ secrets.GH_RUNNER_PAT }} + GH_TOKEN: ${{ steps.app-token.outputs.token }} run: | set -euo pipefail # Capture the HTTP status separately from the body so a failed mint @@ -70,7 +78,7 @@ jobs: # empty token (the default shell here has no pipefail, so a plain # `curl -f | jq` masks curl's failure behind jq's exit 0). code=$(curl -sS -o resp.json -w '%{http_code}' -X POST \ - -H "Authorization: Bearer ${GH_RUNNER_PAT}" \ + -H "Authorization: Bearer ${GH_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token") echo "registration-token HTTP ${code}" From ffbb9b3086c3c28e686bffa5d449b5b8faf4c1ce Mon Sep 17 00:00:00 2001 From: Clydingus <40514241+Clydingus@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:54:50 -0400 Subject: [PATCH 6/7] Fail fast on runner-registration failure instead of hanging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the VM's startup script fails to register the runner, the benchmark job (runs-on the ephemeral label) sits queued waiting for a runner that never comes — up to GitHub's ~24h auto-fail — while the VM self-deletes at 3h, leaving no signal and no logs (startup output only reaches the VM serial console). start-runner now polls the runners API for an online runner with our label (~10 min bound) and, on timeout, dumps the VM serial console into the Actions log and fails — so the real cause is visible, benchmark is skipped (not queued), and stop-runner tears the VM down. Also harden runner-startup.sh: `set -x` + an ERR trap (so the serial dump pinpoints the failing line), run ./bin/installdependencies.sh (missing runner OS deps are a common silent config.sh failure), and bump RUNNER_VERSION 2.323.0 -> 2.334.0. Add timeout-minutes: 90 to benchmark as a backstop for a runner that registers then stalls mid-build. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/runner-startup.sh | 11 ++++++++++- .github/workflows/benchmark.yml | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/.github/runner-startup.sh b/.github/runner-startup.sh index ae4bd11..80e7033 100755 --- a/.github/runner-startup.sh +++ b/.github/runner-startup.sh @@ -11,8 +11,13 @@ # The base image (Deep Learning VM, common-cu129-…-nvidia-580) already has the # NVIDIA driver; world_engine deps are installed by the workflow's job steps. set -euo pipefail +# Echo every command and announce the failing line. This output lands on the VM +# serial console, which the workflow's "Wait for runner to register" step dumps +# into the Actions log if registration doesn't complete in time. +set -x +trap 'echo "[runner-startup] FAILED at line ${LINENO} (exit $?)" >&2' ERR -RUNNER_VERSION="2.323.0" # TODO: pin to the current actions/runner release +RUNNER_VERSION="2.334.0" meta() { curl -s -H "Metadata-Flavor: Google" \ "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"; } @@ -30,6 +35,10 @@ curl -sL -o runner.tar.gz \ "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" tar xzf runner.tar.gz +# Install the runner's OS dependencies (libicu et al.). Missing libs are a +# common silent cause of config.sh failing on minimal/base images. +./bin/installdependencies.sh + ./config.sh \ --unattended \ --ephemeral \ diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0e43be8..fe4dcd4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -105,8 +105,31 @@ jobs: --metadata-from-file startup-script=.github/runner-startup.sh \ --metadata "gh_repo=${{ github.repository }},runner_token=${{ steps.reg.outputs.token }},runner_name=$GH_RUNNER_NAME,runner_labels=$GH_RUNNER_LABEL" + # The startup script registers the runner on the VM; its output goes to the + # serial console, invisible to Actions. Without this gate, a failed + # registration leaves the `benchmark` job queued (waiting for a runner that + # never comes) until GitHub's ~24h auto-fail, with no signal. Poll the + # runners API for our label; on timeout, dump the serial console so the + # startup failure is visible here, then fail fast so stop-runner tears down. + - name: Wait for runner to register + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + set -euo pipefail + for i in $(seq 1 40); do # ~10 min @ 15s + online=$(gh api "repos/${{ github.repository }}/actions/runners" \ + --jq "[.runners[] | select(any(.labels[]; .name==\"$GH_RUNNER_LABEL\")) | select(.status==\"online\")] | length") + if [ "${online:-0}" -ge 1 ]; then echo "runner online"; exit 0; fi + echo "waiting for runner ($i/40)…"; sleep 15 + done + echo "::error::runner '$GH_RUNNER_NAME' never came online — serial console follows:" + gcloud compute instances get-serial-port-output "$GH_RUNNER_NAME" \ + --project "$GCP_PROJECT" --zone "$ZONE" || true + exit 1 + benchmark: needs: start-runner + timeout-minutes: 90 # backstop for a runner that registers then stalls mid-build runs-on: [self-hosted, "gha-${{ github.run_id }}"] env: CUBLAS_WORKSPACE_CONFIG: ":4096:8" # required for deterministic cuBLAS From 986451aa63b9090eef3f61bc67e1d0c6420e7c29 Mon Sep 17 00:00:00 2001 From: Aaron Sanders Date: Fri, 5 Jun 2026 13:04:19 -0700 Subject: [PATCH 7/7] Move benchmark runner zone to us-south1-a us-central1 has no RTX PRO 6000 capacity; the us-south1 quota request is filed. Point the workflow ZONE at us-south1-a. --- .github/workflows/benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index fe4dcd4..5991d5f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -15,7 +15,7 @@ # HF_TOKEN - Hugging Face token for gated Overworld/* model pulls # # NOTE: requires the standard (non-preemptible) RTX PRO 6000 quota in -# overworld-dev/us-central1, which is currently 0 (request pending). Until then +# overworld-dev/us-south1, which is currently 0 (request pending). Until then # the create step fails on quota — switch --provisioning-model to SPOT to test # against the preemptible quota (subject to capacity / mid-run preemption). @@ -32,7 +32,7 @@ permissions: env: GCP_PROJECT: overworld-dev - ZONE: us-central1-b + ZONE: us-south1-a MACHINE_TYPE: g4-standard-48 IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580 IMAGE_PROJECT: deeplearning-platform-release