From d6097b0fbe531a0d8b0dbf4ed0b04d2913b0dd92 Mon Sep 17 00:00:00 2001 From: pratikbin <68642400+pratikbin@users.noreply.github.com> Date: Tue, 19 May 2026 13:27:31 +0530 Subject: [PATCH] feat: avail node operations skills --- .gitignore | 4 + README.md | 6 + skills/avail-validator-operate/SKILL.md | 111 ++++++++++++ .../avail-validator-operate/evals/evals.json | 35 ++++ .../references/backup-recovery.md | 80 +++++++++ .../references/chill-unbond.md | 60 +++++++ .../references/monitoring.md | 106 +++++++++++ .../references/upgrade.md | 74 ++++++++ .../scripts/backup-keys.sh | 55 ++++++ .../scripts/safe-upgrade.sh | 64 +++++++ skills/avail-validator-setup/SKILL.md | 168 ++++++++++++++++++ skills/avail-validator-setup/evals/evals.json | 38 ++++ .../references/key-security.md | 71 ++++++++ .../references/networks.md | 65 +++++++ .../references/troubleshooting.md | 52 ++++++ .../scripts/avail-validator.sh | 100 +++++++++++ 16 files changed, 1089 insertions(+) create mode 100644 skills/avail-validator-operate/SKILL.md create mode 100644 skills/avail-validator-operate/evals/evals.json create mode 100644 skills/avail-validator-operate/references/backup-recovery.md create mode 100644 skills/avail-validator-operate/references/chill-unbond.md create mode 100644 skills/avail-validator-operate/references/monitoring.md create mode 100644 skills/avail-validator-operate/references/upgrade.md create mode 100755 skills/avail-validator-operate/scripts/backup-keys.sh create mode 100755 skills/avail-validator-operate/scripts/safe-upgrade.sh create mode 100644 skills/avail-validator-setup/SKILL.md create mode 100644 skills/avail-validator-setup/evals/evals.json create mode 100644 skills/avail-validator-setup/references/key-security.md create mode 100644 skills/avail-validator-setup/references/networks.md create mode 100644 skills/avail-validator-setup/references/troubleshooting.md create mode 100755 skills/avail-validator-setup/scripts/avail-validator.sh diff --git a/.gitignore b/.gitignore index d0ccb93..904bb66 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,7 @@ target/ .claude/ .claude_consciousness.m8 .opencode/ +.firecrawl/ + +# skill-creator eval/optimization artifacts +skills/*-workspace/ diff --git a/README.md b/README.md index 7cd31c6..317f3b4 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ AI agent skills for the [NodeOps](https://nodeops.network) ecosystem. Works with | **createos** | Deploy anything to production on CreateOS cloud platform | `npx skills add https://github.com/NodeOps-app/skills --skill createos` | | **vercel-to-createos** | Migrate Next.js, Vite, React, Vue, Svelte apps from Vercel to CreateOS | `npx skills add https://github.com/NodeOps-app/skills --skill vercel-to-createos` | | **claude-code-to-codex** | Migrate Claude Code CLI hooks, MCP servers, plugins, instructions, and sessions to Codex CLI | `npx skills add https://github.com/NodeOps-app/skills --skill claude-code-to-codex` | +| **avail-validator-setup** | Stand up and activate an Avail DA validator (Docker-first) — day-0 provisioning through day-1 staking and going active, on Mainnet or Turing testnet | `npx skills add https://github.com/NodeOps-app/skills --skill avail-validator-setup` | +| **avail-validator-operate** | Day-2 ops for a live Avail DA validator — monitoring, slash-safe upgrades, key backup, chill/unbond, disaster recovery without equivocation | `npx skills add https://github.com/NodeOps-app/skills --skill avail-validator-operate` | ### Migration skills @@ -18,6 +20,10 @@ AI agent skills for the [NodeOps](https://nodeops.network) ecosystem. Works with `claude-code-to-codex` migrates Claude Code CLI setups to Codex CLI, with focused coverage for hooks, Claude Code CLI MCP servers, plugins, and session handoff. +### Avail validator skills + +`avail-validator-setup` and `avail-validator-operate` cover the full lifecycle of an [Avail DA](https://docs.availproject.org/docs/da/operate/become-a-validator) validator, Docker-first and network-parameterized (Mainnet / Turing testnet). `avail-validator-setup` handles day-0 provisioning through day-1 session keys, bonding, and going active; `avail-validator-operate` handles day-2 monitoring, slash-safe upgrades, encrypted key backup, chill/unbond, and disaster recovery — every procedure built around avoiding equivocation/double-signing. + ## CreateOS Authentication The `createos` skill can be used in two modes: diff --git a/skills/avail-validator-operate/SKILL.md b/skills/avail-validator-operate/SKILL.md new file mode 100644 index 0000000..6dd274c --- /dev/null +++ b/skills/avail-validator-operate/SKILL.md @@ -0,0 +1,111 @@ +--- +name: avail-validator-operate +description: >- + Run, maintain, and protect an already-active Avail DA validator — day-2 operations. + Use this whenever the user needs to monitor an Avail validator (telemetry, Prometheus, + Grafana, alerting on missed blocks/peers/sync/era points), upgrade the node to a new + availj/avail image or release without getting slashed, back up the keystore/node key, + restore or migrate a validator after a server loss WITHOUT double-signing, chill / + stop validating cleanly (staking.chill), unbond, or handle equivocation/slashing risk + and disaster recovery. Triggers on phrases like "monitor my avail validator", "set up + grafana for avail", "upgrade avail node safely", "avail validator slashed", "back up + avail keystore", "migrate avail validator to new server", "stop validating avail", + "chill my avail validator", "avail node equivocation", "restore avail validator". + For first-time setup, session-key generation, bonding and going active, use the + avail-validator-setup skill instead. +--- + +# Avail Validator — Operate (Day 2) + +Keep an active Avail validator healthy and **avoid the one class of mistake that gets +you slashed: equivocation (double-signing)**. Equivocation slashes the validator *and* +its nominators, so every procedure here is shaped around the rule: + +> **The same session keystore must never be active on two running nodes at once.** + +Docker-first. One parameterized path covers Mainnet and Turing. Network-specific URLs +and economics (era ≈ 24 h, 28-day unbond, reward lag) are in +`avail-validator-setup/references/networks.md` — reuse it; don't restate values. + +## What day-2 covers + +| Task | Read | +|---|---| +| Monitoring & alerting | `references/monitoring.md` | +| Node upgrade (safe vs fast) | `references/upgrade.md` | +| Backup of secrets | `references/backup-recovery.md` | +| Disaster recovery / server migration | `references/backup-recovery.md` | +| Chill / unbond / withdraw | `references/chill-unbond.md` | +| Slashing & equivocation model | this file + `references/chill-unbond.md` | + +Always identify the network and the running container first: + +```bash +CID=$(docker ps -lq) +docker exec "$CID" ls /da/node-data/chains # confirms chain dir / network +docker logs --tail 30 "$CID" +``` + +## Monitoring (do this on day 1 of day-2) + +A validator you can't observe is a validator you can't protect. Stand up the metrics +stack and alerts before anything else. Full configs (telemetry flag, `prometheus.yml`, +Grafana install, the official dashboard JSON) and the alert thresholds that actually +matter are in `references/monitoring.md`. + +Alert, at minimum, on: node down / not on telemetry, **finalized height not +advancing**, **peer count low**, sync falling behind tip, **missed blocks / era points +dropping**, and version drift from the latest release. A full session unresponsive → +involuntary chill; >10 % of validators offline together in an epoch → all slashed. + +## Upgrades — the equivocation trap + +`docker pull` + recreate is fine for a **full/RPC node**. For an **active validator** +it risks: (a) DB corruption → prolonged downtime → ejection from the active set, and +(b) — if you "just spin up the new one alongside the old" — **double-signing**. + +Two procedures, in `references/upgrade.md`: + +- **Fast (acceptable downtime, single box):** stop container → recreate on the new + pinned tag with the same volume → verify it resumes authoring. Brief downtime, no + equivocation because the old node is stopped first. +- **Slow & safe (zero downtime, two boxes):** stand up Node B on the new version, + `author_rotateKeys` on **B**, submit the new keys via **Set Session Key**, wait for + block production to move to B (confirm by **logs**, not the UI), *then* and only then + stop Node A. Never have both authoring with the same keys. + +`scripts/safe-upgrade.sh` walks the fast path with the stop-before-start ordering +enforced. Read `references/upgrade.md` before using it. + +## Backups + +`db` is re-syncable and holds no secret — don't fixate on it. The only irreplaceable +on-box material is `keystore/` (session keys) and `network/` (node key). Back them up +**encrypted and off-box**, immediately and after any key rotation. +`scripts/backup-keys.sh` produces an encrypted archive. Procedure + restore in +`references/backup-recovery.md`. + +## Disaster recovery — without slashing yourself + +Losing the server is survivable; **restoring keys onto a new box while the old one +might still be running is not** — that double-signs. The safe recovery paths +(old-node-definitively-dead vs rotate-to-new-keys) are in +`references/backup-recovery.md`. When in doubt, rotate to **new** session keys via +`setKeys` rather than restoring the old keystore — new keys can't equivocate against +the old. + +## Chill / unbond / exit + +Stopping cleanly is `staking.chill` (UI or extrinsic), **signed by the controller**, +effective **next era**; funds stay bonded. Unbond → **28-day** lock → withdraw. +Step-by-step, plus the difference between voluntary and involuntary chill and the +slashing conditions, in `references/chill-unbond.md`. + +## Slashing facts to act on + +- Equivocation (two blocks same slot, or conflicting GRANDPA votes) → slash for + validator **and** nominators. Usually self-inflicted by running duplicate keys. +- Slash shows immediately on the staking UI's slashes page, but the **financial + deduction is delayed days** (governance can reverse it). "Not deducted yet" ≠ "safe". +- Involuntary chill (offline, <10 % of set) → no slash; ≥10 % offline together → + slash. Uptime monitoring is a slashing-prevention control, not a nicety. diff --git a/skills/avail-validator-operate/evals/evals.json b/skills/avail-validator-operate/evals/evals.json new file mode 100644 index 0000000..2a78a7a --- /dev/null +++ b/skills/avail-validator-operate/evals/evals.json @@ -0,0 +1,35 @@ +{ + "skill_name": "avail-validator-operate", + "evals": [ + { + "id": 0, + "name": "safe-upgrade-no-equivocation", + "prompt": "My Avail validator on mainnet is active and producing blocks (availj/avail in Docker). A new release just dropped and I need to upgrade. I'm paranoid about getting slashed for double-signing. What's the safe upgrade procedure, and how is it different from just pulling the new image and restarting?", + "expected_output": "Equivocation explanation, fast stop-before-start path, slow two-box rotate-keys path, anti-patterns, high-stake recommendation, tag verification.", + "files": [], + "assertions": [ + "Explicitly explains that double-signing/equivocation slashes the validator and its nominators, and why naive pull+restart is risky", + "Fast single-box path stops and confirms the old container is down BEFORE starting the new one, reusing the same volume and node name", + "Slow zero-downtime path: Node B on new version, author_rotateKeys for new keys, setKeys via controller, migrate confirmed via logs not UI, then stop Node A", + "Names the anti-pattern of running the old and new nodes simultaneously with the same/copied keystore", + "Recommends the slow two-box path for a high-stake active mainnet validator", + "Says to pin/verify the new image tag and not use :latest" + ] + }, + { + "id": 1, + "name": "disaster-recovery-no-double-sign", + "prompt": "Disaster: the server running my active Avail mainnet validator just died (cloud instance gone). I have an encrypted backup of the keystore and network folders. How do I get back to validating WITHOUT equivocating? I'm not 100% sure the old instance is truly dead.", + "expected_output": "Equivocation rule, rotate-to-new-keys Path B due to uncertain old node, Path A only if old definitively dead, db re-syncable vs keystore/network secrets, stash/controller from seed.", + "files": [], + "assertions": [ + "States that restoring the keystore while the old node may still be running causes double-signing = slashing", + "Because old-node status is uncertain, prescribes rotating to NEW session keys (Path B) rather than restoring the old keystore", + "Says restoring the old keystore (Path A) is acceptable only if the old instance is definitively destroyed", + "Notes db is re-syncable and only keystore + network are the irreplaceable secrets", + "States stash/controller are wallet keys recovered from seed/hardware, not from the server backup", + "New keys are registered on-chain via setKeys signed by the controller, with activation confirmed via logs" + ] + } + ] +} diff --git a/skills/avail-validator-operate/references/backup-recovery.md b/skills/avail-validator-operate/references/backup-recovery.md new file mode 100644 index 0000000..200ccd3 --- /dev/null +++ b/skills/avail-validator-operate/references/backup-recovery.md @@ -0,0 +1,80 @@ +# Avail validator backup & disaster recovery + +## What to back up (and what not to) + +| Path (under `/chains//`) | Back up? | Why | +|---|---|---| +| `keystore/` | **Yes, encrypted** | Session keys — irreplaceable, equivocation-critical | +| `network/` | **Yes** | Node key / libp2p identity | +| `db/` | No | Re-syncable from genesis or snapshot; contains no secret | + +In Docker the base is `/da/node-data`; discover the chain dir +(`docker exec ls /da/node-data/chains`) — its name varies by node version. + +## Backup procedure + +Take a backup right after going active and after **every** key rotation. It must be +**encrypted** and stored **off the validator box**. `scripts/backup-keys.sh` does this: +it `tar`s `keystore/` + `network/` and encrypts with `age` (or `gpg` fallback). + +Manual equivalent: + +```bash +CID=$(docker ps -lq) +CHAIN_DIR=$(docker exec "$CID" sh -c 'ls -d /da/node-data/chains/*' | head -1) +docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \ + | age -r > avail-keys-$(date +%F).tar.gz.age +# move the .age file off-box (it is the validator's identity — guard it) +``` + +Never store the archive unencrypted, and never store it on the same machine only. + +## Re-sync the DB (no secrets involved) + +If only the DB is bad (corruption, disk), you do **not** need keys back — keep the +keystore in place and rebuild state: + +```bash +# stop node, then purge chain data and let it re-sync +avail purge-chain # binary form; in Docker: stop container, delete db/ in the volume, restart +``` + +Or restore from a trusted DB snapshot to skip a long genesis sync (warp sync is not +available). Trust the snapshot source. + +## Disaster recovery — the rule that prevents self-slashing + +> Restoring the keystore onto a new node **while the old node is or might still be +> running** double-signs → equivocation → slash (validator **and** nominators). + +Choose the safe path: + +### Path A — old node is definitively dead +Use only when you are *certain* the old machine can never produce blocks again +(destroyed/wiped, disk pulled, account access revoked — not merely "I think it's off"). + +1. Provision a fresh node (setup skill), same `--chain`/`--name`, let it sync. +2. Stop it. Restore `keystore/` + `network/` from the encrypted backup into the + volume's `chains//`. +3. Start it. It resumes the **same** validator identity. Confirm authoring via logs. + +### Path B — old node status uncertain (preferred default) +If there is *any* doubt the old node is gone, do **not** restore the old keystore. +Instead rotate to **new** keys — new keys cannot equivocate against the old: + +1. Provision a fresh node, sync it. +2. `author_rotateKeys` on the new node (new session keys). +3. **Set Session Key** to the new hex (controller-signed) via the staking UI. +4. Wait for authoring to move to the new node — confirm by **logs**, not the UI. +5. The old node, even if it later comes back, is signing with keys no longer + registered on-chain → it cannot equivocate. Decommission it when reachable. + +Path B trades nothing meaningful (the validator account/stake is unchanged — only the +session keys rotate) for complete equivocation safety. Default to it. + +## Stash / controller recovery + +The stash and controller are **wallet** keys, never on the box — recover them from the +operator's seed/hardware wallet, not from server backups. If the controller seed is +compromised, the stash funds are still safe (separation), but rotate the controller and +re-`setKeys`/`validate` from the new controller promptly. diff --git a/skills/avail-validator-operate/references/chill-unbond.md b/skills/avail-validator-operate/references/chill-unbond.md new file mode 100644 index 0000000..81e15f7 --- /dev/null +++ b/skills/avail-validator-operate/references/chill-unbond.md @@ -0,0 +1,60 @@ +# Chill, unbond, exit — Avail validator + +Stopping validation cleanly is a staking action, not a server action. Killing the +container alone does **not** chill you — you'd be an offline validator (involuntary +chill, possible slash if many are offline together). Always chill on-chain *first*, +then it's safe to stop the node. + +## Chill (stop validating, keep funds bonded) + +`staking.chill` removes you from the active/waiting set without unbonding. + +- **Where:** staking actions UI (network URL in + `avail-validator-setup/references/networks.md`) → your account → **Stop**, or submit + the `staking.chill` extrinsic directly. +- **Signed by:** the **controller** account (not the stash). +- **Effective:** next era (~24 h). Funds remain bonded; you simply stop being + selectable for new/revised nominations. +- After chill takes effect (confirm you're out of the active set on the dashboard and + logs no longer show `🎁 Prepared block for proposing`), it is safe to stop/decommission + the node. + +### Voluntary vs involuntary chill +- **Voluntary:** you called `chill`. Clean. No slash. +- **Involuntary:** the network chilled you for being unresponsive a full session. No + slash by itself — but if ≥10% of validators are offline together in an epoch, that + whole group is slashed. So "I'll just turn it off" is risky; chill explicitly. + +## Unbond (start releasing the stake) + +After chilling, to free the bonded funds: + +1. `staking.unbond` the amount (controller-signed). +2. **28-day** unbonding lock — funds are non-transferable during this period. +3. After 28 days, `withdrawUnbonded` to make them transferable. + +You can chill without unbonding (pause validating, keep stake) or unbond a partial +amount and keep validating with the rest (as long as you stay above the waiting-list +floor — see networks.md economics). + +## Full exit checklist + +1. `staking.chill` (controller) → wait one era, confirm out of active set via logs + + dashboard. +2. Stop & decommission the node container. +3. `staking.unbond` the full bonded amount (controller). +4. Wait 28 days. +5. `withdrawUnbonded` (controller). Funds now transferable from the stash. +6. Securely destroy the on-box `keystore/` only after you're certain you won't rejoin + with the same identity (otherwise keep the encrypted backup). + +## Slashing context (why the order matters) + +- Equivocation slashes regardless of chill status — it's about duplicate signing, so + don't run the old node again with live keys after migrating. +- Slash appears immediately on the staking UI slashes page; the **financial deduction + is delayed several days** and governance can reverse it. Don't assume safety from + "balance not changed yet." +- Chilling promptly when you know you'll be offline (maintenance, migration) converts a + potential slash scenario into a clean no-penalty exit. Treat chill as the standard + pre-maintenance step for anything that risks a full session of downtime. diff --git a/skills/avail-validator-operate/references/monitoring.md b/skills/avail-validator-operate/references/monitoring.md new file mode 100644 index 0000000..f70ece8 --- /dev/null +++ b/skills/avail-validator-operate/references/monitoring.md @@ -0,0 +1,106 @@ +# Avail validator monitoring + +Three layers: built-in **telemetry** (free, public, low effort), **Prometheus + Grafana** +(your own metrics + alerts), and the **alert rules that actually prevent slashing**. + +## 1. Telemetry (built-in) + +The node auto-streams to Avail's public telemetry; pick the network tab at +`http://telemetry.avail.so/` and find your node by its `--name`. It is configured via +the chain spec — no flag needed for the default. To force a telemetry endpoint +explicitly (binary form): + +``` +./data-avail --validator \ + --port 30333 \ + --base-path `pwd`/data \ + --chain `pwd`/chainspec.raw.json \ + --name AvailNode \ + --telemetry-url 'ws://telemetry.avail.tools:8001/submit/ 0' +``` + +Telemetry is a convenience dashboard, **not** an alerting system. Use it for a quick +"is my node visible and at tip" check; rely on Prometheus for paging. + +## 2. Prometheus + Grafana (own stack) + +The node exposes Prometheus metrics on `:9615` (localhost-bound on a validator — scrape +from the same host, or add `--prometheus-external` only if you firewall it). + +Install Prometheus + node-exporter: + +``` +sudo apt-get install -y prometheus prometheus-node-exporter +``` + +`prometheus.yml` (node metrics on 9615, host metrics on 9100): + +``` +cat > $HOME/prometheus.yml << EOF +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "prometheus" + scrape_interval: 5s + static_configs: + - targets: ["localhost:9090"] + - job_name: "avail_node" + scrape_interval: 5s + static_configs: + - targets: ["localhost:9615"] + - job_name: node + static_configs: + - targets: ['localhost:9100'] +EOF +sudo mv $HOME/prometheus.yml /etc/prometheus/prometheus.yml +sudo chmod 644 /etc/prometheus/prometheus.yml +sudo systemctl enable prometheus.service prometheus-node-exporter.service +sudo systemctl restart prometheus.service prometheus-node-exporter.service +sudo systemctl status prometheus.service prometheus-node-exporter.service +``` + +Install Grafana: + +``` +wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - +echo "deb https://packages.grafana.com/oss/deb stable main" > grafana.list +sudo mv grafana.list /etc/apt/sources.list.d/grafana.list +sudo apt-get update && sudo apt-get install -y grafana +sudo systemctl enable grafana-server.service +sudo systemctl start grafana-server.service +sudo systemctl status grafana-server.service +sudo ufw allow 3000/tcp +``` + +Grafana UI on `http://:3000` (default `admin/admin`, forced reset). Add a +Prometheus data source pointing at `http://localhost:9090`. Import Avail's official +validator dashboard JSON: + +``` +https://raw.githubusercontent.com/availproject/docs/main/static/validator_metrics.json +``` + +> If the node runs in Docker, scrape works because `9615` is published to `127.0.0.1` +> on the host (the setup skill binds it there). Keep Grafana's `3000` firewalled to +> trusted IPs — don't `ufw allow` it open to the world on a validator host. + +## 3. Alerts that prevent slashing / ejection + +Page on these, not just CPU/disk: + +| Alert | Why it matters | +|---|---| +| Finalized block height not advancing for N min | Node forked/stalled — losing era points, heading for ejection | +| Best block not advancing / `⚙️ Syncing` for long | Falling behind tip; will miss authoring slots | +| Peer count below threshold (e.g. <3) | p2p/network problem; precedes desync | +| Node process down / absent from telemetry | Offline → involuntary chill; if ≥10% of set offline → **slash** | +| Era points / blocks authored dropping vs peers | Underperforming → lower rewards, election risk | +| Running image tag ≠ latest avail release | Missing consensus-relevant fixes; runtime upgrade may require new client | +| Disk < ~20% free | DB growth; node crash → downtime | + +Uptime is a **slashing-prevention control**. A full session unresponsive → +involuntary chill (no slash, but you stop earning and must rejoin). ≥10% of validators +offline together in an epoch → everyone in that group slashed. Treat "node down" as a +page-now incident, not a morning-review item. diff --git a/skills/avail-validator-operate/references/upgrade.md b/skills/avail-validator-operate/references/upgrade.md new file mode 100644 index 0000000..1b435ab --- /dev/null +++ b/skills/avail-validator-operate/references/upgrade.md @@ -0,0 +1,74 @@ +# Avail validator upgrades + +Goal: move to a newer `availj/avail` release without (a) corrupting the DB into long +downtime or (b) double-signing. Pick the procedure by your downtime tolerance. + +## First: pick and verify the new tag + +Never `:latest`. Find the new release and verify before pulling: + +```bash +curl -s https://api.github.com/repos/availproject/avail/releases/latest | grep -m1 '"tag_name"' +skopeo inspect docker://docker.io/availj/avail: | grep -E 'Digest|Created' +``` + +Read the release notes — a node release sometimes pairs with an on-chain runtime +upgrade. Runtime upgrades themselves are forkless and applied by governance; the +operator's only job is running a client new enough to follow them. + +## Fast upgrade — single box, brief downtime + +Acceptable when a few minutes of missed authoring is tolerable. Safe against +equivocation **because the old node is stopped before the new one starts** — same +keystore is never live twice. + +```bash +CID=$(docker ps -lq) +docker stop "$CID" # old node definitively down first +docker rename "$CID" avail-prev-$(date +%s) 2>/dev/null || true +docker run --restart=on-failure -d \ + -v /root/avail/node-data:/da/node-data \ + -p 30333:30333 -p 127.0.0.1:9944:9944 -p 127.0.0.1:9615:9615 \ + docker.io/availj/avail: \ + --chain -d /da/node-data --validator --name +docker logs -f $(docker ps -lq) +``` + +Reuse the **same volume and same `--name`**. Confirm recovery: `✨ Imported #N` rising, +then `🎁 Prepared block for proposing` once your next slot comes. If the new version +won't start or the DB is corrupt, roll back: stop the new container, restart the +renamed old one (`docker start avail-prev-…`). Keep the old container until the new one +has authored a block. + +`scripts/safe-upgrade.sh` performs exactly this stop-then-start ordering and refuses to +start the new container until the old one is confirmed stopped. + +## Slow & safe upgrade — two boxes, zero downtime + +Use for mainnet / high-stake where you can't miss slots. The trick: the new machine +gets **brand-new session keys**, so the two machines never share keys and cannot +equivocate against each other. + +1. Provision **Node B** on the new version, same `--chain`/config, fully synced. +2. On **Node B**: `author_rotateKeys` (see setup skill / `avail-validator.sh + rotate-keys`). Save the new hex. +3. On the staking actions UI (network URL in setup skill networks.md), **Set Session + Key** to Node B's new hex, signed by the **controller**. Both old and new keys are + shown for an epoch or two, then only the new. +4. Wait for authoring to migrate to B. **Confirm via logs, not the UI** — the UI can + show the switch an epoch before it is real. You want: `🎁 Prepared block for + proposing` appearing on **B** and stopping on **A**. +5. **Only then** stop Node A. Upgrade A; optionally repeat to switch back. + +At no point are both nodes authoring with the same keys — that is the whole point. + +## Anti-patterns + +- "Spin up the upgraded node next to the old one with the copied keystore, then kill + the old one." → both live with identical keys = **equivocation = slash**. Use the + two-box flow with `rotate-keys` instead, or the fast flow that stops first. +- Trusting the staking UI's switchover timing. Trust block-production logs. +- Fast-upgrading a high-stake mainnet validator during its expected authoring window — + prefer slow & safe there. +- Bumping the tag but not reading release notes — you may need the new client for a + scheduled runtime upgrade and not know it. diff --git a/skills/avail-validator-operate/scripts/backup-keys.sh b/skills/avail-validator-operate/scripts/backup-keys.sh new file mode 100755 index 0000000..9ab0661 --- /dev/null +++ b/skills/avail-validator-operate/scripts/backup-keys.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Encrypted off-box backup of an Avail validator's irreplaceable secrets. +# Backs up ONLY keystore/ (session keys) and network/ (node key) — db/ is +# re-syncable and is intentionally excluded. +# +# Usage: +# backup-keys.sh --recipient [--container ] [--out DIR] +# backup-keys.sh --gpg [--container ] [--out DIR] +# +# --recipient : encrypt with `age` to this recipient (preferred) +# --gpg : fall back to symmetric `gpg -c` (you'll be prompted for a passphrase) +# +# The resulting archive IS the validator identity. Move it off this host and guard it +# like a private key. Never keep the only copy on the validator box. +set -euo pipefail + +CID="" RECIPIENT="" USE_GPG=0 OUT="." +while [ $# -gt 0 ]; do + case "$1" in + --recipient) RECIPIENT="$2"; shift 2;; + --gpg) USE_GPG=1; shift;; + --container) CID="$2"; shift 2;; + --out) OUT="$2"; shift 2;; + *) echo "unknown arg: $1" >&2; exit 1;; + esac +done + +[ -n "$CID" ] || CID="$(docker ps -lq)" +[ -n "$CID" ] || { echo "error: no running container; pass --container " >&2; exit 1; } + +CHAIN_DIR="$(docker exec "$CID" sh -c 'ls -d /da/node-data/chains/* 2>/dev/null | head -1')" +[ -n "$CHAIN_DIR" ] || { echo "error: could not locate chains dir in container" >&2; exit 1; } +echo ">> backing up keystore/ + network/ from $CHAIN_DIR (db/ excluded by design)" + +STAMP="$(date +%F-%H%M%S)" +mkdir -p "$OUT" + +if [ -n "$RECIPIENT" ]; then + command -v age >/dev/null 2>&1 || { echo "error: age not installed" >&2; exit 1; } + DEST="$OUT/avail-keys-$STAMP.tar.gz.age" + docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \ + | age -r "$RECIPIENT" > "$DEST" +elif [ "$USE_GPG" = "1" ]; then + command -v gpg >/dev/null 2>&1 || { echo "error: gpg not installed" >&2; exit 1; } + DEST="$OUT/avail-keys-$STAMP.tar.gz.gpg" + docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \ + | gpg -c --cipher-algo AES256 -o "$DEST" +else + echo "error: pass --recipient or --gpg (refusing to write plaintext)" >&2 + exit 1 +fi + +echo ">> wrote $DEST" +echo ">> MOVE this off the validator host now. It is the validator's identity." +echo ">> verify it restores before you rely on it (see references/backup-recovery.md)." diff --git a/skills/avail-validator-operate/scripts/safe-upgrade.sh b/skills/avail-validator-operate/scripts/safe-upgrade.sh new file mode 100755 index 0000000..65ef819 --- /dev/null +++ b/skills/avail-validator-operate/scripts/safe-upgrade.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Fast single-box Avail validator upgrade with equivocation-safe ordering: +# the old container is STOPPED and confirmed down BEFORE the new one starts, so the +# same session keystore is never live on two nodes. Same volume + same --name are +# reused, so the validator identity is preserved (no setKeys needed for this path). +# +# For zero-downtime / high-stake mainnet, use the two-box rotate-keys flow in +# references/upgrade.md instead — NOT this script. +# +# Usage: +# safe-upgrade.sh --chain --tag --name \ +# [--container ] [--data DIR] +set -euo pipefail + +CID="" CHAIN="" TAG="" NAME="" DATA_DIR="/root/avail/node-data" +IMAGE_REPO="docker.io/availj/avail" +while [ $# -gt 0 ]; do + case "$1" in + --chain) CHAIN="$2"; shift 2;; + --tag) TAG="$2"; shift 2;; + --name) NAME="$2"; shift 2;; + --container) CID="$2"; shift 2;; + --data) DATA_DIR="$2"; shift 2;; + *) echo "unknown arg: $1" >&2; exit 1;; + esac +done +[ "$CHAIN" = "mainnet" ] || [ "$CHAIN" = "turing" ] || { echo "error: --chain mainnet|turing" >&2; exit 1; } +[ -n "$TAG" ] || { echo "error: --tag required (never :latest)" >&2; exit 1; } +[ -n "$NAME" ] || { echo "error: --name required (reuse the SAME node name)" >&2; exit 1; } +[ -n "$CID" ] || CID="$(docker ps -lq)" +[ -n "$CID" ] || { echo "error: no running container; pass --container " >&2; exit 1; } + +echo ">> verifying $IMAGE_REPO:$TAG before pull" +if command -v skopeo >/dev/null 2>&1; then + skopeo inspect "docker://$IMAGE_REPO:$TAG" | grep -E '"Digest"|"Created"' \ + || { echo "error: cannot inspect $IMAGE_REPO:$TAG" >&2; exit 1; } +else + echo " skopeo not found — skipping digest verification" +fi + +echo ">> stopping old container $CID (must be down before new one starts)" +docker stop "$CID" >/dev/null +# hard gate: refuse to continue unless the old node is actually not running +if [ -n "$(docker ps -q --filter id="$CID")" ]; then + echo "error: old container still running — aborting to avoid double-signing" >&2 + exit 1 +fi +docker rename "$CID" "avail-prev-$(date +%s)" 2>/dev/null || true +echo ">> old node confirmed stopped — starting new node on tag $TAG" + +docker run --restart=on-failure -d \ + -v "$DATA_DIR:/da/node-data" \ + -p 30333:30333 \ + -p 127.0.0.1:9944:9944 \ + -p 127.0.0.1:9615:9615 \ + "$IMAGE_REPO:$TAG" \ + --chain "$CHAIN" -d /da/node-data --validator --name "$NAME" + +NEW_CID="$(docker ps -lq)" +echo ">> new container $NEW_CID started" +echo ">> watch: docker logs -f $NEW_CID" +echo ">> healthy = '✨ Imported #N' rising, then '🎁 Prepared block for proposing'" +echo ">> rollback if DB corrupt: docker stop $NEW_CID; docker start avail-prev-*" +echo ">> keep the renamed old container until the new one has authored a block" diff --git a/skills/avail-validator-setup/SKILL.md b/skills/avail-validator-setup/SKILL.md new file mode 100644 index 0000000..72a795f --- /dev/null +++ b/skills/avail-validator-setup/SKILL.md @@ -0,0 +1,168 @@ +--- +name: avail-validator-setup +description: >- + Stand up and activate an Avail DA validator node (Docker-first) from scratch — day-0 provisioning + through day-1 staking and going active. Use this whenever the user wants to run, deploy, set up, + bootstrap, spin up, or "become" an Avail validator on Mainnet or Turing testnet; generate/rotate + session keys (author_rotateKeys); bond stake, set session keys on-chain (setKeys), register as a + validator (staking.validate) with a commission; pick the right chain/RPC/endpoints per network; + or securely store validator keys (keystore, node key, stash/controller). Triggers on phrases like + "run an Avail validator", "set up avail node", "availj/avail docker", "rotate session keys avail", + "stake my avail validator", "join the avail validator set", "avail turing testnet validator", + "secure my avail keystore". For ongoing day-2 work (monitoring, upgrades, backups, chill, + recovery) use the avail-validator-operate skill instead. +--- + +# Avail Validator — Setup (Day 0 + Day 1) + +Bring an Avail Data Availability validator from bare machine to active block producer. +**Docker-first.** One parameterized path covers Mainnet and Turing testnet — only the +`--chain` value and a handful of endpoints differ. + +A validator is a specialized full node that produces blocks (BABE) and finalizes them +(GRANDPA) under Nominated Proof of Stake. It must be **bonded and registered on-chain** +and must hold **session keys** in its local keystore, or it stays stuck at block 0. + +## The two phases + +- **Day 0 — Provision**: run a synced node container, firewalled correctly. +- **Day 1 — Activate**: generate session keys, create accounts, bond, set keys on-chain, + declare validator intent with a commission, wait for the next era. + +Do them in order. Day 1 needs a fully synced Day 0 node. + +## Network parameters + +Pick the network with the user before any command. Everything network-specific is in +`references/networks.md` — read it now and substitute concretely; never leave +`` / `` placeholders in commands you run. + +| Need | Mainnet | Turing testnet | +|---|---|---| +| `--chain` value | `mainnet` | `turing` | +| Test tokens | buy/bridge AVAIL | request from Turing faucet (see networks.md) | + +There is **no `--chain testnet`** — the only supported testnet is `turing`. Omitting +`--chain` errors out: `Please specify which chain you want to run, e.g. --chain mainnet`. + +## Day 0 — Provision + +### 1. Host + +Minimum 8 GB RAM / 4 cores / SSD; recommended 16 GB / 8 cores / 200–300 GB SSD (chain +grows). Linux server distro. Open **p2p port 30333** inbound. Do **not** expose RPC +(9944) or metrics (9615) to the public internet on a validator. + +### 2. Pick a secure image tag — never `:latest` + +The image is `docker.io/availj/avail`. Always pin an explicit released tag and verify +it before pulling (a validator must run a known-good binary): + +```bash +# newest releases — cross-check the version announced in Avail's Discord +curl -s https://api.github.com/repos/availproject/avail/releases/latest | grep -m1 '"tag_name"' +# verify the tag resolves and inspect its digest before use +skopeo inspect docker://docker.io/availj/avail: | grep -E 'Digest|Created' +``` + +Pin by digest in production if possible: `docker.io/availj/avail@sha256:`. + +### 3. Run the node + +`scripts/avail-validator.sh` wraps the exact upstream `docker run` with secure defaults +(restart policy, named volume, firewalled ports, digest pinning). Prefer it: + +```bash +scripts/avail-validator.sh provision --chain --tag --name +``` + +It is the canonical upstream command, parameterized: + +```bash +docker run --restart=on-failure -d \ + -v /root/avail/node-data:/da/node-data \ + -p 30333:30333 -p 127.0.0.1:9944:9944 -p 127.0.0.1:9615:9615 \ + docker.io/availj/avail: \ + --chain -d /da/node-data --validator --name +``` + +Note the deliberate hardening vs. the upstream docs: RPC `9944` and metrics `9615` are +bound to `127.0.0.1` only (upstream publishes 9944 on `0.0.0.0`, which is unsafe for a +validator). p2p `30333` is the only port that must be world-reachable. + +### 4. Wait for full sync + +```bash +docker logs -f $(docker ps -lq) +``` + +`⚙️ Syncing …` → still catching up. `💤 Idle (N peers) … best: #X, finalized #Y` with X +advancing and N>0 → synced. Role line must read `👤 Role: AUTHORITY` (that confirms +`--validator`). **Do not start Day 1 until fully synced and finalizing.** + +## Day 1 — Activate + +### 1. Generate session keys (on the node) + +`author_rotateKeys` creates the four session keys (babe, gran, imon, audi) inside the +container's keystore and returns their concatenated public hex: + +```bash +CID=$(docker ps -lq) +docker exec -i "$CID" curl -sH "Content-Type: application/json" \ + -d '{"id":1,"jsonrpc":"2.0","method":"author_rotateKeys","params":[]}' \ + http://localhost:9944 +docker restart "$CID" +``` + +(`scripts/avail-validator.sh rotate-keys` does exactly this and prints the hex.) Save +the `result` hex — it is submitted on-chain next. The private keys stay on disk in the +keystore and must **never** leave the box. See `references/key-security.md`. + +### 2. Create stash + controller accounts + +On the explorer wallet (URLs in `references/networks.md`) create **two separate** +accounts: a **stash** (holds bonded funds — keep in cold/hardware storage) and a +**controller** (signs `setKeys`, `validate`, `chill` — used routinely). Separation is a +security control: a compromised controller cannot move bonded funds. Fund the stash; +keep a little in the controller for fees. Detail: `references/key-security.md`. + +### 3. Bond, set keys, validate + +Via the staking UI (network-specific URL in `references/networks.md`): + +1. **Bond** from the stash — at least **50,000 AVAIL** to enter the waiting list. Don't + bond everything; leave fee headroom. Unbonding later is locked **28 days**. +2. **Set Session Key** — paste the `author_rotateKeys` hex (the `setKeys` extrinsic, + signed by the controller). The button then changes to **Validate**. +3. **Validate** — set your **commission %** and submit (`staking.validate`). This + declares validator intent. + +### 4. Become active + +You enter the **Waiting** set. The active set is re-elected each **era (~24 h)** by +stake. If your stake is high enough you're elected within an era or two. Confirm by +node logs showing `🎁 Prepared block for proposing` — not by the UI, which can show +the change before it is real. No rewards in the era you stake; first payout ~era N+3. + +## Key security — non-negotiable + +Read `references/key-security.md` and apply it during Day 1. The load-bearing rules: + +- The **keystore** (`/chains//keystore`) and **network** (node key) + dirs are the only irreplaceable on-box secrets. `db` is re-syncable. +- **Never run two nodes with the same keystore at once** — double-signing is + equivocation and is slashed (validator *and* nominators). This dictates how upgrades + and recovery are done (see avail-validator-operate). +- Stash in cold/hardware storage; stash ≠ controller. + +## When stuck + +`references/troubleshooting.md` — no peers, sync stalled, stays `FULL` not `AUTHORITY`, +not producing blocks after election, key/account mismatch. + +## Handing off to day-2 + +Once the validator is producing blocks, ongoing monitoring, upgrades, backups, chill, +and disaster recovery are the **avail-validator-operate** skill's job. Point the user +there rather than improvising those here — upgrade/recovery have equivocation traps. diff --git a/skills/avail-validator-setup/evals/evals.json b/skills/avail-validator-setup/evals/evals.json new file mode 100644 index 0000000..3792abc --- /dev/null +++ b/skills/avail-validator-setup/evals/evals.json @@ -0,0 +1,38 @@ +{ + "skill_name": "avail-validator-setup", + "evals": [ + { + "id": 0, + "name": "turing-full-lifecycle-docker", + "prompt": "I want to run an Avail validator on the Turing testnet using Docker on a fresh Ubuntu box. Walk me through everything from getting the node container up and synced to actually becoming an active validator that produces blocks with my stake. I keep getting confused about session keys vs my wallet accounts.", + "expected_output": "Ordered day-0 then day-1 runbook with Turing-concrete commands; clear session-keys-vs-wallet-accounts explanation; confirm-via-logs.", + "files": [], + "assertions": [ + "Uses --chain turing for the testnet (does not invent a 'testnet' chain alias or use mainnet)", + "Pins an explicit availj/avail image tag and says not to use :latest, with a verify step before pull", + "docker run includes --validator and binds RPC 9944 / metrics 9615 to localhost while exposing only p2p 30333", + "Tells the user to wait for Role: AUTHORITY and a synced/idle node before starting day-1 staking", + "Generates session keys with author_rotateKeys via docker exec and restarts the node afterward", + "Clearly distinguishes on-box session keys from off-box wallet stash/controller accounts", + "Bonds from the stash (>=50000 AVAIL) and submits setKeys + validate signed by the controller", + "Says to confirm the validator is active via node logs (block proposing), not the staking UI alone" + ] + }, + { + "id": 1, + "name": "mainnet-keys-and-port-hardening", + "prompt": "Setting up an Avail mainnet validator with the availj/avail docker image. Two things I care about: (1) the secure way to generate and register session keys so I don't get slashed, and (2) locking down the box so my RPC port isn't exposed to the internet and the keys are safe. Where do the private keys actually live?", + "expected_output": "Mainnet-concrete secure setup: tag verification, localhost-bound RPC/metrics, key-category model, on-disk key locations, equivocation rule, encrypted off-box backup.", + "files": [], + "assertions": [ + "Uses --chain mainnet", + "Secure tag selection: explicitly avoids :latest and verifies the tag/digest (e.g. skopeo) before pulling", + "Binds RPC 9944 and metrics 9615 to 127.0.0.1 and exposes only p2p 30333 publicly", + "States private session keys live in /chains//keystore inside the Docker volume, and node key in network/", + "Explains the three key categories and that stash and controller must be separate accounts kept off the box", + "States never run two nodes with the same keystore (equivocation = slashing of validator and nominators)", + "Recommends an encrypted off-box backup of keystore + network and restrictive file permissions" + ] + } + ] +} diff --git a/skills/avail-validator-setup/references/key-security.md b/skills/avail-validator-setup/references/key-security.md new file mode 100644 index 0000000..fd9c5bc --- /dev/null +++ b/skills/avail-validator-setup/references/key-security.md @@ -0,0 +1,71 @@ +# Avail validator key security + +A validator manages three distinct categories of key material. Confusing them is how +operators get slashed or drained. Treat this as the security contract for the skill. + +## The three categories + +### 1. Session keys — the equivocation-critical secret +- **What:** four keys — `babe` (block production), `gran` (GRANDPA finality, ed25519), + `imon` (ImOnline heartbeat), `audi` (authority discovery). Generated by + `author_rotateKeys`, which returns only the concatenated **public** hex. +- **Where:** encrypted files in `/chains//keystore/`. In Docker + that is `/da/node-data/chains//keystore/` inside the volume. +- **Rule:** the private keystore must **never leave the box and never exist on two + running nodes at the same time.** Two nodes signing with the same session keys = + equivocation = slashing of the validator *and* its nominators. This single rule + dictates the safe upgrade/recovery procedures in avail-validator-operate. +- **Rotation:** `author_rotateKeys` + on-chain `setKeys` is the safe way to move the + validator identity between machines — not copying the keystore. + +### 2. Node key (libp2p identity) +- **What:** the node's persistent peer identity (`12D3Koo…`). +- **Where:** `/chains//network/`. +- **Rule:** back it up with the keystore. Not equivocation-critical, but losing it + changes your peer ID and telemetry identity. Never share it. + +### 3. Stash & controller account keys — wallet keys, NOT on the validator box +- **Stash:** holds the bonded funds. Keep in **cold storage / hardware wallet**. Used + rarely (bond, designate controller). +- **Controller:** signs day-to-day staking extrinsics — `setKeys`, `staking.validate`, + `staking.chill`. Used routinely. +- **Rule:** stash and controller **must be different accounts.** A compromised + controller can stop/redirect validation but cannot move the stash's bonded funds. + Neither private key belongs on the validator server — they live in the operator's + wallet/hardware device. + +## On-disk layout + +``` +/ # Docker: /da/node-data (host: /root/avail/node-data) +└── chains + └── / # e.g. avail_turing_network — discover, don't hardcode + ├── db/ # blockchain state — RE-SYNCABLE, contains no secret + ├── keystore/ # session keys — IRREPLACEABLE SECRET (category 1) + └── network/ # node key — back this up (category 2) +``` + +`db` can always be rebuilt from genesis or a snapshot. `keystore` + `network` are the +only things worth backing up, and they must be backed up **encrypted**. + +## Hardening checklist (apply during Day 1) + +- Restrict the data volume on the host: `chmod -R go-rwx /root/avail/node-data` and + `chown` to the container's uid. The keystore should be unreadable by other users. +- Never publish RPC `9944` or metrics `9615` beyond `127.0.0.1`. Only p2p `30333` is + public. (The setup skill's `docker run` already binds 9944/9615 to localhost.) +- Take an **encrypted** backup of `keystore/` + `network/` immediately after Day 1 and + store it off-box (e.g. `tar … | age -e` / `gpg -c`). Recovery without it means + rotating to brand-new session keys. +- Stash on a hardware wallet. Record the stash/controller addresses (public) somewhere + durable; never record the seed phrases digitally in plaintext. +- Consider a proxy account for routine staking calls so the controller seed is touched + less often (optional; see Avail "proxies-on-avail" docs). + +## The one mistake that slashes you + +Restoring a keystore backup onto a new node **while the old node is or might still be +running** double-signs and equivocates. Any keystore move requires the previous node to +be **definitively offline first**, or — preferred — use the `author_rotateKeys` + +`setKeys` rotation flow so the two machines never share keys. This is covered +operationally in the avail-validator-operate skill (safe upgrade / disaster recovery). diff --git a/skills/avail-validator-setup/references/networks.md b/skills/avail-validator-setup/references/networks.md new file mode 100644 index 0000000..03607da --- /dev/null +++ b/skills/avail-validator-setup/references/networks.md @@ -0,0 +1,65 @@ +# Avail networks — concrete values + +Substitute these into every command/URL. Avail runs **one** node binary/image; only +these values differ between networks. There is no `testnet` chain alias — the only +supported testnet is **Turing**. + +## Parameter table + +| Param | **Mainnet** | **Turing testnet** | +|---|---|---| +| `--chain` value | `mainnet` | `turing` | +| Chain spec line in logs | `Avail Mainnet` | `Avail Turing Network` | +| Official WS RPC | `wss://mainnet-rpc.avail.so/ws` | `wss://turing-rpc.avail.so/ws` | +| Light-client API | `https://api.lightclient.mainnet.avail.so/v1` | `https://api.lightclient.turing.avail.so/v1` | +| Block explorer (Subscan) | `https://avail.subscan.io/` | `https://avail-turing.subscan.io/` | +| App / extrinsics explorer | `https://explorer.availproject.org/?rpc=wss://mainnet-rpc.avail.so/ws` | `https://explorer.availproject.org/?rpc=wss://turing-rpc.avail.so/ws` | +| Staking dashboard | `https://staking.availproject.org/#/overview` | same UI — select Turing | +| Staking actions (bond/setKeys/validate/chill) | `https://explorer.availproject.org/#/staking/actions` | same UI — select Turing | +| Telemetry | `http://telemetry.avail.so/` | `http://telemetry.avail.so/` (Turing tab) | +| Token symbol | AVAIL | AVAIL | +| Chain-data dir under `/chains/` | e.g. `avail_mainnet_network` | e.g. `avail_turing_network` / `avail_turing_testnet` (varies by node version) | + +> The chain-data subdir name varies across node versions. Never hardcode it — discover +> it: `docker exec ls /da/node-data/chains`. + +## Public RPC endpoints (for queries / explorer, not for the validator itself) + +**Mainnet:** OnFinality `https://avail.api.onfinality.io/public` · +Ankr `https://mainnet.avail-rpc.com/` · AllNodes `https://avail-rpc.publicnode.com/` · +VitWit `https://avail.rpc.vitwit.com/` · GlobalStake `https://rpc-avail.globalstake.io` · +RadiumBlock `https://avail.public.curie.radiumblock.co/http` + +**Turing:** OnFinality `https://avail-turing.api.onfinality.io/public` · +Ankr `https://rpc.ankr.com/avail_turing_testnet` · +AllNodes `https://avail-turing-rpc.publicnode.com` · +RadiumBlock `https://turing.public.curie.radiumblock.co/http` + +WSS variants: replace `https://`→`wss://` and the path per provider (e.g. OnFinality +`wss://avail.api.onfinality.io/public-ws`). + +## Test tokens (Turing only) + +Turing AVAIL is obtained via the Avail faucet / Discord, not a CLI. Direct the user to +the faucet linked from `https://docs.availproject.org/docs/da/build/interact/faucet` +(or the Avail Discord `#faucet`). Mainnet AVAIL must be bought or bridged +(`https://bridge.availproject.org/`). + +## Staking economics (both networks unless noted) + +- **Era** ≈ 24 h. Active validator set re-elected each era boundary by stake. +- **Min self-bond to enter the validator waiting list:** ≥ **50,000 AVAIL**. + (Turing's network "threshold" may currently be 0, but the 50k waiting-list figure is + the documented practical floor — confirm live before advising the user.) +- **Reward lag:** stake in era N → active N+1 → accrues N+2 → first payout ~N+3. +- **Unbonding lock:** **28 days** after `unbond` before funds are withdrawable. +- **Minimum Nominated** is recalculated every era and rises as total stake grows — a + validator above threshold today can fall below later. Not a one-time check. + +## Chain spec source (only if running outside Docker / custom spec) + +The Docker image ships the correct spec; `--chain mainnet|turing` is enough. If a raw +spec is ever needed: +- Mainnet: `https://raw.githubusercontent.com/availproject/avail/main/misc/genesis/mainnet.chain.spec.raw.json` +- Turing: the docs link a GitHub `blob/` URL (a docs bug); use the `raw.githubusercontent.com` + equivalent: `https://raw.githubusercontent.com/availproject/avail/main/misc/genesis/testnet.turing.chain.spec.raw.json` diff --git a/skills/avail-validator-setup/references/troubleshooting.md b/skills/avail-validator-setup/references/troubleshooting.md new file mode 100644 index 0000000..8faf445 --- /dev/null +++ b/skills/avail-validator-setup/references/troubleshooting.md @@ -0,0 +1,52 @@ +# Avail validator setup — troubleshooting + +Symptom → cause → fix. Check `docker logs -f $(docker ps -lq)` first; the log markers +below are the diagnostic signal. + +## Node won't start + +- `Error: Input("Please specify which chain you want to run, e.g. --chain mainnet")` + → `--chain` missing. Add `--chain mainnet` or `--chain turing`. There is no + `testnet` alias. +- Container exits immediately, restarts in a loop → check `docker logs`; usually a bad + `-v` mount path or the volume dir not writable by the container uid. Fix host perms, + recreate. + +## Stuck syncing / no peers + +- Log stays `💤 Idle (0 peers)` → p2p port **30333 not reachable**. Open it inbound in + the host/cloud firewall and security group; confirm `-p 30333:30333` is published. +- `⚙️ Syncing` never reaching tip, or `❌ Error while dialing /dns/telemetry…` → the + telemetry dial error is harmless (telemetry only). Real sync stall = peers/port or + disk too slow; check `best:` and `finalized:` are advancing. +- Genesis sync is slow (hours), not the "5–10 min" some docs imply. Warp sync is not + available. A trusted DB snapshot speeds it up (operate skill covers restore). + +## Node runs but never becomes a validator + +- Log shows `👤 Role: FULL` not `AUTHORITY` → `--validator` flag missing from the + `docker run` args. Recreate the container with `--validator`. +- `Role: AUTHORITY` but no `🎁 Prepared block for proposing` after election: + - Session keys never set on-chain, or set against the wrong account → re-run + `author_rotateKeys`, then **Set Session Key** signed by the **controller**. + - Not elected yet — you're still in **Waiting**. Election happens at era boundaries + (~24 h); insufficient stake keeps you waiting. Verify stake ≥ waiting-list floor + and on the staking dashboard you appear under Waiting/Active. + - Node not fully synced when keys were set → wait for `💤 Idle` with advancing + `finalized:`, rotate keys again, resubmit. + +## Key / account problems + +- `setKeys` / `validate` extrinsic fails or "controller not bonded" → bond from the + **stash** first, then `setKeys`/`validate` from the **controller**. Stash and + controller must be distinct accounts. +- Rotated keys but validator stopped producing → `author_rotateKeys` replaced the + on-box keys; you must submit the **new** hex via `setKeys` (the rotate flow is + intentional for migrations — see operate skill safe-upgrade). + +## Verifying health quickly + +- `docker exec ls /da/node-data/chains` → confirms the chain dir / network. +- Telemetry site (network tab) → node visible by `--name`, block height tracking tip. +- Logs: `✨ Imported #N` rising = following chain; `🎁 Prepared block for proposing` = + actively authoring (you are an active validator). diff --git a/skills/avail-validator-setup/scripts/avail-validator.sh b/skills/avail-validator-setup/scripts/avail-validator.sh new file mode 100755 index 0000000..f98999b --- /dev/null +++ b/skills/avail-validator-setup/scripts/avail-validator.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Avail validator setup helper — Docker-first, network-parameterized. +# Wraps the exact upstream `docker run` with validator-safe hardening: +# - RPC 9944 and metrics 9615 bound to 127.0.0.1 only (upstream exposes 9944 publicly) +# - p2p 30333 published (must be world-reachable) +# - explicit pinned tag, optional digest pinning, restart policy, named host volume +# +# Usage: +# avail-validator.sh provision --chain --tag --name [--data DIR] +# avail-validator.sh rotate-keys [--container ] +# avail-validator.sh status [--container ] +# +# This script is deliberately small and explicit. Read it before running it on a +# machine that will hold real stake. +set -euo pipefail + +DATA_DIR="/root/avail/node-data" +IMAGE_REPO="docker.io/availj/avail" + +die() { echo "error: $*" >&2; exit 1; } + +cmd_provision() { + local chain="" tag="" name="" + while [ $# -gt 0 ]; do + case "$1" in + --chain) chain="$2"; shift 2;; + --tag) tag="$2"; shift 2;; + --name) name="$2"; shift 2;; + --data) DATA_DIR="$2"; shift 2;; + *) die "unknown arg: $1";; + esac + done + [ -n "$chain" ] || die "--chain required (mainnet|turing)" + [ "$chain" = "mainnet" ] || [ "$chain" = "turing" ] || die "--chain must be mainnet or turing" + [ -n "$tag" ] || die "--tag required (pin an explicit release, never :latest)" + [ -n "$name" ] || die "--name required" + + echo ">> verifying image $IMAGE_REPO:$tag before pull (validator must run known-good binary)" + if command -v skopeo >/dev/null 2>&1; then + skopeo inspect "docker://$IMAGE_REPO:$tag" | grep -E '"Digest"|"Created"' \ + || die "skopeo could not inspect $IMAGE_REPO:$tag — bad tag?" + else + echo " skopeo not found — skipping digest verification (install skopeo to harden)" + fi + + mkdir -p "$DATA_DIR" + echo ">> p2p port 30333 must be reachable inbound; 9944/9615 stay localhost-only" + set -x + docker run --restart=on-failure -d \ + -v "$DATA_DIR:/da/node-data" \ + -p 30333:30333 \ + -p 127.0.0.1:9944:9944 \ + -p 127.0.0.1:9615:9615 \ + "$IMAGE_REPO:$tag" \ + --chain "$chain" -d /da/node-data --validator --name "$name" + set +x + echo ">> tail sync with: docker logs -f \$(docker ps -lq)" + echo ">> wait for 'Role: AUTHORITY' and steady '💤 Idle (N peers)' before Day 1" +} + +_pick_container() { + local cid="${1:-}" + [ -n "$cid" ] && { echo "$cid"; return; } + docker ps -lq +} + +cmd_rotate_keys() { + local cid="" + [ "${1:-}" = "--container" ] && { cid="$2"; shift 2; } + cid="$(_pick_container "$cid")" + [ -n "$cid" ] || die "no running container; pass --container " + echo ">> generating session keys in container $cid" + docker exec -i "$cid" curl -sH "Content-Type: application/json" \ + -d '{"id":1,"jsonrpc":"2.0","method":"author_rotateKeys","params":[]}' \ + http://localhost:9944 + echo + echo ">> restarting node to load keys" + docker restart "$cid" >/dev/null + echo ">> submit the 'result' hex above via Set Session Key (signed by CONTROLLER)" + echo ">> private keys remain in the keystore on this box — never copy them off" +} + +cmd_status() { + local cid="" + [ "${1:-}" = "--container" ] && { cid="$2"; shift 2; } + cid="$(_pick_container "$cid")" + [ -n "$cid" ] || die "no running container; pass --container " + echo ">> chains dir (confirms network):" + docker exec "$cid" ls /da/node-data/chains + echo ">> last log lines:" + docker logs --tail 20 "$cid" +} + +sub="${1:-}"; shift || true +case "$sub" in + provision) cmd_provision "$@";; + rotate-keys) cmd_rotate_keys "$@";; + status) cmd_status "$@";; + *) die "usage: $0 {provision|rotate-keys|status} ...";; +esac