From d6097b0fbe531a0d8b0dbf4ed0b04d2913b0dd92 Mon Sep 17 00:00:00 2001
From: pratikbin <68642400+pratikbin@users.noreply.github.com>
Date: Tue, 19 May 2026 13:27:31 +0530
Subject: [PATCH] feat: avail node operations skills
---
.gitignore | 4 +
README.md | 6 +
skills/avail-validator-operate/SKILL.md | 111 ++++++++++++
.../avail-validator-operate/evals/evals.json | 35 ++++
.../references/backup-recovery.md | 80 +++++++++
.../references/chill-unbond.md | 60 +++++++
.../references/monitoring.md | 106 +++++++++++
.../references/upgrade.md | 74 ++++++++
.../scripts/backup-keys.sh | 55 ++++++
.../scripts/safe-upgrade.sh | 64 +++++++
skills/avail-validator-setup/SKILL.md | 168 ++++++++++++++++++
skills/avail-validator-setup/evals/evals.json | 38 ++++
.../references/key-security.md | 71 ++++++++
.../references/networks.md | 65 +++++++
.../references/troubleshooting.md | 52 ++++++
.../scripts/avail-validator.sh | 100 +++++++++++
16 files changed, 1089 insertions(+)
create mode 100644 skills/avail-validator-operate/SKILL.md
create mode 100644 skills/avail-validator-operate/evals/evals.json
create mode 100644 skills/avail-validator-operate/references/backup-recovery.md
create mode 100644 skills/avail-validator-operate/references/chill-unbond.md
create mode 100644 skills/avail-validator-operate/references/monitoring.md
create mode 100644 skills/avail-validator-operate/references/upgrade.md
create mode 100755 skills/avail-validator-operate/scripts/backup-keys.sh
create mode 100755 skills/avail-validator-operate/scripts/safe-upgrade.sh
create mode 100644 skills/avail-validator-setup/SKILL.md
create mode 100644 skills/avail-validator-setup/evals/evals.json
create mode 100644 skills/avail-validator-setup/references/key-security.md
create mode 100644 skills/avail-validator-setup/references/networks.md
create mode 100644 skills/avail-validator-setup/references/troubleshooting.md
create mode 100755 skills/avail-validator-setup/scripts/avail-validator.sh
diff --git a/.gitignore b/.gitignore
index d0ccb93..904bb66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,7 @@ target/
.claude/
.claude_consciousness.m8
.opencode/
+.firecrawl/
+
+# skill-creator eval/optimization artifacts
+skills/*-workspace/
diff --git a/README.md b/README.md
index 7cd31c6..317f3b4 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@ AI agent skills for the [NodeOps](https://nodeops.network) ecosystem. Works with
| **createos** | Deploy anything to production on CreateOS cloud platform | `npx skills add https://github.com/NodeOps-app/skills --skill createos` |
| **vercel-to-createos** | Migrate Next.js, Vite, React, Vue, Svelte apps from Vercel to CreateOS | `npx skills add https://github.com/NodeOps-app/skills --skill vercel-to-createos` |
| **claude-code-to-codex** | Migrate Claude Code CLI hooks, MCP servers, plugins, instructions, and sessions to Codex CLI | `npx skills add https://github.com/NodeOps-app/skills --skill claude-code-to-codex` |
+| **avail-validator-setup** | Stand up and activate an Avail DA validator (Docker-first) — day-0 provisioning through day-1 staking and going active, on Mainnet or Turing testnet | `npx skills add https://github.com/NodeOps-app/skills --skill avail-validator-setup` |
+| **avail-validator-operate** | Day-2 ops for a live Avail DA validator — monitoring, slash-safe upgrades, key backup, chill/unbond, disaster recovery without equivocation | `npx skills add https://github.com/NodeOps-app/skills --skill avail-validator-operate` |
### Migration skills
@@ -18,6 +20,10 @@ AI agent skills for the [NodeOps](https://nodeops.network) ecosystem. Works with
`claude-code-to-codex` migrates Claude Code CLI setups to Codex CLI, with focused coverage for hooks, Claude Code CLI MCP servers, plugins, and session handoff.
+### Avail validator skills
+
+`avail-validator-setup` and `avail-validator-operate` cover the full lifecycle of an [Avail DA](https://docs.availproject.org/docs/da/operate/become-a-validator) validator, Docker-first and network-parameterized (Mainnet / Turing testnet). `avail-validator-setup` handles day-0 provisioning through day-1 session keys, bonding, and going active; `avail-validator-operate` handles day-2 monitoring, slash-safe upgrades, encrypted key backup, chill/unbond, and disaster recovery — every procedure built around avoiding equivocation/double-signing.
+
## CreateOS Authentication
The `createos` skill can be used in two modes:
diff --git a/skills/avail-validator-operate/SKILL.md b/skills/avail-validator-operate/SKILL.md
new file mode 100644
index 0000000..6dd274c
--- /dev/null
+++ b/skills/avail-validator-operate/SKILL.md
@@ -0,0 +1,111 @@
+---
+name: avail-validator-operate
+description: >-
+ Run, maintain, and protect an already-active Avail DA validator — day-2 operations.
+ Use this whenever the user needs to monitor an Avail validator (telemetry, Prometheus,
+ Grafana, alerting on missed blocks/peers/sync/era points), upgrade the node to a new
+ availj/avail image or release without getting slashed, back up the keystore/node key,
+ restore or migrate a validator after a server loss WITHOUT double-signing, chill /
+ stop validating cleanly (staking.chill), unbond, or handle equivocation/slashing risk
+ and disaster recovery. Triggers on phrases like "monitor my avail validator", "set up
+ grafana for avail", "upgrade avail node safely", "avail validator slashed", "back up
+ avail keystore", "migrate avail validator to new server", "stop validating avail",
+ "chill my avail validator", "avail node equivocation", "restore avail validator".
+ For first-time setup, session-key generation, bonding and going active, use the
+ avail-validator-setup skill instead.
+---
+
+# Avail Validator — Operate (Day 2)
+
+Keep an active Avail validator healthy and **avoid the one class of mistake that gets
+you slashed: equivocation (double-signing)**. Equivocation slashes the validator *and*
+its nominators, so every procedure here is shaped around the rule:
+
+> **The same session keystore must never be active on two running nodes at once.**
+
+Docker-first. One parameterized path covers Mainnet and Turing. Network-specific URLs
+and economics (era ≈ 24 h, 28-day unbond, reward lag) are in
+`avail-validator-setup/references/networks.md` — reuse it; don't restate values.
+
+## What day-2 covers
+
+| Task | Read |
+|---|---|
+| Monitoring & alerting | `references/monitoring.md` |
+| Node upgrade (safe vs fast) | `references/upgrade.md` |
+| Backup of secrets | `references/backup-recovery.md` |
+| Disaster recovery / server migration | `references/backup-recovery.md` |
+| Chill / unbond / withdraw | `references/chill-unbond.md` |
+| Slashing & equivocation model | this file + `references/chill-unbond.md` |
+
+Always identify the network and the running container first:
+
+```bash
+CID=$(docker ps -lq)
+docker exec "$CID" ls /da/node-data/chains # confirms chain dir / network
+docker logs --tail 30 "$CID"
+```
+
+## Monitoring (do this on day 1 of day-2)
+
+A validator you can't observe is a validator you can't protect. Stand up the metrics
+stack and alerts before anything else. Full configs (telemetry flag, `prometheus.yml`,
+Grafana install, the official dashboard JSON) and the alert thresholds that actually
+matter are in `references/monitoring.md`.
+
+Alert, at minimum, on: node down / not on telemetry, **finalized height not
+advancing**, **peer count low**, sync falling behind tip, **missed blocks / era points
+dropping**, and version drift from the latest release. A full session unresponsive →
+involuntary chill; >10 % of validators offline together in an epoch → all slashed.
+
+## Upgrades — the equivocation trap
+
+`docker pull` + recreate is fine for a **full/RPC node**. For an **active validator**
+it risks: (a) DB corruption → prolonged downtime → ejection from the active set, and
+(b) — if you "just spin up the new one alongside the old" — **double-signing**.
+
+Two procedures, in `references/upgrade.md`:
+
+- **Fast (acceptable downtime, single box):** stop container → recreate on the new
+ pinned tag with the same volume → verify it resumes authoring. Brief downtime, no
+ equivocation because the old node is stopped first.
+- **Slow & safe (zero downtime, two boxes):** stand up Node B on the new version,
+ `author_rotateKeys` on **B**, submit the new keys via **Set Session Key**, wait for
+ block production to move to B (confirm by **logs**, not the UI), *then* and only then
+ stop Node A. Never have both authoring with the same keys.
+
+`scripts/safe-upgrade.sh` walks the fast path with the stop-before-start ordering
+enforced. Read `references/upgrade.md` before using it.
+
+## Backups
+
+`db` is re-syncable and holds no secret — don't fixate on it. The only irreplaceable
+on-box material is `keystore/` (session keys) and `network/` (node key). Back them up
+**encrypted and off-box**, immediately and after any key rotation.
+`scripts/backup-keys.sh` produces an encrypted archive. Procedure + restore in
+`references/backup-recovery.md`.
+
+## Disaster recovery — without slashing yourself
+
+Losing the server is survivable; **restoring keys onto a new box while the old one
+might still be running is not** — that double-signs. The safe recovery paths
+(old-node-definitively-dead vs rotate-to-new-keys) are in
+`references/backup-recovery.md`. When in doubt, rotate to **new** session keys via
+`setKeys` rather than restoring the old keystore — new keys can't equivocate against
+the old.
+
+## Chill / unbond / exit
+
+Stopping cleanly is `staking.chill` (UI or extrinsic), **signed by the controller**,
+effective **next era**; funds stay bonded. Unbond → **28-day** lock → withdraw.
+Step-by-step, plus the difference between voluntary and involuntary chill and the
+slashing conditions, in `references/chill-unbond.md`.
+
+## Slashing facts to act on
+
+- Equivocation (two blocks same slot, or conflicting GRANDPA votes) → slash for
+ validator **and** nominators. Usually self-inflicted by running duplicate keys.
+- Slash shows immediately on the staking UI's slashes page, but the **financial
+ deduction is delayed days** (governance can reverse it). "Not deducted yet" ≠ "safe".
+- Involuntary chill (offline, <10 % of set) → no slash; ≥10 % offline together →
+ slash. Uptime monitoring is a slashing-prevention control, not a nicety.
diff --git a/skills/avail-validator-operate/evals/evals.json b/skills/avail-validator-operate/evals/evals.json
new file mode 100644
index 0000000..2a78a7a
--- /dev/null
+++ b/skills/avail-validator-operate/evals/evals.json
@@ -0,0 +1,35 @@
+{
+ "skill_name": "avail-validator-operate",
+ "evals": [
+ {
+ "id": 0,
+ "name": "safe-upgrade-no-equivocation",
+ "prompt": "My Avail validator on mainnet is active and producing blocks (availj/avail in Docker). A new release just dropped and I need to upgrade. I'm paranoid about getting slashed for double-signing. What's the safe upgrade procedure, and how is it different from just pulling the new image and restarting?",
+ "expected_output": "Equivocation explanation, fast stop-before-start path, slow two-box rotate-keys path, anti-patterns, high-stake recommendation, tag verification.",
+ "files": [],
+ "assertions": [
+ "Explicitly explains that double-signing/equivocation slashes the validator and its nominators, and why naive pull+restart is risky",
+ "Fast single-box path stops and confirms the old container is down BEFORE starting the new one, reusing the same volume and node name",
+ "Slow zero-downtime path: Node B on new version, author_rotateKeys for new keys, setKeys via controller, migrate confirmed via logs not UI, then stop Node A",
+ "Names the anti-pattern of running the old and new nodes simultaneously with the same/copied keystore",
+ "Recommends the slow two-box path for a high-stake active mainnet validator",
+ "Says to pin/verify the new image tag and not use :latest"
+ ]
+ },
+ {
+ "id": 1,
+ "name": "disaster-recovery-no-double-sign",
+ "prompt": "Disaster: the server running my active Avail mainnet validator just died (cloud instance gone). I have an encrypted backup of the keystore and network folders. How do I get back to validating WITHOUT equivocating? I'm not 100% sure the old instance is truly dead.",
+ "expected_output": "Equivocation rule, rotate-to-new-keys Path B due to uncertain old node, Path A only if old definitively dead, db re-syncable vs keystore/network secrets, stash/controller from seed.",
+ "files": [],
+ "assertions": [
+ "States that restoring the keystore while the old node may still be running causes double-signing = slashing",
+ "Because old-node status is uncertain, prescribes rotating to NEW session keys (Path B) rather than restoring the old keystore",
+ "Says restoring the old keystore (Path A) is acceptable only if the old instance is definitively destroyed",
+ "Notes db is re-syncable and only keystore + network are the irreplaceable secrets",
+ "States stash/controller are wallet keys recovered from seed/hardware, not from the server backup",
+ "New keys are registered on-chain via setKeys signed by the controller, with activation confirmed via logs"
+ ]
+ }
+ ]
+}
diff --git a/skills/avail-validator-operate/references/backup-recovery.md b/skills/avail-validator-operate/references/backup-recovery.md
new file mode 100644
index 0000000..200ccd3
--- /dev/null
+++ b/skills/avail-validator-operate/references/backup-recovery.md
@@ -0,0 +1,80 @@
+# Avail validator backup & disaster recovery
+
+## What to back up (and what not to)
+
+| Path (under `/chains//`) | Back up? | Why |
+|---|---|---|
+| `keystore/` | **Yes, encrypted** | Session keys — irreplaceable, equivocation-critical |
+| `network/` | **Yes** | Node key / libp2p identity |
+| `db/` | No | Re-syncable from genesis or snapshot; contains no secret |
+
+In Docker the base is `/da/node-data`; discover the chain dir
+(`docker exec ls /da/node-data/chains`) — its name varies by node version.
+
+## Backup procedure
+
+Take a backup right after going active and after **every** key rotation. It must be
+**encrypted** and stored **off the validator box**. `scripts/backup-keys.sh` does this:
+it `tar`s `keystore/` + `network/` and encrypts with `age` (or `gpg` fallback).
+
+Manual equivalent:
+
+```bash
+CID=$(docker ps -lq)
+CHAIN_DIR=$(docker exec "$CID" sh -c 'ls -d /da/node-data/chains/*' | head -1)
+docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \
+ | age -r > avail-keys-$(date +%F).tar.gz.age
+# move the .age file off-box (it is the validator's identity — guard it)
+```
+
+Never store the archive unencrypted, and never store it on the same machine only.
+
+## Re-sync the DB (no secrets involved)
+
+If only the DB is bad (corruption, disk), you do **not** need keys back — keep the
+keystore in place and rebuild state:
+
+```bash
+# stop node, then purge chain data and let it re-sync
+avail purge-chain # binary form; in Docker: stop container, delete db/ in the volume, restart
+```
+
+Or restore from a trusted DB snapshot to skip a long genesis sync (warp sync is not
+available). Trust the snapshot source.
+
+## Disaster recovery — the rule that prevents self-slashing
+
+> Restoring the keystore onto a new node **while the old node is or might still be
+> running** double-signs → equivocation → slash (validator **and** nominators).
+
+Choose the safe path:
+
+### Path A — old node is definitively dead
+Use only when you are *certain* the old machine can never produce blocks again
+(destroyed/wiped, disk pulled, account access revoked — not merely "I think it's off").
+
+1. Provision a fresh node (setup skill), same `--chain`/`--name`, let it sync.
+2. Stop it. Restore `keystore/` + `network/` from the encrypted backup into the
+ volume's `chains//`.
+3. Start it. It resumes the **same** validator identity. Confirm authoring via logs.
+
+### Path B — old node status uncertain (preferred default)
+If there is *any* doubt the old node is gone, do **not** restore the old keystore.
+Instead rotate to **new** keys — new keys cannot equivocate against the old:
+
+1. Provision a fresh node, sync it.
+2. `author_rotateKeys` on the new node (new session keys).
+3. **Set Session Key** to the new hex (controller-signed) via the staking UI.
+4. Wait for authoring to move to the new node — confirm by **logs**, not the UI.
+5. The old node, even if it later comes back, is signing with keys no longer
+ registered on-chain → it cannot equivocate. Decommission it when reachable.
+
+Path B trades nothing meaningful (the validator account/stake is unchanged — only the
+session keys rotate) for complete equivocation safety. Default to it.
+
+## Stash / controller recovery
+
+The stash and controller are **wallet** keys, never on the box — recover them from the
+operator's seed/hardware wallet, not from server backups. If the controller seed is
+compromised, the stash funds are still safe (separation), but rotate the controller and
+re-`setKeys`/`validate` from the new controller promptly.
diff --git a/skills/avail-validator-operate/references/chill-unbond.md b/skills/avail-validator-operate/references/chill-unbond.md
new file mode 100644
index 0000000..81e15f7
--- /dev/null
+++ b/skills/avail-validator-operate/references/chill-unbond.md
@@ -0,0 +1,60 @@
+# Chill, unbond, exit — Avail validator
+
+Stopping validation cleanly is a staking action, not a server action. Killing the
+container alone does **not** chill you — you'd be an offline validator (involuntary
+chill, possible slash if many are offline together). Always chill on-chain *first*,
+then it's safe to stop the node.
+
+## Chill (stop validating, keep funds bonded)
+
+`staking.chill` removes you from the active/waiting set without unbonding.
+
+- **Where:** staking actions UI (network URL in
+ `avail-validator-setup/references/networks.md`) → your account → **Stop**, or submit
+ the `staking.chill` extrinsic directly.
+- **Signed by:** the **controller** account (not the stash).
+- **Effective:** next era (~24 h). Funds remain bonded; you simply stop being
+ selectable for new/revised nominations.
+- After chill takes effect (confirm you're out of the active set on the dashboard and
+ logs no longer show `🎁 Prepared block for proposing`), it is safe to stop/decommission
+ the node.
+
+### Voluntary vs involuntary chill
+- **Voluntary:** you called `chill`. Clean. No slash.
+- **Involuntary:** the network chilled you for being unresponsive a full session. No
+ slash by itself — but if ≥10% of validators are offline together in an epoch, that
+ whole group is slashed. So "I'll just turn it off" is risky; chill explicitly.
+
+## Unbond (start releasing the stake)
+
+After chilling, to free the bonded funds:
+
+1. `staking.unbond` the amount (controller-signed).
+2. **28-day** unbonding lock — funds are non-transferable during this period.
+3. After 28 days, `withdrawUnbonded` to make them transferable.
+
+You can chill without unbonding (pause validating, keep stake) or unbond a partial
+amount and keep validating with the rest (as long as you stay above the waiting-list
+floor — see networks.md economics).
+
+## Full exit checklist
+
+1. `staking.chill` (controller) → wait one era, confirm out of active set via logs +
+ dashboard.
+2. Stop & decommission the node container.
+3. `staking.unbond` the full bonded amount (controller).
+4. Wait 28 days.
+5. `withdrawUnbonded` (controller). Funds now transferable from the stash.
+6. Securely destroy the on-box `keystore/` only after you're certain you won't rejoin
+ with the same identity (otherwise keep the encrypted backup).
+
+## Slashing context (why the order matters)
+
+- Equivocation slashes regardless of chill status — it's about duplicate signing, so
+ don't run the old node again with live keys after migrating.
+- Slash appears immediately on the staking UI slashes page; the **financial deduction
+ is delayed several days** and governance can reverse it. Don't assume safety from
+ "balance not changed yet."
+- Chilling promptly when you know you'll be offline (maintenance, migration) converts a
+ potential slash scenario into a clean no-penalty exit. Treat chill as the standard
+ pre-maintenance step for anything that risks a full session of downtime.
diff --git a/skills/avail-validator-operate/references/monitoring.md b/skills/avail-validator-operate/references/monitoring.md
new file mode 100644
index 0000000..f70ece8
--- /dev/null
+++ b/skills/avail-validator-operate/references/monitoring.md
@@ -0,0 +1,106 @@
+# Avail validator monitoring
+
+Three layers: built-in **telemetry** (free, public, low effort), **Prometheus + Grafana**
+(your own metrics + alerts), and the **alert rules that actually prevent slashing**.
+
+## 1. Telemetry (built-in)
+
+The node auto-streams to Avail's public telemetry; pick the network tab at
+`http://telemetry.avail.so/` and find your node by its `--name`. It is configured via
+the chain spec — no flag needed for the default. To force a telemetry endpoint
+explicitly (binary form):
+
+```
+./data-avail --validator \
+ --port 30333 \
+ --base-path `pwd`/data \
+ --chain `pwd`/chainspec.raw.json \
+ --name AvailNode \
+ --telemetry-url 'ws://telemetry.avail.tools:8001/submit/ 0'
+```
+
+Telemetry is a convenience dashboard, **not** an alerting system. Use it for a quick
+"is my node visible and at tip" check; rely on Prometheus for paging.
+
+## 2. Prometheus + Grafana (own stack)
+
+The node exposes Prometheus metrics on `:9615` (localhost-bound on a validator — scrape
+from the same host, or add `--prometheus-external` only if you firewall it).
+
+Install Prometheus + node-exporter:
+
+```
+sudo apt-get install -y prometheus prometheus-node-exporter
+```
+
+`prometheus.yml` (node metrics on 9615, host metrics on 9100):
+
+```
+cat > $HOME/prometheus.yml << EOF
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+scrape_configs:
+ - job_name: "prometheus"
+ scrape_interval: 5s
+ static_configs:
+ - targets: ["localhost:9090"]
+ - job_name: "avail_node"
+ scrape_interval: 5s
+ static_configs:
+ - targets: ["localhost:9615"]
+ - job_name: node
+ static_configs:
+ - targets: ['localhost:9100']
+EOF
+sudo mv $HOME/prometheus.yml /etc/prometheus/prometheus.yml
+sudo chmod 644 /etc/prometheus/prometheus.yml
+sudo systemctl enable prometheus.service prometheus-node-exporter.service
+sudo systemctl restart prometheus.service prometheus-node-exporter.service
+sudo systemctl status prometheus.service prometheus-node-exporter.service
+```
+
+Install Grafana:
+
+```
+wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
+echo "deb https://packages.grafana.com/oss/deb stable main" > grafana.list
+sudo mv grafana.list /etc/apt/sources.list.d/grafana.list
+sudo apt-get update && sudo apt-get install -y grafana
+sudo systemctl enable grafana-server.service
+sudo systemctl start grafana-server.service
+sudo systemctl status grafana-server.service
+sudo ufw allow 3000/tcp
+```
+
+Grafana UI on `http://:3000` (default `admin/admin`, forced reset). Add a
+Prometheus data source pointing at `http://localhost:9090`. Import Avail's official
+validator dashboard JSON:
+
+```
+https://raw.githubusercontent.com/availproject/docs/main/static/validator_metrics.json
+```
+
+> If the node runs in Docker, scrape works because `9615` is published to `127.0.0.1`
+> on the host (the setup skill binds it there). Keep Grafana's `3000` firewalled to
+> trusted IPs — don't `ufw allow` it open to the world on a validator host.
+
+## 3. Alerts that prevent slashing / ejection
+
+Page on these, not just CPU/disk:
+
+| Alert | Why it matters |
+|---|---|
+| Finalized block height not advancing for N min | Node forked/stalled — losing era points, heading for ejection |
+| Best block not advancing / `⚙️ Syncing` for long | Falling behind tip; will miss authoring slots |
+| Peer count below threshold (e.g. <3) | p2p/network problem; precedes desync |
+| Node process down / absent from telemetry | Offline → involuntary chill; if ≥10% of set offline → **slash** |
+| Era points / blocks authored dropping vs peers | Underperforming → lower rewards, election risk |
+| Running image tag ≠ latest avail release | Missing consensus-relevant fixes; runtime upgrade may require new client |
+| Disk < ~20% free | DB growth; node crash → downtime |
+
+Uptime is a **slashing-prevention control**. A full session unresponsive →
+involuntary chill (no slash, but you stop earning and must rejoin). ≥10% of validators
+offline together in an epoch → everyone in that group slashed. Treat "node down" as a
+page-now incident, not a morning-review item.
diff --git a/skills/avail-validator-operate/references/upgrade.md b/skills/avail-validator-operate/references/upgrade.md
new file mode 100644
index 0000000..1b435ab
--- /dev/null
+++ b/skills/avail-validator-operate/references/upgrade.md
@@ -0,0 +1,74 @@
+# Avail validator upgrades
+
+Goal: move to a newer `availj/avail` release without (a) corrupting the DB into long
+downtime or (b) double-signing. Pick the procedure by your downtime tolerance.
+
+## First: pick and verify the new tag
+
+Never `:latest`. Find the new release and verify before pulling:
+
+```bash
+curl -s https://api.github.com/repos/availproject/avail/releases/latest | grep -m1 '"tag_name"'
+skopeo inspect docker://docker.io/availj/avail: | grep -E 'Digest|Created'
+```
+
+Read the release notes — a node release sometimes pairs with an on-chain runtime
+upgrade. Runtime upgrades themselves are forkless and applied by governance; the
+operator's only job is running a client new enough to follow them.
+
+## Fast upgrade — single box, brief downtime
+
+Acceptable when a few minutes of missed authoring is tolerable. Safe against
+equivocation **because the old node is stopped before the new one starts** — same
+keystore is never live twice.
+
+```bash
+CID=$(docker ps -lq)
+docker stop "$CID" # old node definitively down first
+docker rename "$CID" avail-prev-$(date +%s) 2>/dev/null || true
+docker run --restart=on-failure -d \
+ -v /root/avail/node-data:/da/node-data \
+ -p 30333:30333 -p 127.0.0.1:9944:9944 -p 127.0.0.1:9615:9615 \
+ docker.io/availj/avail: \
+ --chain -d /da/node-data --validator --name
+docker logs -f $(docker ps -lq)
+```
+
+Reuse the **same volume and same `--name`**. Confirm recovery: `✨ Imported #N` rising,
+then `🎁 Prepared block for proposing` once your next slot comes. If the new version
+won't start or the DB is corrupt, roll back: stop the new container, restart the
+renamed old one (`docker start avail-prev-…`). Keep the old container until the new one
+has authored a block.
+
+`scripts/safe-upgrade.sh` performs exactly this stop-then-start ordering and refuses to
+start the new container until the old one is confirmed stopped.
+
+## Slow & safe upgrade — two boxes, zero downtime
+
+Use for mainnet / high-stake where you can't miss slots. The trick: the new machine
+gets **brand-new session keys**, so the two machines never share keys and cannot
+equivocate against each other.
+
+1. Provision **Node B** on the new version, same `--chain`/config, fully synced.
+2. On **Node B**: `author_rotateKeys` (see setup skill / `avail-validator.sh
+ rotate-keys`). Save the new hex.
+3. On the staking actions UI (network URL in setup skill networks.md), **Set Session
+ Key** to Node B's new hex, signed by the **controller**. Both old and new keys are
+ shown for an epoch or two, then only the new.
+4. Wait for authoring to migrate to B. **Confirm via logs, not the UI** — the UI can
+ show the switch an epoch before it is real. You want: `🎁 Prepared block for
+ proposing` appearing on **B** and stopping on **A**.
+5. **Only then** stop Node A. Upgrade A; optionally repeat to switch back.
+
+At no point are both nodes authoring with the same keys — that is the whole point.
+
+## Anti-patterns
+
+- "Spin up the upgraded node next to the old one with the copied keystore, then kill
+ the old one." → both live with identical keys = **equivocation = slash**. Use the
+ two-box flow with `rotate-keys` instead, or the fast flow that stops first.
+- Trusting the staking UI's switchover timing. Trust block-production logs.
+- Fast-upgrading a high-stake mainnet validator during its expected authoring window —
+ prefer slow & safe there.
+- Bumping the tag but not reading release notes — you may need the new client for a
+ scheduled runtime upgrade and not know it.
diff --git a/skills/avail-validator-operate/scripts/backup-keys.sh b/skills/avail-validator-operate/scripts/backup-keys.sh
new file mode 100755
index 0000000..9ab0661
--- /dev/null
+++ b/skills/avail-validator-operate/scripts/backup-keys.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Encrypted off-box backup of an Avail validator's irreplaceable secrets.
+# Backs up ONLY keystore/ (session keys) and network/ (node key) — db/ is
+# re-syncable and is intentionally excluded.
+#
+# Usage:
+# backup-keys.sh --recipient [--container ] [--out DIR]
+# backup-keys.sh --gpg [--container ] [--out DIR]
+#
+# --recipient : encrypt with `age` to this recipient (preferred)
+# --gpg : fall back to symmetric `gpg -c` (you'll be prompted for a passphrase)
+#
+# The resulting archive IS the validator identity. Move it off this host and guard it
+# like a private key. Never keep the only copy on the validator box.
+set -euo pipefail
+
+CID="" RECIPIENT="" USE_GPG=0 OUT="."
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --recipient) RECIPIENT="$2"; shift 2;;
+ --gpg) USE_GPG=1; shift;;
+ --container) CID="$2"; shift 2;;
+ --out) OUT="$2"; shift 2;;
+ *) echo "unknown arg: $1" >&2; exit 1;;
+ esac
+done
+
+[ -n "$CID" ] || CID="$(docker ps -lq)"
+[ -n "$CID" ] || { echo "error: no running container; pass --container " >&2; exit 1; }
+
+CHAIN_DIR="$(docker exec "$CID" sh -c 'ls -d /da/node-data/chains/* 2>/dev/null | head -1')"
+[ -n "$CHAIN_DIR" ] || { echo "error: could not locate chains dir in container" >&2; exit 1; }
+echo ">> backing up keystore/ + network/ from $CHAIN_DIR (db/ excluded by design)"
+
+STAMP="$(date +%F-%H%M%S)"
+mkdir -p "$OUT"
+
+if [ -n "$RECIPIENT" ]; then
+ command -v age >/dev/null 2>&1 || { echo "error: age not installed" >&2; exit 1; }
+ DEST="$OUT/avail-keys-$STAMP.tar.gz.age"
+ docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \
+ | age -r "$RECIPIENT" > "$DEST"
+elif [ "$USE_GPG" = "1" ]; then
+ command -v gpg >/dev/null 2>&1 || { echo "error: gpg not installed" >&2; exit 1; }
+ DEST="$OUT/avail-keys-$STAMP.tar.gz.gpg"
+ docker exec "$CID" tar -C "$CHAIN_DIR" -czf - keystore network \
+ | gpg -c --cipher-algo AES256 -o "$DEST"
+else
+ echo "error: pass --recipient or --gpg (refusing to write plaintext)" >&2
+ exit 1
+fi
+
+echo ">> wrote $DEST"
+echo ">> MOVE this off the validator host now. It is the validator's identity."
+echo ">> verify it restores before you rely on it (see references/backup-recovery.md)."
diff --git a/skills/avail-validator-operate/scripts/safe-upgrade.sh b/skills/avail-validator-operate/scripts/safe-upgrade.sh
new file mode 100755
index 0000000..65ef819
--- /dev/null
+++ b/skills/avail-validator-operate/scripts/safe-upgrade.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# Fast single-box Avail validator upgrade with equivocation-safe ordering:
+# the old container is STOPPED and confirmed down BEFORE the new one starts, so the
+# same session keystore is never live on two nodes. Same volume + same --name are
+# reused, so the validator identity is preserved (no setKeys needed for this path).
+#
+# For zero-downtime / high-stake mainnet, use the two-box rotate-keys flow in
+# references/upgrade.md instead — NOT this script.
+#
+# Usage:
+# safe-upgrade.sh --chain --tag --name \
+# [--container ] [--data DIR]
+set -euo pipefail
+
+CID="" CHAIN="" TAG="" NAME="" DATA_DIR="/root/avail/node-data"
+IMAGE_REPO="docker.io/availj/avail"
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --chain) CHAIN="$2"; shift 2;;
+ --tag) TAG="$2"; shift 2;;
+ --name) NAME="$2"; shift 2;;
+ --container) CID="$2"; shift 2;;
+ --data) DATA_DIR="$2"; shift 2;;
+ *) echo "unknown arg: $1" >&2; exit 1;;
+ esac
+done
+[ "$CHAIN" = "mainnet" ] || [ "$CHAIN" = "turing" ] || { echo "error: --chain mainnet|turing" >&2; exit 1; }
+[ -n "$TAG" ] || { echo "error: --tag required (never :latest)" >&2; exit 1; }
+[ -n "$NAME" ] || { echo "error: --name required (reuse the SAME node name)" >&2; exit 1; }
+[ -n "$CID" ] || CID="$(docker ps -lq)"
+[ -n "$CID" ] || { echo "error: no running container; pass --container " >&2; exit 1; }
+
+echo ">> verifying $IMAGE_REPO:$TAG before pull"
+if command -v skopeo >/dev/null 2>&1; then
+ skopeo inspect "docker://$IMAGE_REPO:$TAG" | grep -E '"Digest"|"Created"' \
+ || { echo "error: cannot inspect $IMAGE_REPO:$TAG" >&2; exit 1; }
+else
+ echo " skopeo not found — skipping digest verification"
+fi
+
+echo ">> stopping old container $CID (must be down before new one starts)"
+docker stop "$CID" >/dev/null
+# hard gate: refuse to continue unless the old node is actually not running
+if [ -n "$(docker ps -q --filter id="$CID")" ]; then
+ echo "error: old container still running — aborting to avoid double-signing" >&2
+ exit 1
+fi
+docker rename "$CID" "avail-prev-$(date +%s)" 2>/dev/null || true
+echo ">> old node confirmed stopped — starting new node on tag $TAG"
+
+docker run --restart=on-failure -d \
+ -v "$DATA_DIR:/da/node-data" \
+ -p 30333:30333 \
+ -p 127.0.0.1:9944:9944 \
+ -p 127.0.0.1:9615:9615 \
+ "$IMAGE_REPO:$TAG" \
+ --chain "$CHAIN" -d /da/node-data --validator --name "$NAME"
+
+NEW_CID="$(docker ps -lq)"
+echo ">> new container $NEW_CID started"
+echo ">> watch: docker logs -f $NEW_CID"
+echo ">> healthy = '✨ Imported #N' rising, then '🎁 Prepared block for proposing'"
+echo ">> rollback if DB corrupt: docker stop $NEW_CID; docker start avail-prev-*"
+echo ">> keep the renamed old container until the new one has authored a block"
diff --git a/skills/avail-validator-setup/SKILL.md b/skills/avail-validator-setup/SKILL.md
new file mode 100644
index 0000000..72a795f
--- /dev/null
+++ b/skills/avail-validator-setup/SKILL.md
@@ -0,0 +1,168 @@
+---
+name: avail-validator-setup
+description: >-
+ Stand up and activate an Avail DA validator node (Docker-first) from scratch — day-0 provisioning
+ through day-1 staking and going active. Use this whenever the user wants to run, deploy, set up,
+ bootstrap, spin up, or "become" an Avail validator on Mainnet or Turing testnet; generate/rotate
+ session keys (author_rotateKeys); bond stake, set session keys on-chain (setKeys), register as a
+ validator (staking.validate) with a commission; pick the right chain/RPC/endpoints per network;
+ or securely store validator keys (keystore, node key, stash/controller). Triggers on phrases like
+ "run an Avail validator", "set up avail node", "availj/avail docker", "rotate session keys avail",
+ "stake my avail validator", "join the avail validator set", "avail turing testnet validator",
+ "secure my avail keystore". For ongoing day-2 work (monitoring, upgrades, backups, chill,
+ recovery) use the avail-validator-operate skill instead.
+---
+
+# Avail Validator — Setup (Day 0 + Day 1)
+
+Bring an Avail Data Availability validator from bare machine to active block producer.
+**Docker-first.** One parameterized path covers Mainnet and Turing testnet — only the
+`--chain` value and a handful of endpoints differ.
+
+A validator is a specialized full node that produces blocks (BABE) and finalizes them
+(GRANDPA) under Nominated Proof of Stake. It must be **bonded and registered on-chain**
+and must hold **session keys** in its local keystore, or it stays stuck at block 0.
+
+## The two phases
+
+- **Day 0 — Provision**: run a synced node container, firewalled correctly.
+- **Day 1 — Activate**: generate session keys, create accounts, bond, set keys on-chain,
+ declare validator intent with a commission, wait for the next era.
+
+Do them in order. Day 1 needs a fully synced Day 0 node.
+
+## Network parameters
+
+Pick the network with the user before any command. Everything network-specific is in
+`references/networks.md` — read it now and substitute concretely; never leave
+`` / `` placeholders in commands you run.
+
+| Need | Mainnet | Turing testnet |
+|---|---|---|
+| `--chain` value | `mainnet` | `turing` |
+| Test tokens | buy/bridge AVAIL | request from Turing faucet (see networks.md) |
+
+There is **no `--chain testnet`** — the only supported testnet is `turing`. Omitting
+`--chain` errors out: `Please specify which chain you want to run, e.g. --chain mainnet`.
+
+## Day 0 — Provision
+
+### 1. Host
+
+Minimum 8 GB RAM / 4 cores / SSD; recommended 16 GB / 8 cores / 200–300 GB SSD (chain
+grows). Linux server distro. Open **p2p port 30333** inbound. Do **not** expose RPC
+(9944) or metrics (9615) to the public internet on a validator.
+
+### 2. Pick a secure image tag — never `:latest`
+
+The image is `docker.io/availj/avail`. Always pin an explicit released tag and verify
+it before pulling (a validator must run a known-good binary):
+
+```bash
+# newest releases — cross-check the version announced in Avail's Discord
+curl -s https://api.github.com/repos/availproject/avail/releases/latest | grep -m1 '"tag_name"'
+# verify the tag resolves and inspect its digest before use
+skopeo inspect docker://docker.io/availj/avail: | grep -E 'Digest|Created'
+```
+
+Pin by digest in production if possible: `docker.io/availj/avail@sha256:`.
+
+### 3. Run the node
+
+`scripts/avail-validator.sh` wraps the exact upstream `docker run` with secure defaults
+(restart policy, named volume, firewalled ports, digest pinning). Prefer it:
+
+```bash
+scripts/avail-validator.sh provision --chain --tag --name
+```
+
+It is the canonical upstream command, parameterized:
+
+```bash
+docker run --restart=on-failure -d \
+ -v /root/avail/node-data:/da/node-data \
+ -p 30333:30333 -p 127.0.0.1:9944:9944 -p 127.0.0.1:9615:9615 \
+ docker.io/availj/avail: \
+ --chain -d /da/node-data --validator --name
+```
+
+Note the deliberate hardening vs. the upstream docs: RPC `9944` and metrics `9615` are
+bound to `127.0.0.1` only (upstream publishes 9944 on `0.0.0.0`, which is unsafe for a
+validator). p2p `30333` is the only port that must be world-reachable.
+
+### 4. Wait for full sync
+
+```bash
+docker logs -f $(docker ps -lq)
+```
+
+`⚙️ Syncing …` → still catching up. `💤 Idle (N peers) … best: #X, finalized #Y` with X
+advancing and N>0 → synced. Role line must read `👤 Role: AUTHORITY` (that confirms
+`--validator`). **Do not start Day 1 until fully synced and finalizing.**
+
+## Day 1 — Activate
+
+### 1. Generate session keys (on the node)
+
+`author_rotateKeys` creates the four session keys (babe, gran, imon, audi) inside the
+container's keystore and returns their concatenated public hex:
+
+```bash
+CID=$(docker ps -lq)
+docker exec -i "$CID" curl -sH "Content-Type: application/json" \
+ -d '{"id":1,"jsonrpc":"2.0","method":"author_rotateKeys","params":[]}' \
+ http://localhost:9944
+docker restart "$CID"
+```
+
+(`scripts/avail-validator.sh rotate-keys` does exactly this and prints the hex.) Save
+the `result` hex — it is submitted on-chain next. The private keys stay on disk in the
+keystore and must **never** leave the box. See `references/key-security.md`.
+
+### 2. Create stash + controller accounts
+
+On the explorer wallet (URLs in `references/networks.md`) create **two separate**
+accounts: a **stash** (holds bonded funds — keep in cold/hardware storage) and a
+**controller** (signs `setKeys`, `validate`, `chill` — used routinely). Separation is a
+security control: a compromised controller cannot move bonded funds. Fund the stash;
+keep a little in the controller for fees. Detail: `references/key-security.md`.
+
+### 3. Bond, set keys, validate
+
+Via the staking UI (network-specific URL in `references/networks.md`):
+
+1. **Bond** from the stash — at least **50,000 AVAIL** to enter the waiting list. Don't
+ bond everything; leave fee headroom. Unbonding later is locked **28 days**.
+2. **Set Session Key** — paste the `author_rotateKeys` hex (the `setKeys` extrinsic,
+ signed by the controller). The button then changes to **Validate**.
+3. **Validate** — set your **commission %** and submit (`staking.validate`). This
+ declares validator intent.
+
+### 4. Become active
+
+You enter the **Waiting** set. The active set is re-elected each **era (~24 h)** by
+stake. If your stake is high enough you're elected within an era or two. Confirm by
+node logs showing `🎁 Prepared block for proposing` — not by the UI, which can show
+the change before it is real. No rewards in the era you stake; first payout ~era N+3.
+
+## Key security — non-negotiable
+
+Read `references/key-security.md` and apply it during Day 1. The load-bearing rules:
+
+- The **keystore** (`/chains//keystore`) and **network** (node key)
+ dirs are the only irreplaceable on-box secrets. `db` is re-syncable.
+- **Never run two nodes with the same keystore at once** — double-signing is
+ equivocation and is slashed (validator *and* nominators). This dictates how upgrades
+ and recovery are done (see avail-validator-operate).
+- Stash in cold/hardware storage; stash ≠ controller.
+
+## When stuck
+
+`references/troubleshooting.md` — no peers, sync stalled, stays `FULL` not `AUTHORITY`,
+not producing blocks after election, key/account mismatch.
+
+## Handing off to day-2
+
+Once the validator is producing blocks, ongoing monitoring, upgrades, backups, chill,
+and disaster recovery are the **avail-validator-operate** skill's job. Point the user
+there rather than improvising those here — upgrade/recovery have equivocation traps.
diff --git a/skills/avail-validator-setup/evals/evals.json b/skills/avail-validator-setup/evals/evals.json
new file mode 100644
index 0000000..3792abc
--- /dev/null
+++ b/skills/avail-validator-setup/evals/evals.json
@@ -0,0 +1,38 @@
+{
+ "skill_name": "avail-validator-setup",
+ "evals": [
+ {
+ "id": 0,
+ "name": "turing-full-lifecycle-docker",
+ "prompt": "I want to run an Avail validator on the Turing testnet using Docker on a fresh Ubuntu box. Walk me through everything from getting the node container up and synced to actually becoming an active validator that produces blocks with my stake. I keep getting confused about session keys vs my wallet accounts.",
+ "expected_output": "Ordered day-0 then day-1 runbook with Turing-concrete commands; clear session-keys-vs-wallet-accounts explanation; confirm-via-logs.",
+ "files": [],
+ "assertions": [
+ "Uses --chain turing for the testnet (does not invent a 'testnet' chain alias or use mainnet)",
+ "Pins an explicit availj/avail image tag and says not to use :latest, with a verify step before pull",
+ "docker run includes --validator and binds RPC 9944 / metrics 9615 to localhost while exposing only p2p 30333",
+ "Tells the user to wait for Role: AUTHORITY and a synced/idle node before starting day-1 staking",
+ "Generates session keys with author_rotateKeys via docker exec and restarts the node afterward",
+ "Clearly distinguishes on-box session keys from off-box wallet stash/controller accounts",
+ "Bonds from the stash (>=50000 AVAIL) and submits setKeys + validate signed by the controller",
+ "Says to confirm the validator is active via node logs (block proposing), not the staking UI alone"
+ ]
+ },
+ {
+ "id": 1,
+ "name": "mainnet-keys-and-port-hardening",
+ "prompt": "Setting up an Avail mainnet validator with the availj/avail docker image. Two things I care about: (1) the secure way to generate and register session keys so I don't get slashed, and (2) locking down the box so my RPC port isn't exposed to the internet and the keys are safe. Where do the private keys actually live?",
+ "expected_output": "Mainnet-concrete secure setup: tag verification, localhost-bound RPC/metrics, key-category model, on-disk key locations, equivocation rule, encrypted off-box backup.",
+ "files": [],
+ "assertions": [
+ "Uses --chain mainnet",
+ "Secure tag selection: explicitly avoids :latest and verifies the tag/digest (e.g. skopeo) before pulling",
+ "Binds RPC 9944 and metrics 9615 to 127.0.0.1 and exposes only p2p 30333 publicly",
+ "States private session keys live in /chains//keystore inside the Docker volume, and node key in network/",
+ "Explains the three key categories and that stash and controller must be separate accounts kept off the box",
+ "States never run two nodes with the same keystore (equivocation = slashing of validator and nominators)",
+ "Recommends an encrypted off-box backup of keystore + network and restrictive file permissions"
+ ]
+ }
+ ]
+}
diff --git a/skills/avail-validator-setup/references/key-security.md b/skills/avail-validator-setup/references/key-security.md
new file mode 100644
index 0000000..fd9c5bc
--- /dev/null
+++ b/skills/avail-validator-setup/references/key-security.md
@@ -0,0 +1,71 @@
+# Avail validator key security
+
+A validator manages three distinct categories of key material. Confusing them is how
+operators get slashed or drained. Treat this as the security contract for the skill.
+
+## The three categories
+
+### 1. Session keys — the equivocation-critical secret
+- **What:** four keys — `babe` (block production), `gran` (GRANDPA finality, ed25519),
+ `imon` (ImOnline heartbeat), `audi` (authority discovery). Generated by
+ `author_rotateKeys`, which returns only the concatenated **public** hex.
+- **Where:** encrypted files in `/chains//keystore/`. In Docker
+ that is `/da/node-data/chains//keystore/` inside the volume.
+- **Rule:** the private keystore must **never leave the box and never exist on two
+ running nodes at the same time.** Two nodes signing with the same session keys =
+ equivocation = slashing of the validator *and* its nominators. This single rule
+ dictates the safe upgrade/recovery procedures in avail-validator-operate.
+- **Rotation:** `author_rotateKeys` + on-chain `setKeys` is the safe way to move the
+ validator identity between machines — not copying the keystore.
+
+### 2. Node key (libp2p identity)
+- **What:** the node's persistent peer identity (`12D3Koo…`).
+- **Where:** `/chains//network/`.
+- **Rule:** back it up with the keystore. Not equivocation-critical, but losing it
+ changes your peer ID and telemetry identity. Never share it.
+
+### 3. Stash & controller account keys — wallet keys, NOT on the validator box
+- **Stash:** holds the bonded funds. Keep in **cold storage / hardware wallet**. Used
+ rarely (bond, designate controller).
+- **Controller:** signs day-to-day staking extrinsics — `setKeys`, `staking.validate`,
+ `staking.chill`. Used routinely.
+- **Rule:** stash and controller **must be different accounts.** A compromised
+ controller can stop/redirect validation but cannot move the stash's bonded funds.
+ Neither private key belongs on the validator server — they live in the operator's
+ wallet/hardware device.
+
+## On-disk layout
+
+```
+/ # Docker: /da/node-data (host: /root/avail/node-data)
+└── chains
+ └── / # e.g. avail_turing_network — discover, don't hardcode
+ ├── db/ # blockchain state — RE-SYNCABLE, contains no secret
+ ├── keystore/ # session keys — IRREPLACEABLE SECRET (category 1)
+ └── network/ # node key — back this up (category 2)
+```
+
+`db` can always be rebuilt from genesis or a snapshot. `keystore` + `network` are the
+only things worth backing up, and they must be backed up **encrypted**.
+
+## Hardening checklist (apply during Day 1)
+
+- Restrict the data volume on the host: `chmod -R go-rwx /root/avail/node-data` and
+ `chown` to the container's uid. The keystore should be unreadable by other users.
+- Never publish RPC `9944` or metrics `9615` beyond `127.0.0.1`. Only p2p `30333` is
+ public. (The setup skill's `docker run` already binds 9944/9615 to localhost.)
+- Take an **encrypted** backup of `keystore/` + `network/` immediately after Day 1 and
+ store it off-box (e.g. `tar … | age -e` / `gpg -c`). Recovery without it means
+ rotating to brand-new session keys.
+- Stash on a hardware wallet. Record the stash/controller addresses (public) somewhere
+ durable; never record the seed phrases digitally in plaintext.
+- Consider a proxy account for routine staking calls so the controller seed is touched
+ less often (optional; see Avail "proxies-on-avail" docs).
+
+## The one mistake that slashes you
+
+Restoring a keystore backup onto a new node **while the old node is or might still be
+running** double-signs and equivocates. Any keystore move requires the previous node to
+be **definitively offline first**, or — preferred — use the `author_rotateKeys` +
+`setKeys` rotation flow so the two machines never share keys. This is covered
+operationally in the avail-validator-operate skill (safe upgrade / disaster recovery).
diff --git a/skills/avail-validator-setup/references/networks.md b/skills/avail-validator-setup/references/networks.md
new file mode 100644
index 0000000..03607da
--- /dev/null
+++ b/skills/avail-validator-setup/references/networks.md
@@ -0,0 +1,65 @@
+# Avail networks — concrete values
+
+Substitute these into every command/URL. Avail runs **one** node binary/image; only
+these values differ between networks. There is no `testnet` chain alias — the only
+supported testnet is **Turing**.
+
+## Parameter table
+
+| Param | **Mainnet** | **Turing testnet** |
+|---|---|---|
+| `--chain` value | `mainnet` | `turing` |
+| Chain spec line in logs | `Avail Mainnet` | `Avail Turing Network` |
+| Official WS RPC | `wss://mainnet-rpc.avail.so/ws` | `wss://turing-rpc.avail.so/ws` |
+| Light-client API | `https://api.lightclient.mainnet.avail.so/v1` | `https://api.lightclient.turing.avail.so/v1` |
+| Block explorer (Subscan) | `https://avail.subscan.io/` | `https://avail-turing.subscan.io/` |
+| App / extrinsics explorer | `https://explorer.availproject.org/?rpc=wss://mainnet-rpc.avail.so/ws` | `https://explorer.availproject.org/?rpc=wss://turing-rpc.avail.so/ws` |
+| Staking dashboard | `https://staking.availproject.org/#/overview` | same UI — select Turing |
+| Staking actions (bond/setKeys/validate/chill) | `https://explorer.availproject.org/#/staking/actions` | same UI — select Turing |
+| Telemetry | `http://telemetry.avail.so/` | `http://telemetry.avail.so/` (Turing tab) |
+| Token symbol | AVAIL | AVAIL |
+| Chain-data dir under `/chains/` | e.g. `avail_mainnet_network` | e.g. `avail_turing_network` / `avail_turing_testnet` (varies by node version) |
+
+> The chain-data subdir name varies across node versions. Never hardcode it — discover
+> it: `docker exec ls /da/node-data/chains`.
+
+## Public RPC endpoints (for queries / explorer, not for the validator itself)
+
+**Mainnet:** OnFinality `https://avail.api.onfinality.io/public` ·
+Ankr `https://mainnet.avail-rpc.com/` · AllNodes `https://avail-rpc.publicnode.com/` ·
+VitWit `https://avail.rpc.vitwit.com/` · GlobalStake `https://rpc-avail.globalstake.io` ·
+RadiumBlock `https://avail.public.curie.radiumblock.co/http`
+
+**Turing:** OnFinality `https://avail-turing.api.onfinality.io/public` ·
+Ankr `https://rpc.ankr.com/avail_turing_testnet` ·
+AllNodes `https://avail-turing-rpc.publicnode.com` ·
+RadiumBlock `https://turing.public.curie.radiumblock.co/http`
+
+WSS variants: replace `https://`→`wss://` and the path per provider (e.g. OnFinality
+`wss://avail.api.onfinality.io/public-ws`).
+
+## Test tokens (Turing only)
+
+Turing AVAIL is obtained via the Avail faucet / Discord, not a CLI. Direct the user to
+the faucet linked from `https://docs.availproject.org/docs/da/build/interact/faucet`
+(or the Avail Discord `#faucet`). Mainnet AVAIL must be bought or bridged
+(`https://bridge.availproject.org/`).
+
+## Staking economics (both networks unless noted)
+
+- **Era** ≈ 24 h. Active validator set re-elected each era boundary by stake.
+- **Min self-bond to enter the validator waiting list:** ≥ **50,000 AVAIL**.
+ (Turing's network "threshold" may currently be 0, but the 50k waiting-list figure is
+ the documented practical floor — confirm live before advising the user.)
+- **Reward lag:** stake in era N → active N+1 → accrues N+2 → first payout ~N+3.
+- **Unbonding lock:** **28 days** after `unbond` before funds are withdrawable.
+- **Minimum Nominated** is recalculated every era and rises as total stake grows — a
+ validator above threshold today can fall below later. Not a one-time check.
+
+## Chain spec source (only if running outside Docker / custom spec)
+
+The Docker image ships the correct spec; `--chain mainnet|turing` is enough. If a raw
+spec is ever needed:
+- Mainnet: `https://raw.githubusercontent.com/availproject/avail/main/misc/genesis/mainnet.chain.spec.raw.json`
+- Turing: the docs link a GitHub `blob/` URL (a docs bug); use the `raw.githubusercontent.com`
+ equivalent: `https://raw.githubusercontent.com/availproject/avail/main/misc/genesis/testnet.turing.chain.spec.raw.json`
diff --git a/skills/avail-validator-setup/references/troubleshooting.md b/skills/avail-validator-setup/references/troubleshooting.md
new file mode 100644
index 0000000..8faf445
--- /dev/null
+++ b/skills/avail-validator-setup/references/troubleshooting.md
@@ -0,0 +1,52 @@
+# Avail validator setup — troubleshooting
+
+Symptom → cause → fix. Check `docker logs -f $(docker ps -lq)` first; the log markers
+below are the diagnostic signal.
+
+## Node won't start
+
+- `Error: Input("Please specify which chain you want to run, e.g. --chain mainnet")`
+ → `--chain` missing. Add `--chain mainnet` or `--chain turing`. There is no
+ `testnet` alias.
+- Container exits immediately, restarts in a loop → check `docker logs`; usually a bad
+ `-v` mount path or the volume dir not writable by the container uid. Fix host perms,
+ recreate.
+
+## Stuck syncing / no peers
+
+- Log stays `💤 Idle (0 peers)` → p2p port **30333 not reachable**. Open it inbound in
+ the host/cloud firewall and security group; confirm `-p 30333:30333` is published.
+- `⚙️ Syncing` never reaching tip, or `❌ Error while dialing /dns/telemetry…` → the
+ telemetry dial error is harmless (telemetry only). Real sync stall = peers/port or
+ disk too slow; check `best:` and `finalized:` are advancing.
+- Genesis sync is slow (hours), not the "5–10 min" some docs imply. Warp sync is not
+ available. A trusted DB snapshot speeds it up (operate skill covers restore).
+
+## Node runs but never becomes a validator
+
+- Log shows `👤 Role: FULL` not `AUTHORITY` → `--validator` flag missing from the
+ `docker run` args. Recreate the container with `--validator`.
+- `Role: AUTHORITY` but no `🎁 Prepared block for proposing` after election:
+ - Session keys never set on-chain, or set against the wrong account → re-run
+ `author_rotateKeys`, then **Set Session Key** signed by the **controller**.
+ - Not elected yet — you're still in **Waiting**. Election happens at era boundaries
+ (~24 h); insufficient stake keeps you waiting. Verify stake ≥ waiting-list floor
+ and on the staking dashboard you appear under Waiting/Active.
+ - Node not fully synced when keys were set → wait for `💤 Idle` with advancing
+ `finalized:`, rotate keys again, resubmit.
+
+## Key / account problems
+
+- `setKeys` / `validate` extrinsic fails or "controller not bonded" → bond from the
+ **stash** first, then `setKeys`/`validate` from the **controller**. Stash and
+ controller must be distinct accounts.
+- Rotated keys but validator stopped producing → `author_rotateKeys` replaced the
+ on-box keys; you must submit the **new** hex via `setKeys` (the rotate flow is
+ intentional for migrations — see operate skill safe-upgrade).
+
+## Verifying health quickly
+
+- `docker exec ls /da/node-data/chains` → confirms the chain dir / network.
+- Telemetry site (network tab) → node visible by `--name`, block height tracking tip.
+- Logs: `✨ Imported #N` rising = following chain; `🎁 Prepared block for proposing` =
+ actively authoring (you are an active validator).
diff --git a/skills/avail-validator-setup/scripts/avail-validator.sh b/skills/avail-validator-setup/scripts/avail-validator.sh
new file mode 100755
index 0000000..f98999b
--- /dev/null
+++ b/skills/avail-validator-setup/scripts/avail-validator.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+# Avail validator setup helper — Docker-first, network-parameterized.
+# Wraps the exact upstream `docker run` with validator-safe hardening:
+# - RPC 9944 and metrics 9615 bound to 127.0.0.1 only (upstream exposes 9944 publicly)
+# - p2p 30333 published (must be world-reachable)
+# - explicit pinned tag, optional digest pinning, restart policy, named host volume
+#
+# Usage:
+# avail-validator.sh provision --chain --tag --name [--data DIR]
+# avail-validator.sh rotate-keys [--container ]
+# avail-validator.sh status [--container ]
+#
+# This script is deliberately small and explicit. Read it before running it on a
+# machine that will hold real stake.
+set -euo pipefail
+
+DATA_DIR="/root/avail/node-data"
+IMAGE_REPO="docker.io/availj/avail"
+
+die() { echo "error: $*" >&2; exit 1; }
+
+cmd_provision() {
+ local chain="" tag="" name=""
+ while [ $# -gt 0 ]; do
+ case "$1" in
+ --chain) chain="$2"; shift 2;;
+ --tag) tag="$2"; shift 2;;
+ --name) name="$2"; shift 2;;
+ --data) DATA_DIR="$2"; shift 2;;
+ *) die "unknown arg: $1";;
+ esac
+ done
+ [ -n "$chain" ] || die "--chain required (mainnet|turing)"
+ [ "$chain" = "mainnet" ] || [ "$chain" = "turing" ] || die "--chain must be mainnet or turing"
+ [ -n "$tag" ] || die "--tag required (pin an explicit release, never :latest)"
+ [ -n "$name" ] || die "--name required"
+
+ echo ">> verifying image $IMAGE_REPO:$tag before pull (validator must run known-good binary)"
+ if command -v skopeo >/dev/null 2>&1; then
+ skopeo inspect "docker://$IMAGE_REPO:$tag" | grep -E '"Digest"|"Created"' \
+ || die "skopeo could not inspect $IMAGE_REPO:$tag — bad tag?"
+ else
+ echo " skopeo not found — skipping digest verification (install skopeo to harden)"
+ fi
+
+ mkdir -p "$DATA_DIR"
+ echo ">> p2p port 30333 must be reachable inbound; 9944/9615 stay localhost-only"
+ set -x
+ docker run --restart=on-failure -d \
+ -v "$DATA_DIR:/da/node-data" \
+ -p 30333:30333 \
+ -p 127.0.0.1:9944:9944 \
+ -p 127.0.0.1:9615:9615 \
+ "$IMAGE_REPO:$tag" \
+ --chain "$chain" -d /da/node-data --validator --name "$name"
+ set +x
+ echo ">> tail sync with: docker logs -f \$(docker ps -lq)"
+ echo ">> wait for 'Role: AUTHORITY' and steady '💤 Idle (N peers)' before Day 1"
+}
+
+_pick_container() {
+ local cid="${1:-}"
+ [ -n "$cid" ] && { echo "$cid"; return; }
+ docker ps -lq
+}
+
+cmd_rotate_keys() {
+ local cid=""
+ [ "${1:-}" = "--container" ] && { cid="$2"; shift 2; }
+ cid="$(_pick_container "$cid")"
+ [ -n "$cid" ] || die "no running container; pass --container "
+ echo ">> generating session keys in container $cid"
+ docker exec -i "$cid" curl -sH "Content-Type: application/json" \
+ -d '{"id":1,"jsonrpc":"2.0","method":"author_rotateKeys","params":[]}' \
+ http://localhost:9944
+ echo
+ echo ">> restarting node to load keys"
+ docker restart "$cid" >/dev/null
+ echo ">> submit the 'result' hex above via Set Session Key (signed by CONTROLLER)"
+ echo ">> private keys remain in the keystore on this box — never copy them off"
+}
+
+cmd_status() {
+ local cid=""
+ [ "${1:-}" = "--container" ] && { cid="$2"; shift 2; }
+ cid="$(_pick_container "$cid")"
+ [ -n "$cid" ] || die "no running container; pass --container "
+ echo ">> chains dir (confirms network):"
+ docker exec "$cid" ls /da/node-data/chains
+ echo ">> last log lines:"
+ docker logs --tail 20 "$cid"
+}
+
+sub="${1:-}"; shift || true
+case "$sub" in
+ provision) cmd_provision "$@";;
+ rotate-keys) cmd_rotate_keys "$@";;
+ status) cmd_status "$@";;
+ *) die "usage: $0 {provision|rotate-keys|status} ...";;
+esac