diff --git a/ci/integration-test.sh b/ci/integration-test.sh index f1c6902..76f0e7a 100755 --- a/ci/integration-test.sh +++ b/ci/integration-test.sh @@ -46,6 +46,81 @@ cleanup() { } trap cleanup EXIT +# assert_quorum_n [timeout-seconds] +# Polls all pods 0..N-1 until they all report the same non-empty leaderId. +# On success: exports LEADERS[] (array of leaderIds, one per pod) and +# LEADER_ORDINAL (the pod ordinal of the leader). +assert_quorum_n() { + local n=$1 timeout=${2:-$RAFT_TIMEOUT} + local deadline=$(( SECONDS + timeout )) + local i pid local_port l + while true; do + LEADERS=() + for (( i=0; i/dev/null || echo "") + LEADERS+=("$l") + fi + pf_stop "$pid" + done + + if (( ${#LEADERS[@]} == n )) && [[ -n "${LEADERS[0]}" ]]; then + local all_agree=1 + for l in "${LEADERS[@]:1}"; do + [[ "$l" == "${LEADERS[0]}" ]] || { all_agree=0; break; } + done + if (( all_agree )); then + LEADER_ORDINAL=$(echo "${LEADERS[0]}" \ + | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") + [[ -n "$LEADER_ORDINAL" ]] || { + echo "ERROR: could not parse ordinal from leader '${LEADERS[0]}'" + return 1 + } + echo " Raft leader: ${LEADERS[0]} (pod-${LEADER_ORDINAL})" + return 0 + fi + fi + + if (( SECONDS >= deadline )); then + echo "ERROR: Raft formation on ${n} pods timed out after ${timeout}s." + echo " Leaders seen: ${LEADERS[*]:-}" + return 1 + fi + echo " Not converged yet (${LEADERS[*]:-}), retrying in 5s..." + sleep 5 + done +} + +# cluster_status_assert_healthy +# Asserts no peer is STALLED or FALLING_BEHIND. Gracefully skips if the +# `peers[].status` field is absent (image predates commit 203acdaac). +cluster_status_assert_healthy() { + local port=$1 + local status_json has_status stalled peer_count + status_json=$(api "$port" GET /api/v1/cluster) || { + echo "ERROR: cluster status API call failed"; return 1 + } + has_status=$(echo "$status_json" | jq -r '.peers[0].status // empty') + if [[ -z "$has_status" ]]; then + echo " WARNING: peers[].status field absent on this image; skipping STATUS assertion." + return 0 + fi + stalled=$(echo "$status_json" \ + | jq -r '.peers[] | select(.status=="STALLED" or .status=="FALLING_BEHIND") | .id' \ + | head -n1) + if [[ -n "$stalled" ]]; then + echo "ERROR: peer $stalled has status STALLED/FALLING_BEHIND" + echo "$status_json" | jq '.peers' + return 1 + fi + peer_count=$(echo "$status_json" | jq '.peers | length') + echo " All ${peer_count} peers HEALTHY/CATCHING_UP." + return 0 +} + # ── retrieve password ───────────────────────────────────────────────────────── PASSWORD=$(kubectl get secret arcadedb-credentials-secret \ @@ -56,53 +131,21 @@ PASSWORD=$(kubectl get secret arcadedb-credentials-secret \ # ── phase 1: pod readiness ──────────────────────────────────────────────────── -echo "==> [1/4] Waiting for StatefulSet rollout (timeout ${ROLLOUT_TIMEOUT}s)..." +echo "==> [1/6] Waiting for StatefulSet rollout (timeout ${ROLLOUT_TIMEOUT}s)..." kubectl rollout status statefulset/"$RELEASE" \ -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s" echo " All 3 pods Ready." # ── phase 2: raft formation ─────────────────────────────────────────────────── -echo "==> [2/4] Checking Raft leader consensus (timeout ${RAFT_TIMEOUT}s)..." -DEADLINE=$(( SECONDS + RAFT_TIMEOUT )) - -while true; do - LEADERS=() - for i in 0 1 2; do - LOCAL=$(( HTTP_PORT + 10 + i )) # 2490, 2491, 2492 - PID=$(pf_start "$i" "$LOCAL") - pf_wait "$LOCAL" || { pf_stop "$PID"; continue; } - LEADER=$(api "$LOCAL" GET /api/v1/cluster \ - | jq -r '.leaderId // empty' 2>/dev/null || echo "") - pf_stop "$PID" - LEADERS+=("$LEADER") - done - - if [[ -n "${LEADERS[0]}" \ - && "${LEADERS[0]}" == "${LEADERS[1]}" \ - && "${LEADERS[0]}" == "${LEADERS[2]}" ]]; then - echo " Raft leader: ${LEADERS[0]}" - break - fi - - if (( SECONDS >= DEADLINE )); then - echo "ERROR: Raft formation timed out after ${RAFT_TIMEOUT}s." - echo " Leaders seen: ${LEADERS[*]:-}" - exit 1 - fi - - echo " Not converged yet (${LEADERS[*]:-}), retrying in 5s..." - sleep 5 -done +echo "==> [2/6] Checking Raft leader consensus (timeout ${RAFT_TIMEOUT}s)..." +assert_quorum_n 3 || exit 1 # ── phase 3: write ──────────────────────────────────────────────────────────── -# Writes (including database creation) must go through the Raft leader. Parse the -# pod ordinal out of leaderId, e.g. "test-arcadedb-1.test-arcadedb.default..._2434" -> 1. -LEADER_ORDINAL=$(echo "${LEADERS[0]}" | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") -[[ -n "$LEADER_ORDINAL" ]] || { echo "ERROR: could not parse ordinal from leader '${LEADERS[0]}'"; exit 1; } +# LEADER_ORDINAL is set by assert_quorum_n above. -echo "==> [3/4] Writing test data via leader pod-${LEADER_ORDINAL}..." +echo "==> [3/6] Writing test data via leader pod-${LEADER_ORDINAL}..." PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader pod-${LEADER_ORDINAL} failed"; exit 1; } @@ -122,7 +165,7 @@ echo " Write complete." # ── phase 4: read and assert ────────────────────────────────────────────────── -echo "==> [4/4] Reading back test data..." +echo "==> [4/6] Reading back test data..." RESULT=$(api "$HTTP_PORT" POST /api/v1/query/integration-test \ '{"language":"sql","command":"SELECT name FROM TestDoc WHERE name = '\''hello-kind'\''"}' \ | jq -r '.result[0].name // empty') || { @@ -138,4 +181,96 @@ if [[ "$RESULT" != "hello-kind" ]]; then fi echo " Got: '${RESULT}'" + +# ── phase 5: STATUS column ──────────────────────────────────────────────────── + +echo "==> [5/6] Asserting STATUS=HEALTHY for all peers..." +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +cluster_status_assert_healthy "$HTTP_PORT" || exit 1 + +pf_stop "$PF_PID" + +# ── phase 6: leadership transfer ────────────────────────────────────────────── + +echo "==> [6/6] Transferring Raft leadership..." +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +CURRENT_LEADER=${LEADERS[0]} +TARGET_PEER=$(api "$HTTP_PORT" GET /api/v1/cluster \ + | jq -r --arg leader "$CURRENT_LEADER" \ + '.peers[] | select(.id != $leader) | .id' | head -n1) +[[ -n "$TARGET_PEER" ]] || { echo "ERROR: no non-leader peer found"; exit 1; } +echo " Current leader: $CURRENT_LEADER" +echo " Transfer target: $TARGET_PEER" + +api "$HTTP_PORT" POST /api/v1/cluster/leader \ + "{\"peerId\":\"$TARGET_PEER\"}" >/dev/null +pf_stop "$PF_PID" + +# Wait up to 30s for the transfer to take effect on any pod we can reach. +DEADLINE=$(( SECONDS + 30 )) +NEW_LEADER="" +while (( SECONDS < DEADLINE )); do + for i in 0 1 2; do + LOCAL=$(( HTTP_PORT + 20 + i )) + PID=$(pf_start "$i" "$LOCAL") + if pf_wait "$LOCAL" 5; then + L=$(api "$LOCAL" GET /api/v1/cluster | jq -r '.leaderId // empty' 2>/dev/null || echo "") + pf_stop "$PID" + if [[ "$L" == "$TARGET_PEER" ]]; then + NEW_LEADER="$L" + break 2 + fi + else + pf_stop "$PID" + fi + done + sleep 2 +done + +[[ "$NEW_LEADER" == "$TARGET_PEER" ]] || { + echo "ERROR: leadership did not transfer; got '${NEW_LEADER:-}'" + exit 1 +} +echo " New leader: $NEW_LEADER" + +# Verify writes via the new leader. +NEW_LEADER_ORDINAL=$(echo "$NEW_LEADER" | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") +PF_PID=$(pf_start "$NEW_LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to new leader failed"; exit 1; } + +api "$HTTP_PORT" POST /api/v1/command/integration-test \ + '{"language":"sql","command":"INSERT INTO TestDoc SET name = '\''post-transfer'\''"}' \ + >/dev/null + +POST_RESULT=$(api "$HTTP_PORT" POST /api/v1/query/integration-test \ + '{"language":"sql","command":"SELECT name FROM TestDoc WHERE name = '\''post-transfer'\''"}' \ + | jq -r '.result[0].name // empty') + +pf_stop "$PF_PID" + +[[ "$POST_RESULT" == "post-transfer" ]] || { + echo "ERROR: write via new leader failed (got '${POST_RESULT:-}')" + exit 1 +} +echo " Write via new leader succeeded." + +# Update tracked leader for downstream phases. +LEADERS[0]=$NEW_LEADER +LEADER_ORDINAL=$NEW_LEADER_ORDINAL + +# Phases 7 (helm-upgrade scale-up 3->5) and 8 (snapshot-install recovery) were +# planned but discarded after CI proved the scenarios are not supported by the +# current ArcadeDB image: a `helm upgrade --set replicaCount=5` rolling-restarts +# all StatefulSet pods AND adds two with a serverList of 5 entries, but Raft +# does not auto-vote in the new peers (the support email confirms this requires +# an explicit POST /api/v1/cluster/peer call from the leader). The cluster ends +# up unable to re-form quorum after the rolling restart. The snapshot-install +# phase depended on the post-scale-up cluster, so it was dropped with phase 7. +# See docs/superpowers/specs/2026-05-09-ha-integration-tests-design.md for the +# updated rationale. + echo "==> All checks passed." diff --git a/docs/superpowers/plans/2026-05-09-ha-integration-tests.md b/docs/superpowers/plans/2026-05-09-ha-integration-tests.md new file mode 100644 index 0000000..1a40e6e --- /dev/null +++ b/docs/superpowers/plans/2026-05-09-ha-integration-tests.md @@ -0,0 +1,639 @@ +# HA Integration Tests Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Extend `ci/integration-test.sh` with four new phases that exercise Raft HA scenarios from the support email Q&A: STATUS column observation, runtime leadership transfer, helm-upgrade scale-up 3→5, and snapshot-install recovery. + +**Architecture:** Single CI job, single kind cluster, single Helm install. New phases append to the existing script in order of escalating risk; the destructive snapshot phase runs last. A small refactor extracts a generalized quorum helper so both the existing 3-pod and the new 5-pod check use the same code path. + +**Tech Stack:** bash, kubectl, kind, helm, jq, curl. No new tooling. + +**Spec:** `docs/superpowers/specs/2026-05-09-ha-integration-tests-design.md` + +--- + +## Local Setup (before starting any task) + +To exercise the integration script locally between tasks, you need a kind cluster with the chart installed. Run once before Task 2: + +```bash +kind create cluster --wait 60s + +helm install test-arcadedb charts/arcadedb/ \ + --set replicaCount=3 \ + --set persistence.enabled=false \ + --set arcadedb.defaultDatabases="" \ + --set 'arcadedb.extraCommands[1]=-Darcadedb.ha.snapshotThreshold=50' \ + --timeout 5m --wait +``` + +Tear down at the end (or between major iterations if needed): + +```bash +kind delete cluster +``` + +After Task 7 the cluster will have 5 pods and a deleted-and-recreated peer; you may want to delete and recreate the cluster between full runs. + +--- + +## Task 1: Add `snapshotThreshold` override to CI workflow + +**Files:** +- Modify: `.github/workflows/lint.yml` (the `Install chart` step in the `integration` job) + +- [ ] **Step 1: Update the helm install args in `lint.yml`** + +Locate the `Install chart` step in the `integration` job. Replace its `run:` block with: + +```yaml + - name: Install chart + run: | + helm install test-arcadedb charts/arcadedb/ \ + --set replicaCount=3 \ + --set persistence.enabled=false \ + --set arcadedb.defaultDatabases="" \ + --set 'arcadedb.extraCommands[1]=-Darcadedb.ha.snapshotThreshold=50' \ + --timeout 5m \ + --wait +``` + +- [ ] **Step 2: Verify YAML is valid** + +Run: + +```bash +python3 -c "import yaml,sys; yaml.safe_load(open('.github/workflows/lint.yml'))" \ + && echo OK +``` + +Expected: `OK` + +- [ ] **Step 3: Commit** + +```bash +git add .github/workflows/lint.yml +git commit -m "ci: lower snapshot threshold for HA integration test" +``` + +--- + +## Task 2: Refactor — extract `assert_quorum_n` helper and renumber phase echoes + +**Files:** +- Modify: `ci/integration-test.sh` (helpers section + phase 2 + all phase echoes) + +The existing phase 2 hardcodes ordinals 0/1/2. The 5-pod scale-up in Task 6 needs the same logic for 0..4. Factor the loop out, parametrized by pod count. While we're here, update the `[N/4]` phase counters to `[N/8]` so subsequent tasks just append. + +- [ ] **Step 1: Add `assert_quorum_n` helper after the `cleanup` trap** + +Insert after line 47 (after the `trap cleanup EXIT` line) in `ci/integration-test.sh`: + +```bash +# assert_quorum_n [timeout-seconds] +# Polls all pods 0..N-1 until they all report the same non-empty leaderId. +# On success: exports LEADERS[] (array of leaderIds, one per pod) and +# LEADER_ORDINAL (the pod ordinal of the leader). +assert_quorum_n() { + local n=$1 timeout=${2:-$RAFT_TIMEOUT} + local deadline=$(( SECONDS + timeout )) + local i pid local_port l + while true; do + LEADERS=() + for (( i=0; i/dev/null || echo "") + LEADERS+=("$l") + fi + pf_stop "$pid" + done + + if (( ${#LEADERS[@]} == n )) && [[ -n "${LEADERS[0]}" ]]; then + local all_agree=1 + for l in "${LEADERS[@]:1}"; do + [[ "$l" == "${LEADERS[0]}" ]] || { all_agree=0; break; } + done + if (( all_agree )); then + LEADER_ORDINAL=$(echo "${LEADERS[0]}" \ + | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") + [[ -n "$LEADER_ORDINAL" ]] || { + echo "ERROR: could not parse ordinal from leader '${LEADERS[0]}'" + return 1 + } + echo " Raft leader: ${LEADERS[0]} (pod-${LEADER_ORDINAL})" + return 0 + fi + fi + + if (( SECONDS >= deadline )); then + echo "ERROR: Raft formation on ${n} pods timed out after ${timeout}s." + echo " Leaders seen: ${LEADERS[*]:-}" + return 1 + fi + echo " Not converged yet (${LEADERS[*]:-}), retrying in 5s..." + sleep 5 + done +} +``` + +- [ ] **Step 2: Replace existing phase 2 body with a call to `assert_quorum_n`** + +Replace the block from `echo "==> [2/4] Checking Raft leader consensus..."` through the end of its `while true; do ... done` loop (lines 66–96 in the current file) with: + +```bash +# ── phase 2: raft formation ─────────────────────────────────────────────────── + +echo "==> [2/8] Checking Raft leader consensus (timeout ${RAFT_TIMEOUT}s)..." +assert_quorum_n 3 || exit 1 +``` + +- [ ] **Step 3: Drop the now-unused `LEADER_ORDINAL=...` parse in phase 3** + +In the current phase 3 (write block), `assert_quorum_n` already exports `LEADER_ORDINAL`. Remove the duplicate parse. Replace the lines: + +```bash +LEADER_ORDINAL=$(echo "${LEADERS[0]}" | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") +[[ -n "$LEADER_ORDINAL" ]] || { echo "ERROR: could not parse ordinal from leader '${LEADERS[0]}'"; exit 1; } +``` + +with a single comment: + +```bash +# LEADER_ORDINAL is set by assert_quorum_n above. +``` + +- [ ] **Step 4: Renumber phase counters from `/4` to `/8`** + +Update the four existing `echo "==> [N/4] ..."` lines so they read `[1/8]`, `[2/8]`, `[3/8]`, `[4/8]` respectively. Phases 5–8 will be added in subsequent tasks. + +- [ ] **Step 5: Syntax-check the script** + +Run: + +```bash +bash -n ci/integration-test.sh && echo OK +``` + +Expected: `OK` + +- [ ] **Step 6: Run end-to-end against the local kind cluster** + +Run: + +```bash +make test-integration +``` + +Expected output (last line): `==> All checks passed.` + +If `assert_quorum_n` does not converge, dump cluster state: + +```bash +kubectl get pods,svc -n default +kubectl logs -l app=arcadedb -n default --tail=50 +``` + +- [ ] **Step 7: Commit** + +```bash +git add ci/integration-test.sh +git commit -m "test(integration): extract assert_quorum_n helper, renumber phases" +``` + +--- + +## Task 3: Add Phase 5 — `STATUS=HEALTHY` assertion + +**Files:** +- Modify: `ci/integration-test.sh` (append phase 5 after the existing phase 4) + +- [ ] **Step 1: Add a `cluster_status_assert_healthy` helper** + +Append to the helpers section (immediately after `assert_quorum_n` from Task 2): + +```bash +# cluster_status_assert_healthy +# Asserts no peer is STALLED or FALLING_BEHIND. Gracefully skips if the +# `peers[].status` field is absent (image predates commit 203acdaac). +cluster_status_assert_healthy() { + local port=$1 + local status_json has_status stalled peer_count + status_json=$(api "$port" GET /api/v1/cluster) || { + echo "ERROR: cluster status API call failed"; return 1 + } + has_status=$(echo "$status_json" | jq -r '.peers[0].status // empty') + if [[ -z "$has_status" ]]; then + echo " WARNING: peers[].status field absent on this image; skipping STATUS assertion." + return 0 + fi + stalled=$(echo "$status_json" \ + | jq -r '.peers[] | select(.status=="STALLED" or .status=="FALLING_BEHIND") | .id' \ + | head -n1) + if [[ -n "$stalled" ]]; then + echo "ERROR: peer $stalled has status STALLED/FALLING_BEHIND" + echo "$status_json" | jq '.peers' + return 1 + fi + peer_count=$(echo "$status_json" | jq '.peers | length') + echo " All ${peer_count} peers HEALTHY/CATCHING_UP." + return 0 +} +``` + +- [ ] **Step 2: Append phase 5 before the final `echo "==> All checks passed."` line** + +Insert this block immediately above the existing final line `echo "==> All checks passed."` (which must remain the last line of the file): + +```bash +# ── phase 5: STATUS column ──────────────────────────────────────────────────── + +echo "==> [5/8] Asserting STATUS=HEALTHY for all peers..." +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +cluster_status_assert_healthy "$HTTP_PORT" || exit 1 + +pf_stop "$PF_PID" +``` + +- [ ] **Step 3: Syntax-check the script** + +Run: + +```bash +bash -n ci/integration-test.sh && echo OK +``` + +Expected: `OK` + +- [ ] **Step 4: Run end-to-end against the local kind cluster** + +Run: + +```bash +make test-integration +``` + +Expected output: a line starting with `==> [5/8] Asserting STATUS=HEALTHY` followed by either `All N peers HEALTHY/CATCHING_UP.` or the WARNING graceful-skip line, then `==> All checks passed.` + +- [ ] **Step 5: Commit** + +```bash +git add ci/integration-test.sh +git commit -m "test(integration): assert peers STATUS=HEALTHY (phase 5)" +``` + +--- + +## Task 4: Add Phase 6 — runtime leadership transfer + +**Files:** +- Modify: `ci/integration-test.sh` (append phase 6 after phase 5) + +- [ ] **Step 1: Append phase 6 before the final `echo "==> All checks passed."` line** + +```bash +# ── phase 6: leadership transfer ────────────────────────────────────────────── + +echo "==> [6/8] Transferring Raft leadership..." +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +CURRENT_LEADER=${LEADERS[0]} +TARGET_PEER=$(api "$HTTP_PORT" GET /api/v1/cluster \ + | jq -r --arg leader "$CURRENT_LEADER" \ + '.peers[] | select(.id != $leader) | .id' | head -n1) +[[ -n "$TARGET_PEER" ]] || { echo "ERROR: no non-leader peer found"; exit 1; } +echo " Current leader: $CURRENT_LEADER" +echo " Transfer target: $TARGET_PEER" + +api "$HTTP_PORT" POST /api/v1/cluster/leader \ + "{\"peerId\":\"$TARGET_PEER\"}" >/dev/null +pf_stop "$PF_PID" + +# Wait up to 30s for the transfer to take effect on any pod we can reach. +DEADLINE=$(( SECONDS + 30 )) +NEW_LEADER="" +while (( SECONDS < DEADLINE )); do + for i in 0 1 2; do + LOCAL=$(( HTTP_PORT + 20 + i )) + PID=$(pf_start "$i" "$LOCAL") + if pf_wait "$LOCAL" 5; then + L=$(api "$LOCAL" GET /api/v1/cluster | jq -r '.leaderId // empty' 2>/dev/null || echo "") + pf_stop "$PID" + if [[ "$L" == "$TARGET_PEER" ]]; then + NEW_LEADER="$L" + break 2 + fi + else + pf_stop "$PID" + fi + done + sleep 2 +done + +[[ "$NEW_LEADER" == "$TARGET_PEER" ]] || { + echo "ERROR: leadership did not transfer; got '${NEW_LEADER:-}'" + exit 1 +} +echo " New leader: $NEW_LEADER" + +# Verify writes via the new leader. +NEW_LEADER_ORDINAL=$(echo "$NEW_LEADER" | sed -nE "s/^${RELEASE}-([0-9]+)\..*$/\1/p") +PF_PID=$(pf_start "$NEW_LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to new leader failed"; exit 1; } + +api "$HTTP_PORT" POST /api/v1/command/integration-test \ + '{"language":"sql","command":"INSERT INTO TestDoc SET name = '\''post-transfer'\''"}' \ + >/dev/null + +POST_RESULT=$(api "$HTTP_PORT" POST /api/v1/query/integration-test \ + '{"language":"sql","command":"SELECT name FROM TestDoc WHERE name = '\''post-transfer'\''"}' \ + | jq -r '.result[0].name // empty') + +pf_stop "$PF_PID" + +[[ "$POST_RESULT" == "post-transfer" ]] || { + echo "ERROR: write via new leader failed (got '${POST_RESULT:-}')" + exit 1 +} +echo " Write via new leader succeeded." + +# Update tracked leader for downstream phases. +LEADERS[0]=$NEW_LEADER +LEADER_ORDINAL=$NEW_LEADER_ORDINAL +``` + +- [ ] **Step 2: Syntax-check** + +```bash +bash -n ci/integration-test.sh && echo OK +``` + +Expected: `OK` + +- [ ] **Step 3: Run end-to-end** + +```bash +make test-integration +``` + +Expected: a `[6/8]` block reporting `New leader: ` (different from the original) and `Write via new leader succeeded.`, followed by `==> All checks passed.` + +If the API responds with 404 or 405 on `/api/v1/cluster/leader`, the deployed image does not yet expose this endpoint — capture the response body via `curl -v` and report; do not silently skip (this endpoint is the entire point of phase 6). + +- [ ] **Step 4: Commit** + +```bash +git add ci/integration-test.sh +git commit -m "test(integration): exercise runtime leadership transfer (phase 6)" +``` + +--- + +## Task 5: Add Phase 7 — `helm upgrade` scale-up 3→5 + +**Files:** +- Modify: `ci/integration-test.sh` (append phase 7 after phase 6) + +- [ ] **Step 1: Append phase 7 before the final `echo "==> All checks passed."` line** + +```bash +# ── phase 7: scale-up 3 -> 5 ────────────────────────────────────────────────── + +echo "==> [7/8] Scaling cluster from 3 to 5 replicas..." +helm upgrade "$RELEASE" charts/arcadedb/ \ + --reuse-values \ + --set replicaCount=5 \ + --wait --timeout 5m + +kubectl rollout status statefulset/"$RELEASE" \ + -n "$NAMESPACE" --timeout=5m +echo " Rollout complete (5 pods Ready)." + +echo " Re-checking quorum across 5 pods..." +assert_quorum_n 5 || exit 1 + +echo " Re-asserting STATUS across all peers..." +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +PEER_COUNT=$(api "$HTTP_PORT" GET /api/v1/cluster | jq '.peers | length') +[[ "$PEER_COUNT" == "5" ]] || { + echo "ERROR: expected 5 peers in cluster status, got ${PEER_COUNT}" + exit 1 +} + +cluster_status_assert_healthy "$HTTP_PORT" || exit 1 + +pf_stop "$PF_PID" +``` + +- [ ] **Step 2: Syntax-check** + +```bash +bash -n ci/integration-test.sh && echo OK +``` + +Expected: `OK` + +- [ ] **Step 3: Run end-to-end** + +```bash +make test-integration +``` + +Expected: `[7/8] Scaling cluster from 3 to 5 replicas...`, then `Rollout complete (5 pods Ready).`, `Raft leader: ... (pod-N)` (from `assert_quorum_n 5`), peer-count check passes, STATUS check passes, `==> All checks passed.` + +If the rolling restart times out: increase `--timeout` to 10m and re-run; also check `kubectl describe pod test-arcadedb-3 -n default` for scheduling failures (kind clusters have limited resources). + +- [ ] **Step 4: Commit** + +```bash +git add ci/integration-test.sh +git commit -m "test(integration): scale-up 3->5 via helm upgrade (phase 7)" +``` + +--- + +## Task 6: Add Phase 8 — snapshot-install recovery + +**Files:** +- Modify: `ci/integration-test.sh` (append phase 8 after phase 7) + +- [ ] **Step 1: Append phase 8 before the final `echo "==> All checks passed."` line** + +```bash +# ── phase 8: snapshot-install recovery ──────────────────────────────────────── + +echo "==> [8/8] Snapshot-install on follower recovery..." + +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +echo " Writing 100 rows to push log past snapshotThreshold=50..." +for i in $(seq 1 100); do + api "$HTTP_PORT" POST /api/v1/command/integration-test \ + "{\"language\":\"sql\",\"command\":\"INSERT INTO TestDoc SET name = 'snap-${i}'\"}" \ + >/dev/null +done +echo " Wrote 100 rows." + +# Pick a non-leader pod ordinal to delete. +DELETE_ORDINAL="" +for i in 0 1 2 3 4; do + if [[ "$i" != "$LEADER_ORDINAL" ]]; then + DELETE_ORDINAL=$i + break + fi +done +[[ -n "$DELETE_ORDINAL" ]] || { echo "ERROR: no non-leader pod to delete"; exit 1; } + +pf_stop "$PF_PID" + +echo " Deleting pod ${RELEASE}-${DELETE_ORDINAL}..." +kubectl delete pod "${RELEASE}-${DELETE_ORDINAL}" -n "$NAMESPACE" --wait=false +kubectl wait --for=condition=Ready pod/"${RELEASE}-${DELETE_ORDINAL}" \ + -n "$NAMESPACE" --timeout=2m +echo " Pod recreated and Ready." + +PF_PID=$(pf_start "$LEADER_ORDINAL" "$HTTP_PORT") +pf_wait "$HTTP_PORT" || { echo "ERROR: port-forward to leader failed"; exit 1; } + +DEADLINE=$(( SECONDS + 90 )) +RECOVERED=0 +LAST_STATUS="" +while (( SECONDS < DEADLINE )); do + STATUS_JSON=$(api "$HTTP_PORT" GET /api/v1/cluster) + S=$(echo "$STATUS_JSON" \ + | jq -r --arg p "${RELEASE}-${DELETE_ORDINAL}" \ + '.peers[] | select(.id | startswith($p)) | .status // empty') + if [[ "$S" == "HEALTHY" ]]; then + RECOVERED=1 + break + fi + if [[ -z "$S" ]]; then + HAS_STATUS_FIELD=$(echo "$STATUS_JSON" | jq -r '.peers[0].status // empty') + if [[ -z "$HAS_STATUS_FIELD" ]]; then + PEER_PRESENT=$(echo "$STATUS_JSON" \ + | jq -r --arg p "${RELEASE}-${DELETE_ORDINAL}" \ + '.peers[] | select(.id | startswith($p)) | .id' | head -n1) + if [[ -n "$PEER_PRESENT" ]]; then + echo " NOTE: STATUS field absent on this image; peer is present in cluster, accepting as recovered." + RECOVERED=1 + break + fi + fi + fi + LAST_STATUS=$S + echo " peer ${RELEASE}-${DELETE_ORDINAL} status=${S:-}, retrying..." + sleep 5 +done +pf_stop "$PF_PID" + +(( RECOVERED )) || { + echo "ERROR: recreated pod did not reach HEALTHY in 90s (last status: ${LAST_STATUS:-})" + exit 1 +} +echo " Recreated pod recovered." + +# Best-effort log signal: did the snapshot-install path actually run? +if kubectl logs "${RELEASE}-${DELETE_ORDINAL}" -n "$NAMESPACE" --tail=500 2>/dev/null \ + | grep -q SnapshotInstaller; then + echo " Confirmed snapshot-install path in logs." +else + echo " NOTE: SnapshotInstaller log line not found (log wording is not a stable contract; not a failure)." +fi +``` + +- [ ] **Step 2: Syntax-check** + +```bash +bash -n ci/integration-test.sh && echo OK +``` + +Expected: `OK` + +- [ ] **Step 3: Run end-to-end** + +```bash +make test-integration +``` + +Expected: `[8/8] Snapshot-install on follower recovery...` block ending with `Recreated pod recovered.`, then `==> All checks passed.` + +If the recreated pod does not reach HEALTHY in 90s, capture diagnostics: + +```bash +kubectl logs "test-arcadedb-${DELETE_ORDINAL}" -n default --tail=200 +kubectl logs "test-arcadedb-${LEADER_ORDINAL}" -n default --tail=200 | grep -i snapshot +``` + +If you see only `Snapshot download attempt N/3 failed` lines: the snapshot transfer is failing in the cluster, which is itself a real bug worth reporting; do not paper over it. + +- [ ] **Step 4: Commit** + +```bash +git add ci/integration-test.sh +git commit -m "test(integration): snapshot-install on follower recovery (phase 8)" +``` + +--- + +## Task 7: End-to-end CI verification + +**Files:** +- No code changes; this task is pure verification. + +- [ ] **Step 1: Push the branch and trigger CI** + +```bash +git push -u origin "$(git rev-parse --abbrev-ref HEAD)" +``` + +- [ ] **Step 2: Watch the `integration` job in GitHub Actions** + +Run: + +```bash +gh run watch +``` + +Or open the Actions tab in the repo. The `integration` job should complete inside the 20-minute timeout and the log should contain all eight `[N/8]` phase headers ending with `==> All checks passed.` + +- [ ] **Step 3: If CI fails on a phase that passed locally** + +Common causes: +- kind in CI is slower than local; bump per-phase timeouts (`RAFT_TIMEOUT`, the 90s in phase 8) before flagging as a real bug. +- Image tag pulled in CI may differ from local cache; check the resolved tag in the `Install chart` step's helm output. + +If a flake is intermittent specifically in phase 8, gate it behind an env var: + +```bash +if [[ "${RUN_SNAPSHOT_TEST:-1}" != "0" ]]; then + # phase 8 body +fi +``` + +This is the contingency from the spec's risk section; only apply it after observing real flake. + +- [ ] **Step 4: Open PR** + +Once CI is green: + +```bash +gh pr create --fill +``` + +--- + +## Acceptance Checklist + +- [ ] All 8 phases pass locally on an image tag that exposes the STATUS field. +- [ ] On older image tags, P5/P7 emit the WARNING graceful-skip line and the rest of the run still passes. +- [ ] CI completes inside the 20-minute timeout. +- [ ] No duplicated port-forward/poll loops between phase 2 and phase 7 — both go through `assert_quorum_n`. +- [ ] No dangling `PF_PID` background processes after the script exits (the existing `cleanup` trap covers this). diff --git a/docs/superpowers/specs/2026-05-09-ha-integration-tests-design.md b/docs/superpowers/specs/2026-05-09-ha-integration-tests-design.md new file mode 100644 index 0000000..3085ea8 --- /dev/null +++ b/docs/superpowers/specs/2026-05-09-ha-integration-tests-design.md @@ -0,0 +1,238 @@ +# HA Cluster Integration Tests — Design + +**Date:** 2026-05-09 +**Status:** Draft +**Source:** Support email Q&A on Raft HA behavior (leader control, scale-up sync, large-import tuning) + +## Context + +The chart already runs a kind-based integration test (`ci/integration-test.sh`) that +brings up a 3-pod HA cluster, verifies Raft consensus, writes via the leader, and +reads back. This design extends that test with scenarios derived from a recent +support exchange about ArcadeDB HA cluster operations. + +Three areas were discussed in the support email: + +- **Q1** — Controlling the leader: `arcadedb.ha.serverRole=replica` to exclude a + pod from leadership; runtime leadership transfer via + `POST /api/v1/cluster/leader`. +- **Q2** — Sync after scale-up: peer-add path (`POST /api/v1/cluster/peer`) and + the snapshot-install path (`/api/v1/ha/snapshot/{database}`). +- **Q3** — Large (>1 GB) import recipe with replication: bring cluster up before + importing, drive imports through the leader, tune Raft thresholds. + +A new `STATUS` column (HEALTHY / CATCHING_UP / FALLING_BEHIND / STALLED) was +added to the cluster status table in commit `203acdaac` and is the canonical +signal for follower health. + +## Goals + +Add automated coverage for the support scenarios that are testable inside the +existing kind-based CI job, within the 20-minute job timeout. + +## Non-Goals (discarded scenarios) + +- **Q1a — `serverRole=replica`:** The chart applies the same `-D` flags to every + pod. Per-ordinal configuration is a chart change, not a test, and is out of + scope here. +- **Q2a — 1→3 helm upgrade reproducing the "peer not in `HA_SERVER_LIST`" path:** + The chart re-renders `arcadedb.ha.serverList` on every upgrade, so the new pod + is always in the configured list. The wire-level peer-add scenario from the + support email does not reproduce through Helm. +- **Q3 — Large-import recipe:** Operational guidance for >1 GB datasets. Not + testable at CI scale; volume too high. + +## In-Scope Scenarios + +| ID | Scenario | Confidence | Approx. cost | +|----|----------|-----------|--------------| +| P5 | STATUS column reports HEALTHY | High | ~5 s | +| P6 | Runtime leadership transfer | High | ~60 s | +| ~~P7~~ | ~~Scale-up 3→5 via `helm upgrade`~~ | **Discarded — see "Post-implementation discovery" below** | — | +| ~~P8~~ | ~~Snapshot-install on follower recovery~~ | **Discarded — depended on P7** | — | + +## Post-implementation discovery (2026-05-09) + +P7 and P8 were implemented and exercised in CI before being discarded. The CI +run revealed two related limitations of the deployed ArcadeDB image: + +1. `helm upgrade --set replicaCount=5` triggers a rolling restart of the + StatefulSet because the `arcadedb.ha.serverList` env var grows from 3 to 5 + entries. With `persistence.enabled=false` (CI default) every pod loses its + in-memory state during the restart cycle. +2. ArcadeDB does not auto-vote new peers into the Raft configuration when a + pod with a wider serverList shows up — the support email itself notes that + `POST /api/v1/cluster/peer` must be called from the leader for each new + peer. The chart does not (and should not) issue that call. + +The combined effect: after `helm upgrade --set replicaCount=5`, the cluster +falls below quorum during the rolling restart and never re-converges. P7's +`assert_quorum_n 5` times out; P8 cannot run because it depends on a 5-pod +cluster with the `integration-test` database. + +P7 and P8 are therefore removed from the in-scope list. The chart's +serverList-rendering correctness is still covered by `helm-unittest` (template +unit tests) — that asserts the correct value without needing a live cluster. + +Future work: a "snapshot-and-restore" workflow (mentioned at the end of the +support email) would let us cover the snapshot path on a single fresh cluster +without the scale-up dependency. Out of scope here. + +## Architecture + +Single CI job, single kind cluster, single Helm install. Extend +`ci/integration-test.sh` with new phases. Existing phases stay (rollout → +quorum → write → read). New phases append on the same cluster, with one +`helm upgrade` step in the middle. Order chosen so destructive scenarios +(pod delete) run last and cannot mask earlier signals. + +Why one cluster instead of one-per-scenario: each kind cluster create costs +~60 s. Sequencing keeps total CI time well under the existing 20-minute job +timeout. + +### Install-time changes + +In the `Install chart` step of `.github/workflows/lint.yml`, append a low +snapshot threshold to `arcadedb.extraCommands`: + +``` +--set 'arcadedb.extraCommands[1]=-Darcadedb.ha.snapshotThreshold=50' +``` + +Index 1 because index 0 holds the existing `-Darcadedb.server.mode=production`. + +A low threshold makes the snapshot-install path reachable without writing +100k rows. It does not affect other scenarios — they each generate fewer +than 50 entries. + +### Helpers (additions to `ci/integration-test.sh`) + +The existing script already has `pf_start`, `pf_stop`, `pf_wait`, `api`. Add: + +- `cluster_status ` — fetches `GET /api/v1/cluster` and returns the + parsed JSON via stdout. Callers extract `.leaderId`, `.peers[]`, etc. +- `peer_status ` — extracts a single peer's `.status` + field from cluster status. Returns empty string if the field is absent. +- `wait_status_healthy ` — polls + `peer_status` until it returns `HEALTHY`. Treats `CATCHING_UP` as transient. + Fails on `STALLED` or `FALLING_BEHIND` only after the timeout. +- `assert_quorum_n ` — generalizes the existing + hardcoded 0/1/2 loop. Iterates ordinals 0..N-1, port-forwards each, reads + `leaderId` from each, asserts all agree. + +## Phase Detail + +### P5 — STATUS=HEALTHY assertion + +After the existing read assertion (phase 4), the script port-forwards to the +leader and calls `GET /api/v1/cluster`. For each `peers[]` entry, assert +`status` is `HEALTHY` (or absent — see graceful-skip below). + +**Graceful skip on missing field:** If `.peers[0].status` is null/missing on +the deployed image (older than `203acdaac`), log a warning and skip the +assertion. Do not fail. This keeps the test compatible with image tags that +predate the STATUS column. + +### P6 — Runtime leadership transfer + +1. From cluster status, pick a non-leader peer ID. +2. `POST /api/v1/cluster/leader` with body `{"peerId":""}`. +3. Poll `GET /api/v1/cluster` from any pod for up to 30 s; assert `leaderId` + matches the chosen peer. +4. Re-run the existing write+read sequence (insert a marker row, read it back) + via the new leader to confirm the cluster is still functional after the + transfer. + +Choosing a specific target peer (rather than sending an empty body) makes the +assertion deterministic; Ratis would otherwise be free to re-elect the same +pod and the test would have to retry. + +### P7 — Scale-up 3→5 via `helm upgrade` + +1. `helm upgrade test-arcadedb charts/arcadedb/ --set replicaCount=5 + --reuse-values --wait --timeout 5m`. +2. `kubectl rollout status statefulset/test-arcadedb --timeout 5m` to cover + the rolling restart of the original 3 pods plus scheduling of pods 3 and 4. +3. Run `assert_quorum_n 5`. +4. Run the STATUS=HEALTHY assertion across all 5 peers (with the same + graceful-skip behavior as P5). + +**No data-persistence assertion.** The CI install runs with +`persistence.enabled=false`, so the rolling restart of pods 0–2 wipes existing +data. The assertion here is purely about cluster topology: the chart's +`arcadedb.nodenames` helper must re-render the serverList correctly so that +all 5 pods agree on a single leader and report HEALTHY. + +### P8 — Snapshot install on follower recovery + +1. From the post-scale-up leader, write 100 small rows in a loop. With + `snapshotThreshold=50` (set at install time), the leader will have produced + a Raft snapshot. +2. Pick a non-leader pod (e.g. ordinal 4). `kubectl delete pod test-arcadedb-4`. +3. Wait for the StatefulSet to recreate the pod and for it to reach `Ready` + (`kubectl wait --for=condition=Ready pod/test-arcadedb-4 --timeout=2m`). +4. Poll cluster status for up to 90 s; assert the recreated peer reaches + `STATUS=HEALTHY`. +5. **Secondary signal (best-effort):** `kubectl logs test-arcadedb-4` and grep + for `SnapshotInstaller`. Log the result but do not fail on miss — log-line + wording is not a stable contract. + +Without persistence enabled, deleting the pod wipes its state, so the +recreated pod's Raft log starts at index 0. With the leader at >50 entries, +this is below the snapshot threshold gap and the leader will install a +snapshot rather than ship individual log entries. + +## Phase Ordering + +``` +1. Existing: rollout +2. Existing: Raft consensus (3 pods) +3. Existing: write via leader +4. Existing: read back +5. P5: STATUS=HEALTHY (3 pods) +6. P6: leadership transfer + verify writes +7. P7: helm upgrade to replicaCount=5, re-verify quorum + STATUS +8. P8: delete pod, verify snapshot-install recovery +``` + +## CI Budget + +| Phase | Estimate | +|-------|---------:| +| kind create | ~60 s | +| helm install + rollout | ~2 min | +| Existing phases 1–4 | ~2 min | +| P5 STATUS | ~5 s | +| P6 leadership transfer | ~60 s | +| P7 scale-up to 5 | ~3–5 min | +| P8 snapshot recovery | ~2 min | +| **Total** | **~11–13 min** | + +Comfortable under the 20-minute job timeout. + +## Risks and Mitigations + +- **STATUS field absent on older image tags.** The chart's `image.tag` defaults + to `appVersion`; if a release predates commit `203acdaac`, the STATUS field + is missing. Mitigation: graceful skip with a warning, not a hard failure. +- **Leadership-transfer flake (Ratis re-elects the same pod).** Mitigation: + send an explicit `peerId` instead of an empty body. +- **Scale-up rolling restart loses data.** Mitigation: do not assert data + survival in P7; only assert cluster topology. +- **P8 is the most flake-prone phase.** It runs last so a P8 failure cannot + mask earlier signals. If P8 proves flaky in practice, gate it behind a + `RUN_SNAPSHOT_TEST=1` env var rather than disabling the rest of the file. + +## Acceptance + +The work is complete when: + +1. `make test-integration` against a kind cluster passes all 8 phases on a + chart pinned to an image tag that includes the STATUS column. +2. The same script run against an image tag that predates the STATUS column + skips P5/P7's STATUS assertions with a warning and still passes the rest. +3. CI (`.github/workflows/lint.yml`) installs the chart with the + `snapshotThreshold=50` override and runs the extended script in under + the existing 20-minute job timeout. +4. The new helpers in `ci/integration-test.sh` are factored out and reused + across phases (no duplicated port-forward/poll loops).