diff --git a/spartan/aztec-chaos-scenarios/templates/validator-failure.yaml b/spartan/aztec-chaos-scenarios/templates/validator-failure.yaml new file mode 100644 index 000000000000..95a6e5921465 --- /dev/null +++ b/spartan/aztec-chaos-scenarios/templates/validator-failure.yaml @@ -0,0 +1,28 @@ +{{- if (get (get .Values "validatorFailure") "enabled") }} +--- +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: {{ .Release.Name }}-{{ .Values.global.targetNamespace }}-validator-failure + namespace: {{ default .Values.global.chaosMeshNamespace .Values.global.chaosResourceNamespace }} + labels: + {{- include "aztec-chaos-scenarios.labels" . | nindent 4 }} + annotations: + "helm.sh/resource-policy": keep +spec: + action: pod-failure + {{- if .Values.validatorFailure.percent }} + mode: fixed-percent + value: {{ .Values.validatorFailure.percent | quote }} + {{- else }} + mode: all + {{- end }} + selector: + namespaces: + - {{ .Values.global.targetNamespace }} + labelSelectors: + app.kubernetes.io/name: validator + duration: {{ .Values.validatorFailure.duration }} +{{- end }} + + diff --git a/spartan/aztec-chaos-scenarios/templates/validator-kill.yaml b/spartan/aztec-chaos-scenarios/templates/validator-kill.yaml index 9db3b72be563..b8a1d5604aab 100644 --- a/spartan/aztec-chaos-scenarios/templates/validator-kill.yaml +++ b/spartan/aztec-chaos-scenarios/templates/validator-kill.yaml @@ -17,5 +17,5 @@ spec: namespaces: - {{ .Values.global.targetNamespace }} labelSelectors: - app.kubernetes.io/component: validator + app.kubernetes.io/component: sequencer-node {{- end }} diff --git a/spartan/aztec-chaos-scenarios/values.yaml b/spartan/aztec-chaos-scenarios/values.yaml index 5da0496981b8..94432f6d0625 100644 --- a/spartan/aztec-chaos-scenarios/values.yaml +++ b/spartan/aztec-chaos-scenarios/values.yaml @@ -94,6 +94,11 @@ validatorKill: enabled: false percent: 30 +validatorFailure: + enabled: false + duration: 60s + percent: "100" + bootNodeFailure: enabled: false duration: 60m diff --git a/spartan/aztec-chaos-scenarios/values/validator-failure.yaml b/spartan/aztec-chaos-scenarios/values/validator-failure.yaml new file mode 100644 index 000000000000..a8141e770371 --- /dev/null +++ b/spartan/aztec-chaos-scenarios/values/validator-failure.yaml @@ -0,0 +1,6 @@ +global: + namespace: "smoke" + +validatorFailure: + enabled: true + duration: "60s" diff --git a/spartan/aztec-node/templates/_pod-template.yaml b/spartan/aztec-node/templates/_pod-template.yaml index 56593cb39aba..ec3b94f821e8 100644 --- a/spartan/aztec-node/templates/_pod-template.yaml +++ b/spartan/aztec-node/templates/_pod-template.yaml @@ -94,6 +94,8 @@ spec: path: /status port: {{ .Values.service.rpc.port }} periodSeconds: {{ .Values.node.startupProbe.periodSeconds }} + timeoutSeconds: {{ .Values.node.startupProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.node.startupProbe.initialDelaySeconds }} failureThreshold: {{ .Values.node.startupProbe.failureThreshold }} volumeMounts: - name: shared diff --git a/spartan/aztec-node/values.yaml b/spartan/aztec-node/values.yaml index 8172d02143e6..ba0af0d16b2a 100644 --- a/spartan/aztec-node/values.yaml +++ b/spartan/aztec-node/values.yaml @@ -116,9 +116,13 @@ node: startupProbe: # -- Period seconds periodSeconds: 30 + # -- Timeout seconds for each probe request + timeoutSeconds: 1 + # -- Initial delay before starting probes + initialDelaySeconds: 0 # -- Failure threshold - # 10 minutes default but this might not be enough if the node has to download a lot of blocks. - failureThreshold: 20 + # 20 minutes default but this might not be enough if the node has to download a lot of blocks. + failureThreshold: 40 resources: {} diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index cdf9a404e7e0..1906832e1e46 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -86,6 +86,7 @@ function network_test_cmds { echo $prefix $run_test_script simple src/spartan/mempool_limit.test.ts echo $prefix $run_test_script simple src/spartan/upgrade_governance_proposer.test.ts # echo $prefix $run_test_script simple src/spartan/reorg.test.ts #takes too long >~5 epochs + echo $prefix $run_test_script simple src/spartan/validator_nuke_and_suppression.test.ts } function single_test { @@ -214,7 +215,7 @@ case "$cmd" in docker update --restart=no kind-control-plane >/dev/null || true ;; "chaos-mesh") - chaos-mesh/install.sh + scripts/deploy_chaos_mesh.sh ;; "metrics-kind") metrics/install-kind.sh diff --git a/spartan/terraform/deploy-aztec-infra/values/validator.yaml b/spartan/terraform/deploy-aztec-infra/values/validator.yaml index 1ea495115ba2..dbcd1482a775 100644 --- a/spartan/terraform/deploy-aztec-infra/values/validator.yaml +++ b/spartan/terraform/deploy-aztec-infra/values/validator.yaml @@ -1,6 +1,13 @@ validator: node: logLevel: "debug; info: aztec:simulator, json-rpc" + # Validators can take a long time to come up (CRS download, heavy init, catch-up). + # Keep startupProbe generous to avoid kubelet killing the container mid-boot. + startupProbe: + periodSeconds: 30 + timeoutSeconds: 5 + initialDelaySeconds: 0 + failureThreshold: 40 preStartScript: | source /scripts/setup-attester-keystore.sh diff --git a/yarn-project/end-to-end/src/spartan/utils.ts b/yarn-project/end-to-end/src/spartan/utils.ts index c40dd9efb19e..69dfa7512657 100644 --- a/yarn-project/end-to-end/src/spartan/utils.ts +++ b/yarn-project/end-to-end/src/spartan/utils.ts @@ -246,7 +246,10 @@ export async function deleteResourceByLabel({ force?: boolean; }) { try { - await execAsync(`kubectl api-resources --no-headers -o name | grep -Eq "^${resource}(\\\\..+)?$"`); + // Match both plain and group-qualified names (e.g., "podchaos" or "podchaos.chaos-mesh.org") + const escaped = resource.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&'); + const regex = `(^|\\.)${escaped}(\\.|$)`; + await execAsync(`kubectl api-resources --no-headers -o name | grep -Eq '${regex}'`); } catch (error) { logger.warn(`Resource type '${resource}' not found in cluster, skipping deletion ${error}`); return ''; @@ -279,6 +282,58 @@ export async function waitForResourceByLabel({ return stdout; } +export async function waitForResourceByName({ + resource, + name, + namespace, + condition = 'Ready', + timeout = '10m', +}: { + resource: string; + name: string; + namespace: string; + condition?: string; + timeout?: string; +}) { + const command = `kubectl wait ${resource}/${name} --for=condition=${condition} -n ${namespace} --timeout=${timeout}`; + logger.info(`command: ${command}`); + const { stdout } = await execAsync(command); + return stdout; +} + +export async function waitForResourcesByName({ + resource, + names, + namespace, + condition = 'Ready', + timeout = '10m', +}: { + resource: string; + names: string[]; + namespace: string; + condition?: string; + timeout?: string; +}) { + if (!names.length) { + throw new Error(`No ${resource} names provided to waitForResourcesByName`); + } + + // Wait all in parallel; if any fails, surface which one. + await Promise.all( + names.map(async name => { + try { + await waitForResourceByName({ resource, name, namespace, condition, timeout }); + } catch (err) { + throw new Error( + `Failed waiting for ${resource}/${name} condition=${condition} timeout=${timeout} namespace=${namespace}: ${String( + err, + )}`, + ); + } + }), + ); +} + export function getChartDir(spartanDir: string, chartName: string) { return path.join(spartanDir.trim(), chartName); } @@ -458,6 +513,29 @@ export function applyProverFailure({ }); } +export function applyValidatorFailure({ + namespace, + spartanDir, + logger, + values, + instanceName, +}: { + namespace: string; + spartanDir: string; + logger: Logger; + values?: Record; + instanceName?: string; +}) { + return installChaosMeshChart({ + instanceName: instanceName ?? 'validator-failure', + targetNamespace: namespace, + valuesFile: 'validator-failure.yaml', + helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + values, + logger, + }); +} + export function applyProverKill({ namespace, spartanDir, @@ -537,18 +615,21 @@ export function applyValidatorKill({ spartanDir, logger, values, + clean = true, }: { instanceName?: string; namespace: string; spartanDir: string; logger: Logger; values?: Record; + clean?: boolean; }) { return installChaosMeshChart({ - instanceName, + instanceName: instanceName ?? 'validator-kill', targetNamespace: namespace, valuesFile: 'validator-kill.yaml', helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + clean, logger, values, }); diff --git a/yarn-project/end-to-end/src/spartan/validator_nuke_and_suppression.test.ts b/yarn-project/end-to-end/src/spartan/validator_nuke_and_suppression.test.ts new file mode 100644 index 000000000000..5280db3ba1c5 --- /dev/null +++ b/yarn-project/end-to-end/src/spartan/validator_nuke_and_suppression.test.ts @@ -0,0 +1,274 @@ +import { EthAddress } from '@aztec/aztec.js/addresses'; +import { createAztecNodeClient } from '@aztec/aztec.js/node'; +import { RollupContract } from '@aztec/ethereum/contracts'; +import { ChainMonitor } from '@aztec/ethereum/test'; +import type { ViemPublicClient } from '@aztec/ethereum/types'; +import { EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { createLogger } from '@aztec/foundation/log'; +import { retryUntil } from '@aztec/foundation/retry'; +import { sleep } from '@aztec/foundation/sleep'; +import { type L1RollupConstants, getSlotRangeForEpoch, getStartTimestampForEpoch } from '@aztec/stdlib/epoch-helpers'; + +import { expect, jest } from '@jest/globals'; +import type { ChildProcess } from 'child_process'; +import { createPublicClient, fallback, http } from 'viem'; + +import { + applyValidatorFailure, + applyValidatorKill, + getExternalIP, + getGitProjectRoot, + getPublicViemClient, + getSequencers, + setupEnvironment, + startPortForwardForRPC, + uninstallChaosMesh, + updateSequencersConfig, + waitForResourcesByName, +} from './utils.js'; + +describe('validator suppression and nuke with slashing assertions', () => { + jest.setTimeout(60 * 60 * 1000); // 60 minutes + + const logger = createLogger('e2e:spartan:suppress-nuke-slash'); + const config = setupEnvironment(process.env); + const forwardProcesses: ChildProcess[] = []; + + let client: ViemPublicClient; + let rollup: RollupContract; + let constants: Omit; + let monitor: ChainMonitor; + let nodeRpcUrl: string; + let spartanDir: string; + const killReleases: string[] = []; + + beforeAll(async () => { + const chaosReleases = ['validator-failure']; + await Promise.all( + chaosReleases.map(name => + uninstallChaosMesh(name, config.NAMESPACE, logger).catch(() => + logger.verbose(`Not Found/Failed to post-clean chaos release ${name}`), + ), + ), + ); + + // Prefer direct access to the Aztec RPC if it is exposed, otherwise fall back to port-forward. + try { + const rpcIP = await getExternalIP(config.NAMESPACE, 'rpc-aztec-node'); + if (!rpcIP) { + throw new Error('No external IP for rpc-aztec-node service'); + } + nodeRpcUrl = `http://${rpcIP}:8080`; + logger.info(`Found external Aztec RPC url`); + } catch (err) { + logger.warn(`Failed to use external Aztec RPC, falling back to port-forward`, err as Error); + const { process: rpcProc, port } = await startPortForwardForRPC(config.NAMESPACE); + forwardProcesses.push(rpcProc); + nodeRpcUrl = `http://127.0.0.1:${port}`; + } + + // Reuse RPC port-forward to fetch L1 deployment addresses + const deployAddresses = await createAztecNodeClient(nodeRpcUrl) + .getNodeInfo() + .then(i => i.l1ContractAddresses); + + try { + const ethExecutionIp = await getExternalIP(config.NAMESPACE, 'eth-execution'); + if (!ethExecutionIp) { + throw new Error('No external IP for eth-execution service'); + } + const url = `http://${ethExecutionIp}:8545`; + client = createPublicClient({ transport: fallback([http(url, { batch: false, timeout: 60_000 })]) }); + // Ensure the endpoint is actually responsive; otherwise fall back to port-forward. + await client.getBlockNumber(); + logger.info(`Found external L1 RPC url`); + } catch (err) { + logger.warn(`Failed to use external L1 RPC, falling back to port-forward`, err as Error); + const viem = await getPublicViemClient(config, forwardProcesses); + client = viem.client; + } + + rollup = new RollupContract(client, deployAddresses.rollupAddress); + monitor = new ChainMonitor(rollup, undefined, logger.createChild('chain-monitor'), 500).start(); + + constants = await rollup.getRollupConstants(); + spartanDir = `${getGitProjectRoot()}/spartan`; + + await monitor.run(); + }); + + afterAll(async () => { + // Ensure we don't leave validators disabled + await updateSequencersConfig(config, { disabledValidators: [] }).catch(() => undefined); + const chaosReleases = ['validator-failure', ...killReleases]; + await Promise.all( + chaosReleases.map(name => + uninstallChaosMesh(name, config.NAMESPACE, logger).catch(() => + logger.warn(`Not Found/Failed to post-clean chaos release ${name}`), + ), + ), + ); + + monitor?.removeAllListeners(); + await monitor?.stop(); + forwardProcesses.forEach(p => p.kill()); + }); + + // Wait for validator/sequencer pods by name (discovered via `getSequencers`) to satisfy a condition. + const waitValidators = async (condition: string, timeout = '5m') => { + const pods = await getSequencers(config.NAMESPACE); + if (!pods.length) { + throw new Error(`No validator/sequencer pods found in namespace ${config.NAMESPACE}`); + } + await waitForResourcesByName({ + resource: 'pod', + names: pods, + namespace: config.NAMESPACE, + condition, + timeout, + }); + }; + + it('suppresses next-epoch committee, nukes repeatedly, then resumes quickly with no missed slots and no slashing', async () => { + // Count slots via l2-slot deltas, and blocks via pending tips delta within [startSlot, endSlot) + const countSlotsAndBlocks = async (startSlot: SlotNumber, endSlot: SlotNumber) => { + // Bound the window by observed slots + await monitor.waitUntilL2Slot(startSlot); + const startObserved = Number(monitor.l2SlotNumber); + const tipsStart = await rollup.getTips(); + + await monitor.waitUntilL2Slot(endSlot); + const endObserved = Number(monitor.l2SlotNumber); + const tipsEnd = await rollup.getTips(); + + const slotTotal = endObserved - startObserved; + const blockCount = tipsEnd.pending - tipsStart.pending; + return { slotTotal, blockCount }; + }; + + // Next epoch committee discovery (to keep track of slashing) + const getNextEpochCommittee = async () => { + const startEpoch = await rollup.getCurrentEpoch(); + logger.warn(`Retrieving committee for next epoch (current epoch is ${startEpoch})`); + return await retryUntil( + async () => { + const nextEpoch = EpochNumber((await rollup.getCurrentEpoch()) + 1); + const nextEpochStartTimestamp = getStartTimestampForEpoch(nextEpoch, constants); + const committee = (await rollup.getCommitteeAt(nextEpochStartTimestamp)) as string[]; + if (committee && committee.length > 0) { + logger.warn(`Retrieved committee for epoch ${nextEpoch}`, { committee }); + return { committee, epoch: nextEpoch }; + } + }, + 'committee', + constants.epochDuration * constants.slotDuration * 4, // up to 4 epochs + 1, + ); + }; + + // Keep track of slashing events from the committee + const { committee, epoch } = await getNextEpochCommittee(); + const committeeEthAddresses = committee.map((a: string) => EthAddress.fromString(a)); + // Subscribe early to slash events for this committee to avoid missing early executions + const committeeSet = new Set(committeeEthAddresses.map((a: EthAddress) => a.toString())); + const observedSlashes = new Map(); + const unsubscribeGlobalSlash = rollup.listenToSlash((data: { amount: bigint; attester: EthAddress }) => { + const key = data.attester.toString(); + if (committeeSet.has(key) && !observedSlashes.has(key)) { + observedSlashes.set(key, { amount: data.amount, attester: data.attester }); + logger.warn(`(early) observed slash for ${key} amount=${data.amount}`); + } + }); + + // Wait until the first slot of the suppressed epoch to start suppression + const slotRange = getSlotRangeForEpoch(epoch, constants); + const slotBeforeSuppressedEpoch = SlotNumber(slotRange[0] - 1); + const slotBeginningSuppressedEpoch = SlotNumber(slotRange[0]); + const slotEndSuppressedEpoch = SlotNumber(slotRange[1]); + logger.info( + `Waiting until slot ${slotBeforeSuppressedEpoch} to start suppression (current ${monitor.l2SlotNumber})`, + ); + await monitor.waitUntilL2Slot(slotBeforeSuppressedEpoch); + // Start failure a bit before the slot start + const percentOfSlotToBuffer = 0.7; + const delaySeconds = Number(constants.slotDuration) * percentOfSlotToBuffer; + const remainingSeconds = Number(constants.slotDuration) - delaySeconds; + await sleep(delaySeconds); + + // Suppress validators for the next epoch + const durationSeconds = Math.ceil(Number(constants.epochDuration * constants.slotDuration + remainingSeconds)); + await applyValidatorFailure({ + namespace: config.NAMESPACE, + spartanDir, + logger, + values: { + 'validatorFailure.duration': `${durationSeconds}s`, + 'global.chaosResourceNamespace': config.NAMESPACE, + }, + }); + // Ensure validators are NotReady before entering suppression window + try { + await waitValidators('Ready=false', '3m'); + } catch { + logger.warn('Validators did not reach NotReady state before suppression window'); + } + + const { slotTotal: suppressedSlots, blockCount: suppressedBlocks } = await countSlotsAndBlocks( + slotBeginningSuppressedEpoch, + slotEndSuppressedEpoch, + ); + + logger.info(`suppression window slots=${suppressedSlots} blocks=${suppressedBlocks}`); + // Assertions can be flaky due to variable chart deployment relative to absolute slot times + // expect(suppressedSlots).toBe(constants.epochDuration); // Slots should increment + expect(suppressedBlocks).toBe(0); // No blocks should be produced + + // Gate on PodReadyToStartContainers instead of Ready to avoid block building/slashing when only partial validators are up + await waitValidators('PodReadyToStartContainers', '15m'); + logger.info(`Validators recovered after suppression epoch`); + + // Perform the nuke cycles and ensure no slashing occurs + const rounds = 4; // 3–5 + for (let i = 1; i < rounds; i++) { + logger.info(`nuke round ${i + 1}/${rounds}`); + + const releaseName = `validator-kill-${i + 1}`; + killReleases.push(releaseName); + await applyValidatorKill({ + namespace: config.NAMESPACE, + spartanDir, + logger, + values: { + 'validatorKill.percent': 100, + // Ensure chaos resources are created in the scenario namespace (mirrors prover kill tests) + 'global.chaosResourceNamespace': config.NAMESPACE, + }, + clean: false, + instanceName: releaseName, + }); + await sleep(3000); + } + await waitValidators('Ready', '15m'); + + // Check we have started creating blocks on current epoch, then check that a clean (no missed slots) epoch was created + const pendingTipsBefore = (await rollup.getTips()).pending; + const afterNukesEpoch = EpochNumber((await rollup.getCurrentEpoch()) + 1); + const afterNukesStart = getSlotRangeForEpoch(afterNukesEpoch, constants)[0]; + const afterNukesEnd = getSlotRangeForEpoch(EpochNumber(afterNukesEpoch + 1), constants)[0]; + const { slotTotal: postNukeSlots, blockCount: postNukeBlocks } = await countSlotsAndBlocks( + afterNukesStart, + afterNukesEnd, + ); + const missedAfterNukes = postNukeSlots - postNukeBlocks; + logger.info( + `post-nukes epoch from=${afterNukesStart} to=${afterNukesEnd} slots=${postNukeSlots} blocks=${postNukeBlocks} missed=${missedAfterNukes}`, + ); + const pendingTipsAfter = (await rollup.getTips()).pending; + expect(pendingTipsAfter).toBeGreaterThan(pendingTipsBefore); + expect(missedAfterNukes).toBe(0); + + // Additionally assert that no slashing occurred during the test window + expect(observedSlashes.size).toBe(0); + unsubscribeGlobalSlash(); + }); +});