From 610b1ef1a888b9a49474db2fd5b706916faed759 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Mon, 22 Jun 2026 21:00:29 +0000 Subject: [PATCH] Change liveness check for eventually_conservation The previous version of eventually_conservation relied on polling /metrics to infer whether or not the data topology in the scenarios were active. This inference is a race, metrics/ might report while the source/sink are still coming online, so in a small number of cases scenarios failed on this race. The approach taken here is to bump the "is it back online" check to 180s and poll directly on the topology, not on /metrics. I've taken the opportunity to slim vector_e2e a little further also, removing vestigial restart config update material. This rare failure will be fixed in the disk buffer scenario as well. --- .../src/bin/eventually_conservation.rs | 9 ++++- .../scenarios/vector_e2e/Dockerfile | 7 +--- .../antithesis/scenarios/vector_e2e/README.md | 4 +-- .../scenarios/vector_e2e/anytime_reload.sh | 34 ------------------ .../scenarios/vector_e2e/docker-compose.yaml | 4 --- .../scenarios/vector_e2e/vector.b.yaml | 36 ------------------- 6 files changed, 10 insertions(+), 84 deletions(-) delete mode 100755 tests/antithesis/scenarios/vector_e2e/anytime_reload.sh delete mode 100644 tests/antithesis/scenarios/vector_e2e/vector.b.yaml diff --git a/tests/antithesis/harness/src/bin/eventually_conservation.rs b/tests/antithesis/harness/src/bin/eventually_conservation.rs index 947d6d9372d2a..452a1ef898458 100644 --- a/tests/antithesis/harness/src/bin/eventually_conservation.rs +++ b/tests/antithesis/harness/src/bin/eventually_conservation.rs @@ -226,7 +226,14 @@ async fn main() { // post retry until one sticks, since a node can briefly refuse a write while it is // still recovering. A wedged node never delivers it and fails here. Runs // unconditionally. - let deadline = time::Instant::now() + time::Duration::from_secs(45); + // + // The recovery gate above only proves the metrics endpoint answers. That is a + // separate listener from the source's data path, so the source and sink can still + // be unready while metrics already serve, and a just-restarted node needs time to + // bring them up. The round-trip is therefore the real readiness signal and gets the + // same budget as recovery rather than a tight window that expires before the data + // path is serving. + let deadline = time::Instant::now() + time::Duration::from_secs(180); let mut probe = None; let mut progressed = false; while !progressed && time::Instant::now() < deadline { diff --git a/tests/antithesis/scenarios/vector_e2e/Dockerfile b/tests/antithesis/scenarios/vector_e2e/Dockerfile index 350fc48a9e165..7f2d0e21b170f 100644 --- a/tests/antithesis/scenarios/vector_e2e/Dockerfile +++ b/tests/antithesis/scenarios/vector_e2e/Dockerfile @@ -76,13 +76,8 @@ FROM debian:stable-slim AS vector RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \ && rm -rf /var/lib/apt/lists/* COPY --from=vector-build /usr/local/bin/vector /usr/bin/vector -# Bake the node config plus its benign alternate, which the reload fault swaps in -# to force a sink rebuild. +# Bake the node config; compose selects it via --config. COPY tests/antithesis/scenarios/vector_e2e/vector.yaml /etc/vector/vector.yaml -COPY tests/antithesis/scenarios/vector_e2e/vector.b.yaml /etc/vector/vector.b.yaml -# The reload fault is an anytime_ test command that runs IN the node container. -# The node stays running because its entrypoint is Vector, not a test command. -COPY --chmod=755 tests/antithesis/scenarios/vector_e2e/anytime_reload.sh /opt/antithesis/test/v1/ve2e/anytime_reload RUN mkdir -p /symbols && ln -s /usr/bin/vector /symbols/vector ENV NO_COLOR=1 EXPOSE 8080 9598 diff --git a/tests/antithesis/scenarios/vector_e2e/README.md b/tests/antithesis/scenarios/vector_e2e/README.md index 4448e918b7e94..4aabdc33a01b3 100644 --- a/tests/antithesis/scenarios/vector_e2e/README.md +++ b/tests/antithesis/scenarios/vector_e2e/README.md @@ -26,9 +26,7 @@ One Vector node and one oracle container. - **vector** takes an `http_server` source (`:8080`) and delivers over `http` to the oracle through an in-memory buffer with `when_full: block` and e2e acks. It - also exposes Prometheus metrics (`:9598`) for the health gate, and runs the - reload fault: an `anytime_` command swaps `vector.yaml`/`vector.b.yaml` and sends - `SIGHUP`, forcing the sink to rebuild mid-run. + also exposes Prometheus metrics (`:9598`) for the health gate. - **oracle** (`:8686`) is one container that injects unique event ids at the node and runs the HTTP endpoint the node's sink delivers back to. diff --git a/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh b/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh deleted file mode 100755 index 71416aaf9324b..0000000000000 --- a/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail -[ -n "${VECTOR_CONFIG_ALT:-}" ] || exit 0 -cfg="${VECTOR_CONFIG:?}" -alt="${VECTOR_CONFIG_ALT:?}" - -# Vector only ever reads $cfg, so reload alternates $cfg between two immutable -# sources rather than swapping two live files. The alternate $alt is never -# written, and the baseline (the original $cfg) is snapshotted once, so the only -# mutable file is $cfg and the only writes to it are a single rename of a fully -# written temp. The node-termination fault can therefore interrupt this script at -# any point and leave $cfg as one complete config or the other, never half-written -# and never collapsed so both sources hold the same content. Alternation always -# resumes on the next invocation. -base="$cfg.orig" -if [ ! -f "$base" ]; then - cp "$cfg" "$base.tmp" - mv "$base.tmp" "$base" -fi - -# Pick whichever source is not currently live. cksum reads from stdin so its -# output is the checksum alone, with no filename to differ on. -if [ "$(cksum <"$cfg")" = "$(cksum <"$alt")" ]; then - next="$base" -else - next="$alt" -fi -cp "$next" "$cfg.tmp" -mv "$cfg.tmp" "$cfg" - -# Vector is PID 1 in the node container. SIGHUP triggers reload-from-disk. -kill -HUP 1 -sleep 5 diff --git a/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml index abbc3a136b36d..f18a794ad251a 100644 --- a/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml +++ b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml @@ -21,12 +21,8 @@ services: build: *vector-build image: ve2e-vector:${ANTITHESIS_IMAGE_TAG:-dev} entrypoint: ["/usr/bin/vector", "--config", "/etc/vector/vector.yaml"] - # vector runs the reload fault: VECTOR_CONFIG_ALT lets anytime_reload swap - # configs and SIGHUP, forcing the sink to rebuild. No disk buffer, so no volume. environment: NO_COLOR: "1" - VECTOR_CONFIG: "/etc/vector/vector.yaml" - VECTOR_CONFIG_ALT: "/etc/vector/vector.b.yaml" healthcheck: *node-health oracle: diff --git a/tests/antithesis/scenarios/vector_e2e/vector.b.yaml b/tests/antithesis/scenarios/vector_e2e/vector.b.yaml deleted file mode 100644 index d426c7413699a..0000000000000 --- a/tests/antithesis/scenarios/vector_e2e/vector.b.yaml +++ /dev/null @@ -1,36 +0,0 @@ -sources: - in: - type: http_server - address: 0.0.0.0:8080 - decoding: - codec: json - acknowledgements: - enabled: true - - metrics: - type: internal_metrics - scrape_interval_secs: 1 - -sinks: - out: - type: http - inputs: [in] - uri: http://oracle:8686/ingest - method: post - encoding: - codec: json - # Benign alternate the reload fault swaps in. It differs from vector.yaml only - # by an explicit request timeout, enough to make the reload rebuild the sink. - request: - timeout_secs: 45 - buffer: - type: memory - max_events: 500 - when_full: block - acknowledgements: - enabled: true - - prom: - type: prometheus_exporter - inputs: [metrics] - address: 0.0.0.0:9598