diff --git a/tests/antithesis/harness/src/bin/eventually_conservation.rs b/tests/antithesis/harness/src/bin/eventually_conservation.rs index 947d6d9372d2a..452a1ef898458 100644 --- a/tests/antithesis/harness/src/bin/eventually_conservation.rs +++ b/tests/antithesis/harness/src/bin/eventually_conservation.rs @@ -226,7 +226,14 @@ async fn main() { // post retry until one sticks, since a node can briefly refuse a write while it is // still recovering. A wedged node never delivers it and fails here. Runs // unconditionally. - let deadline = time::Instant::now() + time::Duration::from_secs(45); + // + // The recovery gate above only proves the metrics endpoint answers. That is a + // separate listener from the source's data path, so the source and sink can still + // be unready while metrics already serve, and a just-restarted node needs time to + // bring them up. The round-trip is therefore the real readiness signal and gets the + // same budget as recovery rather than a tight window that expires before the data + // path is serving. + let deadline = time::Instant::now() + time::Duration::from_secs(180); let mut probe = None; let mut progressed = false; while !progressed && time::Instant::now() < deadline { diff --git a/tests/antithesis/scenarios/vector_e2e/Dockerfile b/tests/antithesis/scenarios/vector_e2e/Dockerfile index 350fc48a9e165..7f2d0e21b170f 100644 --- a/tests/antithesis/scenarios/vector_e2e/Dockerfile +++ b/tests/antithesis/scenarios/vector_e2e/Dockerfile @@ -76,13 +76,8 @@ FROM debian:stable-slim AS vector RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \ && rm -rf /var/lib/apt/lists/* COPY --from=vector-build /usr/local/bin/vector /usr/bin/vector -# Bake the node config plus its benign alternate, which the reload fault swaps in -# to force a sink rebuild. +# Bake the node config; compose selects it via --config. COPY tests/antithesis/scenarios/vector_e2e/vector.yaml /etc/vector/vector.yaml -COPY tests/antithesis/scenarios/vector_e2e/vector.b.yaml /etc/vector/vector.b.yaml -# The reload fault is an anytime_ test command that runs IN the node container. -# The node stays running because its entrypoint is Vector, not a test command. -COPY --chmod=755 tests/antithesis/scenarios/vector_e2e/anytime_reload.sh /opt/antithesis/test/v1/ve2e/anytime_reload RUN mkdir -p /symbols && ln -s /usr/bin/vector /symbols/vector ENV NO_COLOR=1 EXPOSE 8080 9598 diff --git a/tests/antithesis/scenarios/vector_e2e/README.md b/tests/antithesis/scenarios/vector_e2e/README.md index 4448e918b7e94..4aabdc33a01b3 100644 --- a/tests/antithesis/scenarios/vector_e2e/README.md +++ b/tests/antithesis/scenarios/vector_e2e/README.md @@ -26,9 +26,7 @@ One Vector node and one oracle container. - **vector** takes an `http_server` source (`:8080`) and delivers over `http` to the oracle through an in-memory buffer with `when_full: block` and e2e acks. It - also exposes Prometheus metrics (`:9598`) for the health gate, and runs the - reload fault: an `anytime_` command swaps `vector.yaml`/`vector.b.yaml` and sends - `SIGHUP`, forcing the sink to rebuild mid-run. + also exposes Prometheus metrics (`:9598`) for the health gate. - **oracle** (`:8686`) is one container that injects unique event ids at the node and runs the HTTP endpoint the node's sink delivers back to. diff --git a/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh b/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh deleted file mode 100755 index 71416aaf9324b..0000000000000 --- a/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail -[ -n "${VECTOR_CONFIG_ALT:-}" ] || exit 0 -cfg="${VECTOR_CONFIG:?}" -alt="${VECTOR_CONFIG_ALT:?}" - -# Vector only ever reads $cfg, so reload alternates $cfg between two immutable -# sources rather than swapping two live files. The alternate $alt is never -# written, and the baseline (the original $cfg) is snapshotted once, so the only -# mutable file is $cfg and the only writes to it are a single rename of a fully -# written temp. The node-termination fault can therefore interrupt this script at -# any point and leave $cfg as one complete config or the other, never half-written -# and never collapsed so both sources hold the same content. Alternation always -# resumes on the next invocation. -base="$cfg.orig" -if [ ! -f "$base" ]; then - cp "$cfg" "$base.tmp" - mv "$base.tmp" "$base" -fi - -# Pick whichever source is not currently live. cksum reads from stdin so its -# output is the checksum alone, with no filename to differ on. -if [ "$(cksum <"$cfg")" = "$(cksum <"$alt")" ]; then - next="$base" -else - next="$alt" -fi -cp "$next" "$cfg.tmp" -mv "$cfg.tmp" "$cfg" - -# Vector is PID 1 in the node container. SIGHUP triggers reload-from-disk. -kill -HUP 1 -sleep 5 diff --git a/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml index abbc3a136b36d..f18a794ad251a 100644 --- a/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml +++ b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml @@ -21,12 +21,8 @@ services: build: *vector-build image: ve2e-vector:${ANTITHESIS_IMAGE_TAG:-dev} entrypoint: ["/usr/bin/vector", "--config", "/etc/vector/vector.yaml"] - # vector runs the reload fault: VECTOR_CONFIG_ALT lets anytime_reload swap - # configs and SIGHUP, forcing the sink to rebuild. No disk buffer, so no volume. environment: NO_COLOR: "1" - VECTOR_CONFIG: "/etc/vector/vector.yaml" - VECTOR_CONFIG_ALT: "/etc/vector/vector.b.yaml" healthcheck: *node-health oracle: diff --git a/tests/antithesis/scenarios/vector_e2e/vector.b.yaml b/tests/antithesis/scenarios/vector_e2e/vector.b.yaml deleted file mode 100644 index d426c7413699a..0000000000000 --- a/tests/antithesis/scenarios/vector_e2e/vector.b.yaml +++ /dev/null @@ -1,36 +0,0 @@ -sources: - in: - type: http_server - address: 0.0.0.0:8080 - decoding: - codec: json - acknowledgements: - enabled: true - - metrics: - type: internal_metrics - scrape_interval_secs: 1 - -sinks: - out: - type: http - inputs: [in] - uri: http://oracle:8686/ingest - method: post - encoding: - codec: json - # Benign alternate the reload fault swaps in. It differs from vector.yaml only - # by an explicit request timeout, enough to make the reload rebuild the sink. - request: - timeout_secs: 45 - buffer: - type: memory - max_events: 500 - when_full: block - acknowledgements: - enabled: true - - prom: - type: prometheus_exporter - inputs: [metrics] - address: 0.0.0.0:9598