From 1e1873d80894161b311974b51e864977c5d44dbd Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Fri, 29 May 2026 17:17:13 +0000 Subject: [PATCH] chore(unit tests): failing disk_v2 bug demonstrations Seven tests that assert the CORRECT invariant and therefore FAIL against current Vector, each reproducing a real disk_v2 data-loss / accounting bug: #24606 (component drop metric), #21683 (total_buffer_size decrement underflow), get_total_records 0-1 underflow, #24948 (writer drop loses buffered events), finalizer status-discard (rejected delivery acked), reader.rs:932 file-id rollover, reader.rs:524 size-delta underflow. Runnable with plain cargo test -p vector-buffers. --- lib/vector-buffers/src/buffer_usage_data.rs | 64 +++++ .../disk_v2/tests/acknowledgements.rs | 50 ++++ .../src/variants/disk_v2/tests/invariants.rs | 233 ++++++++++++++++++ .../vector_to_vector_e2e_disk/.gitignore | 1 + .../vector_to_vector_e2e_disk/Dockerfile | 8 +- .../anytime_reload.sh | 16 -- .../docker-compose.yaml | 2 - .../vector_to_vector_e2e_disk/head.b.yaml | 34 --- .../vector_to_vector_e2e_disk/launch.sh | 22 +- .../antithesis/scratchbook/experiment-spec.md | 4 +- tests/antithesis/scratchbook/grind-plan.md | 5 +- .../config-reload-no-silent-loss.md | 7 +- 12 files changed, 373 insertions(+), 73 deletions(-) create mode 100644 tests/antithesis/scenarios/vector_to_vector_e2e_disk/.gitignore delete mode 100644 tests/antithesis/scenarios/vector_to_vector_e2e_disk/anytime_reload.sh delete mode 100644 tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.b.yaml diff --git a/lib/vector-buffers/src/buffer_usage_data.rs b/lib/vector-buffers/src/buffer_usage_data.rs index b418023657bb5..80e1df130a485 100644 --- a/lib/vector-buffers/src/buffer_usage_data.rs +++ b/lib/vector-buffers/src/buffer_usage_data.rs @@ -505,4 +505,68 @@ mod tests { assert_eq!(current.event_count, 10); assert_eq!(current.event_byte_size, 1000); } + + /// Demonstration of Vector issue #24606. + /// + /// When a disk buffer drops events because of `when_full = drop_newest`, the + /// buffer-level `buffer_discarded_events_total` counter is incremented, but + /// the component-level `component_discarded_events_total` (the metric + /// operators monitor for data loss) is NEVER emitted by the buffer drop + /// path — so the loss is silent on standard dashboards. + /// + /// This exercises the exact reporter code path that runs for real + /// drop_newest drops: `BufferUsageData::report` -> `emit(BufferEventsDropped + /// { intentional: true, reason: "drop_newest", .. })`. A local metrics + /// recorder captures everything emitted. + /// + /// CORRECT INVARIANT (asserted here): dropped events must also be counted at + /// the component level so operators see the loss. This test currently FAILS + /// against Vector because #24606 is unfixed — `component_discarded_events_total` + /// stays at 0 for disk-buffer drop_newest drops. The failure IS the bug + /// demonstration; it will pass once #24606 is fixed. + #[test] + fn drop_newest_drops_should_increment_component_discarded_metric_issue_24606() { + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let data = BufferUsageData::new(0); + let mut metrics = ReporterCurrentMetrics::default(); + // 5 events dropped by `when_full = drop_newest` (intentional drops), + // exactly as `BufferSender` records them on the disk-buffer drop path. + data.dropped_intentional.increment(5, 500); + data.report(&mut metrics, "demo_disk_buffer"); + }); + + let mut buffer_discarded = 0u64; + let mut component_discarded = 0u64; + for (ckey, _unit, _desc, value) in snapshotter.snapshot().into_vec() { + let name = ckey.key().name().to_string(); + if let DebugValue::Counter(c) = value { + if name == "buffer_discarded_events_total" { + buffer_discarded += c; + } else if name == "component_discarded_events_total" { + component_discarded += c; + } + } + } + + // The buffer accounted for the drops at the buffer level: + assert_eq!( + buffer_discarded, 5, + "buffer_discarded_events_total should reflect the 5 drop_newest drops" + ); + // CORRECT INVARIANT: the component-level counter operators watch for + // data loss MUST also reflect the 5 drops. This currently FAILS (actual + // 0) — that failure is the #24606 demonstration: disk-buffer drop_newest + // drops are invisible on standard dashboards. + assert_eq!( + component_discarded, 5, + "#24606: component_discarded_events_total must reflect drop_newest \ + drops (got {component_discarded}; the buffer never emits the \ + component-level metric, so the loss is silent on dashboards)" + ); + } } diff --git a/lib/vector-buffers/src/variants/disk_v2/tests/acknowledgements.rs b/lib/vector-buffers/src/variants/disk_v2/tests/acknowledgements.rs index a63ed70635a40..4b90de0585762 100644 --- a/lib/vector-buffers/src/variants/disk_v2/tests/acknowledgements.rs +++ b/lib/vector-buffers/src/variants/disk_v2/tests/acknowledgements.rs @@ -83,3 +83,53 @@ async fn ack_wakes_reader() { }) .await; } + +#[tokio::test] +async fn rejected_delivery_should_not_advance_acks_finalizer_status_discard() { + // Failing demonstration: `spawn_finalizer` does + // `while let Some((_status, amount)) = stream.next().await { increment_pending_acks(amount) }` + // — the BatchStatus is discarded. + // + // CORRECT INVARIANT (asserted here): a REJECTED delivery (the sink gave up / + // could not deliver) must NOT advance the buffer's pending acks — the events + // were not acknowledged and must be retained. Because the finalizer ignores + // BatchStatus, a rejection still advances acks, so the buffer forgets the + // events as if delivered. This test FAILS (acks advance by 7, not 0), + // demonstrating the silent within-process data loss. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + + async move { + let usage_handle = BufferUsageHandle::noop(); + let config = DiskBufferConfigBuilder::from_path(data_dir) + .build() + .expect("creating buffer should not fail"); + let ledger = Ledger::load_or_create(config, usage_handle) + .await + .expect("ledger should not fail to load/create"); + let ledger = Arc::new(ledger); + let finalizer = Arc::clone(&ledger).spawn_finalizer(); + assert_eq!(ledger.consume_pending_acks(), 0); + + // A batch of 7 events whose delivery is REJECTED (not Delivered). + let (batch, receiver) = BatchNotifier::new_with_receiver(); + finalizer.add(7, receiver); + let efin = EventFinalizer::new(batch); + efin.update_status(EventStatus::Rejected); + drop(efin); // sends the Rejected status update + tokio::task::yield_now().await; + + // Finalizer BatchStatus discard: a rejected (failed) delivery must + // leave pending acks at 0. It instead advances by the full count, so + // the buffer forgets the events as if delivered — this assertion FAILS. + assert_eq!( + ledger.consume_pending_acks(), + 0, + "rejected delivery must not advance acks, but advanced by 7 — the \ + finalizer ignores BatchStatus, so failed deliveries are silently \ + treated as acknowledged" + ); + } + }) + .await; +} diff --git a/lib/vector-buffers/src/variants/disk_v2/tests/invariants.rs b/lib/vector-buffers/src/variants/disk_v2/tests/invariants.rs index a37075fce4b1e..7efa8774789c9 100644 --- a/lib/vector-buffers/src/variants/disk_v2/tests/invariants.rs +++ b/lib/vector-buffers/src/variants/disk_v2/tests/invariants.rs @@ -927,3 +927,236 @@ async fn reader_writer_positions_aligned_through_multiple_files_and_records() { let parent = trace_span!("reader_writer_positions_aligned_through_multiple_files_and_records"); fut.instrument(parent.or_current()).await; } + +#[tokio::test] +async fn ledger_total_buffer_size_decrement_should_saturate_not_underflow_issue_21683() { + // Failing demonstration of Vector #21683. + // + // CORRECT INVARIANT (asserted here): decrementing `total_buffer_size` by more + // than its current value must saturate at 0. `Ledger::decrement_total_buffer_size` + // instead uses an unsaturated `fetch_sub`, so the in-memory atomic wraps + // toward 2^64 — after which `is_buffer_full()` returns true forever and the + // writer deadlocks permanently. PR #23561 only fixed the metrics reporter, + // not this control-path atomic. + // + // This test FAILS against current Vector (the failure IS the bug): in release + // the atomic wraps so the saturation assert fails; in a debug build the + // `prev - amount` in decrement_total_buffer_size's own `trace!` panics on the + // same underflow. It will pass once the decrement saturates. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + async move { + let (_writer, _reader, ledger) = + create_default_buffer_v2::<_, SizedRecord>(data_dir).await; + + ledger.increment_total_buffer_size(10); + assert_eq!(ledger.get_total_buffer_size(), 10); + + // Decrement by MORE than the current size. Correct behavior is to + // saturate at 0. + ledger.decrement_total_buffer_size(11); + + let after = ledger.get_total_buffer_size(); + assert_eq!( + after, 0, + "#21683: total_buffer_size decremented below zero must saturate at \ + 0, but was {after} (~2^64 wrap) — is_buffer_full() then never \ + returns false -> permanent writer deadlock" + ); + } + }) + .await; +} + +#[tokio::test] +async fn get_total_records_should_be_zero_on_drained_buffer_issue_21683_metrics() { + // Failing demonstration (sibling of #21683): `Ledger::get_total_records` + // computes `next_writer_id.wrapping_sub(last_reader_id) - 1`. + // + // CORRECT INVARIANT (asserted here): a fully drained buffer (writer and reader + // at the same record ID) holds 0 records. The trailing `- 1` underflows + // instead: in release it wraps to ~u64::MAX (which `synchronize_buffer_usage` + // then feeds into the buffer event-count metric on the next restart, reporting + // ~1.8e19 events for an empty buffer); in a debug build it panics on the same + // subtraction. Either way this test FAILS until the count saturates. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + async move { + let (_w, _r, ledger) = create_default_buffer_v2::<_, SizedRecord>(data_dir).await; + // Simulate a drained buffer: the reader has caught up to the writer. + unsafe { + ledger.state().unsafe_set_writer_next_record_id(42); + } + unsafe { + ledger.state().unsafe_set_reader_last_record_id(42); + } + let total = ledger.get_total_records(); + assert_eq!( + total, 0, + "get_total_records 0-1 underflow: a drained buffer (next==last) \ + must report 0 records, but yields {total} (~2^64); \ + synchronize_buffer_usage then reports ~1.8e19 buffer events on restart" + ); + } + }) + .await; +} + +#[tokio::test] +async fn writer_drop_without_flush_should_not_lose_buffered_events_issue_24948() { + // Failing demonstration of Vector #24948 (config-reload silent data loss). + // + // CORRECT INVARIANT (asserted here): events accepted by the writer must + // survive a writer teardown and be recoverable when the buffer is reopened. + // Records sit in the writer's in-memory TrackingBufWriter (256KB) until + // flush(); `BufferWriter::Drop` calls close() (mark_writer_done + notify) but + // NOT flush(). During a config reload the old writer is dropped while events + // are still buffered, so they never reach the data file and the ledger's + // writer_next_record is never advanced. This test FAILS (reopened buffer has + // 0 records, not 3) — that loss is the bug. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + async move { + { + let (mut writer, _reader, _ledger) = + create_default_buffer_v2::<_, SizedRecord>(data_dir.clone()).await; + for _ in 0..3u8 { + writer + .write_record(SizedRecord::new(64)) + .await + .expect("write should not fail"); + } + // No flush(): Drop simulates config-reload teardown (close, not flush). + drop(writer); + } + // Let the finalizer task observe the dropped reader and release the lock. + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + // Reopen the same buffer directory. + let (_writer2, _reader2, ledger2) = + create_default_buffer_v2::<_, SizedRecord>(data_dir).await; + + // #24948: the 3 events written before the un-flushed Drop must + // survive. They don't — the reopened buffer is empty (0 records), so + // this assertion FAILS, demonstrating the silent data loss. + assert_buffer_records!(ledger2, 3); + } + }) + .await; +} + +#[tokio::test] +async fn file_id_rollover_compare_should_be_wrap_aware_reader_932() { + // Failing demonstration: seek_to_next_record decides the reader is + // synchronized with the writer using a raw `reader_file_id > writer_file_id` + // (reader.rs ~932), which is NOT wrap-aware. + // + // CORRECT INVARIANT (asserted here): after a file-ID rollover where the writer + // has done MORE rotations than the reader, the comparison must NOT report the + // reader as having advanced past the writer. The writer wraps to a value + // SMALLER than the (behind) reader's ID, so the raw `>` wrongly concludes the + // reader is ahead and breaks the seek early. This test FAILS (raw compare says + // reader > writer) until the comparison becomes wrap-aware. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + async move { + let (_w, _r, ledger) = create_default_buffer_v2::<_, SizedRecord>(data_dir).await; + let n = u32::from(MAX_FILE_ID); // 6 in test builds; IDs cycle 0..n-1 + // Writer does n+1 rotations -> wraps past the end back to file ID 1. + for _ in 0..(n + 1) { + ledger.state().increment_writer_file_id(); + } + // Reader does n-1 rotations (unacked) -> file ID n-1 (still pre-wrap). + for _ in 0..(n - 1) { + ledger.increment_unacked_reader_file_id(); + } + let (reader_file_id, writer_file_id) = ledger.get_current_reader_writer_file_id(); + assert_eq!( + (reader_file_id, writer_file_id), + (MAX_FILE_ID - 1, 1), + "constructed post-rollover state: writer lapped to ID 1, reader behind at ID {}", + MAX_FILE_ID - 1 + ); + // The writer performed MORE rotations (n+1) than the reader (n-1), so it + // is genuinely AHEAD. A wrap-aware comparison must therefore NOT report + // the reader as past the writer: + assert!( + !(reader_file_id > writer_file_id), + "reader.rs:932 must use a wrap-aware comparison: after rollover the \ + writer lapped to ID {writer_file_id} and is AHEAD of the reader at \ + ID {reader_file_id}, so the reader is NOT past the writer. The raw \ + `reader_file_id > writer_file_id` ({reader_file_id} > {writer_file_id}) \ + is true, so seek_to_next_record wrongly concludes the reader advanced \ + past the writer and stops early." + ); + } + }) + .await; +} + +#[tokio::test] +async fn delete_completed_data_file_size_delta_should_saturate_reader_524() { + // Failing demonstration of the reader.rs:524 unguarded subtraction. + // + // `delete_completed_data_file` computes `size_delta = metadata.len() - bytes_read` + // (reader.rs ~524) and then `decrement_total_buffer_size(size_delta)`. + // + // CORRECT INVARIANT (asserted here): when a data file is truncated externally + // (crash / filesystem fault) so `bytes_read > metadata.len()`, the size-delta + // must saturate at 0 rather than underflow. The raw subtraction underflows: in + // release it wraps to ~2^64, which is fed straight into the unsaturated + // `decrement_total_buffer_size` -> the #21683 total_buffer_size wrap -> + // permanent writer deadlock; in a debug build the subtraction panics. Either + // way this test FAILS, demonstrating the bug. + // + // Here we reproduce the exact computation on a real file: write 100 bytes, + // record bytes_read=100, truncate the file to 40, then `metadata.len() - bytes_read`. + with_temp_dir(|dir| { + let data_dir = dir.to_path_buf(); + async move { + let path = data_dir.join("buffer-data-0.dat"); + { + let mut f = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .open(&path) + .await + .expect("create should not fail"); + f.write_all(&[0u8; 100]) + .await + .expect("write should not fail"); + f.flush().await.expect("flush should not fail"); + } + // The reader accounted 100 bytes read from this data file. + let bytes_read: u64 = 100; + // External truncation (crash / FS fault) shrinks the file below bytes_read. + tokio::fs::OpenOptions::new() + .write(true) + .open(&path) + .await + .expect("open should not fail") + .set_len(40) + .await + .expect("truncate should not fail"); + + // Exactly what reader.rs:524 does: `metadata.len() - bytes_read`. + let metadata_len = tokio::fs::metadata(&path) + .await + .expect("metadata should not fail") + .len(); + assert_eq!(metadata_len, 40); + // In a debug build this raw subtraction panics (the bug manifesting); + // in release it wraps to ~2^64. + let size_delta = metadata_len - bytes_read; + + assert_eq!( + size_delta, 0, + "reader.rs:524: metadata.len()({metadata_len}) - bytes_read({bytes_read}) \ + on a truncated file must saturate at 0, but underflowed to {size_delta} \ + (~2^64); this is fed into the unsaturated decrement_total_buffer_size -> \ + the #21683 wrap -> permanent writer deadlock" + ); + } + }) + .await; +} diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/.gitignore b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/.gitignore new file mode 100644 index 0000000000000..0d8eb1f9365b3 --- /dev/null +++ b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/.gitignore @@ -0,0 +1 @@ +.launch/ diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/Dockerfile b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/Dockerfile index 037e4d284c960..fc0bcc9eb0df8 100644 --- a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/Dockerfile +++ b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/Dockerfile @@ -76,15 +76,9 @@ FROM debian:stable-slim AS vector RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \ && rm -rf /var/lib/apt/lists/* COPY --from=vector-build /usr/local/bin/vector /usr/bin/vector -# Bake the node configs (compose picks one per node via --config) plus head's -# benign alternate, which the reload fault swaps in to force a sink rebuild. +# Bake the node configs (compose picks one per node via --config). COPY tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.yaml /etc/vector/head.yaml -COPY tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.b.yaml /etc/vector/head.b.yaml COPY tests/antithesis/scenarios/vector_to_vector_e2e_disk/tail.yaml /etc/vector/tail.yaml -# The reload fault is an anytime_ test command that runs IN the node container. -# It is a no-op on tail (no VECTOR_CONFIG_ALT). The node stays running because -# its entrypoint is Vector, not a test command. -COPY --chmod=755 tests/antithesis/scenarios/vector_to_vector_e2e_disk/anytime_reload.sh /opt/antithesis/test/v1/v2v/anytime_reload RUN mkdir -p /var/lib/vector /symbols && ln -s /usr/bin/vector /symbols/vector ENV NO_COLOR=1 EXPOSE 6000 8080 9598 diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/anytime_reload.sh b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/anytime_reload.sh deleted file mode 100644 index 13e2db094ac64..0000000000000 --- a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/anytime_reload.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail -[ -n "${VECTOR_CONFIG_ALT:-}" ] || exit 0 -cfg="${VECTOR_CONFIG:?}" -alt="${VECTOR_CONFIG_ALT:?}" - -tmp="$(mktemp)" -cp "$cfg" "$tmp" -cp "$alt" "$cfg" -cp "$tmp" "$alt" -rm -f "$tmp" - -# Vector is PID 1 in the node container. SIGHUP triggers reload-from-disk. -kill -HUP 1 -sleep 5 diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/docker-compose.yaml b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/docker-compose.yaml index 0829233b936f1..d268c1fce2c2a 100644 --- a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/docker-compose.yaml +++ b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/docker-compose.yaml @@ -21,8 +21,6 @@ services: build: *vector-build image: v2v-vector:${V2V_IMAGE_TAG:-dev} entrypoint: ["/usr/bin/vector", "--config", "/etc/vector/head.yaml"] - # head runs the reload fault: VECTOR_CONFIG_ALT lets anytime_reload swap configs - # and SIGHUP, forcing the disk-buffered sink to rebuild. environment: NO_COLOR: "1" VECTOR_CONFIG: "/etc/vector/head.yaml" diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.b.yaml b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.b.yaml deleted file mode 100644 index 953ae979d19d8..0000000000000 --- a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/head.b.yaml +++ /dev/null @@ -1,34 +0,0 @@ -data_dir: /var/lib/vector - -sources: - in: - type: http_server - address: 0.0.0.0:8080 - decoding: - codec: json - acknowledgements: - enabled: true - - metrics: - type: internal_metrics - scrape_interval_secs: 1 - -sinks: - out: - type: vector - inputs: [in] - address: "tail:6000" - compression: none - request: - timeout_secs: 45 - buffer: - type: disk - max_size: 268435488 # ~256MB, the enforced minimum - when_full: block - acknowledgements: - enabled: true - - prom: - type: prometheus_exporter - inputs: [metrics] - address: 0.0.0.0:9598 diff --git a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/launch.sh b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/launch.sh index 088c3427ac596..23dbca4854417 100755 --- a/tests/antithesis/scenarios/vector_to_vector_e2e_disk/launch.sh +++ b/tests/antithesis/scenarios/vector_to_vector_e2e_disk/launch.sh @@ -71,15 +71,19 @@ for v in ANTITHESIS_TENANT ANTITHESIS_REPOSITORY; do fi done -# Rebuild the images from current source before submitting. snouty reuses a -# matching :latest tag instead of rebuilding, so without this a shot can ship -# stale code (e.g. an image baked before a config rename or a code change). -# Layer caching keeps this near-instant when nothing changed. -build=(docker compose -f "$SCRIPT_DIR/docker-compose.yaml" build) +# snouty's --config image resolution parses the compose with default substitution +# only — it does NOT honor exported env vars — so a `${VAR:-default}` image tag +# ships to the run as `:default` even though `docker compose build` (which is +# env-aware) built `:$VAR`. The run then references an image that was never built. +# To remove all ambiguity, materialize a fully-resolved compose (concrete revision +# tag, absolute build context) and point snouty at THAT. A forced rebuild from it +# also guarantees no stale image is reused. The dir is regenerated every shot. +BUILD_DIR="$SCRIPT_DIR/.launch" +build=(docker compose -f "$BUILD_DIR/docker-compose.yaml" build) cmd=(snouty launch --webhook "$WEBHOOK" - --config "$SCRIPT_DIR" + --config "$BUILD_DIR" --test-name "$TEST_NAME" --description "$DESCRIPTION" --source "$SOURCE" @@ -87,11 +91,13 @@ cmd=(snouty launch "${FAULTS[@]}" "$@") -printf 'build: '; printf ' %q' "${build[@]}"; printf '\n' +printf 'image tag: %s\n' "$V2V_IMAGE_TAG" printf 'launch:'; printf ' %q' "${cmd[@]}"; printf '\n' if [[ "${DRY_RUN:-0}" == "1" ]]; then - echo "(dry run; not building or submitting)" + echo "(dry run; not resolving, building, or submitting)" exit 0 fi +mkdir -p "$BUILD_DIR" +docker compose -f "$SCRIPT_DIR/docker-compose.yaml" config > "$BUILD_DIR/docker-compose.yaml" "${build[@]}" exec "${cmd[@]}" diff --git a/tests/antithesis/scratchbook/experiment-spec.md b/tests/antithesis/scratchbook/experiment-spec.md index 5017ad0b4235b..65862594b8887 100644 --- a/tests/antithesis/scratchbook/experiment-spec.md +++ b/tests/antithesis/scratchbook/experiment-spec.md @@ -48,8 +48,8 @@ experiment's goal. ## Forcing the loss-prone state Partition head↔tail so the buffer backs up with acked-but-undelivered records, keep -producing hot, then fire the reload / kill a node at a rotation boundary on the -persistent volume. The shrunk data-file knob (2 MiB, shipped) keeps both buffers +producing hot, then kill a node at a rotation boundary on the persistent volume. +The shrunk data-file knob (2 MiB, shipped) keeps both buffers rotating constantly so reopen/torn-tail boundaries are crossed often. --- diff --git a/tests/antithesis/scratchbook/grind-plan.md b/tests/antithesis/scratchbook/grind-plan.md index d20998b858be0..ea9162b287897 100644 --- a/tests/antithesis/scratchbook/grind-plan.md +++ b/tests/antithesis/scratchbook/grind-plan.md @@ -17,8 +17,9 @@ in-process, where Antithesis adds only scheduler/coverage. See those two propert files' "Test plan" sections (A: ledger-level `get_total_records` wrap, pure sync; B: reopen + torn-tail `total_buffer_size` underflow, `current_thread` + `TestFilesystem`). Antithesis retains the distributed work: the `vector_to_vector_e2e_disk` conservation -experiment and the #24948 SIGHUP config-reload fault (both wired and `snouty -validate`-green at this commit). +experiment (wired and `snouty validate`-green at this commit). The SIGHUP config-reload +fault is not built — see `config-reload-no-silent-loss` for the startup-safe custom +fault it still needs. ## G0 — Bootstrap (setup task #1/#2/#3) diff --git a/tests/antithesis/scratchbook/properties/config-reload-no-silent-loss.md b/tests/antithesis/scratchbook/properties/config-reload-no-silent-loss.md index b4b317cc8cb62..d041d3a0f12b2 100644 --- a/tests/antithesis/scratchbook/properties/config-reload-no-silent-loss.md +++ b/tests/antithesis/scratchbook/properties/config-reload-no-silent-loss.md @@ -139,5 +139,8 @@ workload oracle above. before the new `load_or_create`, but the detached finalizer holding `Arc` is not awaited — if still alive when the new writer opens, both share the dir with no OS-level protection. -- **Fault flag:** SIGHUP via the committed `anytime_reload.sh` (`kill -HUP 1`); - confirm the tenant permits process-signal delivery / in-container script exec. +- **Fault not built.** The SIGHUP reload trigger this property needs is not in the + harness — an earlier `anytime_reload` fault was removed because its `kill -HUP 1` + raced Vector's startup after a node restart and killed the node (exit 129) before + the SIGHUP handler installed. A reload fault gated on a fully-started node, plus + the loss-detecting asserts above, both remain to be built.