From 46c05efe65f23bfafdb615d7023e82816c63b953 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 18 Mar 2026 17:06:40 -0700 Subject: [PATCH 1/5] Make sure we still do writeback on test failure --- test-utils/src/dev/db.rs | 207 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 3 deletions(-) diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs index 1e71888637f..14064c14f13 100644 --- a/test-utils/src/dev/db.rs +++ b/test-utils/src/dev/db.rs @@ -747,10 +747,75 @@ impl Drop for CockroachInstance { temporary directory leaked)" ); - // Still, make a best effort. - #[allow(unused_must_use)] + // Send SIGTERM so CockroachDB can flush its WAL to disk, + // making the leaked temp directory useful for post-mortem + // debugging. + // + // Background: test CRDB instances set + // `kv.raft_log.disable_synchronization_unsafe = true` + // which skips per-commit fdatasync for performance. Data + // is still written to the WAL but may only exist in + // Pebble's user-space write buffer. SIGTERM triggers + // Pebble's DB.Close() which flushes and fsyncs the WAL + // before exiting. + // See: https://github.com/oxidecomputer/omicron/issues/10085 if let Some(child_process) = self.child_process.as_mut() { - child_process.start_kill(); + let pid = self.pid as libc::pid_t; + unsafe { libc::kill(pid, libc::SIGTERM) }; + + // Wait for the child to exit. We use + // waitpid(WNOHANG) in a poll loop rather than: + // + // - `process_running` (kill -0): fails because the + // child becomes a zombie after exit and kill -0 + // still succeeds on zombies, so we'd never detect + // the exit. + // + // - `waitpid` without WNOHANG: would block this + // thread indefinitely if CRDB doesn't exit, with + // no way to fall back to SIGKILL. + // + // - `child_process.wait().await`: not available + // because we're in a synchronous Drop impl with + // no async executor. + // + // WNOHANG makes waitpid return immediately: it + // returns the pid if the child has exited (reaping + // the zombie), 0 if still running, or -1 on error + // (e.g. ECHILD if tokio's SIGCHLD handler already + // reaped it). Both the pid and -1 cases mean the + // process has exited. This lets us poll with a + // timeout and fall back to SIGKILL. + let deadline = std::time::Instant::now() + + std::time::Duration::from_secs(5); + let mut exited = false; + while std::time::Instant::now() < deadline { + let mut status: libc::c_int = 0; + let wr = unsafe { + libc::waitpid(pid, &mut status, libc::WNOHANG) + }; + if wr == pid || wr == -1 { + // wr == pid: child exited, we reaped it. + // wr == -1: ECHILD — already reaped (e.g. + // by tokio's SIGCHLD handler). + exited = true; + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + + // If SIGTERM didn't work within the timeout, fall back + // to SIGKILL. Warn that the on-disk data may be stale. + if !exited { + eprintln!( + "WARN: CockroachDB did not exit after SIGTERM; \ + sending SIGKILL. On-disk data may be stale." + ); + #[allow(unused_must_use)] + { + child_process.start_kill(); + } + } } #[allow(unused_must_use)] if let Some(temp_dir) = self.temp_dir.take() { @@ -1903,6 +1968,142 @@ mod test { assert_eq!(addr.port(), 12345); } + // Test that data written to CockroachDB survives when the instance is + // dropped without calling cleanup(). This exercises the Drop impl path + // that fires during test failures (when cleanup is unreachable due to + // a panic) or when cleanup is accidentally omitted. + // + // With `disable_synchronization_unsafe = true`, committed data may + // only exist in Pebble's user-space write buffer (not yet write()-ed + // to the WAL file on disk). A graceful shutdown (SIGTERM) triggers + // Pebble's DB.Close() which flushes and fsyncs the WAL, making data + // durable. SIGKILL destroys the process immediately, losing any + // buffered WAL data. + // + // See: https://github.com/oxidecomputer/omicron/issues/10085 + #[tokio::test] + async fn test_data_survives_drop_without_cleanup() { + let store_dir = tempdir().expect("failed to create temp dir"); + let data_dir = store_dir.path().join("data"); + + // Start CRDB with a persistent store directory + let starter = new_builder() + .store_dir(&data_dir) + .build() + .expect("failed to build starter"); + let database = starter.start().await.expect("failed to start CRDB"); + + // Enable the unsafe sync setting (as setup_database does) + database + .disable_synchronization() + .await + .expect("failed to disable sync"); + + // Phase 1: Write enough data to overflow Pebble's internal WAL + // write buffer, ensuring it gets write()-ed to the kernel and + // is durable on disk. + let client = database.connect().await.expect("failed to connect"); + client + .batch_execute( + "CREATE DATABASE test_db; \ + USE test_db; \ + CREATE TABLE test_data (id INT PRIMARY KEY, payload STRING); \ + INSERT INTO test_data (id, payload) \ + SELECT i, repeat('x', 1024) \ + FROM generate_series(1, 5000) AS g(i);", + ) + .await + .expect("insert failed"); + + // Phase 2: Perform many small mutations — each one is a tiny + // WAL entry that may sit in Pebble's user-space write buffer. + // With disable_synchronization_unsafe, Sync() is a no-op, so + // the buffer is only flushed when full. We do many UPDATEs so + // the last few are likely still buffered when we kill. + // + // We update each row individually to maximize the number of + // separate WAL entries, increasing the chance that the final + // entries haven't been flushed. + for batch_start in (1..=5000).step_by(100) { + let batch_end = std::cmp::min(batch_start + 99, 5000); + client + .batch_execute(&format!( + "UPDATE test_db.test_data \ + SET payload = 'updated' \ + WHERE id BETWEEN {batch_start} AND {batch_end};", + )) + .await + .expect("update failed"); + } + + // One final write that we'll check for specifically — this + // is the most likely to be lost since it's the last WAL entry. + client + .batch_execute( + "INSERT INTO test_db.test_data (id, payload) \ + VALUES (99999, 'canary');", + ) + .await + .expect("canary insert failed"); + + // Drop without cleanup — this triggers the Drop impl. + // Save the temp dir path so we can clean it up after + // verification (the Drop impl intentionally leaks it). + let leaked_temp_dir = database.temp_dir().to_owned(); + drop(client); + drop(database); + + // Restart CRDB on the same store directory. + let starter2 = new_builder() + .store_dir(&data_dir) + .build() + .expect("failed to build second starter"); + let mut database2 = + starter2.start().await.expect("failed to restart CRDB"); + + let client2 = database2.connect().await.expect("failed to reconnect"); + + // Check that the canary row survived. + let rows = client2 + .query( + "SELECT payload FROM test_db.test_data WHERE id = 99999", + &[], + ) + .await + .expect("canary query failed"); + assert_eq!( + rows.len(), + 1, + "canary row (id=99999) did not survive shutdown" + ); + let payload: String = rows[0].get(0); + assert_eq!(payload, "canary", "canary row has wrong payload"); + + // Check that updates survived — sample a few rows. + let rows = client2 + .query( + "SELECT count(*) FROM test_db.test_data \ + WHERE payload = 'updated'", + &[], + ) + .await + .expect("update check query failed"); + let updated_count: i64 = rows[0].get(0); + assert_eq!( + updated_count, 5000, + "updates did not survive shutdown: \ + expected 5000 updated rows, got {updated_count}" + ); + + client2.cleanup().await.expect("client2 cleanup failed"); + database2.cleanup().await.expect("database2 cleanup failed"); + + // Clean up the temp dir leaked by the first instance's Drop. + fs::remove_dir_all(&leaked_temp_dir) + .await + .expect("failed to clean up leaked temp dir"); + } + // Tests the way `process_exited()` checks and interprets the exit status // for abnormal process termination (by a signal). #[tokio::test] From 88502d1a9b8116d9e335df5a83dc4175ed929d5f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 19 Mar 2026 10:12:24 -0700 Subject: [PATCH 2/5] simplify waiting --- test-utils/src/dev/db.rs | 97 +++++++++++++--------------------------- 1 file changed, 31 insertions(+), 66 deletions(-) diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs index 14064c14f13..efc1387dc40 100644 --- a/test-utils/src/dev/db.rs +++ b/test-utils/src/dev/db.rs @@ -733,13 +733,19 @@ impl CockroachInstance { impl Drop for CockroachInstance { fn drop(&mut self) { - // TODO-cleanup Ideally at this point we would run self.cleanup() to - // kill the child process, wait for it to exit, and then clean up the - // temporary directory. However, we don't have an executor here with - // which to run async/await code. We could create one here, but it's - // not clear how safe or sketchy that would be. Instead, we expect that - // the caller has done the cleanup already. This won't always happen, - // particularly for ungraceful failures. + // NOTE: Ideally, we'd share this logic with self.cleanup() to tear down + // the child process. However, there are a couple distinctions from that + // pathway we must consider here: + // + // 1. drop is not asynchronous, so we don't have the benefits of using + // our tokio executor to run async/await code. + // 2. If self.cleanup() has not been invoked, we want to gracefully + // preserve the database for post-mortem debugging. This is often + // the case for e.g. a test which has failed before invoking cleanup, + // and which is bailing out. We can also take this pathway when the + // calling code has forgotten to call "cleanup", and is leaking the + // database, but that's a bug - and choosing to flush and terminate + // via SIGTERM wouldn't be wrong in that case either. if self.child_process.is_some() || self.temp_dir.is_some() { eprintln!( "WARN: dropped CockroachInstance without cleaning it up first \ @@ -747,75 +753,34 @@ impl Drop for CockroachInstance { temporary directory leaked)" ); - // Send SIGTERM so CockroachDB can flush its WAL to disk, - // making the leaked temp directory useful for post-mortem - // debugging. + // Send SIGTERM so CockroachDB can flush its WAL to disk, making the + // leaked temp directory useful for post-mortem debugging. // - // Background: test CRDB instances set + // Background: test CRDB instances might set // `kv.raft_log.disable_synchronization_unsafe = true` - // which skips per-commit fdatasync for performance. Data - // is still written to the WAL but may only exist in - // Pebble's user-space write buffer. SIGTERM triggers - // Pebble's DB.Close() which flushes and fsyncs the WAL - // before exiting. + // which skips per-commit fdatasync for performance. Data is still + // written to the WAL but may only exist in Pebble's user-space + // write buffer. SIGTERM triggers Pebble's DB.Close() which flushes + // and fsyncs the WAL before exiting. // See: https://github.com/oxidecomputer/omicron/issues/10085 if let Some(child_process) = self.child_process.as_mut() { let pid = self.pid as libc::pid_t; unsafe { libc::kill(pid, libc::SIGTERM) }; - // Wait for the child to exit. We use - // waitpid(WNOHANG) in a poll loop rather than: - // - // - `process_running` (kill -0): fails because the - // child becomes a zombie after exit and kill -0 - // still succeeds on zombies, so we'd never detect - // the exit. - // - // - `waitpid` without WNOHANG: would block this - // thread indefinitely if CRDB doesn't exit, with - // no way to fall back to SIGKILL. - // - // - `child_process.wait().await`: not available - // because we're in a synchronous Drop impl with - // no async executor. - // - // WNOHANG makes waitpid return immediately: it - // returns the pid if the child has exited (reaping - // the zombie), 0 if still running, or -1 on error - // (e.g. ECHILD if tokio's SIGCHLD handler already - // reaped it). Both the pid and -1 cases mean the - // process has exited. This lets us poll with a - // timeout and fall back to SIGKILL. - let deadline = std::time::Instant::now() - + std::time::Duration::from_secs(5); - let mut exited = false; - while std::time::Instant::now() < deadline { - let mut status: libc::c_int = 0; - let wr = unsafe { - libc::waitpid(pid, &mut status, libc::WNOHANG) - }; - if wr == pid || wr == -1 { - // wr == pid: child exited, we reaped it. - // wr == -1: ECHILD — already reaped (e.g. - // by tokio's SIGCHLD handler). - exited = true; - break; + // Poll the child_process until it exits. + loop { + match child_process.try_wait() { + Ok(Some(_status)) => { + break; + } + Ok(None) => {} // still running + Err(_) => { + // Already reaped (e.g. ECHILD). + break; + } } std::thread::sleep(std::time::Duration::from_millis(50)); } - - // If SIGTERM didn't work within the timeout, fall back - // to SIGKILL. Warn that the on-disk data may be stale. - if !exited { - eprintln!( - "WARN: CockroachDB did not exit after SIGTERM; \ - sending SIGKILL. On-disk data may be stale." - ); - #[allow(unused_must_use)] - { - child_process.start_kill(); - } - } } #[allow(unused_must_use)] if let Some(temp_dir) = self.temp_dir.take() { From 7efbc95424d0285974085e1da1f6cb939c770d6c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 19 Mar 2026 13:08:28 -0700 Subject: [PATCH 3/5] update comment --- test-utils/src/dev/db.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs index efc1387dc40..6ba6b66a088 100644 --- a/test-utils/src/dev/db.rs +++ b/test-utils/src/dev/db.rs @@ -1980,15 +1980,12 @@ mod test { .await .expect("insert failed"); - // Phase 2: Perform many small mutations — each one is a tiny - // WAL entry that may sit in Pebble's user-space write buffer. - // With disable_synchronization_unsafe, Sync() is a no-op, so - // the buffer is only flushed when full. We do many UPDATEs so - // the last few are likely still buffered when we kill. - // - // We update each row individually to maximize the number of - // separate WAL entries, increasing the chance that the final - // entries haven't been flushed. + // Phase 2: Perform many small mutations that may sit in + // Pebble's user-space write buffer. With + // disable_synchronization_unsafe, Sync() is a no-op, so the + // buffer is only flushed when full. We issue many batch UPDATEs + // (each updating 100 rows) so the last few WAL entries are + // likely still buffered when we shut down. for batch_start in (1..=5000).step_by(100) { let batch_end = std::cmp::min(batch_start + 99, 5000); client From 4f19262f1c72143453d4156033de6ec61a9e1c1c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 19 Mar 2026 14:23:12 -0700 Subject: [PATCH 4/5] nextest configs --- .config/nextest.toml | 6 +++++- test-utils/src/dev/db.rs | 16 +++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index 39416445b0c..6ef0aecd627 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -67,7 +67,11 @@ filter = 'binary_id(omicron-nexus::test_all) and test(::schema::)' threads-required = 4 [[profile.ci.overrides]] -filter = 'binary_id(omicron-nexus::test_all)' +# omicron-nexus depends on omicron-test-utils, so this is a bit redundant, +# but it's future-proofing. omicron-test-utils has conditions where it uses +# SIGTERM to await CockroachDB graceful shutdown - we use a nextest timeout +# so it can wait "as long as the test runner decides is reasonable". +filter = 'binary_id(omicron-nexus::test_all) + rdeps(omicron-test-utils)' # As of 2023-01-08, the slowest test in test_all takes 196s on a Ryzen 7950X. # 900s is a good upper limit that adds a comfortable buffer. slow-timeout = { period = '60s', terminate-after = 15 } diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs index 6ba6b66a088..74ef19c8c13 100644 --- a/test-utils/src/dev/db.rs +++ b/test-utils/src/dev/db.rs @@ -768,14 +768,20 @@ impl Drop for CockroachInstance { unsafe { libc::kill(pid, libc::SIGTERM) }; // Poll the child_process until it exits. + // + // Note that nextest has a timeout for tests + // with the filter 'rdeps(omicron-test-utils)'. + // + // As a consequence, we can loop "as long as the test runner + // lets us" waiting for CockroachDB to gracefully terminate. In + // reality, this is typically on the order of ~one second after + // a test failure. loop { match child_process.try_wait() { - Ok(Some(_status)) => { - break; - } Ok(None) => {} // still running - Err(_) => { - // Already reaped (e.g. ECHILD). + Ok(Some(_)) | Err(_) => { + // Terminated successfully or already reaped (e.g. + // ECHILD). break; } } From c9489c0017760cde4de72070ad6927cbc81df75e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 19 Mar 2026 14:58:02 -0700 Subject: [PATCH 5/5] clippy --- test-utils/src/dev/db.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs index 74ef19c8c13..c7d728b14f9 100644 --- a/test-utils/src/dev/db.rs +++ b/test-utils/src/dev/db.rs @@ -776,15 +776,7 @@ impl Drop for CockroachInstance { // lets us" waiting for CockroachDB to gracefully terminate. In // reality, this is typically on the order of ~one second after // a test failure. - loop { - match child_process.try_wait() { - Ok(None) => {} // still running - Ok(Some(_)) | Err(_) => { - // Terminated successfully or already reaped (e.g. - // ECHILD). - break; - } - } + while let Ok(None) = child_process.try_wait() { std::thread::sleep(std::time::Duration::from_millis(50)); } }