From 46c05efe65f23bfafdb615d7023e82816c63b953 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 18 Mar 2026 17:06:40 -0700
Subject: [PATCH 1/5] Make sure we still do writeback on test failure

---
 test-utils/src/dev/db.rs | 207 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 204 insertions(+), 3 deletions(-)

diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index 1e71888637f..14064c14f13 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -747,10 +747,75 @@ impl Drop for CockroachInstance {
                 temporary directory leaked)"
             );
 
-            // Still, make a best effort.
-            #[allow(unused_must_use)]
+            // Send SIGTERM so CockroachDB can flush its WAL to disk,
+            // making the leaked temp directory useful for post-mortem
+            // debugging.
+            //
+            // Background: test CRDB instances set
+            // `kv.raft_log.disable_synchronization_unsafe = true`
+            // which skips per-commit fdatasync for performance. Data
+            // is still written to the WAL but may only exist in
+            // Pebble's user-space write buffer. SIGTERM triggers
+            // Pebble's DB.Close() which flushes and fsyncs the WAL
+            // before exiting.
+            // See: https://github.com/oxidecomputer/omicron/issues/10085
             if let Some(child_process) = self.child_process.as_mut() {
-                child_process.start_kill();
+                let pid = self.pid as libc::pid_t;
+                unsafe { libc::kill(pid, libc::SIGTERM) };
+
+                // Wait for the child to exit. We use
+                // waitpid(WNOHANG) in a poll loop rather than:
+                //
+                // - `process_running` (kill -0): fails because the
+                //   child becomes a zombie after exit and kill -0
+                //   still succeeds on zombies, so we'd never detect
+                //   the exit.
+                //
+                // - `waitpid` without WNOHANG: would block this
+                //   thread indefinitely if CRDB doesn't exit, with
+                //   no way to fall back to SIGKILL.
+                //
+                // - `child_process.wait().await`: not available
+                //   because we're in a synchronous Drop impl with
+                //   no async executor.
+                //
+                // WNOHANG makes waitpid return immediately: it
+                // returns the pid if the child has exited (reaping
+                // the zombie), 0 if still running, or -1 on error
+                // (e.g. ECHILD if tokio's SIGCHLD handler already
+                // reaped it). Both the pid and -1 cases mean the
+                // process has exited. This lets us poll with a
+                // timeout and fall back to SIGKILL.
+                let deadline = std::time::Instant::now()
+                    + std::time::Duration::from_secs(5);
+                let mut exited = false;
+                while std::time::Instant::now() < deadline {
+                    let mut status: libc::c_int = 0;
+                    let wr = unsafe {
+                        libc::waitpid(pid, &mut status, libc::WNOHANG)
+                    };
+                    if wr == pid || wr == -1 {
+                        // wr == pid: child exited, we reaped it.
+                        // wr == -1: ECHILD — already reaped (e.g.
+                        //   by tokio's SIGCHLD handler).
+                        exited = true;
+                        break;
+                    }
+                    std::thread::sleep(std::time::Duration::from_millis(50));
+                }
+
+                // If SIGTERM didn't work within the timeout, fall back
+                // to SIGKILL. Warn that the on-disk data may be stale.
+                if !exited {
+                    eprintln!(
+                        "WARN: CockroachDB did not exit after SIGTERM; \
+                         sending SIGKILL. On-disk data may be stale."
+                    );
+                    #[allow(unused_must_use)]
+                    {
+                        child_process.start_kill();
+                    }
+                }
             }
             #[allow(unused_must_use)]
             if let Some(temp_dir) = self.temp_dir.take() {
@@ -1903,6 +1968,142 @@ mod test {
         assert_eq!(addr.port(), 12345);
     }
 
+    // Test that data written to CockroachDB survives when the instance is
+    // dropped without calling cleanup(). This exercises the Drop impl path
+    // that fires during test failures (when cleanup is unreachable due to
+    // a panic) or when cleanup is accidentally omitted.
+    //
+    // With `disable_synchronization_unsafe = true`, committed data may
+    // only exist in Pebble's user-space write buffer (not yet write()-ed
+    // to the WAL file on disk). A graceful shutdown (SIGTERM) triggers
+    // Pebble's DB.Close() which flushes and fsyncs the WAL, making data
+    // durable. SIGKILL destroys the process immediately, losing any
+    // buffered WAL data.
+    //
+    // See: https://github.com/oxidecomputer/omicron/issues/10085
+    #[tokio::test]
+    async fn test_data_survives_drop_without_cleanup() {
+        let store_dir = tempdir().expect("failed to create temp dir");
+        let data_dir = store_dir.path().join("data");
+
+        // Start CRDB with a persistent store directory
+        let starter = new_builder()
+            .store_dir(&data_dir)
+            .build()
+            .expect("failed to build starter");
+        let database = starter.start().await.expect("failed to start CRDB");
+
+        // Enable the unsafe sync setting (as setup_database does)
+        database
+            .disable_synchronization()
+            .await
+            .expect("failed to disable sync");
+
+        // Phase 1: Write enough data to overflow Pebble's internal WAL
+        // write buffer, ensuring it gets write()-ed to the kernel and
+        // is durable on disk.
+        let client = database.connect().await.expect("failed to connect");
+        client
+            .batch_execute(
+                "CREATE DATABASE test_db; \
+                 USE test_db; \
+                 CREATE TABLE test_data (id INT PRIMARY KEY, payload STRING); \
+                 INSERT INTO test_data (id, payload) \
+                 SELECT i, repeat('x', 1024) \
+                 FROM generate_series(1, 5000) AS g(i);",
+            )
+            .await
+            .expect("insert failed");
+
+        // Phase 2: Perform many small mutations — each one is a tiny
+        // WAL entry that may sit in Pebble's user-space write buffer.
+        // With disable_synchronization_unsafe, Sync() is a no-op, so
+        // the buffer is only flushed when full. We do many UPDATEs so
+        // the last few are likely still buffered when we kill.
+        //
+        // We update each row individually to maximize the number of
+        // separate WAL entries, increasing the chance that the final
+        // entries haven't been flushed.
+        for batch_start in (1..=5000).step_by(100) {
+            let batch_end = std::cmp::min(batch_start + 99, 5000);
+            client
+                .batch_execute(&format!(
+                    "UPDATE test_db.test_data \
+                     SET payload = 'updated' \
+                     WHERE id BETWEEN {batch_start} AND {batch_end};",
+                ))
+                .await
+                .expect("update failed");
+        }
+
+        // One final write that we'll check for specifically — this
+        // is the most likely to be lost since it's the last WAL entry.
+        client
+            .batch_execute(
+                "INSERT INTO test_db.test_data (id, payload) \
+                 VALUES (99999, 'canary');",
+            )
+            .await
+            .expect("canary insert failed");
+
+        // Drop without cleanup — this triggers the Drop impl.
+        // Save the temp dir path so we can clean it up after
+        // verification (the Drop impl intentionally leaks it).
+        let leaked_temp_dir = database.temp_dir().to_owned();
+        drop(client);
+        drop(database);
+
+        // Restart CRDB on the same store directory.
+        let starter2 = new_builder()
+            .store_dir(&data_dir)
+            .build()
+            .expect("failed to build second starter");
+        let mut database2 =
+            starter2.start().await.expect("failed to restart CRDB");
+
+        let client2 = database2.connect().await.expect("failed to reconnect");
+
+        // Check that the canary row survived.
+        let rows = client2
+            .query(
+                "SELECT payload FROM test_db.test_data WHERE id = 99999",
+                &[],
+            )
+            .await
+            .expect("canary query failed");
+        assert_eq!(
+            rows.len(),
+            1,
+            "canary row (id=99999) did not survive shutdown"
+        );
+        let payload: String = rows[0].get(0);
+        assert_eq!(payload, "canary", "canary row has wrong payload");
+
+        // Check that updates survived — sample a few rows.
+        let rows = client2
+            .query(
+                "SELECT count(*) FROM test_db.test_data \
+                 WHERE payload = 'updated'",
+                &[],
+            )
+            .await
+            .expect("update check query failed");
+        let updated_count: i64 = rows[0].get(0);
+        assert_eq!(
+            updated_count, 5000,
+            "updates did not survive shutdown: \
+             expected 5000 updated rows, got {updated_count}"
+        );
+
+        client2.cleanup().await.expect("client2 cleanup failed");
+        database2.cleanup().await.expect("database2 cleanup failed");
+
+        // Clean up the temp dir leaked by the first instance's Drop.
+        fs::remove_dir_all(&leaked_temp_dir)
+            .await
+            .expect("failed to clean up leaked temp dir");
+    }
+
     // Tests the way `process_exited()` checks and interprets the exit status
     // for abnormal process termination (by a signal).
     #[tokio::test]

From 88502d1a9b8116d9e335df5a83dc4175ed929d5f Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Thu, 19 Mar 2026 10:12:24 -0700
Subject: [PATCH 2/5] simplify waiting

---
 test-utils/src/dev/db.rs | 97 +++++++++++++---------------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index 14064c14f13..efc1387dc40 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -733,13 +733,19 @@ impl CockroachInstance {
 
 impl Drop for CockroachInstance {
     fn drop(&mut self) {
-        // TODO-cleanup Ideally at this point we would run self.cleanup() to
-        // kill the child process, wait for it to exit, and then clean up the
-        // temporary directory.  However, we don't have an executor here with
-        // which to run async/await code.  We could create one here, but it's
-        // not clear how safe or sketchy that would be.  Instead, we expect that
-        // the caller has done the cleanup already.  This won't always happen,
-        // particularly for ungraceful failures.
+        // NOTE: Ideally, we'd share this logic with self.cleanup() to tear down
+        // the child process. However, there are a couple distinctions from that
+        // pathway we must consider here:
+        //
+        // 1. drop is not asynchronous, so we don't have the benefits of using
+        //    our tokio executor to run async/await code.
+        // 2. If self.cleanup() has not been invoked, we want to gracefully
+        //    preserve the database for post-mortem debugging. This is often
+        //    the case for e.g. a test which has failed before invoking cleanup,
+        //    and which is bailing out. We can also take this pathway when the
+        //    calling code has forgotten to call "cleanup", and is leaking the
+        //    database, but that's a bug - and choosing to flush and terminate
+        //    via SIGTERM wouldn't be wrong in that case either.
         if self.child_process.is_some() || self.temp_dir.is_some() {
             eprintln!(
                 "WARN: dropped CockroachInstance without cleaning it up first \
@@ -747,75 +753,34 @@ impl Drop for CockroachInstance {
                 temporary directory leaked)"
             );
 
-            // Send SIGTERM so CockroachDB can flush its WAL to disk,
-            // making the leaked temp directory useful for post-mortem
-            // debugging.
+            // Send SIGTERM so CockroachDB can flush its WAL to disk, making the
+            // leaked temp directory useful for post-mortem debugging.
             //
-            // Background: test CRDB instances set
+            // Background: test CRDB instances might set
             // `kv.raft_log.disable_synchronization_unsafe = true`
-            // which skips per-commit fdatasync for performance. Data
-            // is still written to the WAL but may only exist in
-            // Pebble's user-space write buffer. SIGTERM triggers
-            // Pebble's DB.Close() which flushes and fsyncs the WAL
-            // before exiting.
+            // which skips per-commit fdatasync for performance. Data is still
+            // written to the WAL but may only exist in Pebble's user-space
+            // write buffer. SIGTERM triggers Pebble's DB.Close() which flushes
+            // and fsyncs the WAL before exiting.
             // See: https://github.com/oxidecomputer/omicron/issues/10085
             if let Some(child_process) = self.child_process.as_mut() {
                 let pid = self.pid as libc::pid_t;
                 unsafe { libc::kill(pid, libc::SIGTERM) };
 
-                // Wait for the child to exit. We use
-                // waitpid(WNOHANG) in a poll loop rather than:
-                //
-                // - `process_running` (kill -0): fails because the
-                //   child becomes a zombie after exit and kill -0
-                //   still succeeds on zombies, so we'd never detect
-                //   the exit.
-                //
-                // - `waitpid` without WNOHANG: would block this
-                //   thread indefinitely if CRDB doesn't exit, with
-                //   no way to fall back to SIGKILL.
-                //
-                // - `child_process.wait().await`: not available
-                //   because we're in a synchronous Drop impl with
-                //   no async executor.
-                //
-                // WNOHANG makes waitpid return immediately: it
-                // returns the pid if the child has exited (reaping
-                // the zombie), 0 if still running, or -1 on error
-                // (e.g. ECHILD if tokio's SIGCHLD handler already
-                // reaped it). Both the pid and -1 cases mean the
-                // process has exited. This lets us poll with a
-                // timeout and fall back to SIGKILL.
-                let deadline = std::time::Instant::now()
-                    + std::time::Duration::from_secs(5);
-                let mut exited = false;
-                while std::time::Instant::now() < deadline {
-                    let mut status: libc::c_int = 0;
-                    let wr = unsafe {
-                        libc::waitpid(pid, &mut status, libc::WNOHANG)
-                    };
-                    if wr == pid || wr == -1 {
-                        // wr == pid: child exited, we reaped it.
-                        // wr == -1: ECHILD — already reaped (e.g.
-                        //   by tokio's SIGCHLD handler).
-                        exited = true;
-                        break;
+                // Poll the child_process until it exits.
+                loop {
+                    match child_process.try_wait() {
+                        Ok(Some(_status)) => {
+                            break;
+                        }
+                        Ok(None) => {} // still running
+                        Err(_) => {
+                            // Already reaped (e.g. ECHILD).
+                            break;
+                        }
                     }
                     std::thread::sleep(std::time::Duration::from_millis(50));
                 }
-
-                // If SIGTERM didn't work within the timeout, fall back
-                // to SIGKILL. Warn that the on-disk data may be stale.
-                if !exited {
-                    eprintln!(
-                        "WARN: CockroachDB did not exit after SIGTERM; \
-                         sending SIGKILL. On-disk data may be stale."
-                    );
-                    #[allow(unused_must_use)]
-                    {
-                        child_process.start_kill();
-                    }
-                }
             }
             #[allow(unused_must_use)]
             if let Some(temp_dir) = self.temp_dir.take() {

From 7efbc95424d0285974085e1da1f6cb939c770d6c Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Thu, 19 Mar 2026 13:08:28 -0700
Subject: [PATCH 3/5] update comment

---
 test-utils/src/dev/db.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index efc1387dc40..6ba6b66a088 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -1980,15 +1980,12 @@ mod test {
             .await
             .expect("insert failed");
 
-        // Phase 2: Perform many small mutations — each one is a tiny
-        // WAL entry that may sit in Pebble's user-space write buffer.
-        // With disable_synchronization_unsafe, Sync() is a no-op, so
-        // the buffer is only flushed when full. We do many UPDATEs so
-        // the last few are likely still buffered when we kill.
-        //
-        // We update each row individually to maximize the number of
-        // separate WAL entries, increasing the chance that the final
-        // entries haven't been flushed.
+        // Phase 2: Perform many small mutations that may sit in
+        // Pebble's user-space write buffer. With
+        // disable_synchronization_unsafe, Sync() is a no-op, so the
+        // buffer is only flushed when full. We issue many batch UPDATEs
+        // (each updating 100 rows) so the last few WAL entries are
+        // likely still buffered when we shut down.
         for batch_start in (1..=5000).step_by(100) {
             let batch_end = std::cmp::min(batch_start + 99, 5000);
             client

From 4f19262f1c72143453d4156033de6ec61a9e1c1c Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Thu, 19 Mar 2026 14:23:12 -0700
Subject: [PATCH 4/5] nextest configs

---
 .config/nextest.toml     |  6 +++++-
 test-utils/src/dev/db.rs | 16 +++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index 39416445b0c..6ef0aecd627 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -67,7 +67,11 @@ filter = 'binary_id(omicron-nexus::test_all) and test(::schema::)'
 threads-required = 4
 
 [[profile.ci.overrides]]
-filter = 'binary_id(omicron-nexus::test_all)'
+# omicron-nexus depends on omicron-test-utils, so this is a bit redundant,
+# but it's future-proofing. omicron-test-utils has conditions where it uses
+# SIGTERM to await CockroachDB graceful shutdown - we use a nextest timeout
+# so it can wait "as long as the test runner decides is reasonable".
+filter = 'binary_id(omicron-nexus::test_all) + rdeps(omicron-test-utils)'
 # As of 2023-01-08, the slowest test in test_all takes 196s on a Ryzen 7950X.
 # 900s is a good upper limit that adds a comfortable buffer.
 slow-timeout = { period = '60s', terminate-after = 15 }
diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index 6ba6b66a088..74ef19c8c13 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -768,14 +768,20 @@ impl Drop for CockroachInstance {
                 unsafe { libc::kill(pid, libc::SIGTERM) };
 
                 // Poll the child_process until it exits.
+                //
+                // Note that nextest has a timeout for tests
+                // with the filter 'rdeps(omicron-test-utils)'.
+                //
+                // As a consequence, we can loop "as long as the test runner
+                // lets us" waiting for CockroachDB to gracefully terminate. In
+                // reality, this is typically on the order of ~one second after
+                // a test failure.
                 loop {
                     match child_process.try_wait() {
-                        Ok(Some(_status)) => {
-                            break;
-                        }
                         Ok(None) => {} // still running
-                        Err(_) => {
-                            // Already reaped (e.g. ECHILD).
+                        Ok(Some(_)) | Err(_) => {
+                            // Terminated successfully or already reaped (e.g.
+                            // ECHILD).
                             break;
                         }
                     }

From c9489c0017760cde4de72070ad6927cbc81df75e Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Thu, 19 Mar 2026 14:58:02 -0700
Subject: [PATCH 5/5] clippy

---
 test-utils/src/dev/db.rs | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index 74ef19c8c13..c7d728b14f9 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -776,15 +776,7 @@ impl Drop for CockroachInstance {
                 // lets us" waiting for CockroachDB to gracefully terminate. In
                 // reality, this is typically on the order of ~one second after
                 // a test failure.
-                loop {
-                    match child_process.try_wait() {
-                        Ok(None) => {} // still running
-                        Ok(Some(_)) | Err(_) => {
-                            // Terminated successfully or already reaped (e.g.
-                            // ECHILD).
-                            break;
-                        }
-                    }
+                while let Ok(None) = child_process.try_wait() {
                     std::thread::sleep(std::time::Duration::from_millis(50));
                 }
             }