From b6ee77e22f8d5f25439ca214d6f8a9403888d0b6 Mon Sep 17 00:00:00 2001 From: Dan Rosen Date: Mon, 16 Mar 2026 16:56:13 -0400 Subject: [PATCH 1/2] Add review context comments --- nexus/db-model/src/fm.rs | 8 ++++++++ nexus/db-queries/src/db/datastore/fm.rs | 18 ++++++++++++++++++ nexus/db-queries/src/lib.rs | 4 ++++ .../src/app/background/tasks/fm_rendezvous.rs | 18 ++++++++++++++++-- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index 353a2174aac..d0b008222b4 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -12,6 +12,14 @@ //! These types are used when inserting and reading sitreps in CRDB; when in //! use, the sitrep is represented as a [`nexus_types::fm::Sitrep`]. See the //! documentation in [`nexus_types::fm`] for more information. +//! +//! # Conventions +//! +//! Fields in `Queryable`/`Insertable` structs should be ordered to match the +//! column order in `dbinit.sql`. Diesel maps `Queryable` fields positionally, +//! so matching the schema column order prevents subtle bugs if a query ever +//! omits `Selectable`, and keeps the Rust types easy to cross-reference with +//! the schema definition. use crate::SqlU32; use crate::typed_uuid::DbTypedUuid; diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 81072530b5c..1224bf88911 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -7,6 +7,17 @@ //! //! See [RFD 603](https://rfd.shared.oxide.computer/rfd/0603) for details on the //! fault management sitrep. +//! +//! # Conventions +//! +//! Methods with an `_on_conn` suffix take an explicit database connection +//! parameter. In other parts of the codebase this suffix sometimes implies the +//! method is called within a transaction, but that isn't always the case here. +//! In the FM datastore, `_on_conn` methods exist primarily so that multiple +//! queries can share the same connection for consistency (e.g. loading a sitrep +//! reads child records first and metadata last on the same connection, to +//! detect concurrent deletes without requiring a transaction, see +//! [`DataStore::fm_sitrep_read_on_conn`] and issue #9594). use super::DataStore; use crate::authz; @@ -503,6 +514,13 @@ impl DataStore { // rather than doing smaller ones for each case in the sitrep. This uses // more memory in Nexus but reduces the number of small db queries we // perform. + // + // The ordering of inserts among case child records (ereports, alert + // requests) and case metadata doesn't matter: there are no foreign key + // constraints between these tables, and garbage collection is keyed on + // sitrep_id (which is inserted first above). If we crash partway + // through, orphaned child records will be cleaned up when the orphaned + // sitrep is garbage collected. let mut cases = Vec::with_capacity(sitrep.cases.len()); let mut alerts_requested = Vec::new(); let mut case_ereports = Vec::new(); diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index c02c781c8e2..17db1568071 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -2,6 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +// We only use rustdoc for internal documentation, including private items, so +// it's expected that we'll have links to private items in the docs. +#![allow(rustdoc::private_intra_doc_links)] + //! Facilities for working with the Omicron database pub use nexus_auth::authn; diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 4d15465b84c..78eddc35785 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -82,7 +82,19 @@ impl FmRendezvous { // XXX(eliza): is it better to allocate all of these into a big array // and do a single `INSERT INTO` query, or iterate over them one by one - // (not allocating) but insert one at a time? + // (not allocating) but insert one at a time? Note that a batched insert + // would need to use `ON CONFLICT DO NOTHING` rather than checking for + // `Conflict` errors from individual inserts, since multiple Nexus + // instances may run this task concurrently. + // + // Currently, these `alert_create` calls have no guard against a stale + // Nexus inserting alerts from an outdated sitrep. This is fine for now + // because alert requests are carried forward into newer sitreps, so a + // stale insert is redundant rather than incorrect. However, if alerts + // are ever hard-deleted (e.g. when a case is closed), a lagging Nexus + // could re-create "zombie" alert records after deletion. At that point, + // the INSERT should be guarded by a CTE that checks the sitrep + // generation matches the current one. for (case_id, req) in sitrep.alerts_requested() { let &AlertRequest { id, class, requested_sitrep_id, .. } = req; status.total_alerts_requested += 1; @@ -97,7 +109,9 @@ impl FmRendezvous { ) .await { - // Alert already exists, that's fine. + // Alert already exists --- this is expected, since multiple + // Nexus instances may run this task concurrently for the same + // sitrep, or a previous activation may have partially completed. Err(Error::Conflict { .. }) => {} Err(e) => { slog::warn!( From b65d715f16c3585a4d1901693255c32293dc246a Mon Sep 17 00:00:00 2001 From: Dan Rosen Date: Tue, 17 Mar 2026 19:40:37 -0400 Subject: [PATCH 2/2] Move doc comments to crate/module level per review feedback Move DB model conventions (field ordering, Diesel derives) from nexus/db-model/src/fm.rs to the nexus-db-model crate-level docs in lib.rs, with a vmm example showing schema-to-model type mapping. Move _on_conn/_in_txn naming conventions from the FM datastore module to nexus/db-queries/src/db/datastore/mod.rs. Fix the framing: connection sharing is primarily about pool efficiency, not consistency. Keep the FM-specific correctness detail (concurrent delete detection, #9594) inline with a pointer to the general docs. --- nexus/db-model/src/fm.rs | 11 +-- nexus/db-model/src/lib.rs | 77 ++++++++++++++++++- nexus/db-queries/src/db/datastore/fm.rs | 14 +--- nexus/db-queries/src/db/datastore/mod.rs | 13 ++++ .../src/app/background/tasks/fm_rendezvous.rs | 16 ++-- 5 files changed, 101 insertions(+), 30 deletions(-) diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index d0b008222b4..b2c4f5fd592 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -11,15 +11,8 @@ //! //! These types are used when inserting and reading sitreps in CRDB; when in //! use, the sitrep is represented as a [`nexus_types::fm::Sitrep`]. See the -//! documentation in [`nexus_types::fm`] for more information. -//! -//! # Conventions -//! -//! Fields in `Queryable`/`Insertable` structs should be ordered to match the -//! column order in `dbinit.sql`. Diesel maps `Queryable` fields positionally, -//! so matching the schema column order prevents subtle bugs if a query ever -//! omits `Selectable`, and keeps the Rust types easy to cross-reference with -//! the schema definition. +//! documentation in [`nexus_types::fm`] for more information, and the +//! [crate-level documentation](crate) for general conventions. use crate::SqlU32; use crate::typed_uuid::DbTypedUuid; diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 49c45acfa01..fbb650f25d4 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -2,7 +2,82 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Structures stored to the database. +// We only use rustdoc for internal documentation, including private items, so +// it's expected that we'll have links to private items in the docs. +#![allow(rustdoc::private_intra_doc_links)] + +//! Rust types that represent rows in the Omicron database (CockroachDB). +//! +//! Each struct in this crate maps to a database table. The table's columns and +//! SQL types are declared in [`nexus_db_schema`] via Diesel's +//! [`table!`](diesel::table) macro; the struct here provides the corresponding +//! Rust types. `nexus_db_queries` then uses both to build and execute +//! queries in its `DataStore` methods. +//! +//! # How the mapping works +//! +//! Consider the `vmm` table. The schema declares columns and SQL types: +//! +//! ```text +//! // in nexus-db-schema +//! table! { +//! vmm (id) { +//! id -> Uuid, +//! sled_id -> Uuid, +//! state_generation -> Int8, +//! state -> VmmStateEnum, +//! ... +//! } +//! } +//! ``` +//! +//! The model struct provides the Rust representation of a row: +//! +//! ```text +//! // in nexus-db-model +//! #[derive(Queryable, Insertable, Selectable)] +//! #[diesel(table_name = vmm)] +//! pub struct Vmm { +//! pub id: Uuid, +//! pub sled_id: DbTypedUuid, +//! #[diesel(column_name = state_generation)] +//! pub generation: Generation, +//! pub state: VmmState, +//! ... +//! } +//! ``` +//! +//! A few things to note: +//! +//! - **Type translation.** Each field's Rust type must implement Diesel's +//! [`FromSql`](diesel::deserialize::FromSql) and +//! [`ToSql`](diesel::serialize::ToSql) traits for the corresponding column's +//! SQL type. Diesel provides these impls for common types (`Uuid`, +//! `DateTime`, `String`, etc.); this crate defines wrapper types like +//! [`DbTypedUuid`] for domain-specific conversions +//! (e.g. distinguishing a sled UUID from any other UUID). +//! - **Column renaming.** Field names must match column names by default, but +//! `#[diesel(column_name = ...)]` allows the Rust name to differ (e.g. +//! `generation` for the `state_generation` column). +//! - **Positional mapping.** [`diesel::Queryable`] maps SQL result columns to +//! struct fields by position, not by name. If the field order doesn't match +//! the column order in the query, fields will silently receive wrong values. +//! Deriving [`diesel::Selectable`] mitigates this by generating an explicit +//! column list, so the result columns are always in the order `Queryable` +//! expects. **Always derive both.** +//! +//! # Field ordering convention +//! +//! Fields in `Queryable`/`Insertable` structs should be ordered to match the +//! column order in `dbinit.sql`. This keeps the Rust types easy to +//! cross-reference with the schema definition and prevents subtle bugs if a +//! query ever omits `Selectable`. +//! +//! # Representing enums +//! +//! For types that map to database enum columns, see the [`impl_enum_type!`] +//! and [`impl_enum_wrapper!`] macros defined below. For string-backed enums, +//! see the [`DatabaseString`] trait. #[macro_use] extern crate diesel; diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 1224bf88911..66d15f5d6f7 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -6,18 +6,8 @@ //! reports (sitreps). //! //! See [RFD 603](https://rfd.shared.oxide.computer/rfd/0603) for details on the -//! fault management sitrep. -//! -//! # Conventions -//! -//! Methods with an `_on_conn` suffix take an explicit database connection -//! parameter. In other parts of the codebase this suffix sometimes implies the -//! method is called within a transaction, but that isn't always the case here. -//! In the FM datastore, `_on_conn` methods exist primarily so that multiple -//! queries can share the same connection for consistency (e.g. loading a sitrep -//! reads child records first and metadata last on the same connection, to -//! detect concurrent deletes without requiring a transaction, see -//! [`DataStore::fm_sitrep_read_on_conn`] and issue #9594). +//! fault management sitrep, and the [datastore module documentation](super) for +//! general conventions. use super::DataStore; use crate::authz; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 37525b020b8..4dea932147a 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -3,6 +3,19 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Primary control plane interface for database read and write operations +//! +//! # Method naming conventions +//! +//! Many methods in this module have a public entry point that acquires a +//! database connection from the pool, paired with a private `_on_conn` (or +//! `_on_connection`) helper that takes an explicit connection parameter. The +//! helper exists so that multiple queries can share the same connection without +//! re-acquiring one from the pool for each step of a multi-query operation. +//! +//! A related convention is `_in_txn` (or `_in_transaction`), for helpers that +//! run inside an explicit database transaction. These typically take an +//! [`OptionalError`](nexus_db_errors::OptionalError) so they can bail +//! out of the entire transaction on application-level errors. // TODO-scalability review all queries for use of indexes (may need // "time_deleted IS NOT NULL" conditions) Figure out how to automate this. diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 78eddc35785..d5d72fe793c 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -87,14 +87,14 @@ impl FmRendezvous { // `Conflict` errors from individual inserts, since multiple Nexus // instances may run this task concurrently. // - // Currently, these `alert_create` calls have no guard against a stale - // Nexus inserting alerts from an outdated sitrep. This is fine for now - // because alert requests are carried forward into newer sitreps, so a - // stale insert is redundant rather than incorrect. However, if alerts - // are ever hard-deleted (e.g. when a case is closed), a lagging Nexus - // could re-create "zombie" alert records after deletion. At that point, - // the INSERT should be guarded by a CTE that checks the sitrep - // generation matches the current one. + // TODO(#9592) Currently, these `alert_create` calls have no guard + // against a stale Nexus inserting alerts from an outdated sitrep. This + // is fine for now because alert requests are carried forward into newer + // sitreps, so a stale insert is redundant rather than incorrect. + // However, if alerts are ever hard-deleted (e.g. when a case is + // closed), a lagging Nexus could re-create "zombie" alert records after + // deletion. At that point, the INSERT should be guarded by a CTE that + // checks the sitrep generation matches the current one. for (case_id, req) in sitrep.alerts_requested() { let &AlertRequest { id, class, requested_sitrep_id, .. } = req; status.total_alerts_requested += 1;