diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index 353a2174aac..b2c4f5fd592 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -11,7 +11,8 @@ //! //! These types are used when inserting and reading sitreps in CRDB; when in //! use, the sitrep is represented as a [`nexus_types::fm::Sitrep`]. See the -//! documentation in [`nexus_types::fm`] for more information. +//! documentation in [`nexus_types::fm`] for more information, and the +//! [crate-level documentation](crate) for general conventions. use crate::SqlU32; use crate::typed_uuid::DbTypedUuid; diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 49c45acfa01..fbb650f25d4 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -2,7 +2,82 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Structures stored to the database. +// We only use rustdoc for internal documentation, including private items, so +// it's expected that we'll have links to private items in the docs. +#![allow(rustdoc::private_intra_doc_links)] + +//! Rust types that represent rows in the Omicron database (CockroachDB). +//! +//! Each struct in this crate maps to a database table. The table's columns and +//! SQL types are declared in [`nexus_db_schema`] via Diesel's +//! [`table!`](diesel::table) macro; the struct here provides the corresponding +//! Rust types. `nexus_db_queries` then uses both to build and execute +//! queries in its `DataStore` methods. +//! +//! # How the mapping works +//! +//! Consider the `vmm` table. The schema declares columns and SQL types: +//! +//! ```text +//! // in nexus-db-schema +//! table! { +//! vmm (id) { +//! id -> Uuid, +//! sled_id -> Uuid, +//! state_generation -> Int8, +//! state -> VmmStateEnum, +//! ... +//! } +//! } +//! ``` +//! +//! The model struct provides the Rust representation of a row: +//! +//! ```text +//! // in nexus-db-model +//! #[derive(Queryable, Insertable, Selectable)] +//! #[diesel(table_name = vmm)] +//! pub struct Vmm { +//! pub id: Uuid, +//! pub sled_id: DbTypedUuid, +//! #[diesel(column_name = state_generation)] +//! pub generation: Generation, +//! pub state: VmmState, +//! ... +//! } +//! ``` +//! +//! A few things to note: +//! +//! - **Type translation.** Each field's Rust type must implement Diesel's +//! [`FromSql`](diesel::deserialize::FromSql) and +//! [`ToSql`](diesel::serialize::ToSql) traits for the corresponding column's +//! SQL type. Diesel provides these impls for common types (`Uuid`, +//! `DateTime`, `String`, etc.); this crate defines wrapper types like +//! [`DbTypedUuid`] for domain-specific conversions +//! (e.g. distinguishing a sled UUID from any other UUID). +//! - **Column renaming.** Field names must match column names by default, but +//! `#[diesel(column_name = ...)]` allows the Rust name to differ (e.g. +//! `generation` for the `state_generation` column). +//! - **Positional mapping.** [`diesel::Queryable`] maps SQL result columns to +//! struct fields by position, not by name. If the field order doesn't match +//! the column order in the query, fields will silently receive wrong values. +//! Deriving [`diesel::Selectable`] mitigates this by generating an explicit +//! column list, so the result columns are always in the order `Queryable` +//! expects. **Always derive both.** +//! +//! # Field ordering convention +//! +//! Fields in `Queryable`/`Insertable` structs should be ordered to match the +//! column order in `dbinit.sql`. This keeps the Rust types easy to +//! cross-reference with the schema definition and prevents subtle bugs if a +//! query ever omits `Selectable`. +//! +//! # Representing enums +//! +//! For types that map to database enum columns, see the [`impl_enum_type!`] +//! and [`impl_enum_wrapper!`] macros defined below. For string-backed enums, +//! see the [`DatabaseString`] trait. #[macro_use] extern crate diesel; diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 81072530b5c..66d15f5d6f7 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -6,7 +6,8 @@ //! reports (sitreps). //! //! See [RFD 603](https://rfd.shared.oxide.computer/rfd/0603) for details on the -//! fault management sitrep. +//! fault management sitrep, and the [datastore module documentation](super) for +//! general conventions. use super::DataStore; use crate::authz; @@ -503,6 +504,13 @@ impl DataStore { // rather than doing smaller ones for each case in the sitrep. This uses // more memory in Nexus but reduces the number of small db queries we // perform. + // + // The ordering of inserts among case child records (ereports, alert + // requests) and case metadata doesn't matter: there are no foreign key + // constraints between these tables, and garbage collection is keyed on + // sitrep_id (which is inserted first above). If we crash partway + // through, orphaned child records will be cleaned up when the orphaned + // sitrep is garbage collected. let mut cases = Vec::with_capacity(sitrep.cases.len()); let mut alerts_requested = Vec::new(); let mut case_ereports = Vec::new(); diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 37525b020b8..4dea932147a 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -3,6 +3,19 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Primary control plane interface for database read and write operations +//! +//! # Method naming conventions +//! +//! Many methods in this module have a public entry point that acquires a +//! database connection from the pool, paired with a private `_on_conn` (or +//! `_on_connection`) helper that takes an explicit connection parameter. The +//! helper exists so that multiple queries can share the same connection without +//! re-acquiring one from the pool for each step of a multi-query operation. +//! +//! A related convention is `_in_txn` (or `_in_transaction`), for helpers that +//! run inside an explicit database transaction. These typically take an +//! [`OptionalError`](nexus_db_errors::OptionalError) so they can bail +//! out of the entire transaction on application-level errors. // TODO-scalability review all queries for use of indexes (may need // "time_deleted IS NOT NULL" conditions) Figure out how to automate this. diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index c02c781c8e2..17db1568071 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -2,6 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +// We only use rustdoc for internal documentation, including private items, so +// it's expected that we'll have links to private items in the docs. +#![allow(rustdoc::private_intra_doc_links)] + //! Facilities for working with the Omicron database pub use nexus_auth::authn; diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 4d15465b84c..d5d72fe793c 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -82,7 +82,19 @@ impl FmRendezvous { // XXX(eliza): is it better to allocate all of these into a big array // and do a single `INSERT INTO` query, or iterate over them one by one - // (not allocating) but insert one at a time? + // (not allocating) but insert one at a time? Note that a batched insert + // would need to use `ON CONFLICT DO NOTHING` rather than checking for + // `Conflict` errors from individual inserts, since multiple Nexus + // instances may run this task concurrently. + // + // TODO(#9592) Currently, these `alert_create` calls have no guard + // against a stale Nexus inserting alerts from an outdated sitrep. This + // is fine for now because alert requests are carried forward into newer + // sitreps, so a stale insert is redundant rather than incorrect. + // However, if alerts are ever hard-deleted (e.g. when a case is + // closed), a lagging Nexus could re-create "zombie" alert records after + // deletion. At that point, the INSERT should be guarded by a CTE that + // checks the sitrep generation matches the current one. for (case_id, req) in sitrep.alerts_requested() { let &AlertRequest { id, class, requested_sitrep_id, .. } = req; status.total_alerts_requested += 1; @@ -97,7 +109,9 @@ impl FmRendezvous { ) .await { - // Alert already exists, that's fine. + // Alert already exists --- this is expected, since multiple + // Nexus instances may run this task concurrently for the same + // sitrep, or a previous activation may have partially completed. Err(Error::Conflict { .. }) => {} Err(e) => { slog::warn!(