From e7fbdeef56f79e54d66698ab18082d626a21ccb1 Mon Sep 17 00:00:00 2001 From: Josh Carp Date: Tue, 3 Mar 2026 10:47:58 -0500 Subject: [PATCH] Add per-zone cpu metrics. Add a new oximeter instrument for tracking per-zone cpu statistics with kstat, and use it in sled-agent. We add metrics for cpu_nsec_{user,sys,waitrq}, for a total cardinality of triple the number of internal zones. --- oximeter/instruments/Cargo.toml | 3 +- oximeter/instruments/src/kstat/mod.rs | 14 ++ oximeter/instruments/src/kstat/zone.rs | 304 +++++++++++++++++++++++++ oximeter/oximeter/schema/zone.toml | 55 +++++ sled-agent/src/metrics.rs | 118 ++++++++-- sled-agent/src/sled_agent.rs | 6 +- 6 files changed, 480 insertions(+), 20 deletions(-) create mode 100644 oximeter/instruments/src/kstat/zone.rs create mode 100644 oximeter/oximeter/schema/zone.toml diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 48ed5c1b06f..15cfcbf2c37 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -27,7 +27,7 @@ uuid = { workspace = true, optional = true } omicron-workspace-hack.workspace = true [features] -default = ["http-instruments", "cpu", "datalink"] +default = ["http-instruments", "cpu", "datalink", "zone"] http-instruments = [ "dep:chrono", "dep:dropshot", @@ -55,6 +55,7 @@ kstat = [ ] cpu = ["kstat"] datalink = ["kstat"] +zone = ["kstat"] [dev-dependencies] rand.workspace = true diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index 84634deffc6..acf8b308046 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -92,6 +92,8 @@ pub mod cpu; #[cfg(any(feature = "datalink", test))] pub mod link; mod sampler; +#[cfg(any(feature = "zone", test))] +pub mod zone; pub use sampler::CollectionDetails; pub use sampler::ExpirationBehavior; @@ -236,6 +238,7 @@ pub trait ConvertNamedData { fn as_u32(&self) -> Result; fn as_i64(&self) -> Result; fn as_u64(&self) -> Result; + fn as_str(&self) -> Result<&str, Error>; } impl ConvertNamedData for NamedData<'_> { @@ -282,6 +285,17 @@ impl ConvertNamedData for NamedData<'_> { }) } } + + fn as_str(&self) -> Result<&str, Error> { + if let NamedData::String(x) = self { + Ok(*x) + } else { + Err(Error::UnexpectedDataType { + expected: NamedType::String, + found: self.data_type(), + }) + } + } } /// Return a high-resolution monotonic timestamp, in nanoseconds since an diff --git a/oximeter/instruments/src/kstat/zone.rs b/oximeter/instruments/src/kstat/zone.rs new file mode 100644 index 00000000000..f96ad559261 --- /dev/null +++ b/oximeter/instruments/src/kstat/zone.rs @@ -0,0 +1,304 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Report metrics about zones on the host system + +use crate::kstat::ConvertNamedData; +use crate::kstat::Error; +use crate::kstat::KstatList; +use crate::kstat::KstatTarget; +use crate::kstat::hrtime_to_utc; +use kstat_rs::Data; +use kstat_rs::Kstat; +use kstat_rs::Named; +use oximeter::FieldType; +use oximeter::FieldValue; +use oximeter::Sample; +use oximeter::Target; +use oximeter::types::Cumulative; +use uuid::Uuid; + +/// The prefix for zone CPU kstat fields. +const CPU_NSEC_PREFIX: &str = "nsec_"; + +/// The CPU states we track from zone kstats. +const CPU_STATES: &[&str] = &["user", "sys", "waitrq"]; + +/// The prefix used for Omicron zone names. +const ZONE_PREFIX: &str = "oxz_"; + +/// Parsed zone metadata from a zone name formatted as "oxz_TYPE_UUID". +struct ZoneMetadata { + zone_type: String, + zone_id: Uuid, +} + +/// Parse a zone name into its service type and UUID. +/// +/// Returns `None` if the zone name isn't formatted as +/// "oxz_TYPE_UUID". +/// +/// TODO: Consider passing typed zone metadata from sled-agent instead of +/// parsing zone names. As of this writing, zone names are easy to parse, +/// and we can avoid the complexity of per-zone tracking or maintaining a +/// shared mapping of zone metadata. +fn parse_zone_name(zone_name: &str) -> Option { + let rest = zone_name.strip_prefix(ZONE_PREFIX)?; + let (zone_type, uuid_str) = rest.rsplit_once('_')?; + let zone_id = uuid_str.parse().ok()?; + Some(ZoneMetadata { zone_type: zone_type.to_string(), zone_id }) +} + +oximeter::use_timeseries!("zone.toml"); +pub use self::zone::Zone as ZoneTarget; + +/// CPU metrics for all zones on a sled. +#[derive(Clone, Debug)] +pub struct Zone { + /// The oximeter target for this zone's metrics. + pub target: ZoneTarget, + /// Flag indicating whether the sled is synced with NTP. + pub time_synced: bool, +} + +impl Zone { + /// Create a new `Zone` with the given target and synchronization flag. + pub fn new(target: ZoneTarget, time_synced: bool) -> Self { + Self { target, time_synced } + } + + /// Return the sled ID. + pub fn sled_id(&self) -> Uuid { + self.target.sled_id + } +} + +impl KstatTarget for Zone { + fn interested(&self, kstat: &Kstat<'_>) -> bool { + self.time_synced && kstat.ks_module == "zones" + } + + fn to_samples( + &self, + kstats: KstatList<'_, '_>, + ) -> Result, Error> { + let mut samples = Vec::new(); + + for (creation_time, kstat, data) in kstats.iter() { + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; + + let Data::Named(named) = data else { + return Err(Error::ExpectedNamedKstat); + }; + + /* Parse zone kstats into cpu samples. + + States for the zone module look like this (stats we don't use elided): + + ... + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_sys 112675830670973 + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_user 550830053620923 + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_waitrq 9211749392692 + zones:26:oxz_cockroachdb_8bbea076-ff60-:zonename oxz_cockroachdb_8bbea076-ff60-4330-8302-383e18140ef3 + + The zone name in the identifier is truncated, so use the + zonename statistic instead. Then parse cpu-related + statistics into a cpu_nsec metric labeled by state. + */ + + // Must have exactly one statistic called "zonename". + let zone_name = named + .iter() + .find(|n| n.name == "zonename") + .ok_or(Error::NoSuchKstat) + .and_then(|n| n.value.as_str())? + .to_string(); + let (zone_type, zone_id) = match parse_zone_name(&zone_name) { + Some(m) => (m.zone_type, m.zone_id), + None => (String::new(), Uuid::nil()), + }; + + for named_data in named.iter() { + let Named { name, value } = named_data; + + let Some(state) = name.strip_prefix(CPU_NSEC_PREFIX) else { + continue; + }; + if !CPU_STATES.contains(&state) { + continue; + } + + let datum = value.as_u64()?; + let metric = zone::CpuNsec { + zone_name: zone_name.clone().into(), + zone_type: zone_type.clone().into(), + zone_id, + state: state.to_string().into(), + datum: Cumulative::with_start_time(*creation_time, datum), + }; + let sample = Sample::new_with_timestamp( + snapshot_time, + &self.target, + &metric, + ) + .map_err(Error::Sample)?; + samples.push(sample); + } + } + + Ok(samples) + } +} + +// NOTE: Delegate to the inner target type for this implementation. +impl Target for Zone { + fn name(&self) -> &'static str { + self.target.name() + } + + fn field_names(&self) -> &'static [&'static str] { + self.target.field_names() + } + + fn field_types(&self) -> Vec { + self.target.field_types() + } + + fn field_values(&self) -> Vec { + self.target.field_values() + } +} + +#[cfg(test)] +mod parse_tests { + use super::*; + + #[test] + fn test_parse_zone_name_omicron_zone() { + let metadata = parse_zone_name( + "oxz_cockroachdb_2be512e2-e127-40f0-95a4-67763ac02185", + ) + .unwrap(); + assert_eq!(metadata.zone_type, "cockroachdb"); + assert_eq!( + metadata.zone_id, + "2be512e2-e127-40f0-95a4-67763ac02185".parse::().unwrap() + ); + } + + #[test] + fn test_parse_zone_name_no_prefix() { + assert!(parse_zone_name("global").is_none()); + } + + #[test] + fn test_parse_zone_name_no_uuid() { + assert!(parse_zone_name("oxz_switch").is_none()); + } + + #[test] + fn test_parse_zone_name_invalid_uuid() { + assert!(parse_zone_name("oxz_foo_bar").is_none()); + } +} + +#[cfg(all(test, target_os = "illumos"))] +mod tests { + use super::*; + use kstat_rs::Ctl; + use uuid::Uuid; + use uuid::uuid; + + /// The metric name we expect to produce for each zone. + const ZONE_METRIC: &str = "cpu_nsec"; + + const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29"); + const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a"); + const SLED_MODEL: &str = "fake-gimlet"; + const SLED_REVISION: u32 = 1; + const SLED_SERIAL: &str = "fake-serial"; + + fn test_target() -> ZoneTarget { + ZoneTarget { + rack_id: RACK_ID, + sled_id: SLED_ID, + sled_model: SLED_MODEL.into(), + sled_revision: SLED_REVISION, + sled_serial: SLED_SERIAL.into(), + } + } + + #[test] + fn test_kstat_interested() { + let mut zone = Zone::new(test_target(), false); + + let ctl = Ctl::new().unwrap(); + let ctl = ctl.update().unwrap(); + + // There should be at least the global zone kstat. + let kstat = ctl + .filter(Some("zones"), None, None) + .next() + .expect("should have at least one zones kstat"); + + // Not interested when not time synced. + assert!(!zone.interested(&kstat)); + + // Interested when time synced. + zone.time_synced = true; + assert!(zone.interested(&kstat)); + + // Not interested in non-zone kstats. + if let Some(cpu_kstat) = + ctl.filter(Some("cpu"), Some(0), Some("sys")).next() + { + assert!(!zone.interested(&cpu_kstat)); + } + } + + #[test] + fn test_zone_samples() { + let zone = Zone::new(test_target(), true); + let ctl = Ctl::new().unwrap(); + let ctl = ctl.update().unwrap(); + + // Collect kstats for the first reported zone. + let mut kstat = ctl + .filter(Some("zones"), None, None) + .next() + .expect("should have at least one zones kstat"); + let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap(); + let data = ctl.read(&mut kstat).unwrap(); + let samples = zone.to_samples(&[(creation_time, kstat, data)]).unwrap(); + + // Assert that all metrics have the expected timeseries name. + assert!( + samples + .iter() + .all(|s| s.timeseries_name == format!("zone:{ZONE_METRIC}")) + ); + + // Extract the state from each sample. + let mut states: Vec<_> = samples + .iter() + .filter_map(|s| { + s.sorted_metric_fields().get("state").and_then(|f| { + match &f.value { + oximeter::FieldValue::String(s) => { + Some(s.as_ref().to_string()) + } + _ => None, + } + }) + }) + .collect(); + states.sort(); + + // Assert that we found all expected cpu states. + let mut expected: Vec<_> = + CPU_STATES.iter().map(|s| s.to_string()).collect(); + expected.sort(); + assert_eq!(states, expected); + } +} diff --git a/oximeter/oximeter/schema/zone.toml b/oximeter/oximeter/schema/zone.toml new file mode 100644 index 00000000000..860a3da0fe4 --- /dev/null +++ b/oximeter/oximeter/schema/zone.toml @@ -0,0 +1,55 @@ +format_version = 1 + +[target] +name = "zone" +description = "Metrics for a host zone" +authz_scope = "fleet" + +versions = [ + { version = 1, fields = [ "rack_id", "sled_id", "sled_model", "sled_revision", "sled_serial" ] }, +] + +[fields.rack_id] +type = "uuid" +description = "ID for the rack" + +[fields.sled_id] +type = "uuid" +description = "ID for the sled" + +[fields.sled_model] +type = "string" +description = "Model number of the sled" + +[fields.sled_revision] +type = "u32" +description = "Revision number of the sled" + +[fields.sled_serial] +type = "string" +description = "Serial number of the sled" + +[fields.zone_name] +type = "string" +description = "Name of the host zone" + +[fields.zone_type] +type = "string" +description = "Service type of the zone (e.g. cockroachdb, nexus)" + +[fields.zone_id] +type = "uuid" +description = "UUID of the zone" + +[fields.state] +type = "string" +description = "The CPU state (user, sys, waitrq)" + +[[metrics]] +name = "cpu_nsec" +description = "Cumulative CPU nanoseconds across all threads in a zone, by scheduling state" +units = "nanoseconds" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "zone_name", "zone_type", "zone_id", "state" ] } +] diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 8e1ead85486..3f9aa194ab5 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -19,6 +19,8 @@ use oximeter_instruments::kstat::cpu::SledCpu; use oximeter_instruments::kstat::cpu::SledCpuTarget; use oximeter_instruments::kstat::link::SledDataLink; use oximeter_instruments::kstat::link::SledDataLinkTarget; +use oximeter_instruments::kstat::zone::Zone; +use oximeter_instruments::kstat::zone::ZoneTarget; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use slog::Logger; @@ -46,6 +48,10 @@ const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(30); // now. const LINK_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); +const CPU_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); + +const ZONE_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); + /// The interval after which we expire kstat-based collection of transient /// links. /// @@ -88,10 +94,10 @@ pub enum Message { /// /// This is only used on startup, to track the underlays. TrackPhysical { zone_name: String, name: String }, - /// Start tracking CPU metrics for this sled. + /// Start tracking sled-level stats (CPU, zones). /// /// This is only used on startup. - TrackCpu, + TrackSledStats, /// Track the named VNIC. TrackVnic { zone_name: String, name: String }, /// Stop tracking the named VNIC. @@ -143,7 +149,8 @@ async fn metrics_task( mut rx: mpsc::Receiver, ) { let mut tracked_links: TrackedLinks = HashMap::new(); - let mut tracked_cpu: Option = None; + let mut tracked_zone: Option = None; + let mut tracked_sled_cpu: Option = None; let mut sled_time_synced: bool = false; // Main polling loop, waiting for messages from other pieces of the code to @@ -171,11 +178,19 @@ async fn metrics_task( add_datalink(&log, &mut tracked_links, &kstat_sampler, link) .await; } - Message::TrackCpu => { + Message::TrackSledStats => { + add_zone( + &log, + &sled_identifiers, + &mut tracked_zone, + &kstat_sampler, + sled_time_synced, + ) + .await; add_sled_cpu( &log, &sled_identifiers, - &mut tracked_cpu, + &mut tracked_sled_cpu, &kstat_sampler, sled_time_synced, ) @@ -232,7 +247,9 @@ async fn metrics_task( &kstat_sampler, ) .await; - sync_sled_cpu(&log, &mut tracked_cpu, &kstat_sampler).await; + sync_zone(&log, &mut tracked_zone, &kstat_sampler).await; + sync_sled_cpu(&log, &mut tracked_sled_cpu, &kstat_sampler) + .await; } } } @@ -360,7 +377,73 @@ fn is_transient_link(kind: &str) -> bool { kind == LinkKind::VNIC || kind == LinkKind::OPTE } -/// Start tracking CPU metrics for the sled. +/// Start tracking zone metrics for the sled. +async fn add_zone( + log: &Logger, + sled_identifiers: &SledIdentifiers, + tracked_zone: &mut Option, + kstat_sampler: &KstatSampler, + time_synced: bool, +) { + if tracked_zone.is_some() { + debug!(log, "zone metrics already being tracked"); + return; + } + + let target = ZoneTarget { + rack_id: sled_identifiers.rack_id, + sled_id: sled_identifiers.sled_id, + sled_model: sled_identifiers.model.clone().into(), + sled_revision: sled_identifiers.revision, + sled_serial: sled_identifiers.serial.clone().into(), + }; + let zone = Zone::new(target, time_synced); + + // We have one target per sled that samples all zones, so there's no + // need to expire it. + let details = CollectionDetails::never(ZONE_SAMPLE_INTERVAL); + match kstat_sampler.add_target(zone.clone(), details).await { + Ok(_id) => { + debug!(log, "added zone metrics to kstat sampler"); + *tracked_zone = Some(zone); + } + Err(err) => { + error!( + log, + "failed to add zone metrics to kstat sampler"; + "error" => ?err, + ); + } + } +} + +/// Update zone tracking when the sled is synced with NTP. +async fn sync_zone( + log: &Logger, + tracked_zone: &mut Option, + kstat_sampler: &KstatSampler, +) { + let Some(zone) = tracked_zone.as_mut() else { + return; + }; + + zone.time_synced = true; + let details = CollectionDetails::never(ZONE_SAMPLE_INTERVAL); + match kstat_sampler.update_target(zone.clone(), details).await { + Ok(_) => { + debug!(log, "updated zone metrics after time sync"); + } + Err(err) => { + error!( + log, + "failed to update zone metrics after time sync"; + "error" => ?err, + ); + } + } +} + +/// Start tracking sled CPU metrics for the sled. async fn add_sled_cpu( log: &Logger, sled_identifiers: &SledIdentifiers, @@ -382,8 +465,9 @@ async fn add_sled_cpu( }; let cpu = SledCpu::new(target, time_synced); - // CPUs are permanent, so we never expire them. - let details = CollectionDetails::never(LINK_SAMPLE_INTERVAL); + // We have one target per sled that samples all CPUs, so there's no + // need to expire it. + let details = CollectionDetails::never(CPU_SAMPLE_INTERVAL); match kstat_sampler.add_target(cpu.clone(), details).await { Ok(_id) => { debug!(log, "added CPU metrics to kstat sampler"); @@ -399,7 +483,7 @@ async fn add_sled_cpu( } } -/// Update CPU tracking when the sled is synced with NTP. +/// Update sled CPU tracking when the sled is synced with NTP. async fn sync_sled_cpu( log: &Logger, tracked_cpu: &mut Option, @@ -410,15 +494,15 @@ async fn sync_sled_cpu( }; cpu.time_synced = true; - let details = CollectionDetails::never(LINK_SAMPLE_INTERVAL); + let details = CollectionDetails::never(CPU_SAMPLE_INTERVAL); match kstat_sampler.update_target(cpu.clone(), details).await { Ok(_) => { - debug!(log, "updated CPU metrics after time sync"); + debug!(log, "updated sled CPU metrics after time sync"); } Err(err) => { error!( log, - "failed to update CPU metrics after time sync"; + "failed to update sled CPU metrics after time sync"; "error" => ?err, ); } @@ -528,12 +612,14 @@ impl MetricsRequestQueue { .map_err(|e| Error::SendFailed(e)) } - /// Ask the task to start tracking CPU metrics for this sled. + /// Ask the task to start tracking sled-level stats (CPU, zones). /// /// This is non-blocking, and returns an error if the task is currently /// unavailable. - pub fn track_cpu(&self) -> Result<(), Error> { - self.0.try_send(Message::TrackCpu).map_err(|e| Error::SendFailed(e)) + pub fn track_sled_stats(&self) -> Result<(), Error> { + self.0 + .try_send(Message::TrackSledStats) + .map_err(|e| Error::SendFailed(e)) } /// Ask the task to start tracking the named VNIC. diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index f3e426611dd..7f2626ddfe3 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -548,10 +548,10 @@ impl SledAgent { } } - // Start tracking CPU metrics. - match metrics_manager.request_queue().track_cpu() { + // Start tracking sled-level stats (CPU, zones). + match metrics_manager.request_queue().track_sled_stats() { Ok(_) => { - debug!(log, "started tracking CPU metrics") + debug!(log, "started tracking sled stats") } Err(e) => error!( log,