|
| 1 | +// This Source Code Form is subject to the terms of the Mozilla Public |
| 2 | +// License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 | +// file, You can obtain one at https://mozilla.org/MPL/2.0/. |
| 4 | + |
| 5 | +//! Report metrics about zones on the host system |
| 6 | +
|
| 7 | +use crate::kstat::ConvertNamedData; |
| 8 | +use crate::kstat::Error; |
| 9 | +use crate::kstat::KstatList; |
| 10 | +use crate::kstat::KstatTarget; |
| 11 | +use crate::kstat::hrtime_to_utc; |
| 12 | +use kstat_rs::Data; |
| 13 | +use kstat_rs::Kstat; |
| 14 | +use kstat_rs::Named; |
| 15 | +use oximeter::FieldType; |
| 16 | +use oximeter::FieldValue; |
| 17 | +use oximeter::Sample; |
| 18 | +use oximeter::Target; |
| 19 | +use oximeter::types::Cumulative; |
| 20 | +use uuid::Uuid; |
| 21 | + |
| 22 | +/// The prefix for zone CPU kstat fields. |
| 23 | +const CPU_NSEC_PREFIX: &str = "nsec_"; |
| 24 | + |
| 25 | +/// The CPU states we track from zone kstats. |
| 26 | +const CPU_STATES: &[&str] = &["user", "sys", "waitrq"]; |
| 27 | + |
| 28 | +/// The prefix used for Omicron zone names. |
| 29 | +const ZONE_PREFIX: &str = "oxz_"; |
| 30 | + |
| 31 | +/// Parsed zone metadata from a zone name formatted as "oxz_TYPE_UUID". |
| 32 | +struct ZoneMetadata { |
| 33 | + zone_type: String, |
| 34 | + zone_id: Uuid, |
| 35 | +} |
| 36 | + |
| 37 | +/// Parse a zone name into its service type and UUID. |
| 38 | +/// |
| 39 | +/// Returns `None` if the zone name isn't formatted as |
| 40 | +/// "oxz_TYPE_UUID". |
| 41 | +/// |
| 42 | +/// TODO: Consider passing typed zone metadata from sled-agent instead of |
| 43 | +/// parsing zone names. As of this writing, zone names are easy to parse, |
| 44 | +/// and we can avoid the complexity of per-zone tracking or maintaining a |
| 45 | +/// shared mapping of zone metadata. |
| 46 | +fn parse_zone_name(zone_name: &str) -> Option<ZoneMetadata> { |
| 47 | + let rest = zone_name.strip_prefix(ZONE_PREFIX)?; |
| 48 | + let (zone_type, uuid_str) = rest.rsplit_once('_')?; |
| 49 | + let zone_id = uuid_str.parse().ok()?; |
| 50 | + Some(ZoneMetadata { zone_type: zone_type.to_string(), zone_id }) |
| 51 | +} |
| 52 | + |
| 53 | +oximeter::use_timeseries!("zone.toml"); |
| 54 | +pub use self::zone::Zone as ZoneTarget; |
| 55 | + |
| 56 | +/// CPU metrics for all zones on a sled. |
| 57 | +#[derive(Clone, Debug)] |
| 58 | +pub struct Zone { |
| 59 | + /// The target for this sled's CPUs. |
| 60 | + pub target: ZoneTarget, |
| 61 | + /// Flag indicating whether the sled is synced with NTP. |
| 62 | + pub time_synced: bool, |
| 63 | +} |
| 64 | + |
| 65 | +impl Zone { |
| 66 | + /// Create a new `Zone` with the given target and synchronization flag. |
| 67 | + pub fn new(target: ZoneTarget, time_synced: bool) -> Self { |
| 68 | + Self { target, time_synced } |
| 69 | + } |
| 70 | + |
| 71 | + /// Return the sled ID. |
| 72 | + pub fn sled_id(&self) -> Uuid { |
| 73 | + self.target.sled_id |
| 74 | + } |
| 75 | +} |
| 76 | + |
| 77 | +impl KstatTarget for Zone { |
| 78 | + fn interested(&self, kstat: &Kstat<'_>) -> bool { |
| 79 | + self.time_synced && kstat.ks_module == "zones" |
| 80 | + } |
| 81 | + |
| 82 | + fn to_samples( |
| 83 | + &self, |
| 84 | + kstats: KstatList<'_, '_>, |
| 85 | + ) -> Result<Vec<Sample>, Error> { |
| 86 | + let mut samples = Vec::new(); |
| 87 | + |
| 88 | + for (creation_time, kstat, data) in kstats.iter() { |
| 89 | + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; |
| 90 | + |
| 91 | + let Data::Named(named) = data else { |
| 92 | + return Err(Error::ExpectedNamedKstat); |
| 93 | + }; |
| 94 | + |
| 95 | + /* Parse zone kstats into cpu samples. |
| 96 | +
|
| 97 | + States for the zone module look like this (stats we don't use elided): |
| 98 | +
|
| 99 | + ... |
| 100 | + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_sys 112675830670973 |
| 101 | + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_user 550830053620923 |
| 102 | + zones:26:oxz_cockroachdb_8bbea076-ff60-:nsec_waitrq 9211749392692 |
| 103 | + zones:26:oxz_cockroachdb_8bbea076-ff60-:zonename oxz_cockroachdb_8bbea076-ff60-4330-8302-383e18140ef3 |
| 104 | +
|
| 105 | + The zone name in the identifier is truncated, so use the |
| 106 | + zonename statistic instead. Then parse cpu-related |
| 107 | + statistics into a cpu_nsec metric labeled by state. |
| 108 | + */ |
| 109 | + |
| 110 | + // Must have exactly one statistic called "zonename". |
| 111 | + let zone_name = named |
| 112 | + .iter() |
| 113 | + .find(|n| n.name == "zonename") |
| 114 | + .ok_or(Error::NoSuchKstat) |
| 115 | + .and_then(|n| n.value.as_str())? |
| 116 | + .to_string(); |
| 117 | + let (zone_type, zone_id) = match parse_zone_name(&zone_name) { |
| 118 | + Some(m) => (m.zone_type, m.zone_id), |
| 119 | + None => (String::new(), Uuid::nil()), |
| 120 | + }; |
| 121 | + |
| 122 | + for named_data in named.iter() { |
| 123 | + let Named { name, value } = named_data; |
| 124 | + |
| 125 | + let Some(state) = name.strip_prefix(CPU_NSEC_PREFIX) else { |
| 126 | + continue; |
| 127 | + }; |
| 128 | + if !CPU_STATES.contains(&state) { |
| 129 | + continue; |
| 130 | + } |
| 131 | + |
| 132 | + let datum = value.as_u64()?; |
| 133 | + let metric = zone::CpuNsec { |
| 134 | + zone_name: zone_name.clone().into(), |
| 135 | + zone_type: zone_type.clone().into(), |
| 136 | + zone_id, |
| 137 | + state: state.to_string().into(), |
| 138 | + datum: Cumulative::with_start_time(*creation_time, datum), |
| 139 | + }; |
| 140 | + let sample = Sample::new_with_timestamp( |
| 141 | + snapshot_time, |
| 142 | + &self.target, |
| 143 | + &metric, |
| 144 | + ) |
| 145 | + .map_err(Error::Sample)?; |
| 146 | + samples.push(sample); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + Ok(samples) |
| 151 | + } |
| 152 | +} |
| 153 | + |
| 154 | +// NOTE: Delegate to the inner target type for this implementation. |
| 155 | +impl Target for Zone { |
| 156 | + fn name(&self) -> &'static str { |
| 157 | + self.target.name() |
| 158 | + } |
| 159 | + |
| 160 | + fn field_names(&self) -> &'static [&'static str] { |
| 161 | + self.target.field_names() |
| 162 | + } |
| 163 | + |
| 164 | + fn field_types(&self) -> Vec<FieldType> { |
| 165 | + self.target.field_types() |
| 166 | + } |
| 167 | + |
| 168 | + fn field_values(&self) -> Vec<FieldValue> { |
| 169 | + self.target.field_values() |
| 170 | + } |
| 171 | +} |
| 172 | + |
| 173 | +#[cfg(test)] |
| 174 | +mod parse_tests { |
| 175 | + use super::*; |
| 176 | + |
| 177 | + #[test] |
| 178 | + fn test_parse_zone_name_omicron_zone() { |
| 179 | + let metadata = parse_zone_name( |
| 180 | + "oxz_cockroachdb_2be512e2-e127-40f0-95a4-67763ac02185", |
| 181 | + ) |
| 182 | + .unwrap(); |
| 183 | + assert_eq!(metadata.zone_type, "cockroachdb"); |
| 184 | + assert_eq!( |
| 185 | + metadata.zone_id, |
| 186 | + "2be512e2-e127-40f0-95a4-67763ac02185".parse::<Uuid>().unwrap() |
| 187 | + ); |
| 188 | + } |
| 189 | + |
| 190 | + #[test] |
| 191 | + fn test_parse_zone_name_no_prefix() { |
| 192 | + assert!(parse_zone_name("global").is_none()); |
| 193 | + } |
| 194 | + |
| 195 | + #[test] |
| 196 | + fn test_parse_zone_name_no_uuid() { |
| 197 | + assert!(parse_zone_name("oxz_switch").is_none()); |
| 198 | + } |
| 199 | + |
| 200 | + #[test] |
| 201 | + fn test_parse_zone_name_invalid_uuid() { |
| 202 | + assert!(parse_zone_name("oxz_foo_bar").is_none()); |
| 203 | + } |
| 204 | +} |
| 205 | + |
| 206 | +#[cfg(all(test, target_os = "illumos"))] |
| 207 | +mod tests { |
| 208 | + use super::*; |
| 209 | + use kstat_rs::Ctl; |
| 210 | + use uuid::Uuid; |
| 211 | + use uuid::uuid; |
| 212 | + |
| 213 | + /// The metric name we expect to produce for each zone. |
| 214 | + const ZONE_METRIC: &str = "cpu_nsec"; |
| 215 | + |
| 216 | + const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29"); |
| 217 | + const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a"); |
| 218 | + const SLED_MODEL: &str = "fake-gimlet"; |
| 219 | + const SLED_REVISION: u32 = 1; |
| 220 | + const SLED_SERIAL: &str = "fake-serial"; |
| 221 | + |
| 222 | + fn test_target() -> ZoneTarget { |
| 223 | + ZoneTarget { |
| 224 | + rack_id: RACK_ID, |
| 225 | + sled_id: SLED_ID, |
| 226 | + sled_model: SLED_MODEL.into(), |
| 227 | + sled_revision: SLED_REVISION, |
| 228 | + sled_serial: SLED_SERIAL.into(), |
| 229 | + } |
| 230 | + } |
| 231 | + |
| 232 | + #[test] |
| 233 | + fn test_kstat_interested() { |
| 234 | + let mut zone = Zone::new(test_target(), false); |
| 235 | + |
| 236 | + let ctl = Ctl::new().unwrap(); |
| 237 | + let ctl = ctl.update().unwrap(); |
| 238 | + |
| 239 | + // There should be at least the global zone kstat. |
| 240 | + let kstat = ctl |
| 241 | + .filter(Some("zones"), None, None) |
| 242 | + .next() |
| 243 | + .expect("should have at least one zones kstat"); |
| 244 | + |
| 245 | + // Not interested when not time synced. |
| 246 | + assert!(!zone.interested(&kstat)); |
| 247 | + |
| 248 | + // Interested when time synced. |
| 249 | + zone.time_synced = true; |
| 250 | + assert!(zone.interested(&kstat)); |
| 251 | + |
| 252 | + // Not interested in non-zone kstats. |
| 253 | + if let Some(cpu_kstat) = |
| 254 | + ctl.filter(Some("cpu"), Some(0), Some("sys")).next() |
| 255 | + { |
| 256 | + assert!(!zone.interested(&cpu_kstat)); |
| 257 | + } |
| 258 | + } |
| 259 | + |
| 260 | + #[test] |
| 261 | + fn test_zone_samples() { |
| 262 | + let zone = Zone::new(test_target(), true); |
| 263 | + let ctl = Ctl::new().unwrap(); |
| 264 | + let ctl = ctl.update().unwrap(); |
| 265 | + |
| 266 | + // Collect kstats for the first reported zone. |
| 267 | + let mut kstat = ctl |
| 268 | + .filter(Some("zones"), None, None) |
| 269 | + .next() |
| 270 | + .expect("should have at least one zones kstat"); |
| 271 | + let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap(); |
| 272 | + let data = ctl.read(&mut kstat).unwrap(); |
| 273 | + let samples = zone.to_samples(&[(creation_time, kstat, data)]).unwrap(); |
| 274 | + |
| 275 | + // Assert that all metrics have the expected timeseries name. |
| 276 | + assert!( |
| 277 | + samples |
| 278 | + .iter() |
| 279 | + .all(|s| s.timeseries_name == format!("zone:{ZONE_METRIC}")) |
| 280 | + ); |
| 281 | + |
| 282 | + // Extract the state from each sample. |
| 283 | + let mut states: Vec<_> = samples |
| 284 | + .iter() |
| 285 | + .filter_map(|s| { |
| 286 | + s.sorted_metric_fields().get("state").and_then(|f| { |
| 287 | + match &f.value { |
| 288 | + oximeter::FieldValue::String(s) => { |
| 289 | + Some(s.as_ref().to_string()) |
| 290 | + } |
| 291 | + _ => None, |
| 292 | + } |
| 293 | + }) |
| 294 | + }) |
| 295 | + .collect(); |
| 296 | + states.sort(); |
| 297 | + |
| 298 | + // Assert that we found all expected cpu states. |
| 299 | + let mut expected: Vec<_> = |
| 300 | + CPU_STATES.iter().map(|s| s.to_string()).collect(); |
| 301 | + expected.sort(); |
| 302 | + assert_eq!(states, expected); |
| 303 | + } |
| 304 | +} |
0 commit comments