From 57fd364f99b778e22e3a05c24c2793ce0b28398f Mon Sep 17 00:00:00 2001 From: Emily Albini Date: Mon, 2 Mar 2026 13:55:18 +0100 Subject: [PATCH 1/5] fix flaky test test_sled_sp_inventory_matching --- .../integration_tests/inventory_matching.rs | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/nexus/tests/integration_tests/inventory_matching.rs b/nexus/tests/integration_tests/inventory_matching.rs index 3faab873551..96362963807 100644 --- a/nexus/tests/integration_tests/inventory_matching.rs +++ b/nexus/tests/integration_tests/inventory_matching.rs @@ -7,6 +7,8 @@ use nexus_db_queries::context::OpContext; use nexus_test_utils_macros::nexus_test; use nexus_types::identity::Asset; +use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; +use std::time::Duration; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -20,12 +22,21 @@ async fn test_sled_sp_inventory_matching(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Get the latest inventory collection - let inventory = datastore - .inventory_get_latest_collection(&opctx) - .await - .expect("failed to get inventory collection") - .expect("no inventory collection available"); + // Get the latest inventory collection. When running tests with high + // concurrency the inventory might not be ready yet, so we retry. + let inventory = wait_for_condition( + || async { + datastore + .inventory_get_latest_collection(&opctx) + .await + .expect("failed to get inventory collection") + .ok_or(CondCheckError::<()>::NotYet) + }, + &Duration::from_millis(100), + &Duration::from_secs(30), + ) + .await + .expect("no inventory collection available"); // Get all sleds let sleds = datastore From 351f79dd7f1bc6edaa8609279ca8c7d0e00f1eb9 Mon Sep 17 00:00:00 2001 From: Emily Albini Date: Mon, 9 Mar 2026 13:10:37 +0100 Subject: [PATCH 2/5] fix flaky test test_update_status --- nexus/tests/integration_tests/updates.rs | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 2238a0d5951..953e4b25e03 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -39,6 +39,8 @@ use tufaceous_lib::assemble::{ArtifactManifest, OmicronRepoAssembler}; use tufaceous_lib::assemble::{DeserializedManifest, ManifestTweak}; use crate::integration_tests::target_release::set_target_release_for_mupdate_recovery; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; const TRUST_ROOTS_URL: &str = "/v1/system/update/trust-roots"; @@ -173,6 +175,25 @@ impl TestRepo { } } +async fn wait_for_inventory(cptestctx: &ControlPlaneTestContext) { + let log = cptestctx.logctx.log.new(o!()); + let datastore = cptestctx.server.server_context().nexus.datastore(); + let opctx = OpContext::for_tests(log, datastore.clone()); + wait_for_condition( + || async { + datastore + .inventory_get_latest_collection(&opctx) + .await + .expect("failed to get inventory collection") + .ok_or(CondCheckError::<()>::NotYet) + }, + &std::time::Duration::from_millis(100), + &std::time::Duration::from_secs(30), + ) + .await + .expect("no inventory collection available"); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_repo_upload_unconfigured() -> Result<()> { let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( @@ -721,6 +742,13 @@ async fn test_update_status() -> Result<()> { let client = &cptestctx.external_client; let logctx = &cptestctx.logctx; + // During high contention the inventory might not be ready yet, which will + // cause the call to /v1/system/update/status to 500. We thus query the + // database to make sure we have an inventory before proceeding. + // + // Flaky test issue: https://github.com/oxidecomputer/omicron/issues/9316 + wait_for_inventory(&cptestctx).await; + // initial status let status: update::UpdateStatus = object_get(client, "/v1/system/update/status").await; From 5ff0724fa29f2d456afe3ee81f7ddd2ee35201d4 Mon Sep 17 00:00:00 2001 From: Emily Albini Date: Mon, 9 Mar 2026 18:24:01 +0100 Subject: [PATCH 3/5] fix flaky test test_request_without_api_version --- nexus/tests/integration_tests/updates.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 953e4b25e03..f1a65c8b383 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -1083,6 +1083,12 @@ async fn test_request_without_api_version(cptestctx: &ControlPlaneTestContext) { let server_addr = cptestctx.server.get_http_server_external_address(); let test_cx = ClientTestContext::new(server_addr, cptestctx.logctx.log.clone()); + + // During high contention the inventory might not be ready yet, which will + // cause the call to /v1/system/update/status to 500. We thus query the + // database to make sure we have an inventory before proceeding. + wait_for_inventory(cptestctx).await; + let req_builder = RequestBuilder::new( &test_cx, http::Method::GET, From 7db4959b8e58331d189cbc69c63771f1c850536a Mon Sep 17 00:00:00 2001 From: Emily Albini Date: Mon, 9 Mar 2026 18:27:09 +0100 Subject: [PATCH 4/5] fix flaky test get_set_target_release --- .../tests/integration_tests/target_release.rs | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/nexus/tests/integration_tests/target_release.rs b/nexus/tests/integration_tests/target_release.rs index 8944377cddb..33167204832 100644 --- a/nexus/tests/integration_tests/target_release.rs +++ b/nexus/tests/integration_tests/target_release.rs @@ -26,9 +26,12 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::external_api::update; use nexus_types::external_api::update::SetTargetReleaseParams; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; use omicron_uuid_kinds::{BlueprintUuid, GenericUuid}; use semver::Version; use std::sync::Arc; +use std::time::Duration; use tufaceous_artifact::ArtifactKind; use tufaceous_artifact::{ArtifactVersion, KnownArtifactKind}; use tufaceous_lib::assemble::ManifestTweak; @@ -44,6 +47,28 @@ async fn get_set_target_release() -> Result<()> { let client = &ctx.external_client; let logctx = &ctx.logctx; + // During high contention the inventory might not be ready yet, which will + // cause the call to /v1/system/update/status to 500. We thus query the + // database to make sure we have an inventory before proceeding. + { + let datastore = ctx.server.server_context().nexus.datastore(); + let opctx = + OpContext::for_tests(logctx.log.new(o!()), datastore.clone()); + wait_for_condition( + || async { + datastore + .inventory_get_latest_collection(&opctx) + .await + .expect("failed to get inventory collection") + .ok_or(CondCheckError::<()>::NotYet) + }, + &Duration::from_millis(100), + &Duration::from_secs(30), + ) + .await + .expect("no inventory collection available"); + } + // There is no target release before one has ever been specified let status: update::UpdateStatus = object_get(client, "/v1/system/update/status").await; From d992d4672201786d7a577ef2290fe588b7443e72 Mon Sep 17 00:00:00 2001 From: Emily Albini Date: Mon, 2 Mar 2026 13:55:18 +0100 Subject: [PATCH 5/5] monitor resource consumption on linux --- .github/buildomat/build-and-test.sh | 3 + .../buildomat/jobs/build-and-test-helios.sh | 3 +- .../buildomat/jobs/build-and-test-linux.sh | 1 + .github/buildomat/jobs/check-features.sh | 4 + .github/buildomat/jobs/clippy.sh | 7 +- .github/buildomat/jobs/omicron-common.sh | 7 +- .github/buildomat/jobs/package.sh | 6 +- .github/buildomat/jobs/tuf-repo.sh | 6 +- Cargo.lock | 8 + Cargo.toml | 1 + dev-tools/ci-resource-usage/Cargo.toml | 13 ++ dev-tools/ci-resource-usage/src/illumos.rs | 163 ++++++++++++++++++ dev-tools/ci-resource-usage/src/linux.rs | 139 +++++++++++++++ dev-tools/ci-resource-usage/src/main.rs | 151 ++++++++++++++++ 14 files changed, 507 insertions(+), 5 deletions(-) create mode 100644 dev-tools/ci-resource-usage/Cargo.toml create mode 100644 dev-tools/ci-resource-usage/src/illumos.rs create mode 100644 dev-tools/ci-resource-usage/src/linux.rs create mode 100644 dev-tools/ci-resource-usage/src/main.rs diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh index f3f8e2b8113..759bfe25c7a 100755 --- a/.github/buildomat/build-and-test.sh +++ b/.github/buildomat/build-and-test.sh @@ -24,6 +24,9 @@ cargo --version rustc --version curl -sSfL --retry 10 https://get.nexte.st/"$NEXTEST_VERSION"/"$1" | gunzip | tar -xvf - -C ~/.cargo/bin +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + # # Set up a custom temporary directory within whatever one we were given so that # we can check later whether we left detritus around. diff --git a/.github/buildomat/jobs/build-and-test-helios.sh b/.github/buildomat/jobs/build-and-test-helios.sh index 2bf5608200b..e902eef1267 100755 --- a/.github/buildomat/jobs/build-and-test-helios.sh +++ b/.github/buildomat/jobs/build-and-test-helios.sh @@ -2,7 +2,7 @@ #: #: name = "build-and-test (helios)" #: variety = "basic" -#: target = "helios-2.0" +#: target = "helios-2.0-32c256gb" #: rust_toolchain = true #: output_rules = [ #: "%/work/*", @@ -12,6 +12,7 @@ #: "%/var/tmp/omicron_tmp/**/*", #: "!/var/tmp/omicron_tmp/crdb-base*", #: "!/var/tmp/omicron_tmp/rustc*", +#: "%/var/tmp/ci-resource-usage.csv", #: ] #: access_repos = [ #: "oxidecomputer/dendrite" diff --git a/.github/buildomat/jobs/build-and-test-linux.sh b/.github/buildomat/jobs/build-and-test-linux.sh index e9d0bcefadc..3dbec6831f2 100755 --- a/.github/buildomat/jobs/build-and-test-linux.sh +++ b/.github/buildomat/jobs/build-and-test-linux.sh @@ -11,6 +11,7 @@ #: "%/var/tmp/omicron_tmp/**/*", #: "!/var/tmp/omicron_tmp/crdb-base*", #: "!/var/tmp/omicron_tmp/rustc*", +#: "%/var/tmp/ci-resource-usage.csv", #: ] #: access_repos = [ #: "oxidecomputer/dendrite", diff --git a/.github/buildomat/jobs/check-features.sh b/.github/buildomat/jobs/check-features.sh index 03dbee4cfa2..f1e1afb6023 100644 --- a/.github/buildomat/jobs/check-features.sh +++ b/.github/buildomat/jobs/check-features.sh @@ -6,6 +6,7 @@ #: rust_toolchain = true #: output_rules = [ #: "/out/*", +#: "%/var/tmp/ci-resource-usage.csv", #: ] # Run the check-features `xtask` on illumos, testing compilation of feature combinations. @@ -20,6 +21,9 @@ source .github/buildomat/ci-env.sh cargo --version rustc --version +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + # # Set up our PATH for use with this workspace. # diff --git a/.github/buildomat/jobs/clippy.sh b/.github/buildomat/jobs/clippy.sh index 66f5208088d..68d47a0314d 100755 --- a/.github/buildomat/jobs/clippy.sh +++ b/.github/buildomat/jobs/clippy.sh @@ -4,7 +4,9 @@ #: variety = "basic" #: target = "helios-2.0" #: rust_toolchain = true -#: output_rules = [] +#: output_rules = [ +#: "%/var/tmp/ci-resource-usage.csv", +#: ] # Run clippy on illumos (not just other systems) because a bunch of our code # (that we want to check) is conditionally-compiled on illumos only. @@ -28,6 +30,9 @@ source .github/buildomat/ci-env.sh cargo --version rustc --version +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + banner prerequisites ptime -m bash ./tools/install_builder_prerequisites.sh -y diff --git a/.github/buildomat/jobs/omicron-common.sh b/.github/buildomat/jobs/omicron-common.sh index 676c18f52ad..7b0d7b235c2 100755 --- a/.github/buildomat/jobs/omicron-common.sh +++ b/.github/buildomat/jobs/omicron-common.sh @@ -4,7 +4,9 @@ #: variety = "basic" #: target = "helios-2.0" #: rust_toolchain = true -#: output_rules = [] +#: output_rules = [ +#: "%/var/tmp/ci-resource-usage.csv", +#: ] # Verify that omicron-common builds successfully when used as a dependency # in an external project. It must not leak anything that requires an external @@ -20,6 +22,9 @@ source .github/buildomat/ci-env.sh cargo --version rustc --version +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + cd /tmp cargo new --lib test-project cd test-project diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index 53b2e960da9..e5cf9c6ec1e 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -2,10 +2,11 @@ #: #: name = "helios / package" #: variety = "basic" -#: target = "helios-2.0" +#: target = "helios-2.0-16c64gb" #: rust_toolchain = true #: output_rules = [ #: "=/work/package.tar.gz", +#: "%/var/tmp/ci-resource-usage.csv", #: ] #: @@ -19,6 +20,9 @@ source .github/buildomat/ci-env.sh cargo --version rustc --version +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + WORK=/work pfexec mkdir -p $WORK && pfexec chown $USER $WORK diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index 9e7c7326bcb..8262b42df2b 100755 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -2,7 +2,7 @@ #: #: name = "helios / build TUF repo" #: variety = "basic" -#: target = "helios-2.0" +#: target = "helios-2.0-16c128gb" #: rust_toolchain = true #: output_rules = [ #: "=/work/manifest.toml", @@ -12,6 +12,7 @@ #: "=/work/incorporation.p5m", #: "=/work/incorporation.p5p", #: "%/work/*.log", +#: "%/var/tmp/ci-resource-usage.csv", #: ] #: access_repos = [ #: "oxidecomputer/amd-firmware", @@ -61,6 +62,9 @@ source .github/buildomat/ci-env.sh cargo --version rustc --version +cargo build --release -p ci-resource-usage +bmat process start ci-resource-usage target/release/ci-resource-usage /var/tmp/ci-resource-usage.csv + # Before we do _anything_, quickly check that Cargo.lock is properly locked. # Most of our tools (including releng!) eventually call `cargo xtask`, which # runs without `--locked` and will update the lockfile. diff --git a/Cargo.lock b/Cargo.lock index 31e144f33ec..eb072a11051 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1330,6 +1330,14 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "ci-resource-usage" +version = "0.1.0" +dependencies = [ + "anyhow", + "kstat-rs", +] + [[package]] name = "ciborium" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index 17e9dd6acf4..28c0b216a61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ members = [ "cockroach-metrics", "common", "dev-tools/cert-dev", + "dev-tools/ci-resource-usage", "dev-tools/clickana", "dev-tools/clickhouse-cluster-dev", "dev-tools/ch-dev", diff --git a/dev-tools/ci-resource-usage/Cargo.toml b/dev-tools/ci-resource-usage/Cargo.toml new file mode 100644 index 00000000000..9033c80fbdd --- /dev/null +++ b/dev-tools/ci-resource-usage/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "ci-resource-usage" +version = "0.1.0" +edition.workspace = true + +[dependencies] +anyhow.workspace = true + +[target.'cfg(target_os = "illumos")'.dependencies] +kstat-rs.workspace = true + +[lints] +workspace = true diff --git a/dev-tools/ci-resource-usage/src/illumos.rs b/dev-tools/ci-resource-usage/src/illumos.rs new file mode 100644 index 00000000000..126eec9f2c7 --- /dev/null +++ b/dev-tools/ci-resource-usage/src/illumos.rs @@ -0,0 +1,163 @@ +use crate::{CpuCore, Io, Memory, Swap}; +use anyhow::{Context as _, Error, anyhow, bail}; +use kstat_rs::{Ctl, Data, Kstat, NamedData}; + +pub(crate) fn gather_cpu_cores() -> Result, Error> { + let mut cores = Vec::new(); + let ctl = Ctl::new()?; + for mut entry in ctl.filter(Some("cpu"), None, Some("sys")) { + let parsed = KstatCpuCore::read(&ctl, &mut entry)?; + + let used_ticks = parsed.cpu_ticks_user + parsed.cpu_ticks_kernel; + let wait_ticks = parsed.cpu_ticks_wait; + cores.push(CpuCore { + total_ticks: used_ticks + wait_ticks + parsed.cpu_ticks_idle, + used_ticks, + wait_ticks, + }); + } + Ok(cores) +} + +pub(crate) fn gather_io() -> Result { + let mut io = Io { read_ops: 0, write_ops: 0 }; + let ctl = Ctl::new()?; + for mut entry in ctl.filter(Some("blkdev"), None, None) { + let Data::Io(data) = ctl.read(&mut entry)? else { + bail!("expected io for {}", entry.ks_name); + }; + io.read_ops += u64::from(data.reads); + io.write_ops += u64::from(data.writes); + } + Ok(io) +} + +pub(crate) fn gather_load_1min() -> Result { + let data: KstatSystemMisc = get("unix", 0, "system_misc")?; + Ok(data.avenrun_1min as f64 / 256.0) +} + +pub(crate) fn gather_memory() -> Result { + let data: KstatSystemPages = get("unix", 0, "system_pages")?; + Ok(Memory { total: data.physmem * 4096, available: data.freemem * 4096 }) +} + +pub(crate) fn gather_swap() -> Result { + let mut swap = Swap { read: 0, write: 0 }; + let ctl = Ctl::new()?; + for mut entry in ctl.filter(Some("cpu"), None, Some("vm")) { + let parsed = KstatCpuVm::read(&ctl, &mut entry)?; + swap.read += parsed.anonpgin; + swap.write += parsed.anonpgout; + } + Ok(swap) +} + +fn get(module: &str, inst: i32, name: &str) -> Result { + let ctl = Ctl::new()?; + let mut kstat = ctl + .filter(Some(module), Some(inst), Some(name)) + .next() + .ok_or_else(|| anyhow!("missing {module}:{inst}:{name} kstat"))?; + T::read(&ctl, &mut kstat) +} + +macro_rules! kstat_named { + (struct $ident:ident { $($field:ident: $ty:ty,)* }) => { + struct $ident { + $($field: $ty,)* + } + + impl ReadNamed for $ident { + fn read(ctl: &Ctl, entry: &mut Kstat<'_>) -> Result { + let id = format!( + "{}:{}:{}", + entry.ks_module, + entry.ks_instance, + entry.ks_name, + ); + + let Data::Named(vec_named) = ctl.read(entry)? else { + bail!("expected named data for {id}"); + }; + + $(let mut $field = None;)* + for named in vec_named { + match named.name { + $(stringify!($field) => { + $field = Some( + <$ty>::from_named_data(named.value) + .with_context(|| format!( + "failed to parse field {} of {id}", + named.name, + ))? + ); + })* + _ => {} + } + } + + Ok(Self { + $( + $field: $field.ok_or_else(|| anyhow!( + "missing kstat field named {} in {id}", + stringify!($field), + ))?, + )* + }) + } + } + } +} + +kstat_named! { + struct KstatCpuCore { + cpu_ticks_idle: u64, + cpu_ticks_kernel: u64, + cpu_ticks_user: u64, + cpu_ticks_wait: u64, + } +} + +kstat_named! { + struct KstatCpuVm { + anonpgin: u64, + anonpgout: u64, + } +} + +kstat_named! { + struct KstatSystemMisc { + avenrun_1min: u64, + } +} + +kstat_named! { + struct KstatSystemPages { + physmem: u64, + freemem: u64, + } +} + +trait ReadNamed: Sized { + fn read(ctl: &Ctl, kstat: &mut Kstat<'_>) -> Result; +} + +trait FromNamedData: Sized { + fn from_named_data(named_data: NamedData<'_>) -> Result; +} + +impl FromNamedData for u64 { + fn from_named_data(named_data: NamedData<'_>) -> Result { + match named_data { + NamedData::Char(_) => bail!("expected unsigned int, found Char"), + NamedData::Int32(_) => bail!("expected unsigned int, found Int32"), + NamedData::UInt32(inner) => Ok(inner.into()), + NamedData::Int64(_) => bail!("expetced unsigned int, found Int64"), + NamedData::UInt64(inner) => Ok(inner), + NamedData::String(_) => { + bail!("expected unsigned int, found String"); + } + } + } +} diff --git a/dev-tools/ci-resource-usage/src/linux.rs b/dev-tools/ci-resource-usage/src/linux.rs new file mode 100644 index 00000000000..d7084dfb8dd --- /dev/null +++ b/dev-tools/ci-resource-usage/src/linux.rs @@ -0,0 +1,139 @@ +use crate::{CpuCore, Io, Memory, Swap}; +use anyhow::{Context, Error, anyhow, bail}; + +pub(crate) fn gather_cpu_cores() -> Result, Error> { + fn field(iter: &mut dyn Iterator) -> Result { + let value = iter + .next() + .ok_or_else(|| anyhow!("there are less fields than expected"))?; + Ok(value.parse()?) + } + + // man 5 proc_stat + let stat = std::fs::read_to_string("/proc/stat") + .context("failed to read /proc/stat")?; + + let mut per_core = Vec::new(); + for line in stat.lines() { + let mut parser = line.split(' ').filter(|w| !w.is_empty()); + let Some(key) = parser.next() else { continue }; + if !key.starts_with("cpu") || key == "cpu" { + continue; + } + + let user = field(&mut parser)?; + let nice = field(&mut parser)?; + let system = field(&mut parser)?; + let idle = field(&mut parser)?; + let iowait = field(&mut parser)?; + let irq = field(&mut parser)?; + let softirq = field(&mut parser)?; + let steal = field(&mut parser)?; + let guest = field(&mut parser)?; + let guest_nice = field(&mut parser)?; + + let used_ticks = + user + nice + system + irq + softirq + steal + guest + guest_nice; + per_core.push(CpuCore { + total_ticks: used_ticks + idle + iowait, + wait_ticks: iowait, + used_ticks, + }); + } + + Ok(per_core) +} + +pub(crate) fn gather_io() -> Result { + // https://www.kernel.org/doc/html/latest/admin-guide/iostats.html + let diskstats = std::fs::read_to_string("/proc/diskstats") + .context("failed to read /proc/diskstats")?; + + let mut io = Io { read_ops: 0, write_ops: 0 }; + for line in diskstats.lines() { + let mut parser = line.split(' ').filter(|w| !w.is_empty()); + let Some(device_minor) = parser.nth(1) else { continue }; + let _name = parser.next(); + let reads_completed: u64 = + parser.nth(1).ok_or_else(|| anyhow!("too few fields"))?.parse()?; + let writes_completed: u64 = + parser.nth(3).ok_or_else(|| anyhow!("too few fields"))?.parse()?; + + // /proc/diskstats contains data for both disks as a whole and + // individual partitions. The whole disk is always partition zero, so + // ignore everything that is a partition (device minor number > 0). + if device_minor != "0" { + continue; + } + + io.read_ops += reads_completed; + io.write_ops += writes_completed; + } + + Ok(io) +} + +pub(crate) fn gather_load_1min() -> Result { + let loadavg = std::fs::read_to_string("/proc/loadavg") + .context("failed to read /proc/loadavg")?; + let Some((load_1min, _rest)) = loadavg.split_once(' ') else { + bail!("invalid contents for /proc/loadavg"); + }; + Ok(load_1min.parse()?) +} + +pub(crate) fn gather_memory() -> Result { + fn parse_value(raw: &str) -> Result { + let Some((value, unit)) = raw.trim().split_once(' ') else { + bail!("missing space in value"); + }; + if unit != "kB" { + bail!("unexpected unit: {unit}"); + } + Ok(value.parse::()? * 1000) + } + + // man 5 proc_meminfo + let meminfo = std::fs::read_to_string("/proc/meminfo") + .context("failed to read /proc/meminfo")?; + + let mut total = None; + let mut available = None; + for line in meminfo.lines() { + let Some((key, value)) = line.split_once(':') else { continue }; + match key { + "MemTotal" => total = Some(parse_value(value)?), + "MemAvailable" => available = Some(parse_value(value)?), + _ => {} + } + } + + Ok(Memory { + total: total + .ok_or_else(|| anyhow!("missing MemTotal in /proc/meminfo"))?, + available: available + .ok_or_else(|| anyhow!("missing MemAvailable in /proc/meminfo"))?, + }) +} + +pub(crate) fn gather_swap() -> Result { + let vmstat = std::fs::read_to_string("/proc/vmstat") + .context("failed to read /proc/vmstat")?; + + let mut read = None; + let mut write = None; + for line in vmstat.lines() { + let Some((key, value)) = line.split_once(' ') else { continue }; + match key { + "pswpin" => read = Some(value.parse()?), + "pswpout" => write = Some(value.parse()?), + _ => {} + } + } + + Ok(Swap { + read: read.ok_or_else(|| anyhow!("missing pswpin in /proc/vmstat"))?, + write: write + .ok_or_else(|| anyhow!("missing pswpout in /proc/vmstat"))?, + }) +} diff --git a/dev-tools/ci-resource-usage/src/main.rs b/dev-tools/ci-resource-usage/src/main.rs new file mode 100644 index 00000000000..c86fa23a3bb --- /dev/null +++ b/dev-tools/ci-resource-usage/src/main.rs @@ -0,0 +1,151 @@ +#[cfg(target_os = "illumos")] +mod illumos; +#[cfg(target_os = "linux")] +mod linux; + +#[cfg(target_os = "illumos")] +use crate::illumos as sys; +#[cfg(target_os = "linux")] +use crate::linux as sys; + +use anyhow::{Context, Error, bail}; +use std::fs::File; +use std::io::Write; +use std::iter::{Sum, zip}; +use std::ops::Sub; +use std::time::{Duration, SystemTime}; + +fn main() -> Result<(), Error> { + let mut args = std::env::args_os().skip(1); + let mut dest: Box = match args.next() { + Some(path) => Box::new(File::create_new(&path).with_context(|| { + format!("failed to write to {}", path.display()) + })?), + None => Box::new(std::io::stdout().lock()), + }; + if args.next().is_some() { + bail!("usage: ci-resource-usage [output-path]"); + } + + let mut prev_cores = sys::gather_cpu_cores()?; + let mut prev_io = sys::gather_io()?; + let mut prev_swap = sys::gather_swap()?; + writeln!( + dest, + "timestamp,memory_used,load_1min,swap_read,swap_write,iops_read,\ + iops_write,cpu_wait,cpu_used{}", + (0..prev_cores.len()) + .map(|idx| format!(",cpu{idx}_used")) + .collect::() + )?; + loop { + std::thread::sleep(Duration::from_secs(1)); + + let mem = sys::gather_memory()?; + let curr_cores = sys::gather_cpu_cores()?; + let curr_io = sys::gather_io()?; + let io = curr_io - prev_io; + let curr_swap = sys::gather_swap()?; + let swap = curr_swap - prev_swap; + + let curr_cpu: CpuCore = curr_cores.iter().copied().sum(); + let prev_cpu: CpuCore = prev_cores.iter().copied().sum(); + let cpu = curr_cpu - prev_cpu; + + writeln!( + dest, + "{},{:.3},{:.2},{},{},{},{},{:.3},{:.3}{}", + SystemTime::now().duration_since(SystemTime::UNIX_EPOCH)?.as_secs(), + (mem.total - mem.available) as f64 / mem.total as f64, + sys::gather_load_1min()?, + swap.read, + swap.write, + io.read_ops, + io.write_ops, + cpu.wait_ticks as f64 / cpu.total_ticks as f64, + cpu.used_ticks as f64 / cpu.total_ticks as f64, + zip(&curr_cores, &prev_cores) + .map(|(curr, prev)| *curr - *prev) + .map(|core| core.used_ticks as f64 / core.total_ticks as f64) + .map(|percentage| format!(",{percentage:.3}")) + .collect::() + )?; + dest.flush()?; + + prev_cores = curr_cores; + prev_io = curr_io; + prev_swap = curr_swap; + } +} + +#[derive(Debug, Clone, Copy)] +struct Memory { + total: u64, + available: u64, +} + +#[derive(Debug, Clone, Copy)] +struct CpuCore { + total_ticks: u64, + used_ticks: u64, + wait_ticks: u64, +} + +impl Sub for CpuCore { + type Output = CpuCore; + + fn sub(self, rhs: CpuCore) -> Self::Output { + CpuCore { + total_ticks: self.total_ticks - rhs.total_ticks, + used_ticks: self.used_ticks - rhs.used_ticks, + wait_ticks: self.wait_ticks - rhs.wait_ticks, + } + } +} + +impl Sum for CpuCore { + fn sum>(iter: I) -> Self { + iter.fold( + CpuCore { total_ticks: 0, used_ticks: 0, wait_ticks: 0 }, + |acc, entry| CpuCore { + total_ticks: acc.total_ticks + entry.total_ticks, + used_ticks: acc.used_ticks + entry.used_ticks, + wait_ticks: acc.wait_ticks + entry.wait_ticks, + }, + ) + } +} + +#[derive(Debug, Clone, Copy)] +struct Swap { + read: u64, + write: u64, +} + +impl Sub for Swap { + type Output = Swap; + + fn sub(self, rhs: Swap) -> Self::Output { + Swap { + read: self.read - rhs.read, + write: self.write - rhs.write, + } + } +} + +#[derive(Debug, Clone, Copy)] +struct Io { + read_ops: u64, + write_ops: u64, +} + +impl Sub for Io { + type Output = Io; + + fn sub(self, rhs: Io) -> Self::Output { + Io { + read_ops: self.read_ops - rhs.read_ops, + write_ops: self.write_ops - rhs.write_ops, + } + } +}