From cb97c8d5630c739fb0c0a4a7add85ea3f0d7f711 Mon Sep 17 00:00:00 2001 From: Will Chandler Date: Wed, 11 Mar 2026 13:43:39 -0400 Subject: [PATCH] sled-diagnostics: Capture nvmeadm health logpage During recent customer installs we have found that the `health` logpage exposed by `nvmeadm(8)` was useful in identifying failing drives. Add this output to support bundles. --- .../src/test_util/host_phase_2_test_state.rs | 2 +- .../sled-agent-27.0.0-42911d.json.gitstub | 1 + ...11d.json => sled-agent-28.0.0-137ef2.json} | 8 +++++-- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/api/src/lib.rs | 24 ++++++++++++++++++- sled-agent/src/http_entrypoints.rs | 12 +++++++--- sled-agent/src/sim/http_entrypoints.rs | 3 ++- sled-agent/src/sled_agent.rs | 2 +- sled-diagnostics/src/lib.rs | 24 +++++++++++++++++-- sled-diagnostics/src/queries.rs | 11 +++++++++ 10 files changed, 77 insertions(+), 12 deletions(-) create mode 100644 openapi/sled-agent/sled-agent-27.0.0-42911d.json.gitstub rename openapi/sled-agent/{sled-agent-27.0.0-42911d.json => sled-agent-28.0.0-137ef2.json} (99%) diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 820588798c5..dccff702df0 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -861,7 +861,7 @@ mod api_impl { async fn support_nvmeadm_info( _request_context: RequestContext, - ) -> Result, HttpError> + ) -> Result>, HttpError> { unimplemented!() } diff --git a/openapi/sled-agent/sled-agent-27.0.0-42911d.json.gitstub b/openapi/sled-agent/sled-agent-27.0.0-42911d.json.gitstub new file mode 100644 index 00000000000..09fb077b22c --- /dev/null +++ b/openapi/sled-agent/sled-agent-27.0.0-42911d.json.gitstub @@ -0,0 +1 @@ +df724fae54459e6ffa609f932547c643fb52e3f7:openapi/sled-agent/sled-agent-27.0.0-42911d.json diff --git a/openapi/sled-agent/sled-agent-27.0.0-42911d.json b/openapi/sled-agent/sled-agent-28.0.0-137ef2.json similarity index 99% rename from openapi/sled-agent/sled-agent-27.0.0-42911d.json rename to openapi/sled-agent/sled-agent-28.0.0-137ef2.json index dd26d73d836..89fd4293105 100644 --- a/openapi/sled-agent/sled-agent-27.0.0-42911d.json +++ b/openapi/sled-agent/sled-agent-28.0.0-137ef2.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "27.0.0" + "version": "28.0.0" }, "paths": { "/artifacts": { @@ -766,7 +766,11 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } } } } diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 6caf751e245..ca55148a1d3 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-27.0.0-42911d.json \ No newline at end of file +sled-agent-28.0.0-137ef2.json \ No newline at end of file diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index f7627c31319..ee3bcfedb99 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -37,6 +37,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (28, MORE_NVMEADM_OUTPUT), (27, RENAME_SWITCH_LOCATION_TO_SWITCH_SLOT), (26, RACK_NETWORK_CONFIG_NOT_OPTIONAL), (25, BOOTSTORE_VERSIONING), @@ -1103,10 +1104,31 @@ pub trait SledAgentApi { #[endpoint { method = GET, path = "/support/nvmeadm-info", + versions = VERSION_MORE_NVMEADM_OUTPUT.., }] async fn support_nvmeadm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; + + #[endpoint { + operation_id = "support_nvmeadm_info", + method = GET, + path = "/support/nvmeadm-info", + versions = ..VERSION_MORE_NVMEADM_OUTPUT, + }] + async fn support_nvmeadm_info_v27( + request_context: RequestContext, + ) -> Result, HttpError> { + Self::support_nvmeadm_info(request_context).await.map( + |HttpResponseOk(items)| { + HttpResponseOk(items.into_iter().next().unwrap_or( + SledDiagnosticsQueryOutput::Failure { + error: String::from("no nvmeadm output available"), + }, + )) + }, + ) + } #[endpoint { method = GET, diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index d7c55c537e5..96f81435656 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -1252,12 +1252,18 @@ impl SledAgentApi for SledAgentImpl { async fn support_nvmeadm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); sa.latencies() .instrument_dropshot_handler(&request_context, async { - let res = sa.support_nvmeadm_info().await; - Ok(HttpResponseOk(res.get_output())) + let res = sa + .support_nvmeadm_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(); + Ok(HttpResponseOk(res)) }) .await } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 2866f23d58b..1a1038fb224 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -919,7 +919,8 @@ impl SledAgentApi for SledAgentSimImpl { async fn support_nvmeadm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 4f9b682920e..ec4ebac9a83 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1255,7 +1255,7 @@ impl SledAgent { pub(crate) async fn support_nvmeadm_info( &self, - ) -> Result { + ) -> Vec> { sled_diagnostics::nvmeadm_info().await } diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index 659468051a5..1f08543c45b 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -34,6 +34,9 @@ use queries::*; /// Max number of ptool commands to run in parallel const MAX_PTOOL_PARALLELISM: usize = 50; +/// Number of NVMe drives on a sled. +const NUM_NVME_DRIVES: u32 = 12; + /// List all zones on a sled. pub async fn zoneadm_info() -> Result { @@ -83,8 +86,25 @@ pub async fn dladm_info() } pub async fn nvmeadm_info() --> Result { - execute_command_with_timeout(nvmeadm_list(), DEFAULT_TIMEOUT).await +-> Vec> { + let mut results = Vec::new(); + + // Run these serially so that the disk log pages are listed in order in the + // output. + let res = + execute_command_with_timeout(nvmeadm_list(), DEFAULT_TIMEOUT).await; + results.push(res); + + for disk_num in 0..NUM_NVME_DRIVES { + let res = execute_command_with_timeout( + nvmeadm_logpage_health(disk_num), + DEFAULT_TIMEOUT, + ) + .await; + results.push(res); + } + + results } pub async fn pargs_oxide_processes( diff --git a/sled-diagnostics/src/queries.rs b/sled-diagnostics/src/queries.rs index c1452abbc4d..44b81ed76a5 100644 --- a/sled-diagnostics/src/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -248,6 +248,17 @@ pub fn nvmeadm_list() -> Command { cmd } +pub fn nvmeadm_logpage_health(nvme_num: u32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear() + .arg(NVMEADM) + .arg("-v") + .arg("get-logpage") + .arg(&format!("nvme{nvme_num}")) + .arg("health"); + cmd +} + pub fn pargs_process(pid: i32) -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string());