Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ mod api_impl {

async fn support_nvmeadm_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
{
unimplemented!()
}
Expand Down
1 change: 1 addition & 0 deletions openapi/sled-agent/sled-agent-27.0.0-42911d.json.gitstub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
df724fae54459e6ffa609f932547c643fb52e3f7:openapi/sled-agent/sled-agent-27.0.0-42911d.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"url": "https://oxide.computer",
"email": "api@oxide.computer"
},
"version": "27.0.0"
"version": "28.0.0"
},
"paths": {
"/artifacts": {
Expand Down Expand Up @@ -766,7 +766,11 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
"title": "Array_of_SledDiagnosticsQueryOutput",
"type": "array",
"items": {
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion openapi/sled-agent/sled-agent-latest.json
24 changes: 23 additions & 1 deletion sled-agent/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ api_versions!([
// | example for the next person.
// v
// (next_int, IDENT),
(28, MORE_NVMEADM_OUTPUT),
(27, RENAME_SWITCH_LOCATION_TO_SWITCH_SLOT),
(26, RACK_NETWORK_CONFIG_NOT_OPTIONAL),
(25, BOOTSTORE_VERSIONING),
Expand Down Expand Up @@ -1103,10 +1104,31 @@ pub trait SledAgentApi {
#[endpoint {
method = GET,
path = "/support/nvmeadm-info",
versions = VERSION_MORE_NVMEADM_OUTPUT..,
}]
async fn support_nvmeadm_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>;

#[endpoint {
operation_id = "support_nvmeadm_info",
method = GET,
path = "/support/nvmeadm-info",
versions = ..VERSION_MORE_NVMEADM_OUTPUT,
}]
async fn support_nvmeadm_info_v27(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
Self::support_nvmeadm_info(request_context).await.map(
|HttpResponseOk(items)| {
HttpResponseOk(items.into_iter().next().unwrap_or(
SledDiagnosticsQueryOutput::Failure {
error: String::from("no nvmeadm output available"),
},
))
},
)
}

#[endpoint {
method = GET,
Expand Down
12 changes: 9 additions & 3 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,12 +1252,18 @@ impl SledAgentApi for SledAgentImpl {

async fn support_nvmeadm_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
{
let sa = request_context.context();
sa.latencies()
.instrument_dropshot_handler(&request_context, async {
let res = sa.support_nvmeadm_info().await;
Ok(HttpResponseOk(res.get_output()))
let res = sa
.support_nvmeadm_info()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>();
Ok(HttpResponseOk(res))
})
.await
}
Expand Down
3 changes: 2 additions & 1 deletion sled-agent/src/sim/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,8 @@ impl SledAgentApi for SledAgentSimImpl {

async fn support_nvmeadm_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
{
method_unimplemented()
}

Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1255,7 +1255,7 @@ impl SledAgent {

pub(crate) async fn support_nvmeadm_info(
&self,
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::nvmeadm_info().await
}

Expand Down
24 changes: 22 additions & 2 deletions sled-diagnostics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ use queries::*;
/// Max number of ptool commands to run in parallel
const MAX_PTOOL_PARALLELISM: usize = 50;

/// Number of NVMe drives on a sled.
const NUM_NVME_DRIVES: u32 = 12;

/// List all zones on a sled.
pub async fn zoneadm_info()
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
Expand Down Expand Up @@ -83,8 +86,25 @@ pub async fn dladm_info()
}

pub async fn nvmeadm_info()
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
execute_command_with_timeout(nvmeadm_list(), DEFAULT_TIMEOUT).await
-> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
let mut results = Vec::new();

// Run these serially so that the disk log pages are listed in order in the
// output.
let res =
execute_command_with_timeout(nvmeadm_list(), DEFAULT_TIMEOUT).await;
results.push(res);

for disk_num in 0..NUM_NVME_DRIVES {
let res = execute_command_with_timeout(
nvmeadm_logpage_health(disk_num),
DEFAULT_TIMEOUT,
)
.await;
results.push(res);
}

results
}

pub async fn pargs_oxide_processes(
Expand Down
11 changes: 11 additions & 0 deletions sled-diagnostics/src/queries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,17 @@ pub fn nvmeadm_list() -> Command {
cmd
}

pub fn nvmeadm_logpage_health(nvme_num: u32) -> Command {
let mut cmd = std::process::Command::new(PFEXEC);
cmd.env_clear()
.arg(NVMEADM)
.arg("-v")
.arg("get-logpage")
.arg(&format!("nvme{nvme_num}"))
.arg("health");
cmd
}
Comment on lines +251 to +260

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're going to invoke this, please just use the -O option to get-logpage to send this entirely to a binary file that can be interpreted more efficiently with tools.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rmustacc I think this is a "yes and" situation. When we we're specifically looking for a problem with a disk, the binary files are superior. In scenarios where we're just performing a quick health check against the bundle, it's more convenient to have text output.

Text files can be analyzed on non-illumos hosts, and are trivial to read without extracting the files, e.g., bundle-cat bundle.zip --path '*logpage*' | 'grep -A 4 "Critical Warnings"'.

Happy to make a follow-on PR for the binary health log page, and any others you want.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The text output format is not a stable interface and is going to change. So I think it's critical if we're going to build tooling on top of this that we're doing something that is going to continue to work and not silently break.

Copy link
Contributor Author

@wfchandler wfchandler Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the print-logpage CL you have in flight for illumos will cover both of our needs, or maybe get-logpage -p.

Perhaps I should just close this PR and wait for those command to be available.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same features for print-logpage work for get-logpage. However, if I were doing the support bundle, I would again just gather the thing we want once and then do whatever we want after the fact. Note, the changes going in there don't touch the extent logs today, but will in the future.


pub fn pargs_process(pid: i32) -> Command {
let mut cmd = std::process::Command::new(PFEXEC);
cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string());
Expand Down
Loading