Skip to content

Commit cb2ed22

Browse files
drewlinuxdevel
authored andcommitted
fix(bootstrap): surface diagnostics for K8s namespace not ready failures (NVIDIA#466)
1 parent f18f195 commit cb2ed22

File tree

4 files changed

+266
-16
lines changed

4 files changed

+266
-16
lines changed

crates/openshell-bootstrap/src/errors.rs

Lines changed: 200 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -447,16 +447,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis
447447
summary: "Gateway failed to start".to_string(),
448448
explanation: "The gateway encountered an unexpected error during startup.".to_string(),
449449
recovery_steps: vec![
450+
RecoveryStep::with_command(
451+
"Check container logs for details",
452+
format!("openshell doctor logs --name {gateway_name}"),
453+
),
454+
RecoveryStep::with_command(
455+
"Run diagnostics",
456+
format!("openshell doctor check --name {gateway_name}"),
457+
),
450458
RecoveryStep::with_command(
451459
"Try destroying and recreating the gateway",
452460
format!(
453461
"openshell gateway destroy --name {gateway_name} && openshell gateway start"
454462
),
455463
),
456-
RecoveryStep::with_command(
457-
"Check container logs for details",
458-
format!("docker logs openshell-cluster-{gateway_name}"),
459-
),
460464
RecoveryStep::new(
461465
"If the issue persists, report it at https://github.com/nvidia/openshell/issues",
462466
),
@@ -729,4 +733,196 @@ mod tests {
729733
);
730734
assert!(d.retryable);
731735
}
736+
737+
// -- generic_failure_diagnosis tests --
738+
739+
#[test]
740+
fn generic_diagnosis_suggests_doctor_logs() {
741+
let d = generic_failure_diagnosis("my-gw");
742+
let commands: Vec<String> = d
743+
.recovery_steps
744+
.iter()
745+
.filter_map(|s| s.command.clone())
746+
.collect();
747+
assert!(
748+
commands.iter().any(|c| c.contains("openshell doctor logs")),
749+
"expected 'openshell doctor logs' in recovery commands, got: {commands:?}"
750+
);
751+
}
752+
753+
#[test]
754+
fn generic_diagnosis_suggests_doctor_check() {
755+
let d = generic_failure_diagnosis("my-gw");
756+
let commands: Vec<String> = d
757+
.recovery_steps
758+
.iter()
759+
.filter_map(|s| s.command.clone())
760+
.collect();
761+
assert!(
762+
commands
763+
.iter()
764+
.any(|c| c.contains("openshell doctor check")),
765+
"expected 'openshell doctor check' in recovery commands, got: {commands:?}"
766+
);
767+
}
768+
769+
#[test]
770+
fn generic_diagnosis_includes_gateway_name() {
771+
let d = generic_failure_diagnosis("custom-name");
772+
let all_text: String = d
773+
.recovery_steps
774+
.iter()
775+
.filter_map(|s| s.command.clone())
776+
.collect::<Vec<_>>()
777+
.join(" ");
778+
assert!(
779+
all_text.contains("custom-name"),
780+
"expected gateway name in recovery commands, got: {all_text}"
781+
);
782+
}
783+
784+
// -- fallback behavior tests --
785+
786+
#[test]
787+
fn namespace_timeout_without_logs_returns_none() {
788+
// This is the most common user-facing error: a plain timeout with only
789+
// kubectl output. It must NOT match any specific pattern so the caller
790+
// can fall back to generic_failure_diagnosis.
791+
let diagnosis = diagnose_failure(
792+
"test",
793+
"K8s namespace not ready\n\nCaused by:\n \
794+
timed out waiting for namespace 'openshell' to exist: \
795+
error: the server doesn't have a resource type \"namespace\"",
796+
None,
797+
);
798+
assert!(
799+
diagnosis.is_none(),
800+
"plain namespace timeout should not match any specific pattern, got: {:?}",
801+
diagnosis.map(|d| d.summary)
802+
);
803+
}
804+
805+
#[test]
806+
fn namespace_timeout_with_pressure_logs_matches() {
807+
// When container logs reveal node pressure, the diagnosis engine
808+
// should detect it even though the error message itself is generic.
809+
let diagnosis = diagnose_failure(
810+
"test",
811+
"K8s namespace not ready\n\nCaused by:\n \
812+
timed out waiting for namespace 'openshell' to exist: <kubectl output>",
813+
Some("HEALTHCHECK_NODE_PRESSURE: DiskPressure"),
814+
);
815+
assert!(diagnosis.is_some(), "expected node pressure diagnosis");
816+
let d = diagnosis.unwrap();
817+
assert!(
818+
d.summary.contains("pressure"),
819+
"expected pressure in summary, got: {}",
820+
d.summary
821+
);
822+
}
823+
824+
#[test]
825+
fn namespace_timeout_with_corrupted_state_logs_matches() {
826+
// Container logs revealing RBAC corruption should be caught.
827+
let diagnosis = diagnose_failure(
828+
"test",
829+
"K8s namespace not ready\n\nCaused by:\n \
830+
timed out waiting for namespace 'openshell' to exist: <output>",
831+
Some(
832+
"configmaps \"extension-apiserver-authentication\" is forbidden: \
833+
User cannot get resource",
834+
),
835+
);
836+
assert!(diagnosis.is_some(), "expected corrupted state diagnosis");
837+
let d = diagnosis.unwrap();
838+
assert!(
839+
d.summary.contains("Corrupted"),
840+
"expected Corrupted in summary, got: {}",
841+
d.summary
842+
);
843+
}
844+
845+
#[test]
846+
fn namespace_timeout_with_no_route_logs_matches() {
847+
let diagnosis = diagnose_failure(
848+
"test",
849+
"K8s namespace not ready",
850+
Some("Error: no default route present before starting k3s"),
851+
);
852+
assert!(diagnosis.is_some(), "expected networking diagnosis");
853+
let d = diagnosis.unwrap();
854+
assert!(
855+
d.summary.contains("networking"),
856+
"expected networking in summary, got: {}",
857+
d.summary
858+
);
859+
}
860+
861+
#[test]
862+
fn diagnose_failure_with_logs_uses_combined_text() {
863+
// Verify that diagnose_failure combines error_message + container_logs
864+
// for pattern matching. The pattern "connection refused" is in logs,
865+
// not in the error message.
866+
let diagnosis = diagnose_failure(
867+
"test",
868+
"K8s namespace not ready",
869+
Some("dial tcp 127.0.0.1:6443: connect: connection refused"),
870+
);
871+
assert!(
872+
diagnosis.is_some(),
873+
"expected diagnosis from container logs pattern"
874+
);
875+
let d = diagnosis.unwrap();
876+
assert!(
877+
d.summary.contains("Network") || d.summary.contains("connectivity"),
878+
"expected network diagnosis, got: {}",
879+
d.summary
880+
);
881+
}
882+
883+
// -- end-to-end fallback pattern (mirrors CLI code) --
884+
885+
#[test]
886+
fn fallback_to_generic_produces_actionable_diagnosis() {
887+
// This mirrors the actual CLI pattern:
888+
// diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name))
889+
// For a plain namespace timeout with no useful container logs, the
890+
// specific matcher returns None and we must fall back to the generic
891+
// diagnosis that suggests doctor commands.
892+
let err_str = "K8s namespace not ready\n\nCaused by:\n \
893+
timed out waiting for namespace 'openshell' to exist: \
894+
error: the server doesn't have a resource type \"namespace\"";
895+
let container_logs = Some("k3s is starting\nwaiting for kube-apiserver");
896+
897+
let diagnosis = diagnose_failure("my-gw", err_str, container_logs)
898+
.unwrap_or_else(|| generic_failure_diagnosis("my-gw"));
899+
900+
// Should have gotten the generic diagnosis (no specific pattern matched)
901+
assert_eq!(diagnosis.summary, "Gateway failed to start");
902+
// Must contain actionable recovery steps
903+
assert!(
904+
!diagnosis.recovery_steps.is_empty(),
905+
"generic diagnosis should have recovery steps"
906+
);
907+
// Must mention doctor commands
908+
let all_commands: String = diagnosis
909+
.recovery_steps
910+
.iter()
911+
.filter_map(|s| s.command.as_ref())
912+
.cloned()
913+
.collect::<Vec<_>>()
914+
.join("\n");
915+
assert!(
916+
all_commands.contains("doctor logs"),
917+
"should suggest 'doctor logs', got: {all_commands}"
918+
);
919+
assert!(
920+
all_commands.contains("doctor check"),
921+
"should suggest 'doctor check', got: {all_commands}"
922+
);
923+
assert!(
924+
all_commands.contains("my-gw"),
925+
"commands should include gateway name, got: {all_commands}"
926+
);
927+
}
732928
}

crates/openshell-bootstrap/src/lib.rs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,21 @@ pub async fn gateway_container_logs<W: std::io::Write>(
662662
Ok(())
663663
}
664664

665+
/// Fetch the last `n` lines of container logs for a local gateway as a
666+
/// `String`. This is a convenience wrapper for diagnostic call sites (e.g.
667+
/// failure diagnosis in the CLI) that do not hold a Docker client handle.
668+
///
669+
/// Returns an empty string on any Docker/connection error so callers don't
670+
/// need to worry about error handling.
671+
pub async fn fetch_gateway_logs(name: &str, n: usize) -> String {
672+
let docker = match Docker::connect_with_local_defaults() {
673+
Ok(d) => d,
674+
Err(_) => return String::new(),
675+
};
676+
let container = container_name(name);
677+
fetch_recent_logs(&docker, &container, n).await
678+
}
679+
665680
fn default_gateway_image_ref() -> String {
666681
if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE")
667682
&& !image.trim().is_empty()
@@ -1008,7 +1023,11 @@ async fn wait_for_namespace(
10081023
}
10091024

10101025
if attempt + 1 == attempts {
1011-
return Err(err).wrap_err("K8s namespace not ready");
1026+
let logs = fetch_recent_logs(docker, container_name, 40).await;
1027+
return Err(miette::miette!(
1028+
"exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}"
1029+
))
1030+
.wrap_err("K8s namespace not ready");
10121031
}
10131032
tokio::time::sleep(backoff).await;
10141033
backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff);
@@ -1021,8 +1040,9 @@ async fn wait_for_namespace(
10211040
}
10221041

10231042
if attempt + 1 == attempts {
1043+
let logs = fetch_recent_logs(docker, container_name, 40).await;
10241044
return Err(miette::miette!(
1025-
"timed out waiting for namespace '{namespace}' to exist: {output}"
1045+
"timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}"
10261046
))
10271047
.wrap_err("K8s namespace not ready");
10281048
}

crates/openshell-cli/src/bootstrap.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ pub async fn run_bootstrap(
139139
eprintln!();
140140
eprintln!(
141141
" Manage it later with: {} or {}",
142-
"openshell gateway status".bold(),
142+
"openshell status".bold(),
143143
"openshell gateway stop".bold(),
144144
);
145145
eprintln!();

crates/openshell-cli/src/run.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,19 +1248,27 @@ pub(crate) async fn deploy_gateway_with_panel(
12481248
"x".red().bold(),
12491249
"Gateway failed:".red().bold(),
12501250
);
1251+
// Fetch container logs for pattern-based diagnosis
1252+
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
1253+
let logs_opt = if container_logs.is_empty() {
1254+
None
1255+
} else {
1256+
Some(container_logs.as_str())
1257+
};
12511258
// Try to diagnose the failure and provide guidance
12521259
let err_str = format!("{err:?}");
1253-
if let Some(diagnosis) =
1254-
openshell_bootstrap::errors::diagnose_failure(name, &err_str, None)
1255-
{
1256-
print_failure_diagnosis(&diagnosis);
1257-
}
1260+
let diagnosis =
1261+
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
1262+
.unwrap_or_else(|| {
1263+
openshell_bootstrap::errors::generic_failure_diagnosis(name)
1264+
});
1265+
print_failure_diagnosis(&diagnosis);
12581266
Err(err)
12591267
}
12601268
}
12611269
} else {
12621270
eprintln!("Deploying {location} gateway {name}...");
1263-
let handle = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
1271+
let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
12641272
if let Some(status) = line.strip_prefix("[status] ") {
12651273
eprintln!(" {status}");
12661274
} else if line.strip_prefix("[progress] ").is_some() {
@@ -1269,9 +1277,35 @@ pub(crate) async fn deploy_gateway_with_panel(
12691277
eprintln!(" {line}");
12701278
}
12711279
})
1272-
.await?;
1273-
eprintln!("Gateway {name} ready.");
1274-
Ok(handle)
1280+
.await;
1281+
match result {
1282+
Ok(handle) => {
1283+
eprintln!("Gateway {name} ready.");
1284+
Ok(handle)
1285+
}
1286+
Err(err) => {
1287+
eprintln!(
1288+
"{} {} {name}",
1289+
"x".red().bold(),
1290+
"Gateway failed:".red().bold(),
1291+
);
1292+
// Fetch container logs for pattern-based diagnosis
1293+
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
1294+
let logs_opt = if container_logs.is_empty() {
1295+
None
1296+
} else {
1297+
Some(container_logs.as_str())
1298+
};
1299+
let err_str = format!("{err:?}");
1300+
let diagnosis =
1301+
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
1302+
.unwrap_or_else(|| {
1303+
openshell_bootstrap::errors::generic_failure_diagnosis(name)
1304+
});
1305+
print_failure_diagnosis(&diagnosis);
1306+
Err(err)
1307+
}
1308+
}
12751309
}
12761310
}
12771311

0 commit comments

Comments
 (0)