@@ -447,16 +447,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis
447447 summary : "Gateway failed to start" . to_string ( ) ,
448448 explanation : "The gateway encountered an unexpected error during startup." . to_string ( ) ,
449449 recovery_steps : vec ! [
450+ RecoveryStep :: with_command(
451+ "Check container logs for details" ,
452+ format!( "openshell doctor logs --name {gateway_name}" ) ,
453+ ) ,
454+ RecoveryStep :: with_command(
455+ "Run diagnostics" ,
456+ format!( "openshell doctor check --name {gateway_name}" ) ,
457+ ) ,
450458 RecoveryStep :: with_command(
451459 "Try destroying and recreating the gateway" ,
452460 format!(
453461 "openshell gateway destroy --name {gateway_name} && openshell gateway start"
454462 ) ,
455463 ) ,
456- RecoveryStep :: with_command(
457- "Check container logs for details" ,
458- format!( "docker logs openshell-cluster-{gateway_name}" ) ,
459- ) ,
460464 RecoveryStep :: new(
461465 "If the issue persists, report it at https://github.com/nvidia/openshell/issues" ,
462466 ) ,
@@ -729,4 +733,196 @@ mod tests {
729733 ) ;
730734 assert ! ( d. retryable) ;
731735 }
736+
737+ // -- generic_failure_diagnosis tests --
738+
739+ #[ test]
740+ fn generic_diagnosis_suggests_doctor_logs ( ) {
741+ let d = generic_failure_diagnosis ( "my-gw" ) ;
742+ let commands: Vec < String > = d
743+ . recovery_steps
744+ . iter ( )
745+ . filter_map ( |s| s. command . clone ( ) )
746+ . collect ( ) ;
747+ assert ! (
748+ commands. iter( ) . any( |c| c. contains( "openshell doctor logs" ) ) ,
749+ "expected 'openshell doctor logs' in recovery commands, got: {commands:?}"
750+ ) ;
751+ }
752+
753+ #[ test]
754+ fn generic_diagnosis_suggests_doctor_check ( ) {
755+ let d = generic_failure_diagnosis ( "my-gw" ) ;
756+ let commands: Vec < String > = d
757+ . recovery_steps
758+ . iter ( )
759+ . filter_map ( |s| s. command . clone ( ) )
760+ . collect ( ) ;
761+ assert ! (
762+ commands
763+ . iter( )
764+ . any( |c| c. contains( "openshell doctor check" ) ) ,
765+ "expected 'openshell doctor check' in recovery commands, got: {commands:?}"
766+ ) ;
767+ }
768+
769+ #[ test]
770+ fn generic_diagnosis_includes_gateway_name ( ) {
771+ let d = generic_failure_diagnosis ( "custom-name" ) ;
772+ let all_text: String = d
773+ . recovery_steps
774+ . iter ( )
775+ . filter_map ( |s| s. command . clone ( ) )
776+ . collect :: < Vec < _ > > ( )
777+ . join ( " " ) ;
778+ assert ! (
779+ all_text. contains( "custom-name" ) ,
780+ "expected gateway name in recovery commands, got: {all_text}"
781+ ) ;
782+ }
783+
784+ // -- fallback behavior tests --
785+
786+ #[ test]
787+ fn namespace_timeout_without_logs_returns_none ( ) {
788+ // This is the most common user-facing error: a plain timeout with only
789+ // kubectl output. It must NOT match any specific pattern so the caller
790+ // can fall back to generic_failure_diagnosis.
791+ let diagnosis = diagnose_failure (
792+ "test" ,
793+ "K8s namespace not ready\n \n Caused by:\n \
794+ timed out waiting for namespace 'openshell' to exist: \
795+ error: the server doesn't have a resource type \" namespace\" ",
796+ None ,
797+ ) ;
798+ assert ! (
799+ diagnosis. is_none( ) ,
800+ "plain namespace timeout should not match any specific pattern, got: {:?}" ,
801+ diagnosis. map( |d| d. summary)
802+ ) ;
803+ }
804+
805+ #[ test]
806+ fn namespace_timeout_with_pressure_logs_matches ( ) {
807+ // When container logs reveal node pressure, the diagnosis engine
808+ // should detect it even though the error message itself is generic.
809+ let diagnosis = diagnose_failure (
810+ "test" ,
811+ "K8s namespace not ready\n \n Caused by:\n \
812+ timed out waiting for namespace 'openshell' to exist: <kubectl output>",
813+ Some ( "HEALTHCHECK_NODE_PRESSURE: DiskPressure" ) ,
814+ ) ;
815+ assert ! ( diagnosis. is_some( ) , "expected node pressure diagnosis" ) ;
816+ let d = diagnosis. unwrap ( ) ;
817+ assert ! (
818+ d. summary. contains( "pressure" ) ,
819+ "expected pressure in summary, got: {}" ,
820+ d. summary
821+ ) ;
822+ }
823+
824+ #[ test]
825+ fn namespace_timeout_with_corrupted_state_logs_matches ( ) {
826+ // Container logs revealing RBAC corruption should be caught.
827+ let diagnosis = diagnose_failure (
828+ "test" ,
829+ "K8s namespace not ready\n \n Caused by:\n \
830+ timed out waiting for namespace 'openshell' to exist: <output>",
831+ Some (
832+ "configmaps \" extension-apiserver-authentication\" is forbidden: \
833+ User cannot get resource",
834+ ) ,
835+ ) ;
836+ assert ! ( diagnosis. is_some( ) , "expected corrupted state diagnosis" ) ;
837+ let d = diagnosis. unwrap ( ) ;
838+ assert ! (
839+ d. summary. contains( "Corrupted" ) ,
840+ "expected Corrupted in summary, got: {}" ,
841+ d. summary
842+ ) ;
843+ }
844+
845+ #[ test]
846+ fn namespace_timeout_with_no_route_logs_matches ( ) {
847+ let diagnosis = diagnose_failure (
848+ "test" ,
849+ "K8s namespace not ready" ,
850+ Some ( "Error: no default route present before starting k3s" ) ,
851+ ) ;
852+ assert ! ( diagnosis. is_some( ) , "expected networking diagnosis" ) ;
853+ let d = diagnosis. unwrap ( ) ;
854+ assert ! (
855+ d. summary. contains( "networking" ) ,
856+ "expected networking in summary, got: {}" ,
857+ d. summary
858+ ) ;
859+ }
860+
861+ #[ test]
862+ fn diagnose_failure_with_logs_uses_combined_text ( ) {
863+ // Verify that diagnose_failure combines error_message + container_logs
864+ // for pattern matching. The pattern "connection refused" is in logs,
865+ // not in the error message.
866+ let diagnosis = diagnose_failure (
867+ "test" ,
868+ "K8s namespace not ready" ,
869+ Some ( "dial tcp 127.0.0.1:6443: connect: connection refused" ) ,
870+ ) ;
871+ assert ! (
872+ diagnosis. is_some( ) ,
873+ "expected diagnosis from container logs pattern"
874+ ) ;
875+ let d = diagnosis. unwrap ( ) ;
876+ assert ! (
877+ d. summary. contains( "Network" ) || d. summary. contains( "connectivity" ) ,
878+ "expected network diagnosis, got: {}" ,
879+ d. summary
880+ ) ;
881+ }
882+
883+ // -- end-to-end fallback pattern (mirrors CLI code) --
884+
885+ #[ test]
886+ fn fallback_to_generic_produces_actionable_diagnosis ( ) {
887+ // This mirrors the actual CLI pattern:
888+ // diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name))
889+ // For a plain namespace timeout with no useful container logs, the
890+ // specific matcher returns None and we must fall back to the generic
891+ // diagnosis that suggests doctor commands.
892+ let err_str = "K8s namespace not ready\n \n Caused by:\n \
893+ timed out waiting for namespace 'openshell' to exist: \
894+ error: the server doesn't have a resource type \" namespace\" ";
895+ let container_logs = Some ( "k3s is starting\n waiting for kube-apiserver" ) ;
896+
897+ let diagnosis = diagnose_failure ( "my-gw" , err_str, container_logs)
898+ . unwrap_or_else ( || generic_failure_diagnosis ( "my-gw" ) ) ;
899+
900+ // Should have gotten the generic diagnosis (no specific pattern matched)
901+ assert_eq ! ( diagnosis. summary, "Gateway failed to start" ) ;
902+ // Must contain actionable recovery steps
903+ assert ! (
904+ !diagnosis. recovery_steps. is_empty( ) ,
905+ "generic diagnosis should have recovery steps"
906+ ) ;
907+ // Must mention doctor commands
908+ let all_commands: String = diagnosis
909+ . recovery_steps
910+ . iter ( )
911+ . filter_map ( |s| s. command . as_ref ( ) )
912+ . cloned ( )
913+ . collect :: < Vec < _ > > ( )
914+ . join ( "\n " ) ;
915+ assert ! (
916+ all_commands. contains( "doctor logs" ) ,
917+ "should suggest 'doctor logs', got: {all_commands}"
918+ ) ;
919+ assert ! (
920+ all_commands. contains( "doctor check" ) ,
921+ "should suggest 'doctor check', got: {all_commands}"
922+ ) ;
923+ assert ! (
924+ all_commands. contains( "my-gw" ) ,
925+ "commands should include gateway name, got: {all_commands}"
926+ ) ;
927+ }
732928}
0 commit comments