@@ -362,72 +362,160 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize)
362362 rendered
363363}
364364
365- /// Remove stale k3s nodes from a cluster with a reused persistent volume .
365+ /// Remove stale k3s nodes and their orphaned pods from a resumed cluster .
366366///
367367/// When a cluster container is recreated but the volume is reused, k3s registers
368368/// a new node (using the container ID as the hostname) while old node entries
369369/// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run,
370370/// causing health checks to fail.
371371///
372- /// This function identifies all `NotReady` nodes and deletes them so k3s can
373- /// reschedule workloads onto the current (Ready) node.
372+ /// This function retries with backoff until `kubectl` becomes available (k3s may
373+ /// still be initialising), then:
374+ /// 1. Deletes all `NotReady` nodes so k3s stops tracking them.
375+ /// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and
376+ /// Deployments can reschedule replacements on the current (Ready) node.
374377///
375378/// Returns the number of stale nodes removed.
376379pub async fn clean_stale_nodes ( docker : & Docker , name : & str ) -> Result < usize > {
380+ // Retry until kubectl is responsive. k3s can take 10-20 s to start the
381+ // API server after a container restart, so we allow up to ~45 s.
382+ const MAX_ATTEMPTS : u32 = 15 ;
383+ const RETRY_DELAY : Duration = Duration :: from_secs ( 3 ) ;
384+
377385 let container_name = container_name ( name) ;
386+ let mut stale_nodes: Vec < String > = Vec :: new ( ) ;
387+
388+ for attempt in 1 ..=MAX_ATTEMPTS {
389+ // List ALL node names and the container's own hostname. Any node that
390+ // is not the current container is stale — we cannot rely on the Ready
391+ // condition because k3s may not have marked the old node NotReady yet
392+ // when this runs shortly after container start.
393+ let ( output, exit_code) = exec_capture_with_exit (
394+ docker,
395+ & container_name,
396+ vec ! [
397+ "sh" . to_string( ) ,
398+ "-c" . to_string( ) ,
399+ format!(
400+ "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
401+ --no-headers -o custom-columns=NAME:.metadata.name \
402+ 2>/dev/null"
403+ ) ,
404+ ] ,
405+ )
406+ . await ?;
407+
408+ if exit_code == 0 {
409+ // Determine the current node name (container hostname).
410+ let ( hostname_out, _) =
411+ exec_capture_with_exit ( docker, & container_name, vec ! [ "hostname" . to_string( ) ] )
412+ . await ?;
413+ let current_hostname = hostname_out. trim ( ) . to_string ( ) ;
414+
415+ stale_nodes = output
416+ . lines ( )
417+ . map ( str:: trim)
418+ . filter ( |l| !l. is_empty ( ) && * l != current_hostname)
419+ . map ( ToString :: to_string)
420+ . collect ( ) ;
421+ break ;
422+ }
423+
424+ if attempt < MAX_ATTEMPTS {
425+ tracing:: debug!(
426+ "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s" ,
427+ RETRY_DELAY . as_secs( )
428+ ) ;
429+ tokio:: time:: sleep ( RETRY_DELAY ) . await ;
430+ }
431+ }
432+
433+ if stale_nodes. is_empty ( ) {
434+ return Ok ( 0 ) ;
435+ }
436+
437+ let node_list = stale_nodes. join ( " " ) ;
438+ let count = stale_nodes. len ( ) ;
439+ tracing:: info!( "removing {} stale node(s): {}" , count, node_list) ;
378440
379- // Get the list of NotReady nodes.
380- // The last condition on a node is always type=Ready; we need to check its
381- // **status** (True/False/Unknown), not its type. Nodes where the Ready
382- // condition status is not "True" are stale and should be removed.
383- let ( output, exit_code) = exec_capture_with_exit (
441+ // Step 1: delete the stale node objects.
442+ let ( _output, exit_code) = exec_capture_with_exit (
384443 docker,
385444 & container_name,
386445 vec ! [
387446 "sh" . to_string( ) ,
388447 "-c" . to_string( ) ,
389448 format!(
390- "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
391- --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
392- 2>/dev/null | grep -v '\\ bTrue$' | awk '{{print $1}}'"
449+ "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
393450 ) ,
394451 ] ,
395452 )
396453 . await ?;
397454
398455 if exit_code != 0 {
399- // kubectl not ready yet or no nodes — nothing to clean
400- return Ok ( 0 ) ;
456+ tracing:: warn!( "failed to delete stale nodes (exit code {exit_code})" ) ;
401457 }
402458
403- let stale_nodes: Vec < & str > = output
404- . lines ( )
405- . map ( str:: trim)
406- . filter ( |l| !l. is_empty ( ) )
407- . collect ( ) ;
408- if stale_nodes. is_empty ( ) {
409- return Ok ( 0 ) ;
410- }
459+ // Step 2: force-delete pods stuck in Terminating. After the stale node is
460+ // removed, pods that were scheduled on it transition to Terminating but
461+ // will never complete graceful shutdown (the node is gone). StatefulSets
462+ // will not create a replacement until the old pod is fully deleted.
463+ let ( _output, exit_code) = exec_capture_with_exit (
464+ docker,
465+ & container_name,
466+ vec ! [
467+ "sh" . to_string( ) ,
468+ "-c" . to_string( ) ,
469+ format!(
470+ "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
471+ --field-selector=status.phase=Running -o name 2>/dev/null; \
472+ for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
473+ --no-headers 2>/dev/null | awk '$4 == \" Terminating\" {{print $1\" /\" $2}}'); do \
474+ ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \
475+ KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \" $pod\" -n \" $ns\" \
476+ --force --grace-period=0 --ignore-not-found 2>/dev/null; \
477+ done"
478+ ) ,
479+ ] ,
480+ )
481+ . await ?;
411482
412- let node_list = stale_nodes. join ( " " ) ;
413- let count = stale_nodes. len ( ) ;
414- tracing:: info!( "removing {} stale node(s): {}" , count, node_list) ;
483+ if exit_code != 0 {
484+ tracing:: debug!(
485+ "force-delete of terminating pods returned exit code {exit_code} (non-fatal)"
486+ ) ;
487+ }
415488
489+ // Step 3: delete PersistentVolumeClaims in the openshell namespace whose
490+ // backing PV has node affinity for a stale node. local-path-provisioner
491+ // creates PVs tied to the original node; when the node changes, the PV is
492+ // unschedulable and the `StatefulSet` pod stays Pending. Deleting the PVC
493+ // (and its PV) lets the provisioner create a fresh one on the current node.
416494 let ( _output, exit_code) = exec_capture_with_exit (
417495 docker,
418496 & container_name,
419497 vec ! [
420498 "sh" . to_string( ) ,
421499 "-c" . to_string( ) ,
422500 format!(
423- "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
501+ r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \
502+ CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \
503+ [ -z "$CURRENT_NODE" ] && exit 0; \
504+ for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \
505+ NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \
506+ [ "$NODE" = "$CURRENT_NODE" ] && continue; \
507+ NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \
508+ PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \
509+ [ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \
510+ kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \
511+ done"#
424512 ) ,
425513 ] ,
426514 )
427515 . await ?;
428516
429517 if exit_code != 0 {
430- tracing:: warn !( "failed to delete stale nodes ( exit code {exit_code})" ) ;
518+ tracing:: debug !( "PV/PVC cleanup returned exit code {exit_code} (non-fatal )" ) ;
431519 }
432520
433521 Ok ( count)
0 commit comments