fix(bootstrap): robust stale node cleanup with retries and pod force-deletion

drew · drew · commit da5a68f9ed73 · 2026-04-01T15:01:51.000-07:00
Reverts the hostname preservation approach which caused k3s node password
validation failures. Instead, makes clean_stale_nodes() reliable by:

1. Retrying with 3s backoff (up to ~45s) until kubectl becomes available
   after a container restart, instead of firing once and silently giving up.
2. Force-deleting pods stuck in Terminating on removed stale nodes so
   StatefulSets can immediately reschedule replacements.

This fixes gateway resume failures after stop/start when the container
image has changed (common in development), where the new container gets a
different k3s node identity and pods on the old node never reschedule.
diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
@@ -482,12 +482,6 @@ pub async fn ensure_container(
 ) -> Result<()> {
     let container_name = container_name(name);
 
-    // When an existing container is recreated due to an image change, we
-    // preserve its hostname so the new container registers with the same k3s
-    // node identity.  Without this, k3s sees a brand-new node while pods on
-    // the old (now-dead) node remain stuck in Terminating.
-    let mut preserved_hostname: Option<String> = None;
-
     // Check if the container already exists
     match docker
         .inspect_container(&container_name, None::<InspectContainerOptions>)
@@ -527,20 +521,11 @@ pub async fn ensure_container(
             }
 
             // Image changed — remove the stale container so we can recreate it.
-            // Capture the hostname before removal so the replacement container
-            // keeps the same k3s node identity.
-            preserved_hostname = info
-                .config
-                .as_ref()
-                .and_then(|c| c.hostname.clone())
-                .filter(|h| !h.is_empty());
-
             tracing::info!(
-                "Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})",
+                "Container {} exists but uses a different image (container={}, desired={}), recreating",
                 container_name,
                 container_image_id.as_deref().map_or("unknown", truncate_id),
                 desired_id.as_deref().map_or("unknown", truncate_id),
-                preserved_hostname,
             );
 
             let _ = docker.stop_container(&container_name, None).await;
@@ -747,14 +732,7 @@ pub async fn ensure_container(
 
     let env = Some(env_vars);
 
-    // Use the preserved hostname from a previous container (image-change
-    // recreation) so k3s keeps the same node identity.  For fresh containers
-    // fall back to the Docker container name, giving a stable hostname that
-    // survives future image-change recreations.
-    let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone());
-
     let config = ContainerCreateBody {
-        hostname: Some(hostname),
         image: Some(image_ref.to_string()),
         cmd: Some(cmd),
         env,
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
@@ -446,13 +446,18 @@ where
         start_container(&target_docker, &name).await?;
 
         // Clean up stale k3s nodes left over from previous container instances that
-        // used the same persistent volume. Without this, pods remain scheduled on
+        // used the same persistent volume.  Without this, pods remain scheduled on
         // NotReady ghost nodes and the health check will time out.
+        //
+        // The function retries internally until kubectl becomes available (k3s may
+        // still be initialising after the container start).  It also force-deletes
+        // pods stuck in Terminating on the removed nodes so that StatefulSets can
+        // reschedule replacements immediately.
         match clean_stale_nodes(&target_docker, &name).await {
             Ok(0) => {}
-            Ok(n) => tracing::debug!("removed {n} stale node(s)"),
+            Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"),
             Err(err) => {
-                tracing::debug!("stale node cleanup failed (non-fatal): {err}");
+                tracing::warn!("stale node cleanup failed (non-fatal): {err}");
             }
         }
 
diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs
@@ -362,72 +362,116 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize)
     rendered
 }
 
-/// Remove stale k3s nodes from a cluster with a reused persistent volume.
+/// Remove stale k3s nodes and their orphaned pods from a resumed cluster.
 ///
 /// When a cluster container is recreated but the volume is reused, k3s registers
 /// a new node (using the container ID as the hostname) while old node entries
 /// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run,
 /// causing health checks to fail.
 ///
-/// This function identifies all `NotReady` nodes and deletes them so k3s can
-/// reschedule workloads onto the current (Ready) node.
+/// This function retries with backoff until `kubectl` becomes available (k3s may
+/// still be initialising), then:
+///   1. Deletes all `NotReady` nodes so k3s stops tracking them.
+///   2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and
+///      Deployments can reschedule replacements on the current (Ready) node.
 ///
 /// Returns the number of stale nodes removed.
 pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
+    // Retry until kubectl is responsive.  k3s can take 10-20 s to start the
+    // API server after a container restart, so we allow up to ~45 s.
+    const MAX_ATTEMPTS: u32 = 15;
+    const RETRY_DELAY: Duration = Duration::from_secs(3);
+
     let container_name = container_name(name);
+    let mut stale_nodes: Vec<String> = Vec::new();
+
+    for attempt in 1..=MAX_ATTEMPTS {
+        let (output, exit_code) = exec_capture_with_exit(
+            docker,
+            &container_name,
+            vec![
+                "sh".to_string(),
+                "-c".to_string(),
+                format!(
+                    "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
+                     --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
+                     2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
+                ),
+            ],
+        )
+        .await?;
+
+        if exit_code == 0 {
+            stale_nodes = output
+                .lines()
+                .map(str::trim)
+                .filter(|l| !l.is_empty())
+                .map(ToString::to_string)
+                .collect();
+            break;
+        }
+
+        if attempt < MAX_ATTEMPTS {
+            tracing::debug!(
+                "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s",
+                RETRY_DELAY.as_secs()
+            );
+            tokio::time::sleep(RETRY_DELAY).await;
+        }
+    }
+
+    if stale_nodes.is_empty() {
+        return Ok(0);
+    }
+
+    let node_list = stale_nodes.join(" ");
+    let count = stale_nodes.len();
+    tracing::info!("removing {} stale node(s): {}", count, node_list);
 
-    // Get the list of NotReady nodes.
-    // The last condition on a node is always type=Ready; we need to check its
-    // **status** (True/False/Unknown), not its type.  Nodes where the Ready
-    // condition status is not "True" are stale and should be removed.
-    let (output, exit_code) = exec_capture_with_exit(
+    // Step 1: delete the stale node objects.
+    let (_output, exit_code) = exec_capture_with_exit(
         docker,
         &container_name,
         vec![
             "sh".to_string(),
             "-c".to_string(),
             format!(
-                "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
-                 --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
-                 2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
+                "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
             ),
         ],
     )
     .await?;
 
     if exit_code != 0 {
-        // kubectl not ready yet or no nodes — nothing to clean
-        return Ok(0);
-    }
-
-    let stale_nodes: Vec<&str> = output
-        .lines()
-        .map(str::trim)
-        .filter(|l| !l.is_empty())
-        .collect();
-    if stale_nodes.is_empty() {
-        return Ok(0);
+        tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
     }
 
-    let node_list = stale_nodes.join(" ");
-    let count = stale_nodes.len();
-    tracing::info!("removing {} stale node(s): {}", count, node_list);
-
+    // Step 2: force-delete pods stuck in Terminating.  After the stale node is
+    // removed, pods that were scheduled on it transition to Terminating but
+    // will never complete graceful shutdown (the node is gone).  StatefulSets
+    // will not create a replacement until the old pod is fully deleted.
     let (_output, exit_code) = exec_capture_with_exit(
         docker,
         &container_name,
         vec![
             "sh".to_string(),
             "-c".to_string(),
             format!(
-                "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
+                "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
+                 --field-selector=status.phase=Running -o name 2>/dev/null; \
+                 for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
+                     --no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \
+                     ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \
+                     KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \
+                         --force --grace-period=0 --ignore-not-found 2>/dev/null; \
+                 done"
             ),
         ],
     )
     .await?;
 
     if exit_code != 0 {
-        tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
+        tracing::debug!("force-delete of terminating pods returned exit code {exit_code} (non-fatal)");
     }
 
     Ok(count)