Skip to content

Commit da5a68f

Browse files
committed
fix(bootstrap): robust stale node cleanup with retries and pod force-deletion
Reverts the hostname preservation approach which caused k3s node password validation failures. Instead, makes clean_stale_nodes() reliable by: 1. Retrying with 3s backoff (up to ~45s) until kubectl becomes available after a container restart, instead of firing once and silently giving up. 2. Force-deleting pods stuck in Terminating on removed stale nodes so StatefulSets can immediately reschedule replacements. This fixes gateway resume failures after stop/start when the container image has changed (common in development), where the new container gets a different k3s node identity and pods on the old node never reschedule.
1 parent a6e1d22 commit da5a68f

File tree

3 files changed

+81
-54
lines changed

3 files changed

+81
-54
lines changed

crates/openshell-bootstrap/src/docker.rs

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -482,12 +482,6 @@ pub async fn ensure_container(
482482
) -> Result<()> {
483483
let container_name = container_name(name);
484484

485-
// When an existing container is recreated due to an image change, we
486-
// preserve its hostname so the new container registers with the same k3s
487-
// node identity. Without this, k3s sees a brand-new node while pods on
488-
// the old (now-dead) node remain stuck in Terminating.
489-
let mut preserved_hostname: Option<String> = None;
490-
491485
// Check if the container already exists
492486
match docker
493487
.inspect_container(&container_name, None::<InspectContainerOptions>)
@@ -527,20 +521,11 @@ pub async fn ensure_container(
527521
}
528522

529523
// Image changed — remove the stale container so we can recreate it.
530-
// Capture the hostname before removal so the replacement container
531-
// keeps the same k3s node identity.
532-
preserved_hostname = info
533-
.config
534-
.as_ref()
535-
.and_then(|c| c.hostname.clone())
536-
.filter(|h| !h.is_empty());
537-
538524
tracing::info!(
539-
"Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})",
525+
"Container {} exists but uses a different image (container={}, desired={}), recreating",
540526
container_name,
541527
container_image_id.as_deref().map_or("unknown", truncate_id),
542528
desired_id.as_deref().map_or("unknown", truncate_id),
543-
preserved_hostname,
544529
);
545530

546531
let _ = docker.stop_container(&container_name, None).await;
@@ -747,14 +732,7 @@ pub async fn ensure_container(
747732

748733
let env = Some(env_vars);
749734

750-
// Use the preserved hostname from a previous container (image-change
751-
// recreation) so k3s keeps the same node identity. For fresh containers
752-
// fall back to the Docker container name, giving a stable hostname that
753-
// survives future image-change recreations.
754-
let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone());
755-
756735
let config = ContainerCreateBody {
757-
hostname: Some(hostname),
758736
image: Some(image_ref.to_string()),
759737
cmd: Some(cmd),
760738
env,

crates/openshell-bootstrap/src/lib.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -446,13 +446,18 @@ where
446446
start_container(&target_docker, &name).await?;
447447

448448
// Clean up stale k3s nodes left over from previous container instances that
449-
// used the same persistent volume. Without this, pods remain scheduled on
449+
// used the same persistent volume. Without this, pods remain scheduled on
450450
// NotReady ghost nodes and the health check will time out.
451+
//
452+
// The function retries internally until kubectl becomes available (k3s may
453+
// still be initialising after the container start). It also force-deletes
454+
// pods stuck in Terminating on the removed nodes so that StatefulSets can
455+
// reschedule replacements immediately.
451456
match clean_stale_nodes(&target_docker, &name).await {
452457
Ok(0) => {}
453-
Ok(n) => tracing::debug!("removed {n} stale node(s)"),
458+
Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"),
454459
Err(err) => {
455-
tracing::debug!("stale node cleanup failed (non-fatal): {err}");
460+
tracing::warn!("stale node cleanup failed (non-fatal): {err}");
456461
}
457462
}
458463

crates/openshell-bootstrap/src/runtime.rs

Lines changed: 72 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -362,72 +362,116 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize)
362362
rendered
363363
}
364364

365-
/// Remove stale k3s nodes from a cluster with a reused persistent volume.
365+
/// Remove stale k3s nodes and their orphaned pods from a resumed cluster.
366366
///
367367
/// When a cluster container is recreated but the volume is reused, k3s registers
368368
/// a new node (using the container ID as the hostname) while old node entries
369369
/// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run,
370370
/// causing health checks to fail.
371371
///
372-
/// This function identifies all `NotReady` nodes and deletes them so k3s can
373-
/// reschedule workloads onto the current (Ready) node.
372+
/// This function retries with backoff until `kubectl` becomes available (k3s may
373+
/// still be initialising), then:
374+
/// 1. Deletes all `NotReady` nodes so k3s stops tracking them.
375+
/// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and
376+
/// Deployments can reschedule replacements on the current (Ready) node.
374377
///
375378
/// Returns the number of stale nodes removed.
376379
pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
380+
// Retry until kubectl is responsive. k3s can take 10-20 s to start the
381+
// API server after a container restart, so we allow up to ~45 s.
382+
const MAX_ATTEMPTS: u32 = 15;
383+
const RETRY_DELAY: Duration = Duration::from_secs(3);
384+
377385
let container_name = container_name(name);
386+
let mut stale_nodes: Vec<String> = Vec::new();
387+
388+
for attempt in 1..=MAX_ATTEMPTS {
389+
let (output, exit_code) = exec_capture_with_exit(
390+
docker,
391+
&container_name,
392+
vec![
393+
"sh".to_string(),
394+
"-c".to_string(),
395+
format!(
396+
"KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
397+
--no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
398+
2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
399+
),
400+
],
401+
)
402+
.await?;
403+
404+
if exit_code == 0 {
405+
stale_nodes = output
406+
.lines()
407+
.map(str::trim)
408+
.filter(|l| !l.is_empty())
409+
.map(ToString::to_string)
410+
.collect();
411+
break;
412+
}
413+
414+
if attempt < MAX_ATTEMPTS {
415+
tracing::debug!(
416+
"kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s",
417+
RETRY_DELAY.as_secs()
418+
);
419+
tokio::time::sleep(RETRY_DELAY).await;
420+
}
421+
}
422+
423+
if stale_nodes.is_empty() {
424+
return Ok(0);
425+
}
426+
427+
let node_list = stale_nodes.join(" ");
428+
let count = stale_nodes.len();
429+
tracing::info!("removing {} stale node(s): {}", count, node_list);
378430

379-
// Get the list of NotReady nodes.
380-
// The last condition on a node is always type=Ready; we need to check its
381-
// **status** (True/False/Unknown), not its type. Nodes where the Ready
382-
// condition status is not "True" are stale and should be removed.
383-
let (output, exit_code) = exec_capture_with_exit(
431+
// Step 1: delete the stale node objects.
432+
let (_output, exit_code) = exec_capture_with_exit(
384433
docker,
385434
&container_name,
386435
vec![
387436
"sh".to_string(),
388437
"-c".to_string(),
389438
format!(
390-
"KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
391-
--no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
392-
2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
439+
"KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
393440
),
394441
],
395442
)
396443
.await?;
397444

398445
if exit_code != 0 {
399-
// kubectl not ready yet or no nodes — nothing to clean
400-
return Ok(0);
401-
}
402-
403-
let stale_nodes: Vec<&str> = output
404-
.lines()
405-
.map(str::trim)
406-
.filter(|l| !l.is_empty())
407-
.collect();
408-
if stale_nodes.is_empty() {
409-
return Ok(0);
446+
tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
410447
}
411448

412-
let node_list = stale_nodes.join(" ");
413-
let count = stale_nodes.len();
414-
tracing::info!("removing {} stale node(s): {}", count, node_list);
415-
449+
// Step 2: force-delete pods stuck in Terminating. After the stale node is
450+
// removed, pods that were scheduled on it transition to Terminating but
451+
// will never complete graceful shutdown (the node is gone). StatefulSets
452+
// will not create a replacement until the old pod is fully deleted.
416453
let (_output, exit_code) = exec_capture_with_exit(
417454
docker,
418455
&container_name,
419456
vec![
420457
"sh".to_string(),
421458
"-c".to_string(),
422459
format!(
423-
"KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
460+
"KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
461+
--field-selector=status.phase=Running -o name 2>/dev/null; \
462+
for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
463+
--no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \
464+
ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \
465+
KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \
466+
--force --grace-period=0 --ignore-not-found 2>/dev/null; \
467+
done"
424468
),
425469
],
426470
)
427471
.await?;
428472

429473
if exit_code != 0 {
430-
tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
474+
tracing::debug!("force-delete of terminating pods returned exit code {exit_code} (non-fatal)");
431475
}
432476

433477
Ok(count)

0 commit comments

Comments
 (0)