Skip to content

Commit 1b881d8

Browse files
committed
fix(bootstrap): robust stale node cleanup with retries and pod force-deletion
Reverts the hostname preservation approach which caused k3s node password validation failures. Instead, makes clean_stale_nodes() reliable by: 1. Retrying with 3s backoff (up to ~45s) until kubectl becomes available after a container restart, instead of firing once and silently giving up. 2. Force-deleting pods stuck in Terminating on removed stale nodes so StatefulSets can immediately reschedule replacements. This fixes gateway resume failures after stop/start when the container image has changed (common in development), where the new container gets a different k3s node identity and pods on the old node never reschedule.
1 parent a6e1d22 commit 1b881d8

File tree

10 files changed

+414
-323
lines changed

10 files changed

+414
-323
lines changed

.github/workflows/e2e-test.yml

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,25 @@ permissions:
1919

2020
jobs:
2121
e2e:
22-
name: E2E
22+
name: "E2E (${{ matrix.suite }})"
2323
runs-on: ${{ inputs.runner }}
2424
timeout-minutes: 30
25+
strategy:
26+
fail-fast: false
27+
matrix:
28+
include:
29+
- suite: python
30+
cluster: e2e-python
31+
port: "8080"
32+
cmd: "mise run --no-prepare --skip-deps e2e:python"
33+
- suite: rust
34+
cluster: e2e-rust
35+
port: "8081"
36+
cmd: "mise run --no-prepare --skip-deps e2e:rust"
37+
- suite: gateway-resume
38+
cluster: e2e-resume
39+
port: "8082"
40+
cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume"
2541
container:
2642
image: ghcr.io/nvidia/openshell/ci:latest
2743
credentials:
@@ -38,6 +54,7 @@ jobs:
3854
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
3955
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
4056
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
57+
OPENSHELL_GATEWAY: ${{ matrix.cluster }}
4158
steps:
4259
- uses: actions/checkout@v4
4360

@@ -48,21 +65,26 @@ jobs:
4865
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
4966

5067
- name: Install Python dependencies and generate protobuf stubs
68+
if: matrix.suite == 'python'
5169
run: uv sync --frozen && mise run --no-prepare python:proto
5270

53-
- name: Bootstrap and deploy cluster
71+
- name: Build Rust CLI
72+
if: matrix.suite != 'python'
73+
run: cargo build -p openshell-cli --features openshell-core/dev-settings
74+
75+
- name: Install SSH client
76+
if: matrix.suite != 'python'
77+
run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
78+
79+
- name: Bootstrap cluster
5480
env:
5581
GATEWAY_HOST: host.docker.internal
56-
GATEWAY_PORT: "8080"
82+
GATEWAY_PORT: ${{ matrix.port }}
83+
CLUSTER_NAME: ${{ matrix.cluster }}
5784
SKIP_IMAGE_PUSH: "1"
5885
SKIP_CLUSTER_IMAGE_BUILD: "1"
5986
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
6087
run: mise run --no-prepare --skip-deps cluster
6188

62-
- name: Install SSH client for Rust CLI e2e tests
63-
run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
64-
65-
- name: Run E2E tests
66-
run: |
67-
mise run --no-prepare --skip-deps e2e:python
68-
mise run --no-prepare --skip-deps e2e:rust
89+
- name: Run tests
90+
run: ${{ matrix.cmd }}

crates/openshell-bootstrap/src/docker.rs

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ pub async fn ensure_image(
467467
Ok(())
468468
}
469469

470+
/// Returns the actual host port the container is using. When an existing
471+
/// container is reused (same image), this may differ from `gateway_port`
472+
/// because the container was originally created with a different port.
470473
pub async fn ensure_container(
471474
docker: &Docker,
472475
name: &str,
@@ -479,15 +482,9 @@ pub async fn ensure_container(
479482
registry_username: Option<&str>,
480483
registry_token: Option<&str>,
481484
device_ids: &[String],
482-
) -> Result<()> {
485+
) -> Result<u16> {
483486
let container_name = container_name(name);
484487

485-
// When an existing container is recreated due to an image change, we
486-
// preserve its hostname so the new container registers with the same k3s
487-
// node identity. Without this, k3s sees a brand-new node while pods on
488-
// the old (now-dead) node remain stuck in Terminating.
489-
let mut preserved_hostname: Option<String> = None;
490-
491488
// Check if the container already exists
492489
match docker
493490
.inspect_container(&container_name, None::<InspectContainerOptions>)
@@ -523,24 +520,31 @@ pub async fn ensure_container(
523520
// the current (just-created) network before returning.
524521
let expected_net = network_name(name);
525522
reconcile_container_network(docker, &container_name, &expected_net).await?;
526-
return Ok(());
523+
524+
// Read the actual host port from the container's port bindings
525+
// as a cross-check. The caller should already pass the correct
526+
// port (from stored metadata), but this catches mismatches if
527+
// the container was recreated with a different port externally.
528+
let actual_port = info
529+
.host_config
530+
.as_ref()
531+
.and_then(|hc| hc.port_bindings.as_ref())
532+
.and_then(|pb| pb.get("30051/tcp"))
533+
.and_then(|bindings| bindings.as_ref())
534+
.and_then(|bindings| bindings.first())
535+
.and_then(|b| b.host_port.as_ref())
536+
.and_then(|p| p.parse::<u16>().ok())
537+
.unwrap_or(gateway_port);
538+
539+
return Ok(actual_port);
527540
}
528541

529542
// Image changed — remove the stale container so we can recreate it.
530-
// Capture the hostname before removal so the replacement container
531-
// keeps the same k3s node identity.
532-
preserved_hostname = info
533-
.config
534-
.as_ref()
535-
.and_then(|c| c.hostname.clone())
536-
.filter(|h| !h.is_empty());
537-
538543
tracing::info!(
539-
"Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})",
544+
"Container {} exists but uses a different image (container={}, desired={}), recreating",
540545
container_name,
541546
container_image_id.as_deref().map_or("unknown", truncate_id),
542547
desired_id.as_deref().map_or("unknown", truncate_id),
543-
preserved_hostname,
544548
);
545549

546550
let _ = docker.stop_container(&container_name, None).await;
@@ -747,14 +751,7 @@ pub async fn ensure_container(
747751

748752
let env = Some(env_vars);
749753

750-
// Use the preserved hostname from a previous container (image-change
751-
// recreation) so k3s keeps the same node identity. For fresh containers
752-
// fall back to the Docker container name, giving a stable hostname that
753-
// survives future image-change recreations.
754-
let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone());
755-
756754
let config = ContainerCreateBody {
757-
hostname: Some(hostname),
758755
image: Some(image_ref.to_string()),
759756
cmd: Some(cmd),
760757
env,
@@ -774,7 +771,7 @@ pub async fn ensure_container(
774771
.await
775772
.into_diagnostic()
776773
.wrap_err("failed to create gateway container")?;
777-
Ok(())
774+
Ok(gateway_port)
778775
}
779776

780777
/// Information about a container that is holding a port we need.

crates/openshell-bootstrap/src/lib.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,10 @@ where
429429
// See: https://github.com/NVIDIA/OpenShell/issues/463
430430
let deploy_result: Result<GatewayMetadata> = async {
431431
let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported);
432-
ensure_container(
432+
// ensure_container returns the actual host port — which may differ from
433+
// the requested `port` when reusing an existing container that was
434+
// originally created with a different port.
435+
let actual_port = ensure_container(
433436
&target_docker,
434437
&name,
435438
&image_ref,
@@ -443,16 +446,22 @@ where
443446
&device_ids,
444447
)
445448
.await?;
449+
let port = actual_port;
446450
start_container(&target_docker, &name).await?;
447451

448452
// Clean up stale k3s nodes left over from previous container instances that
449-
// used the same persistent volume. Without this, pods remain scheduled on
453+
// used the same persistent volume. Without this, pods remain scheduled on
450454
// NotReady ghost nodes and the health check will time out.
455+
//
456+
// The function retries internally until kubectl becomes available (k3s may
457+
// still be initialising after the container start). It also force-deletes
458+
// pods stuck in Terminating on the removed nodes so that StatefulSets can
459+
// reschedule replacements immediately.
451460
match clean_stale_nodes(&target_docker, &name).await {
452461
Ok(0) => {}
453-
Ok(n) => tracing::debug!("removed {n} stale node(s)"),
462+
Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"),
454463
Err(err) => {
455-
tracing::debug!("stale node cleanup failed (non-fatal): {err}");
464+
tracing::warn!("stale node cleanup failed (non-fatal): {err}");
456465
}
457466
}
458467

crates/openshell-bootstrap/src/runtime.rs

Lines changed: 114 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -362,72 +362,160 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize)
362362
rendered
363363
}
364364

365-
/// Remove stale k3s nodes from a cluster with a reused persistent volume.
365+
/// Remove stale k3s nodes and their orphaned pods from a resumed cluster.
366366
///
367367
/// When a cluster container is recreated but the volume is reused, k3s registers
368368
/// a new node (using the container ID as the hostname) while old node entries
369369
/// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run,
370370
/// causing health checks to fail.
371371
///
372-
/// This function identifies all `NotReady` nodes and deletes them so k3s can
373-
/// reschedule workloads onto the current (Ready) node.
372+
/// This function retries with backoff until `kubectl` becomes available (k3s may
373+
/// still be initialising), then:
374+
/// 1. Deletes all `NotReady` nodes so k3s stops tracking them.
375+
/// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and
376+
/// Deployments can reschedule replacements on the current (Ready) node.
374377
///
375378
/// Returns the number of stale nodes removed.
376379
pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
380+
// Retry until kubectl is responsive. k3s can take 10-20 s to start the
381+
// API server after a container restart, so we allow up to ~45 s.
382+
const MAX_ATTEMPTS: u32 = 15;
383+
const RETRY_DELAY: Duration = Duration::from_secs(3);
384+
377385
let container_name = container_name(name);
386+
let mut stale_nodes: Vec<String> = Vec::new();
387+
388+
for attempt in 1..=MAX_ATTEMPTS {
389+
// List ALL node names and the container's own hostname. Any node that
390+
// is not the current container is stale — we cannot rely on the Ready
391+
// condition because k3s may not have marked the old node NotReady yet
392+
// when this runs shortly after container start.
393+
let (output, exit_code) = exec_capture_with_exit(
394+
docker,
395+
&container_name,
396+
vec![
397+
"sh".to_string(),
398+
"-c".to_string(),
399+
format!(
400+
"KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
401+
--no-headers -o custom-columns=NAME:.metadata.name \
402+
2>/dev/null"
403+
),
404+
],
405+
)
406+
.await?;
407+
408+
if exit_code == 0 {
409+
// Determine the current node name (container hostname).
410+
let (hostname_out, _) =
411+
exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()])
412+
.await?;
413+
let current_hostname = hostname_out.trim().to_string();
414+
415+
stale_nodes = output
416+
.lines()
417+
.map(str::trim)
418+
.filter(|l| !l.is_empty() && *l != current_hostname)
419+
.map(ToString::to_string)
420+
.collect();
421+
break;
422+
}
423+
424+
if attempt < MAX_ATTEMPTS {
425+
tracing::debug!(
426+
"kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s",
427+
RETRY_DELAY.as_secs()
428+
);
429+
tokio::time::sleep(RETRY_DELAY).await;
430+
}
431+
}
432+
433+
if stale_nodes.is_empty() {
434+
return Ok(0);
435+
}
436+
437+
let node_list = stale_nodes.join(" ");
438+
let count = stale_nodes.len();
439+
tracing::info!("removing {} stale node(s): {}", count, node_list);
378440

379-
// Get the list of NotReady nodes.
380-
// The last condition on a node is always type=Ready; we need to check its
381-
// **status** (True/False/Unknown), not its type. Nodes where the Ready
382-
// condition status is not "True" are stale and should be removed.
383-
let (output, exit_code) = exec_capture_with_exit(
441+
// Step 1: delete the stale node objects.
442+
let (_output, exit_code) = exec_capture_with_exit(
384443
docker,
385444
&container_name,
386445
vec![
387446
"sh".to_string(),
388447
"-c".to_string(),
389448
format!(
390-
"KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
391-
--no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
392-
2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
449+
"KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
393450
),
394451
],
395452
)
396453
.await?;
397454

398455
if exit_code != 0 {
399-
// kubectl not ready yet or no nodes — nothing to clean
400-
return Ok(0);
456+
tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
401457
}
402458

403-
let stale_nodes: Vec<&str> = output
404-
.lines()
405-
.map(str::trim)
406-
.filter(|l| !l.is_empty())
407-
.collect();
408-
if stale_nodes.is_empty() {
409-
return Ok(0);
410-
}
459+
// Step 2: force-delete pods stuck in Terminating. After the stale node is
460+
// removed, pods that were scheduled on it transition to Terminating but
461+
// will never complete graceful shutdown (the node is gone). StatefulSets
462+
// will not create a replacement until the old pod is fully deleted.
463+
let (_output, exit_code) = exec_capture_with_exit(
464+
docker,
465+
&container_name,
466+
vec![
467+
"sh".to_string(),
468+
"-c".to_string(),
469+
format!(
470+
"KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
471+
--field-selector=status.phase=Running -o name 2>/dev/null; \
472+
for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
473+
--no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \
474+
ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \
475+
KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \
476+
--force --grace-period=0 --ignore-not-found 2>/dev/null; \
477+
done"
478+
),
479+
],
480+
)
481+
.await?;
411482

412-
let node_list = stale_nodes.join(" ");
413-
let count = stale_nodes.len();
414-
tracing::info!("removing {} stale node(s): {}", count, node_list);
483+
if exit_code != 0 {
484+
tracing::debug!(
485+
"force-delete of terminating pods returned exit code {exit_code} (non-fatal)"
486+
);
487+
}
415488

489+
// Step 3: delete PersistentVolumeClaims in the openshell namespace whose
490+
// backing PV has node affinity for a stale node. local-path-provisioner
491+
// creates PVs tied to the original node; when the node changes, the PV is
492+
// unschedulable and the `StatefulSet` pod stays Pending. Deleting the PVC
493+
// (and its PV) lets the provisioner create a fresh one on the current node.
416494
let (_output, exit_code) = exec_capture_with_exit(
417495
docker,
418496
&container_name,
419497
vec![
420498
"sh".to_string(),
421499
"-c".to_string(),
422500
format!(
423-
"KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
501+
r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \
502+
CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \
503+
[ -z "$CURRENT_NODE" ] && exit 0; \
504+
for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \
505+
NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \
506+
[ "$NODE" = "$CURRENT_NODE" ] && continue; \
507+
NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \
508+
PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \
509+
[ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \
510+
kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \
511+
done"#
424512
),
425513
],
426514
)
427515
.await?;
428516

429517
if exit_code != 0 {
430-
tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
518+
tracing::debug!("PV/PVC cleanup returned exit code {exit_code} (non-fatal)");
431519
}
432520

433521
Ok(count)

0 commit comments

Comments
 (0)