NVIDIA
diff --git a/‎.github/workflows/e2e-test.yml‎
Lines changed: 32 additions & 10 deletions b/‎.github/workflows/e2e-test.yml‎
Lines changed: 32 additions & 10 deletions
diff --git a/‎crates/openshell-bootstrap/src/docker.rs‎
Lines changed: 23 additions & 26 deletions b/‎crates/openshell-bootstrap/src/docker.rs‎
Lines changed: 23 additions & 26 deletions
diff --git a/‎crates/openshell-bootstrap/src/lib.rs‎
Lines changed: 13 additions & 4 deletions b/‎crates/openshell-bootstrap/src/lib.rs‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎crates/openshell-bootstrap/src/runtime.rs‎
Lines changed: 114 additions & 26 deletions b/‎crates/openshell-bootstrap/src/runtime.rs‎
Lines changed: 114 additions & 26 deletions
@@ -19,9 +19,25 @@ permissions:
 
 jobs:
   e2e:
-    name: E2E
+    name: "E2E (${{ matrix.suite }})"
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - suite: python
+            cluster: e2e-python
+            port: "8080"
+            cmd: "mise run --no-prepare --skip-deps e2e:python"
+          - suite: rust
+            cluster: e2e-rust
+            port: "8081"
+            cmd: "mise run --no-prepare --skip-deps e2e:rust"
+          - suite: gateway-resume
+            cluster: e2e-resume
+            port: "8082"
+            cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume"
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
@@ -38,6 +54,7 @@ jobs:
       OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
       OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
       OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
+      OPENSHELL_GATEWAY: ${{ matrix.cluster }}
     steps:
       - uses: actions/checkout@v4
 
@@ -48,21 +65,26 @@ jobs:
         run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
 
       - name: Install Python dependencies and generate protobuf stubs
+        if: matrix.suite == 'python'
         run: uv sync --frozen && mise run --no-prepare python:proto
 
-      - name: Bootstrap and deploy cluster
+      - name: Build Rust CLI
+        if: matrix.suite != 'python'
+        run: cargo build -p openshell-cli --features openshell-core/dev-settings
+
+      - name: Install SSH client
+        if: matrix.suite != 'python'
+        run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
+
+      - name: Bootstrap cluster
         env:
           GATEWAY_HOST: host.docker.internal
-          GATEWAY_PORT: "8080"
+          GATEWAY_PORT: ${{ matrix.port }}
+          CLUSTER_NAME: ${{ matrix.cluster }}
           SKIP_IMAGE_PUSH: "1"
           SKIP_CLUSTER_IMAGE_BUILD: "1"
           OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
         run: mise run --no-prepare --skip-deps cluster
 
-      - name: Install SSH client for Rust CLI e2e tests
-        run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
-
-      - name: Run E2E tests
-        run: |
-          mise run --no-prepare --skip-deps e2e:python
-          mise run --no-prepare --skip-deps e2e:rust
+      - name: Run tests
+        run: ${{ matrix.cmd }}
@@ -467,6 +467,9 @@ pub async fn ensure_image(
     Ok(())
 }
 
+/// Returns the actual host port the container is using.  When an existing
+/// container is reused (same image), this may differ from `gateway_port`
+/// because the container was originally created with a different port.
 pub async fn ensure_container(
     docker: &Docker,
     name: &str,
@@ -479,15 +482,9 @@ pub async fn ensure_container(
     registry_username: Option<&str>,
     registry_token: Option<&str>,
     device_ids: &[String],
-) -> Result<()> {
+) -> Result<u16> {
     let container_name = container_name(name);
 
-    // When an existing container is recreated due to an image change, we
-    // preserve its hostname so the new container registers with the same k3s
-    // node identity.  Without this, k3s sees a brand-new node while pods on
-    // the old (now-dead) node remain stuck in Terminating.
-    let mut preserved_hostname: Option<String> = None;
-
     // Check if the container already exists
     match docker
         .inspect_container(&container_name, None::<InspectContainerOptions>)
@@ -523,24 +520,31 @@ pub async fn ensure_container(
                 // the current (just-created) network before returning.
                 let expected_net = network_name(name);
                 reconcile_container_network(docker, &container_name, &expected_net).await?;
-                return Ok(());
+
+                // Read the actual host port from the container's port bindings
+                // as a cross-check.  The caller should already pass the correct
+                // port (from stored metadata), but this catches mismatches if
+                // the container was recreated with a different port externally.
+                let actual_port = info
+                    .host_config
+                    .as_ref()
+                    .and_then(|hc| hc.port_bindings.as_ref())
+                    .and_then(|pb| pb.get("30051/tcp"))
+                    .and_then(|bindings| bindings.as_ref())
+                    .and_then(|bindings| bindings.first())
+                    .and_then(|b| b.host_port.as_ref())
+                    .and_then(|p| p.parse::<u16>().ok())
+                    .unwrap_or(gateway_port);
+
+                return Ok(actual_port);
             }
 
             // Image changed — remove the stale container so we can recreate it.
-            // Capture the hostname before removal so the replacement container
-            // keeps the same k3s node identity.
-            preserved_hostname = info
-                .config
-                .as_ref()
-                .and_then(|c| c.hostname.clone())
-                .filter(|h| !h.is_empty());
-
             tracing::info!(
-                "Container {} exists but uses a different image (container={}, desired={}), recreating (preserving hostname {:?})",
+                "Container {} exists but uses a different image (container={}, desired={}), recreating",
                 container_name,
                 container_image_id.as_deref().map_or("unknown", truncate_id),
                 desired_id.as_deref().map_or("unknown", truncate_id),
-                preserved_hostname,
             );
 
             let _ = docker.stop_container(&container_name, None).await;
@@ -747,14 +751,7 @@ pub async fn ensure_container(
 
     let env = Some(env_vars);
 
-    // Use the preserved hostname from a previous container (image-change
-    // recreation) so k3s keeps the same node identity.  For fresh containers
-    // fall back to the Docker container name, giving a stable hostname that
-    // survives future image-change recreations.
-    let hostname = preserved_hostname.unwrap_or_else(|| container_name.clone());
-
     let config = ContainerCreateBody {
-        hostname: Some(hostname),
         image: Some(image_ref.to_string()),
         cmd: Some(cmd),
         env,
@@ -774,7 +771,7 @@ pub async fn ensure_container(
         .await
         .into_diagnostic()
         .wrap_err("failed to create gateway container")?;
-    Ok(())
+    Ok(gateway_port)
 }
 
 /// Information about a container that is holding a port we need.
 
@@ -429,7 +429,10 @@ where
     // See: https://github.com/NVIDIA/OpenShell/issues/463
     let deploy_result: Result<GatewayMetadata> = async {
         let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported);
-        ensure_container(
+        // ensure_container returns the actual host port — which may differ from
+        // the requested `port` when reusing an existing container that was
+        // originally created with a different port.
+        let actual_port = ensure_container(
             &target_docker,
             &name,
             &image_ref,
@@ -443,16 +446,22 @@ where
             &device_ids,
         )
         .await?;
+        let port = actual_port;
         start_container(&target_docker, &name).await?;
 
         // Clean up stale k3s nodes left over from previous container instances that
-        // used the same persistent volume. Without this, pods remain scheduled on
+        // used the same persistent volume.  Without this, pods remain scheduled on
         // NotReady ghost nodes and the health check will time out.
+        //
+        // The function retries internally until kubectl becomes available (k3s may
+        // still be initialising after the container start).  It also force-deletes
+        // pods stuck in Terminating on the removed nodes so that StatefulSets can
+        // reschedule replacements immediately.
         match clean_stale_nodes(&target_docker, &name).await {
             Ok(0) => {}
-            Ok(n) => tracing::debug!("removed {n} stale node(s)"),
+            Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"),
             Err(err) => {
-                tracing::debug!("stale node cleanup failed (non-fatal): {err}");
+                tracing::warn!("stale node cleanup failed (non-fatal): {err}");
             }
         }
 
 
@@ -362,72 +362,160 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize)
     rendered
 }
 
-/// Remove stale k3s nodes from a cluster with a reused persistent volume.
+/// Remove stale k3s nodes and their orphaned pods from a resumed cluster.
 ///
 /// When a cluster container is recreated but the volume is reused, k3s registers
 /// a new node (using the container ID as the hostname) while old node entries
 /// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run,
 /// causing health checks to fail.
 ///
-/// This function identifies all `NotReady` nodes and deletes them so k3s can
-/// reschedule workloads onto the current (Ready) node.
+/// This function retries with backoff until `kubectl` becomes available (k3s may
+/// still be initialising), then:
+///   1. Deletes all `NotReady` nodes so k3s stops tracking them.
+///   2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and
+///      Deployments can reschedule replacements on the current (Ready) node.
 ///
 /// Returns the number of stale nodes removed.
 pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
+    // Retry until kubectl is responsive.  k3s can take 10-20 s to start the
+    // API server after a container restart, so we allow up to ~45 s.
+    const MAX_ATTEMPTS: u32 = 15;
+    const RETRY_DELAY: Duration = Duration::from_secs(3);
+
     let container_name = container_name(name);
+    let mut stale_nodes: Vec<String> = Vec::new();
+
+    for attempt in 1..=MAX_ATTEMPTS {
+        // List ALL node names and the container's own hostname.  Any node that
+        // is not the current container is stale — we cannot rely on the Ready
+        // condition because k3s may not have marked the old node NotReady yet
+        // when this runs shortly after container start.
+        let (output, exit_code) = exec_capture_with_exit(
+            docker,
+            &container_name,
+            vec![
+                "sh".to_string(),
+                "-c".to_string(),
+                format!(
+                    "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
+                     --no-headers -o custom-columns=NAME:.metadata.name \
+                     2>/dev/null"
+                ),
+            ],
+        )
+        .await?;
+
+        if exit_code == 0 {
+            // Determine the current node name (container hostname).
+            let (hostname_out, _) =
+                exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()])
+                    .await?;
+            let current_hostname = hostname_out.trim().to_string();
+
+            stale_nodes = output
+                .lines()
+                .map(str::trim)
+                .filter(|l| !l.is_empty() && *l != current_hostname)
+                .map(ToString::to_string)
+                .collect();
+            break;
+        }
+
+        if attempt < MAX_ATTEMPTS {
+            tracing::debug!(
+                "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s",
+                RETRY_DELAY.as_secs()
+            );
+            tokio::time::sleep(RETRY_DELAY).await;
+        }
+    }
+
+    if stale_nodes.is_empty() {
+        return Ok(0);
+    }
+
+    let node_list = stale_nodes.join(" ");
+    let count = stale_nodes.len();
+    tracing::info!("removing {} stale node(s): {}", count, node_list);
 
-    // Get the list of NotReady nodes.
-    // The last condition on a node is always type=Ready; we need to check its
-    // **status** (True/False/Unknown), not its type.  Nodes where the Ready
-    // condition status is not "True" are stale and should be removed.
-    let (output, exit_code) = exec_capture_with_exit(
+    // Step 1: delete the stale node objects.
+    let (_output, exit_code) = exec_capture_with_exit(
         docker,
         &container_name,
         vec![
             "sh".to_string(),
             "-c".to_string(),
             format!(
-                "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \
-                 --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \
-                 2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'"
+                "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
             ),
         ],
     )
     .await?;
 
     if exit_code != 0 {
-        // kubectl not ready yet or no nodes — nothing to clean
-        return Ok(0);
+        tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
     }
 
-    let stale_nodes: Vec<&str> = output
-        .lines()
-        .map(str::trim)
-        .filter(|l| !l.is_empty())
-        .collect();
-    if stale_nodes.is_empty() {
-        return Ok(0);
-    }
+    // Step 2: force-delete pods stuck in Terminating.  After the stale node is
+    // removed, pods that were scheduled on it transition to Terminating but
+    // will never complete graceful shutdown (the node is gone).  StatefulSets
+    // will not create a replacement until the old pod is fully deleted.
+    let (_output, exit_code) = exec_capture_with_exit(
+        docker,
+        &container_name,
+        vec![
+            "sh".to_string(),
+            "-c".to_string(),
+            format!(
+                "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
+                 --field-selector=status.phase=Running -o name 2>/dev/null; \
+                 for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \
+                     --no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \
+                     ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \
+                     KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \
+                         --force --grace-period=0 --ignore-not-found 2>/dev/null; \
+                 done"
+            ),
+        ],
+    )
+    .await?;
 
-    let node_list = stale_nodes.join(" ");
-    let count = stale_nodes.len();
-    tracing::info!("removing {} stale node(s): {}", count, node_list);
+    if exit_code != 0 {
+        tracing::debug!(
+            "force-delete of terminating pods returned exit code {exit_code} (non-fatal)"
+        );
+    }
 
+    // Step 3: delete PersistentVolumeClaims in the openshell namespace whose
+    // backing PV has node affinity for a stale node.  local-path-provisioner
+    // creates PVs tied to the original node; when the node changes, the PV is
+    // unschedulable and the `StatefulSet` pod stays Pending.  Deleting the PVC
+    // (and its PV) lets the provisioner create a fresh one on the current node.
     let (_output, exit_code) = exec_capture_with_exit(
         docker,
         &container_name,
         vec![
             "sh".to_string(),
             "-c".to_string(),
             format!(
-                "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found"
+                r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \
+                 CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \
+                 [ -z "$CURRENT_NODE" ] && exit 0; \
+                 for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \
+                     NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \
+                     [ "$NODE" = "$CURRENT_NODE" ] && continue; \
+                     NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \
+                     PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \
+                     [ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \
+                     kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \
+                 done"#
             ),
         ],
     )
     .await?;
 
     if exit_code != 0 {
-        tracing::warn!("failed to delete stale nodes (exit code {exit_code})");
+        tracing::debug!("PV/PVC cleanup returned exit code {exit_code} (non-fatal)");
     }
 
     Ok(count)