From 38892c393604a3ab960fc4f18686e1f80692b7ab Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 2 Apr 2026 11:50:53 -0700 Subject: [PATCH 1/3] fix(bootstrap,server): persist sandbox state across gateway stop/start cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes to preserve sandbox state across gateway restarts: 1. Deterministic k3s node identity: Set the Docker container hostname to a deterministic name derived from the gateway name (openshell-{name}). Pass OPENSHELL_NODE_NAME env var and --node-name flag to k3s via the cluster entrypoint as belt-and-suspenders. Update clean_stale_nodes() to prefer the deterministic name with a fallback to the container hostname for backward compatibility with older cluster images. This prevents clean_stale_nodes() from deleting PVCs (including the server's SQLite database) when the container is recreated after an image upgrade. 2. Default workspace persistence: Inject a 2Gi PVC and init container into every sandbox pod so the /sandbox directory survives pod rescheduling. The init container uses the same sandbox image, mounts the PVC at a temporary path, and copies the image's /sandbox contents (Python venv, dotfiles, skills) into the PVC on first use — guarded by a sentinel file so subsequent restarts are instant. The agent container then mounts the populated PVC at /sandbox. Users who supply custom volumeClaimTemplates are unaffected — the default workspace is skipped. Fixes #738 --- architecture/gateway-single-node.md | 2 +- architecture/gateway.md | 2 +- crates/openshell-bootstrap/src/constants.rs | 85 +++++ crates/openshell-bootstrap/src/docker.rs | 15 +- crates/openshell-bootstrap/src/runtime.rs | 45 ++- crates/openshell-server/src/sandbox/mod.rs | 324 ++++++++++++++++++++ deploy/docker/cluster-entrypoint.sh | 20 +- 7 files changed, 476 insertions(+), 17 deletions(-) diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index f46a50d2d..6389c728e 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -185,7 +185,7 @@ For the target daemon (local or remote): After the container starts: -1. **Clean stale nodes**: `clean_stale_nodes()` finds `NotReady` nodes via `kubectl get nodes` and deletes them. This is needed when a container is recreated but reuses the persistent volume -- k3s registers a new node (using the container ID as hostname) while old node entries persist in etcd. Non-fatal on error; returns the count of removed nodes. +1. **Clean stale nodes**: `clean_stale_nodes()` finds nodes whose name does not match the deterministic k3s `--node-name` and deletes them. That node name is derived from the gateway name but normalized to a Kubernetes-safe lowercase form so existing gateway names that contain `_`, `.`, or uppercase characters still produce a valid node identity. This cleanup is needed when a container is recreated but reuses the persistent volume -- old node entries can persist in etcd. Non-fatal on error; returns the count of removed nodes. 2. **Push local images** (optional, local deploy only): If `OPENSHELL_PUSH_IMAGES` is set, the comma-separated image refs are exported from the local Docker daemon as a single tar, uploaded into the container via `docker put_archive`, and imported into containerd via `ctr images import` in the `k8s.io` namespace. After import, `kubectl rollout restart deployment/openshell openshell` is run, followed by `kubectl rollout status --timeout=180s` to wait for completion. See `crates/openshell-bootstrap/src/push.rs`. 3. **Wait for gateway health**: `wait_for_gateway_ready()` polls the Docker HEALTHCHECK status up to 180 times, 2 seconds apart (6 min total). A background task streams container logs during this wait. Failure modes: - Container exits during polling: error includes recent log lines. diff --git a/architecture/gateway.md b/architecture/gateway.md index 39f97c8c1..72574410d 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -501,7 +501,7 @@ The Helm chart template is at `deploy/helm/openshell/templates/statefulset.yaml` `SandboxClient` (`crates/openshell-server/src/sandbox/mod.rs`) manages `agents.x-k8s.io/v1alpha1/Sandbox` CRDs. -- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.). +- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.). When callers do not provide custom `volumeClaimTemplates`, the server injects a default `workspace` PVC and mounts it at `/sandbox` so the default sandbox home/workdir survives pod rescheduling. - **Delete**: Calls the Kubernetes API to delete the CRD by name. Returns `false` if already gone (404). - **Pod IP resolution**: `agent_pod_ip()` fetches the agent pod and reads `status.podIP`. diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index 74e381fd2..eee9000d1 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -13,11 +13,66 @@ pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; /// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; +const NODE_NAME_PREFIX: &str = "openshell-"; +const NODE_NAME_FALLBACK_SUFFIX: &str = "gateway"; +const KUBERNETES_MAX_NAME_LEN: usize = 253; pub fn container_name(name: &str) -> String { format!("openshell-cluster-{name}") } +/// Deterministic k3s node name derived from the gateway name. +/// +/// k3s defaults to using the container hostname (= Docker container ID) as +/// the node name. When the container is recreated (e.g. after an image +/// upgrade), the container ID changes, creating a new k3s node. The +/// `clean_stale_nodes` function then deletes PVCs whose backing PVs have +/// node affinity for the old node — wiping the server database and any +/// sandbox persistent volumes. +/// +/// By passing a deterministic `--node-name` to k3s, the node identity +/// survives container recreation, and PVCs are never orphaned. +/// +/// Gateway names allow Docker-friendly separators and uppercase characters, +/// but Kubernetes node names must be DNS-safe. Normalize the gateway name into +/// a single lowercase RFC 1123 label so previously accepted names such as +/// `prod_us` or `Prod.US` still deploy successfully. +pub fn node_name(name: &str) -> String { + format!("{NODE_NAME_PREFIX}{}", normalize_node_name_suffix(name)) +} + +fn normalize_node_name_suffix(name: &str) -> String { + let mut normalized = String::with_capacity(name.len()); + let mut last_was_separator = false; + + for ch in name.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + last_was_separator = false; + } else if !last_was_separator { + normalized.push('-'); + last_was_separator = true; + } + } + + let mut normalized = normalized.trim_matches('-').to_string(); + if normalized.is_empty() { + normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); + } + + let max_suffix_len = KUBERNETES_MAX_NAME_LEN.saturating_sub(NODE_NAME_PREFIX.len()); + if normalized.len() > max_suffix_len { + normalized.truncate(max_suffix_len); + normalized.truncate(normalized.trim_end_matches('-').len()); + } + + if normalized.is_empty() { + normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); + } + + normalized +} + pub fn volume_name(name: &str) -> String { format!("openshell-cluster-{name}") } @@ -25,3 +80,33 @@ pub fn volume_name(name: &str) -> String { pub fn network_name(name: &str) -> String { format!("openshell-cluster-{name}") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn node_name_normalizes_uppercase_and_underscores() { + assert_eq!(node_name("Prod_US"), "openshell-prod-us"); + } + + #[test] + fn node_name_collapses_and_trims_separator_runs() { + assert_eq!(node_name("._Prod..__-Gateway-."), "openshell-prod-gateway"); + } + + #[test] + fn node_name_falls_back_when_gateway_name_has_no_alphanumerics() { + assert_eq!(node_name("...___---"), "openshell-gateway"); + } + + #[test] + fn node_name_truncates_to_kubernetes_name_limit() { + let gateway_name = "A".repeat(400); + let node_name = node_name(&gateway_name); + + assert!(node_name.len() <= KUBERNETES_MAX_NAME_LEN); + assert!(node_name.starts_with(NODE_NAME_PREFIX)); + assert!(node_name.ends_with('a')); + } +} diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index d9aaed7f4..d418149f9 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::RemoteOptions; -use crate::constants::{container_name, network_name, volume_name}; +use crate::constants::{container_name, network_name, node_name, volume_name}; use crate::image::{self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, parse_image_ref}; use bollard::API_DEFAULT_VERSION; use bollard::Docker; @@ -684,6 +684,11 @@ pub async fn ensure_container( format!("REGISTRY_HOST={registry_host}"), format!("REGISTRY_INSECURE={registry_insecure}"), format!("IMAGE_REPO_BASE={image_repo_base}"), + // Deterministic k3s node name so the node identity survives container + // recreation (e.g. after an image upgrade). Without this, k3s uses + // the container ID as the hostname/node name, which changes on every + // container recreate and triggers stale-node PVC cleanup. + format!("OPENSHELL_NODE_NAME={}", node_name(name)), ]; if let Some(endpoint) = registry_endpoint { env_vars.push(format!("REGISTRY_ENDPOINT={endpoint}")); @@ -753,6 +758,14 @@ pub async fn ensure_container( let config = ContainerCreateBody { image: Some(image_ref.to_string()), + // Set the container hostname to the deterministic node name. + // k3s uses the container hostname as its default node name. Without + // this, Docker defaults to the container ID (first 12 hex chars), + // which changes on every container recreation and can cause + // `clean_stale_nodes` to delete the wrong node on resume. The + // hostname persists across container stop/start cycles, ensuring a + // stable node identity. + hostname: Some(node_name(name)), cmd: Some(cmd), env, exposed_ports: Some(exposed_ports), diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 2a10b2651..0f9a96e6b 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use crate::constants::{KUBECONFIG_PATH, container_name}; +use crate::constants::{KUBECONFIG_PATH, container_name, node_name}; use bollard::Docker; use bollard::container::LogOutput; use bollard::exec::CreateExecOptions; @@ -385,11 +385,19 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { let container_name = container_name(name); let mut stale_nodes: Vec = Vec::new(); + // Determine the current node name. With the deterministic `--node-name` + // entrypoint change the k3s node is `openshell-{gateway}`. However, older + // cluster images (built before that change) still use the container hostname + // (= Docker container ID) as the node name. We must handle both: + // + // 1. If the expected deterministic name appears in the node list, use it. + // 2. Otherwise fall back to the container hostname (old behaviour). + // + // This ensures backward compatibility during upgrades where the bootstrap + // CLI is newer than the cluster image. + let deterministic_node = node_name(name); + for attempt in 1..=MAX_ATTEMPTS { - // List ALL node names and the container's own hostname. Any node that - // is not the current container is stale — we cannot rely on the Ready - // condition because k3s may not have marked the old node NotReady yet - // when this runs shortly after container start. let (output, exit_code) = exec_capture_with_exit( docker, &container_name, @@ -406,16 +414,27 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { .await?; if exit_code == 0 { - // Determine the current node name (container hostname). - let (hostname_out, _) = - exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) - .await?; - let current_hostname = hostname_out.trim().to_string(); - - stale_nodes = output + let all_nodes: Vec<&str> = output .lines() .map(str::trim) - .filter(|l| !l.is_empty() && *l != current_hostname) + .filter(|l| !l.is_empty()) + .collect(); + + // Pick the current node identity: prefer the deterministic name, + // fall back to the container hostname for older cluster images. + let current_node = if all_nodes.contains(&deterministic_node.as_str()) { + deterministic_node.clone() + } else { + // Older cluster image without --node-name: read hostname. + let (hostname_out, _) = + exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) + .await?; + hostname_out.trim().to_string() + }; + + stale_nodes = all_nodes + .into_iter() + .filter(|n| *n != current_node) .map(ToString::to_string) .collect(); break; diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index 3dca66493..c5e9a8335 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -34,6 +34,44 @@ const SANDBOX_MANAGED_VALUE: &str = "openshell"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const GPU_RESOURCE_QUANTITY: &str = "1"; +// --------------------------------------------------------------------------- +// Default workspace persistence (temporary — will be replaced by snapshotting) +// --------------------------------------------------------------------------- +// Every sandbox pod gets a PVC-backed `/sandbox` directory so that user data +// (installed packages, files, dotfiles) survives pod rescheduling across +// gateway stop/start cycles. An init container seeds the PVC with the +// image's original `/sandbox` contents on first use so that the Python venv, +// skills, and shell config are not lost when the empty PVC is mounted. +// +// NOTE: This PVC + init-container approach is a stopgap. It has known +// limitations: image upgrades don't propagate into existing PVCs, the init +// copy adds first-start latency, and the full /sandbox directory is +// duplicated on disk. The plan is to replace this with proper container +// snapshotting so that only the diff from the base image is persisted. + +/// Volume name used for the workspace PVC in the pod spec. +const WORKSPACE_VOLUME_NAME: &str = "workspace"; + +/// Mount path for the workspace PVC in the **agent** container. This shadows +/// the image's `/sandbox` directory — the init container copies the image +/// contents into the PVC before the agent starts. +const WORKSPACE_MOUNT_PATH: &str = "/sandbox"; + +/// Mount path for the workspace PVC in the **init** container. A temporary +/// path so the init container can see the image's original `/sandbox` and +/// copy it into the PVC. +const WORKSPACE_INIT_MOUNT_PATH: &str = "/workspace-pvc"; + +/// Name of the init container that seeds the workspace PVC. +const WORKSPACE_INIT_CONTAINER_NAME: &str = "workspace-init"; + +/// Default storage request for the workspace PVC. +const WORKSPACE_DEFAULT_STORAGE: &str = "2Gi"; + +/// Sentinel file written by the init container after copying the image's +/// `/sandbox` contents. Subsequent pod starts skip the copy. +const WORKSPACE_SENTINEL: &str = ".workspace-initialized"; + #[derive(Clone)] pub struct SandboxClient { client: Client, @@ -733,6 +771,107 @@ fn apply_supervisor_sideload(pod_template: &mut serde_json::Value) { } } +/// Apply workspace persistence transforms to an already-built pod template. +/// +/// This injects: +/// 1. A volume mount on the agent container at `/sandbox`. +/// 2. An init container (same image) that seeds the PVC with the image's +/// original `/sandbox` contents on first use. +/// +/// The PVC volume itself is **not** added here — the Sandbox CRD controller +/// automatically creates a volume for each entry in `volumeClaimTemplates` +/// (following the StatefulSet convention). Adding one here would create a +/// duplicate volume name and fail pod validation. +/// +/// The init container mounts the PVC at a temporary path so it can still see +/// the image's `/sandbox` directory. It checks for a sentinel file and skips +/// the copy if the PVC was already initialised. +fn apply_workspace_persistence(pod_template: &mut serde_json::Value, image: &str) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + // 1. Add workspace volume mount to the agent container + let containers = spec.get_mut("containers").and_then(|v| v.as_array_mut()); + if let Some(containers) = containers { + let mut target_index = None; + for (i, c) in containers.iter().enumerate() { + if c.get("name").and_then(|v| v.as_str()) == Some("agent") { + target_index = Some(i); + break; + } + } + let index = target_index.unwrap_or(0); + + if let Some(container) = containers.get_mut(index).and_then(|v| v.as_object_mut()) { + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(serde_json::json!({ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_MOUNT_PATH + })); + } + } + } + + // 3. Add the init container that seeds the PVC from the image + let init_containers = spec + .entry("initContainers") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(init_containers) = init_containers { + // The init container mounts the PVC at a temp path so it can still + // read the image's original /sandbox contents. It copies them into + // the PVC only when the sentinel file is absent. + // + // The inner `[ -d ... ]` guard handles custom images that don't have + // a /sandbox directory — the copy is skipped but the sentinel is + // still written so subsequent starts are instant. + let copy_cmd = format!( + "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ + if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ + cp -a {WORKSPACE_MOUNT_PATH}/. {WORKSPACE_INIT_MOUNT_PATH}/; \ + fi && \ + touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ + fi" + ); + + init_containers.push(serde_json::json!({ + "name": WORKSPACE_INIT_CONTAINER_NAME, + "image": image, + "command": ["sh", "-c", copy_cmd], + "securityContext": { "runAsUser": 0 }, + "volumeMounts": [{ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_INIT_MOUNT_PATH + }] + })); + } +} + +/// Build the default `volumeClaimTemplates` array for sandbox pods. +/// +/// Provides a single PVC named "workspace" that backs the `/sandbox` +/// directory. The init container seeds it from the image on first use. +fn default_workspace_volume_claim_templates() -> serde_json::Value { + serde_json::json!([{ + "metadata": { + "name": WORKSPACE_VOLUME_NAME + }, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": { + "requests": { + "storage": WORKSPACE_DEFAULT_STORAGE + } + } + } + }]) +} + #[allow(clippy::too_many_arguments)] fn sandbox_to_k8s_spec( spec: Option<&SandboxSpec>, @@ -748,6 +887,18 @@ fn sandbox_to_k8s_spec( host_gateway_ip: &str, ) -> serde_json::Value { let mut root = serde_json::Map::new(); + + // Determine early whether the user provided custom volumeClaimTemplates. + // When they haven't, we inject a default workspace VCT and corresponding + // init container + volume mount so sandbox data persists. We need this + // flag before building the podTemplate because the workspace persistence + // transforms are applied inside sandbox_template_to_k8s. + let user_has_vct = spec + .and_then(|s| s.template.as_ref()) + .and_then(|t| struct_to_json(&t.volume_claim_templates)) + .is_some(); + let inject_workspace = !user_has_vct; + if let Some(spec) = spec { if !spec.log_level.is_empty() { root.insert("logLevel".to_string(), serde_json::json!(spec.log_level)); @@ -775,6 +926,7 @@ fn sandbox_to_k8s_spec( &spec.environment, client_tls_secret_name, host_gateway_ip, + inject_workspace, ), ); if !template.agent_socket.is_empty() { @@ -789,6 +941,15 @@ fn sandbox_to_k8s_spec( } } + // Inject the default workspace volumeClaimTemplate when the user didn't + // provide their own. + if inject_workspace { + root.insert( + "volumeClaimTemplates".to_string(), + default_workspace_volume_claim_templates(), + ); + } + // podTemplate is required by the Kubernetes CRD - ensure it's always present if !root.contains_key("podTemplate") { let empty_env = std::collections::HashMap::new(); @@ -809,6 +970,7 @@ fn sandbox_to_k8s_spec( spec_env, client_tls_secret_name, host_gateway_ip, + inject_workspace, ), ); } @@ -833,6 +995,7 @@ fn sandbox_template_to_k8s( spec_environment: &std::collections::HashMap, client_tls_secret_name: &str, host_gateway_ip: &str, + inject_workspace: bool, ) -> serde_json::Value { // The supervisor binary is always side-loaded from the node filesystem // via a hostPath volume, regardless of which sandbox image is used. @@ -959,6 +1122,13 @@ fn sandbox_template_to_k8s( // Always side-load the supervisor binary from the node filesystem apply_supervisor_sideload(&mut result); + // Inject workspace persistence (init container + PVC volume mount) so + // that /sandbox data survives pod rescheduling. Skipped when the user + // provides custom volumeClaimTemplates to avoid conflicts. + if inject_workspace { + apply_workspace_persistence(&mut result, image); + } + result } @@ -1631,6 +1801,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1664,6 +1835,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1693,6 +1865,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1735,6 +1908,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; @@ -1761,6 +1935,7 @@ mod tests { &std::collections::HashMap::new(), "", "172.17.0.1", + true, ); let host_aliases = pod_template["spec"]["hostAliases"] @@ -1791,6 +1966,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert!( @@ -1816,6 +1992,7 @@ mod tests { &std::collections::HashMap::new(), "my-tls-secret", "", + true, ); let volumes = pod_template["spec"]["volumes"] @@ -1831,4 +2008,151 @@ mod tests { "TLS secret volume must use mode 0400 to prevent sandbox user from reading the private key" ); } + + // ----------------------------------------------------------------------- + // Workspace persistence tests + // ----------------------------------------------------------------------- + + #[test] + fn workspace_persistence_injects_init_container_volume_and_mount() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "openshell/sandbox:latest" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "openshell/sandbox:latest"); + + // Init container + let init_containers = pod_template["spec"]["initContainers"] + .as_array() + .expect("initContainers should exist"); + assert_eq!(init_containers.len(), 1); + assert_eq!(init_containers[0]["name"], WORKSPACE_INIT_CONTAINER_NAME); + assert_eq!(init_containers[0]["image"], "openshell/sandbox:latest"); + assert_eq!(init_containers[0]["securityContext"]["runAsUser"], 0); + + // Init container mounts PVC at temp path, not /sandbox + let init_mounts = init_containers[0]["volumeMounts"] + .as_array() + .expect("init volumeMounts should exist"); + assert_eq!(init_mounts.len(), 1); + assert_eq!(init_mounts[0]["name"], WORKSPACE_VOLUME_NAME); + assert_eq!(init_mounts[0]["mountPath"], WORKSPACE_INIT_MOUNT_PATH); + + // Agent container mounts PVC at /sandbox + let agent_mounts = pod_template["spec"]["containers"][0]["volumeMounts"] + .as_array() + .expect("agent volumeMounts should exist"); + let workspace_mount = agent_mounts + .iter() + .find(|m| m["name"] == WORKSPACE_VOLUME_NAME) + .expect("workspace mount should exist on agent container"); + assert_eq!(workspace_mount["mountPath"], WORKSPACE_MOUNT_PATH); + + // The PVC volume is NOT created by apply_workspace_persistence — the + // Sandbox CRD controller adds it from the volumeClaimTemplates. + // Verify we did not inject one (which would cause a duplicate). + let has_pvc_vol = pod_template["spec"]["volumes"] + .as_array() + .map_or(false, |vols| { + vols.iter().any(|v| v["name"] == WORKSPACE_VOLUME_NAME) + }); + assert!( + !has_pvc_vol, + "apply_workspace_persistence must NOT add a PVC volume (the CRD controller does that)" + ); + } + + #[test] + fn workspace_persistence_uses_same_image_as_agent() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "my-custom-image:v2" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "my-custom-image:v2"); + + let init_image = pod_template["spec"]["initContainers"][0]["image"] + .as_str() + .expect("init container should have image"); + assert_eq!( + init_image, "my-custom-image:v2", + "init container must use the same image as the agent container" + ); + } + + #[test] + fn workspace_init_command_checks_sentinel() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "img:latest" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "img:latest"); + + let cmd = pod_template["spec"]["initContainers"][0]["command"] + .as_array() + .expect("command should be an array"); + let script = cmd[2].as_str().expect("third element should be the script"); + assert!( + script.contains(WORKSPACE_SENTINEL), + "init script must check for sentinel file" + ); + assert!( + script.contains("cp -a"), + "init script must copy image contents" + ); + } + + #[test] + fn workspace_persistence_skipped_when_inject_workspace_false() { + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + false, // user provided custom VCTs + ); + + // No init container should be present + assert!( + pod_template["spec"]["initContainers"].is_null() + || pod_template["spec"]["initContainers"] + .as_array() + .is_none_or(|a| a.is_empty()), + "workspace init container must NOT be present when inject_workspace is false" + ); + + // No workspace volume mount on agent + let has_workspace_mount = pod_template["spec"]["containers"][0]["volumeMounts"] + .as_array() + .map_or(false, |mounts| { + mounts.iter().any(|m| m["name"] == WORKSPACE_VOLUME_NAME) + }); + assert!( + !has_workspace_mount, + "workspace mount must NOT be present when inject_workspace is false" + ); + } } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 367665db3..3ab2f7b75 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -559,8 +559,26 @@ fi # routing to settle first. wait_for_default_route +# --------------------------------------------------------------------------- +# Deterministic k3s node name +# --------------------------------------------------------------------------- +# By default k3s uses the container hostname (= Docker container ID) as the +# node name. When the container is recreated (e.g. after an image upgrade), +# the container ID changes, registering a new k3s node. The bootstrap code +# then deletes PVCs whose backing PVs have node affinity for the old node — +# wiping the server database and any sandbox persistent volumes. +# +# OPENSHELL_NODE_NAME is set by the bootstrap code to a deterministic value +# derived from the gateway name, so the node identity survives container +# recreation and PVCs are never orphaned. +NODE_NAME_ARG="" +if [ -n "${OPENSHELL_NODE_NAME:-}" ]; then + NODE_NAME_ARG="--node-name=${OPENSHELL_NODE_NAME}" + echo "Using deterministic k3s node name: ${OPENSHELL_NODE_NAME}" +fi + # Execute k3s with explicit resolv-conf passed as a kubelet arg. # k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag; # it must be passed via --kubelet-arg instead. # shellcheck disable=SC2086 -exec /bin/k3s "$@" --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS +exec /bin/k3s "$@" $NODE_NAME_ARG --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS From 4cc7e31b71386d3b4163fff796e7a6349ac965d7 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 2 Apr 2026 22:24:22 -0700 Subject: [PATCH 2/3] wip --- crates/openshell-bootstrap/src/docker.rs | 44 ++++++++++------- crates/openshell-bootstrap/src/lib.rs | 60 +++++++++++++----------- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index d418149f9..be086e534 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -482,6 +482,7 @@ pub async fn ensure_container( registry_username: Option<&str>, registry_token: Option<&str>, device_ids: &[String], + resume: bool, ) -> Result { let container_name = container_name(name); @@ -491,25 +492,34 @@ pub async fn ensure_container( .await { Ok(info) => { - // Container exists — verify it is using the expected image. - // Resolve the desired image ref to its content-addressable ID so we - // can compare against the container's image field (which Docker - // stores as an ID). - let desired_id = docker - .inspect_image(image_ref) - .await - .ok() - .and_then(|img| img.id); + // On resume we always reuse the existing container — the persistent + // volume holds k3s etcd state, and recreating the container with + // different env vars would cause the entrypoint to rewrite the + // HelmChart manifest, triggering a Helm upgrade that changes the + // StatefulSet image reference while the old pod still runs with the + // previous image. Reusing the container avoids this entirely. + // + // On a non-resume path we check whether the image changed and + // recreate only when necessary. + let reuse = if resume { + true + } else { + let desired_id = docker + .inspect_image(image_ref) + .await + .ok() + .and_then(|img| img.id); - let container_image_id = info.image; + let container_image_id = info.image.clone(); - let image_matches = match (&desired_id, &container_image_id) { - (Some(desired), Some(current)) => desired == current, - _ => false, + match (&desired_id, &container_image_id) { + (Some(desired), Some(current)) => desired == current, + _ => false, + } }; - if image_matches { - // The container exists with the correct image, but its network + if reuse { + // The container exists and should be reused. Its network // attachment may be stale. When the gateway is resumed after a // container kill, `ensure_network` destroys and recreates the // Docker network (giving it a new ID). The stopped container @@ -543,8 +553,8 @@ pub async fn ensure_container( tracing::info!( "Container {} exists but uses a different image (container={}, desired={}), recreating", container_name, - container_image_id.as_deref().map_or("unknown", truncate_id), - desired_id.as_deref().map_or("unknown", truncate_id), + info.image.as_deref().map_or("unknown", truncate_id), + image_ref, ); let _ = docker.stop_container(&container_name, None).await; diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b569cabe7..d91490706 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -327,33 +327,38 @@ where } } - // Ensure the image is available on the target Docker daemon - if remote_opts.is_some() { - log("[status] Downloading gateway".to_string()); - let on_log_clone = Arc::clone(&on_log); - let progress_cb = move |msg: String| { - if let Ok(mut f) = on_log_clone.lock() { - f(msg); - } - }; - image::pull_remote_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - progress_cb, - ) - .await?; - } else { - // Local deployment: ensure image exists (pull if needed) - log("[status] Downloading gateway".to_string()); - ensure_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - ) - .await?; + // Ensure the image is available on the target Docker daemon. + // On resume the existing container already has its image — skip the + // pull to avoid failures when the original image tag (e.g. a local-only + // `openshell/cluster:dev`) is not available from the default registry. + if !resume { + if remote_opts.is_some() { + log("[status] Downloading gateway".to_string()); + let on_log_clone = Arc::clone(&on_log); + let progress_cb = move |msg: String| { + if let Ok(mut f) = on_log_clone.lock() { + f(msg); + } + }; + image::pull_remote_image( + &target_docker, + &image_ref, + registry_username.as_deref(), + registry_token.as_deref(), + progress_cb, + ) + .await?; + } else { + // Local deployment: ensure image exists (pull if needed) + log("[status] Downloading gateway".to_string()); + ensure_image( + &target_docker, + &image_ref, + registry_username.as_deref(), + registry_token.as_deref(), + ) + .await?; + } } // All subsequent operations use the target Docker (remote or local) @@ -444,6 +449,7 @@ where registry_username.as_deref(), registry_token.as_deref(), &device_ids, + resume, ) .await?; let port = actual_port; From 3e07bdc3583265cf619e321a2735332b7e70b28a Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 2 Apr 2026 22:46:09 -0700 Subject: [PATCH 3/3] fix(bootstrap): skip image pull only when resume container exists On resume, skip the image pull only when the container still exists. When only the volume survives (container was removed), the image pull must proceed so the container can be recreated. This fixes the 'container removal resumes' e2e scenario where the image was not available after the container was force-removed. --- crates/openshell-bootstrap/src/lib.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index d91490706..8ce10703e 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -314,6 +314,7 @@ where // idempotent and will reuse the volume, create a container if needed, // and start it) let mut resume = false; + let mut resume_container_exists = false; if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); @@ -321,17 +322,24 @@ where } else if existing.container_running { log("[status] Gateway is already running".to_string()); resume = true; + resume_container_exists = true; } else { log("[status] Resuming gateway from existing state".to_string()); resume = true; + resume_container_exists = existing.container_exists; } } // Ensure the image is available on the target Docker daemon. - // On resume the existing container already has its image — skip the - // pull to avoid failures when the original image tag (e.g. a local-only + // When both the container and volume exist we can skip the pull entirely + // — the container already references a valid local image. This avoids + // failures when the original image tag (e.g. a local-only // `openshell/cluster:dev`) is not available from the default registry. - if !resume { + // + // When only the volume survives (container was removed), we still need + // the image to recreate the container, so the pull must happen. + let need_image = !resume || !resume_container_exists; + if need_image { if remote_opts.is_some() { log("[status] Downloading gateway".to_string()); let on_log_clone = Arc::clone(&on_log);