Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,34 @@ pub async fn ensure_container(
}
}

// On Tegra platforms (Jetson) the NVIDIA container toolkit and CDI spec
// generation reads host-file injection config from
// /etc/nvidia-container-runtime/host-files-for-container.d on the host.
// Without this bind mount, the device plugin inside k3s cannot discover
// Tegra GPU devices and fails with "CDI options are only supported on
// NVML-based systems".
//
// We detect Tegra by querying the Docker daemon's kernel version (which
// works for both local and remote/SSH deploys) rather than checking the
// local filesystem.
if !device_ids.is_empty() {
let info = docker.info().await.into_diagnostic()?;
let is_tegra = info
.kernel_version
.as_deref()
.map_or(false, |k| k.contains("tegra"));
if is_tegra {
const HOST_FILES_DIR: &str = "/etc/nvidia-container-runtime/host-files-for-container.d";
tracing::info!(
kernel_version = info.kernel_version.as_deref().unwrap_or("unknown"),
"Detected Tegra platform, bind-mounting {HOST_FILES_DIR} for CDI spec generation"
);
let mut binds = host_config.binds.take().unwrap_or_default();
binds.push(format!("{HOST_FILES_DIR}:{HOST_FILES_DIR}:ro"));
host_config.binds = Some(binds);
}
}

let mut cmd = vec![
"server".to_string(),
"--disable=traefik".to_string(),
Expand Down
39 changes: 39 additions & 0 deletions crates/openshell-sandbox/src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,22 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
target_os = "redox"
)))]
{
let cdi_gids = snapshot_cdi_gids();
nix::unistd::initgroups(user_cstr.as_c_str(), group.gid).into_diagnostic()?;
if !cdi_gids.is_empty() {
let mut merged: Vec<nix::unistd::Gid> =
nix::unistd::getgroups().unwrap_or_default();
for gid in &cdi_gids {
if !merged.contains(gid) {
merged.push(*gid);
}
}
tracing::info!(
gids = ?cdi_gids.iter().map(|g| g.as_raw()).collect::<Vec<_>>(),
"Preserving CDI-injected supplementary GIDs across initgroups"
);
nix::unistd::setgroups(&merged).into_diagnostic()?;
}
}
}

Expand Down Expand Up @@ -458,6 +473,30 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
Ok(())
}

/// Snapshot supplementary GIDs injected by the container runtime (e.g. via CDI
/// `additionalGids`) before `initgroups` replaces them.
///
/// Only captures GIDs when GPU devices are present — on non-GPU sandboxes the
/// runtime won't inject device-access GIDs so there is nothing to preserve.
/// GID 0 (root) is always excluded to avoid inadvertent privilege retention.
#[cfg(not(any(
target_os = "macos",
target_os = "ios",
target_os = "haiku",
target_os = "redox"
)))]
fn snapshot_cdi_gids() -> Vec<nix::unistd::Gid> {
if !std::path::Path::new("/dev/nvidiactl").exists() {
return Vec::new();
}
let root_gid = nix::unistd::Gid::from_raw(0);
nix::unistd::getgroups()
.unwrap_or_default()
.into_iter()
.filter(|&g| g != root_gid)
.collect()
}

/// Process exit status.
#[derive(Debug, Clone, Copy)]
pub struct ProcessStatus {
Expand Down
2 changes: 1 addition & 1 deletion deploy/docker/Dockerfile.images
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ARG K3S_VERSION=v1.35.2-k3s1
ARG K3S_DIGEST=sha256:c3184157c3048112bab0c3e17405991da486cb3413511eba23f7650efd70776b
ARG K9S_VERSION=v0.50.18
ARG HELM_VERSION=v3.17.3
ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1
ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.19.0-1

# ---------------------------------------------------------------------------
# Shared Rust build stages
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ spec:
targetNamespace: nvidia-device-plugin
createNamespace: true
valuesContent: |-
image:
repository: ghcr.io/nvidia/k8s-device-plugin
tag: "2ab68c16"
runtimeClassName: nvidia
deviceListStrategy: cdi-cri
deviceIDStrategy: index
Expand Down
10 changes: 6 additions & 4 deletions e2e/python/test_sandbox_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ def test_gpu_sandbox_reports_available_gpu(
sandbox: Callable[..., Sandbox],
gpu_sandbox_spec: datamodel_pb2.SandboxSpec,
) -> None:
nvidia_smi_args = ["--query-gpu=name", "--format=csv,noheader"]
with sandbox(spec=gpu_sandbox_spec, delete_on_exit=True) as sb:
result = sb.exec(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
timeout_seconds=30,
)
result = sb.exec(["nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
if result.exit_code != 0:
# On some platforms (e.g. Tegra/Jetson) nvidia-smi lives in
# /usr/sbin rather than /usr/bin and may not be on PATH.
result = sb.exec(["/usr/sbin/nvidia-smi", *nvidia_smi_args], timeout_seconds=30)

assert result.exit_code == 0, result.stderr
assert result.stdout.strip()
Loading