Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions nvidia-driver-installer/cos/daemonset-confidential-latest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# Copyright 2025 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The Dockerfile and other source for this daemonset are in
# https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/
#
# This is the same as ../../daemonset.yaml except that it assumes that the
# docker image is present on the node instead of downloading from GCR. This
# allows easier upgrades because GKE can preload the correct image on the
# node and the daemonset can just use that image.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
labels:
k8s-app: nvidia-driver-installer
spec:
selector:
matchLabels:
k8s-app: nvidia-driver-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-driver-installer
k8s-app: nvidia-driver-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: cloud.google.com/gke-gpu-driver-version
operator: DoesNotExist
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tested that the LATEST driver works on cGPU?

- key: cloud.google.com/gke-confidential-nodes-instance-type
operator: In
values:
- TDX
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: vulkan-icd-mount
hostPath:
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
- name: cos-tools
hostPath:
path: /var/lib/cos-tools
- name: nvidia-config
hostPath:
path: /etc/nvidia
initContainers:
- image: "cos-nvidia-installer:fixed"
imagePullPolicy: Never
name: nvidia-driver-installer
resources:
requests:
cpu: 150m
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
- name: COS_TOOLS_DIR_HOST
value: /var/lib/cos-tools
- name: COS_TOOLS_DIR_CONTAINER
value: /build/cos-tools
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
- name: cos-tools
mountPath: /build/cos-tools
- name: nvidia-config
mountPath: /etc/nvidia
command:
- bash
- -c
- |
echo "Checking for existing GPU driver modules"
LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 )
IFS=,; for label in $LABELS; do
IFS==; read -r LABEL VALUE <<< "$label"
if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then
CONFIDENTIAL_INSTANCE_TYPE=$VALUE
echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt
fi
done
if lsmod | grep nvidia; then
echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation"
exit 0
else
if [[ "${CONFIDENTIAL_INSTANCE_TYPE}" == "TDX" ]]; then
echo "No GPU driver module detected, installing now"
/cos-gpu-installer install --no-verify --version=latest || exit 1
sbin/modprobe -d /root drm_kms_helper; /sbin/insmod /usr/local/nvidia/drivers/nvidia.ko; sbin/insmod /usr/local/nvidia/drivers/nvidia-uvm.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-modeset.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-drm.ko
/usr/local/nvidia/bin/nvidia-modprobe -c0 -u -m
chmod 755 /root/home/kubernetes/bin/nvidia
else
echo "Confidential GPU is not supported on this VM, skipping driver installation"
fi
fi
- image: "gcr.io/gke-release/nvidia-persistenced-installer@sha256:e875101ea7bddcef6e628359e3a8f02fdfbcd05f6efe75bc7ad9457ef4020a04"
name: "nvidia-persistenced-installer"
restartPolicy: Always
securityContext:
privileged: true
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
- name: COS_TOOLS_DIR_HOST
value: /var/lib/cos-tools
- name: COS_TOOLS_DIR_CONTAINER
value: /build/cos-tools
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
mountPropagation: HostToContainer
- name: root-mount
mountPath: /root
mountPropagation: HostToContainer
- name: nvidia-config
mountPath: /etc/nvidia
mountPropagation: HostToContainer
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: cos-tools
mountPath: /build/cos-tools
- image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:116be6b7335c1d34366223b9a3780fe80d862fcf06cd2c580426fdc1697af693"
name: partition-gpus
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: dev
mountPath: /dev
- name: nvidia-config
mountPath: /etc/nvidia
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause
8 changes: 0 additions & 8 deletions nvidia-driver-installer/cos/daemonset-confidential.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,6 @@ spec:
echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt
fi
done
LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 )
IFS=,; for label in $LABELS; do
IFS==; read -r LABEL VALUE <<< "$label"
if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then
CONFIDENTIAL_INSTANCE_TYPE=$VALUE
echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt
fi
done
if lsmod | grep nvidia; then
echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation"
exit 0
Expand Down