From d5d7f0ac7aeed65478af781a0364e70c3e6c8e09 Mon Sep 17 00:00:00 2001
From: Tariq Ibrahim <tibrahim@nvidia.com>
Date: Mon, 1 Dec 2025 13:12:27 -0800
Subject: [PATCH] [state-toolkit] add support for mounting runtime NRI sockets

Signed-off-by: Tariq Ibrahim <tibrahim@nvidia.com>

add nri plugin annotation instead of setting nvidia runtimeclass

Signed-off-by: Tariq Ibrahim <tibrahim@nvidia.com>
---
 api/nvidia/v1/clusterpolicy_types.go          | 17 +++++
 api/nvidia/v1/zz_generated.deepcopy.go        |  5 ++
 .../manifests/nvidia.com_clusterpolicies.yaml |  6 ++
 .../crd/bases/nvidia.com_clusterpolicies.yaml |  6 ++
 controllers/object_controls.go                | 67 +++++++++++++++--
 controllers/transforms_test.go                | 72 ++++++++++++++++++-
 .../crds/nvidia.com_clusterpolicies.yaml      |  6 ++
 .../gpu-operator/templates/clusterpolicy.yaml |  3 +
 deployments/gpu-operator/values.yaml          |  1 +
 .../manifests/cuda-workload-validation.yaml   |  2 +
 10 files changed, 179 insertions(+), 6 deletions(-)

diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
index ea4b21d86..121493c7d 100644
--- a/api/nvidia/v1/clusterpolicy_types.go
+++ b/api/nvidia/v1/clusterpolicy_types.go
@@ -1722,6 +1722,14 @@ type CDIConfigSpec struct {
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden"
 	Default *bool `json:"default,omitempty"`
+
+	// NRIPluginEnabled indicates whether an NRI Plugin should be run as a means of injecting CDI devices to gpu management containers.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:default=false
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NRI as an additional mechanism for injecting CDI devices to gpu management containers."
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	NRIPluginEnabled *bool `json:"nriPluginEnabled,omitempty"`
 }
 
 // MIGStrategy indicates MIG mode
@@ -2176,6 +2184,15 @@ func (c *CDIConfigSpec) IsEnabled() bool {
 	return *c.Enabled
 }
 
+// IsNRIPluginEnabled returns true if NRI Plugin is enabled as a mechanism for
+// injecting CDI devices to containers
+func (c *CDIConfigSpec) IsNRIPluginEnabled() bool {
+	if c.Enabled == nil || c.NRIPluginEnabled == nil {
+		return false
+	}
+	return *c.Enabled && *c.NRIPluginEnabled
+}
+
 // IsEnabled returns true if Kata Manager is enabled
 func (k *KataManagerSpec) IsEnabled() bool {
 	if k.Enabled == nil {
diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go
index 9e68fdb37..3bb70f073 100644
--- a/api/nvidia/v1/zz_generated.deepcopy.go
+++ b/api/nvidia/v1/zz_generated.deepcopy.go
@@ -82,6 +82,11 @@ func (in *CDIConfigSpec) DeepCopyInto(out *CDIConfigSpec) {
 		*out = new(bool)
 		**out = **in
 	}
+	if in.NRIPluginEnabled != nil {
+		in, out := &in.NRIPluginEnabled, &out.NRIPluginEnabled
+		*out = new(bool)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CDIConfigSpec.
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
index 379e98d87..d82340856 100644
--- a/bundle/manifests/nvidia.com_clusterpolicies.yaml
+++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -146,6 +146,12 @@ spec:
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
                     type: boolean
+                  nriPluginEnabled:
+                    default: false
+                    description: NRIPluginEnabled indicates whether an NRI Plugin
+                      should be run as a means of injecting CDI devices to gpu management
+                      containers.
+                    type: boolean
                 type: object
               daemonsets:
                 description: Daemonset defines common configuration for all Daemonsets
diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
index 379e98d87..d82340856 100644
--- a/config/crd/bases/nvidia.com_clusterpolicies.yaml
+++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -146,6 +146,12 @@ spec:
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
                     type: boolean
+                  nriPluginEnabled:
+                    default: false
+                    description: NRIPluginEnabled indicates whether an NRI Plugin
+                      should be run as a means of injecting CDI devices to gpu management
+                      containers.
+                    type: boolean
                 type: object
               daemonsets:
                 description: Daemonset defines common configuration for all Daemonsets
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index bbd2f740a..fe7e9df4b 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -66,6 +66,8 @@ const (
 	DefaultDockerConfigFile = "/etc/docker/daemon.json"
 	// DefaultDockerSocketFile indicates default docker socket file
 	DefaultDockerSocketFile = "/var/run/docker.sock"
+	// DefaultRuntimeNRISocketFile indicates the default container runtime NRI socket file
+	DefaultRuntimeNRISocketFile = "/var/run/nri/nri.sock"
 	// DefaultCRIOConfigFile indicates default config file path for cri-o. .
 	DefaultCRIOConfigFile = "/etc/crio/config.toml"
 	// DefaultCRIODropInConfigFile indicates the default path to the drop-in config file for cri-o
@@ -82,9 +84,11 @@ const (
 	DefaultRuntimeClass = "nvidia"
 	// DriverInstallPathVolName represents volume name for driver install path provided to toolkit
 	DriverInstallPathVolName = "driver-install-path"
-	// DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted
+	// DefaultRuntimeNRISocketTargetDir represents target directory where runtime NRI socket directory will be mounted
+	DefaultRuntimeNRISocketTargetDir = "/runtime/nri-sock-dir/"
+	// DefaultRuntimeSocketTargetDir represents target directory where runtime socket directory will be mounted
 	DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/"
-	// DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted
+	// DefaultRuntimeConfigTargetDir represents target directory where runtime socket directory will be mounted
 	DefaultRuntimeConfigTargetDir = "/runtime/config-dir/"
 	// DefaultRuntimeDropInConfigTargetDir represents target directory where drop-in config directory will be mounted
 	DefaultRuntimeDropInConfigTargetDir = "/runtime/config-dir.d/"
@@ -144,6 +148,8 @@ const (
 	NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH"
 	// CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
 	CRIOConfigModeEnvName = "CRIO_CONFIG_MODE"
+	// CDIEnableNRIPlugin is the name of the env var for enabling NRI Plugin in the toolkit
+	CDIEnableNRIPlugin = "ENABLE_NRI_PLUGIN"
 	// DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin
 	DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
 	// CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
@@ -175,6 +181,8 @@ const (
 	DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
 	// NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime
 	NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
+	// NRIAnnotationDomain represents the domain name used for NRI annotations used for CDI device injections
+	NRIAnnotationDomain = "nvidia.cdi.k8s.io"
 )
 
 // ContainerProbe defines container probe types
@@ -946,6 +954,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
 	}
 
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name)
 
 	// update env required for MIG support
 	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
@@ -953,6 +962,23 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
 	return nil
 }
 
+func setNRIPluginAnnotation(o *metav1.ObjectMeta, cdiConfig *gpuv1.CDIConfigSpec, containerName string) {
+	const (
+		managementCDIDevice = "management.nvidia.com/gpu=all"
+	)
+
+	if !cdiConfig.IsNRIPluginEnabled() {
+		return
+	}
+	annotations := o.Annotations
+	if len(annotations) == 0 {
+		annotations = make(map[string]string)
+	}
+	annotationKey := fmt.Sprintf("%s/container.%s", NRIAnnotationDomain, containerName)
+	annotations[annotationKey] = managementCDIDevice
+	o.Annotations = annotations
+}
+
 // parseOSRelease can be overridden in tests for mocking filesystem access.
 // In production, it reads and parses /host-etc/os-release.
 var parseOSRelease = parseOSReleaseFromFile
@@ -1238,7 +1264,7 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
 	return envVars
 }
 
-func transformToolkitCtrForCDI(container *corev1.Container) {
+func transformToolkitCtrForCDI(container *corev1.Container, nriPluginEnabled bool) {
 	// When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o
 	// to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The
 	// 'nvidia' runtime will be set as the runtime class for our management containers so that
@@ -1252,6 +1278,10 @@ func transformToolkitCtrForCDI(container *corev1.Container) {
 	setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
 	setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
 	setContainerEnv(container, CRIOConfigModeEnvName, "config")
+
+	if nriPluginEnabled {
+		setContainerEnv(container, CDIEnableNRIPlugin, "true")
+	}
 }
 
 // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
@@ -1293,7 +1323,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 
 	// update env required for CDI support
 	if config.CDI.IsEnabled() {
-		transformToolkitCtrForCDI(toolkitMainContainer)
+		transformToolkitCtrForCDI(toolkitMainContainer, config.CDI.IsNRIPluginEnabled())
 	} else if n.runtime == gpuv1.CRIO {
 		// (cdesiniotis) When CDI is not enabled and cri-o is the container runtime,
 		// we continue to install the OCI prestart hook as opposed to adding nvidia
@@ -1464,6 +1494,23 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 		socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}}
 		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol)
 	}
+
+	if config.CDI.IsNRIPluginEnabled() {
+		// setup mounts for the runtime NRI socket file
+		nriSocketFile := getContainerEnv(container, "NRI_SOCKET")
+		if nriSocketFile == "" {
+			nriSocketFile = DefaultRuntimeNRISocketFile
+		}
+		setContainerEnv(container, "NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile))
+
+		nriVolMountSocketName := "nri-socket"
+		nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir}
+		container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket)
+
+		nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
+		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol)
+	}
+
 	return nil
 }
 
@@ -1536,6 +1583,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	}
 
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, devicePluginContainerName)
 
 	// update env required for MIG support
 	applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy)
@@ -1616,6 +1664,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
 	}
 
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, mpsControlMainContainer.Name)
 
 	// update env required for MIG support
 	applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy)
@@ -1730,6 +1779,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 		obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
 	}
 
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name)
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
 
 	// set hostPID if specified for DCGM Exporter
@@ -1880,6 +1930,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
 		}
 	}
 
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name)
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
 
 	return nil
@@ -1923,6 +1974,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 	}
 
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
+	setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name)
 
 	// set ConfigMap name for "mig-parted-config" Volume
 	for i, vol := range obj.Spec.Template.Spec.Volumes {
@@ -2218,6 +2270,11 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 
 	setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
 
+	toolkitValidationCtr := findContainerByName(obj.Spec.Template.Spec.InitContainers, "toolkit-validation")
+	if toolkitValidationCtr != nil && len(toolkitValidationCtr.Name) > 0 {
+		setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, toolkitValidationCtr.Name)
+	}
+
 	var validatorErr error
 	// apply changes for individual component validators(initContainers)
 	components := []string{
@@ -2573,7 +2630,7 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
 }
 
 func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
-	if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
+	if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO || config.CDI.IsNRIPluginEnabled() {
 		return
 	}
 	runtimeClassName := getRuntimeClassName(config)
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index ef2923452..986308a16 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -2559,6 +2559,55 @@ func TestTransformValidator(t *testing.T) {
 				WithPullSecret("pull-secret").
 				WithRuntimeClassName("nvidia"),
 		},
+		{
+			description: "nri plugin enabled",
+			ds: NewDaemonset().
+				WithInitContainer(corev1.Container{Name: "toolkit-validation"}).
+				WithContainer(corev1.Container{
+					Name:            "dummy",
+					Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					SecurityContext: &corev1.SecurityContext{
+						RunAsUser: rootUID,
+					},
+				}).
+				WithPullSecret("pull-secret"),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Validator: gpuv1.ValidatorSpec{
+					Repository:       "nvcr.io/nvidia/cloud-native",
+					Image:            "gpu-operator-validator",
+					Version:          "v1.0.0",
+					ImagePullPolicy:  "IfNotPresent",
+					ImagePullSecrets: []string{"pull-secret"},
+				},
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled:          newBoolPtr(true),
+					NRIPluginEnabled: newBoolPtr(true),
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithPodAnnotations(map[string]string{
+					"nvidia.cdi.k8s.io/container.toolkit-validation": "management.nvidia.com/gpu=all",
+				}).
+				WithInitContainer(corev1.Container{
+					Name:            "toolkit-validation",
+					Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					SecurityContext: &corev1.SecurityContext{
+						RunAsUser: rootUID,
+					},
+				},
+				).
+				WithContainer(corev1.Container{
+					Name:            "dummy",
+					Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					SecurityContext: &corev1.SecurityContext{
+						RunAsUser: rootUID,
+					},
+				}).
+				WithPullSecret("pull-secret"),
+		},
 	}
 
 	for _, tc := range testCases {
@@ -2834,12 +2883,33 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
 					},
 				}),
 		},
+		{
+			description: "cdi and nri plugin enabled",
+			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled:          newBoolPtr(true),
+					NRIPluginEnabled: newBoolPtr(true),
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(
+				corev1.Container{
+					Name: "main-ctr",
+					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
+						{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
+						{Name: CRIOConfigModeEnvName, Value: "config"},
+						{Name: CDIEnableNRIPlugin, Value: "true"},
+					},
+				}),
+		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
 			mainContainer := &tc.ds.Spec.Template.Spec.Containers[0]
-			transformToolkitCtrForCDI(mainContainer)
+			transformToolkitCtrForCDI(mainContainer, tc.cpSpec.CDI.IsNRIPluginEnabled())
 			require.EqualValues(t, tc.expectedDs, tc.ds)
 		})
 	}
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
index 379e98d87..d82340856 100644
--- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
+++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
@@ -146,6 +146,12 @@ spec:
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
                     type: boolean
+                  nriPluginEnabled:
+                    default: false
+                    description: NRIPluginEnabled indicates whether an NRI Plugin
+                      should be run as a means of injecting CDI devices to gpu management
+                      containers.
+                    type: boolean
                 type: object
               daemonsets:
                 description: Daemonset defines common configuration for all Daemonsets
diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml
index 33efec386..ae8bfd1fb 100644
--- a/deployments/gpu-operator/templates/clusterpolicy.yaml
+++ b/deployments/gpu-operator/templates/clusterpolicy.yaml
@@ -152,6 +152,9 @@ spec:
     {{- if .Values.cdi.default }}
     default: {{ .Values.cdi.default }}
     {{- end }}
+    {{- if and (.Values.cdi.enabled) (.Values.cdi.nriPluginEnabled) }}
+    nriPluginEnabled: {{ .Values.cdi.nriPluginEnabled }}
+    {{- end }}
   driver:
     enabled: {{ .Values.driver.enabled }}
     useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 14129a68d..73903bc70 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -14,6 +14,7 @@ psa:
 
 cdi:
   enabled: true
+  nriPluginEnabled: false
 
 sandboxWorkloads:
   enabled: false
diff --git a/validator/manifests/cuda-workload-validation.yaml b/validator/manifests/cuda-workload-validation.yaml
index 11aca2a5a..fa47df5f7 100644
--- a/validator/manifests/cuda-workload-validation.yaml
+++ b/validator/manifests/cuda-workload-validation.yaml
@@ -1,6 +1,8 @@
 apiVersion: v1
 kind: Pod
 metadata:
+  annotations:
+    nvidia.cdi.k8s.io/container.cuda-validation: "management.nvidia.com/gpu=all"
   labels:
     app: nvidia-cuda-validator
   generateName: nvidia-cuda-validator-