From d5d7f0ac7aeed65478af781a0364e70c3e6c8e09 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 1 Dec 2025 13:12:27 -0800 Subject: [PATCH] [state-toolkit] add support for mounting runtime NRI sockets Signed-off-by: Tariq Ibrahim add nri plugin annotation instead of setting nvidia runtimeclass Signed-off-by: Tariq Ibrahim --- api/nvidia/v1/clusterpolicy_types.go | 17 +++++ api/nvidia/v1/zz_generated.deepcopy.go | 5 ++ .../manifests/nvidia.com_clusterpolicies.yaml | 6 ++ .../crd/bases/nvidia.com_clusterpolicies.yaml | 6 ++ controllers/object_controls.go | 67 +++++++++++++++-- controllers/transforms_test.go | 72 ++++++++++++++++++- .../crds/nvidia.com_clusterpolicies.yaml | 6 ++ .../gpu-operator/templates/clusterpolicy.yaml | 3 + deployments/gpu-operator/values.yaml | 1 + .../manifests/cuda-workload-validation.yaml | 2 + 10 files changed, 179 insertions(+), 6 deletions(-) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index ea4b21d86..121493c7d 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1722,6 +1722,14 @@ type CDIConfigSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden" Default *bool `json:"default,omitempty"` + + // NRIPluginEnabled indicates whether an NRI Plugin should be run as a means of injecting CDI devices to gpu management containers. + // +kubebuilder:validation:Optional + // +kubebuilder:default=false + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NRI as an additional mechanism for injecting CDI devices to gpu management containers." + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + NRIPluginEnabled *bool `json:"nriPluginEnabled,omitempty"` } // MIGStrategy indicates MIG mode @@ -2176,6 +2184,15 @@ func (c *CDIConfigSpec) IsEnabled() bool { return *c.Enabled } +// IsNRIPluginEnabled returns true if NRI Plugin is enabled as a mechanism for +// injecting CDI devices to containers +func (c *CDIConfigSpec) IsNRIPluginEnabled() bool { + if c.Enabled == nil || c.NRIPluginEnabled == nil { + return false + } + return *c.Enabled && *c.NRIPluginEnabled +} + // IsEnabled returns true if Kata Manager is enabled func (k *KataManagerSpec) IsEnabled() bool { if k.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 9e68fdb37..3bb70f073 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -82,6 +82,11 @@ func (in *CDIConfigSpec) DeepCopyInto(out *CDIConfigSpec) { *out = new(bool) **out = **in } + if in.NRIPluginEnabled != nil { + in, out := &in.NRIPluginEnabled, &out.NRIPluginEnabled + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CDIConfigSpec. diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 379e98d87..d82340856 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -146,6 +146,12 @@ spec: (CDI) should be used as the mechanism for making GPUs accessible to containers. type: boolean + nriPluginEnabled: + default: false + description: NRIPluginEnabled indicates whether an NRI Plugin + should be run as a means of injecting CDI devices to gpu management + containers. + type: boolean type: object daemonsets: description: Daemonset defines common configuration for all Daemonsets diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 379e98d87..d82340856 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -146,6 +146,12 @@ spec: (CDI) should be used as the mechanism for making GPUs accessible to containers. type: boolean + nriPluginEnabled: + default: false + description: NRIPluginEnabled indicates whether an NRI Plugin + should be run as a means of injecting CDI devices to gpu management + containers. + type: boolean type: object daemonsets: description: Daemonset defines common configuration for all Daemonsets diff --git a/controllers/object_controls.go b/controllers/object_controls.go index bbd2f740a..fe7e9df4b 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -66,6 +66,8 @@ const ( DefaultDockerConfigFile = "/etc/docker/daemon.json" // DefaultDockerSocketFile indicates default docker socket file DefaultDockerSocketFile = "/var/run/docker.sock" + // DefaultRuntimeNRISocketFile indicates the default container runtime NRI socket file + DefaultRuntimeNRISocketFile = "/var/run/nri/nri.sock" // DefaultCRIOConfigFile indicates default config file path for cri-o. . DefaultCRIOConfigFile = "/etc/crio/config.toml" // DefaultCRIODropInConfigFile indicates the default path to the drop-in config file for cri-o @@ -82,9 +84,11 @@ const ( DefaultRuntimeClass = "nvidia" // DriverInstallPathVolName represents volume name for driver install path provided to toolkit DriverInstallPathVolName = "driver-install-path" - // DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted + // DefaultRuntimeNRISocketTargetDir represents target directory where runtime NRI socket directory will be mounted + DefaultRuntimeNRISocketTargetDir = "/runtime/nri-sock-dir/" + // DefaultRuntimeSocketTargetDir represents target directory where runtime socket directory will be mounted DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/" - // DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted + // DefaultRuntimeConfigTargetDir represents target directory where runtime socket directory will be mounted DefaultRuntimeConfigTargetDir = "/runtime/config-dir/" // DefaultRuntimeDropInConfigTargetDir represents target directory where drop-in config directory will be mounted DefaultRuntimeDropInConfigTargetDir = "/runtime/config-dir.d/" @@ -144,6 +148,8 @@ const ( NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH" // CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration CRIOConfigModeEnvName = "CRIO_CONFIG_MODE" + // CDIEnableNRIPlugin is the name of the env var for enabling NRI Plugin in the toolkit + CDIEnableNRIPlugin = "ENABLE_NRI_PLUGIN" // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY" // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix @@ -175,6 +181,8 @@ const ( DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH" // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" + // NRIAnnotationDomain represents the domain name used for NRI annotations used for CDI device injections + NRIAnnotationDomain = "nvidia.cdi.k8s.io" ) // ContainerProbe defines container probe types @@ -946,6 +954,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol } setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -953,6 +962,23 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol return nil } +func setNRIPluginAnnotation(o *metav1.ObjectMeta, cdiConfig *gpuv1.CDIConfigSpec, containerName string) { + const ( + managementCDIDevice = "management.nvidia.com/gpu=all" + ) + + if !cdiConfig.IsNRIPluginEnabled() { + return + } + annotations := o.Annotations + if len(annotations) == 0 { + annotations = make(map[string]string) + } + annotationKey := fmt.Sprintf("%s/container.%s", NRIAnnotationDomain, containerName) + annotations[annotationKey] = managementCDIDevice + o.Annotations = annotations +} + // parseOSRelease can be overridden in tests for mocking filesystem access. // In production, it reads and parses /host-etc/os-release. var parseOSRelease = parseOSReleaseFromFile @@ -1238,7 +1264,7 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { return envVars } -func transformToolkitCtrForCDI(container *corev1.Container) { +func transformToolkitCtrForCDI(container *corev1.Container, nriPluginEnabled bool) { // When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o // to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The // 'nvidia' runtime will be set as the runtime class for our management containers so that @@ -1252,6 +1278,10 @@ func transformToolkitCtrForCDI(container *corev1.Container) { setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false") setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") setContainerEnv(container, CRIOConfigModeEnvName, "config") + + if nriPluginEnabled { + setContainerEnv(container, CDIEnableNRIPlugin, "true") + } } // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy @@ -1293,7 +1323,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n // update env required for CDI support if config.CDI.IsEnabled() { - transformToolkitCtrForCDI(toolkitMainContainer) + transformToolkitCtrForCDI(toolkitMainContainer, config.CDI.IsNRIPluginEnabled()) } else if n.runtime == gpuv1.CRIO { // (cdesiniotis) When CDI is not enabled and cri-o is the container runtime, // we continue to install the OCI prestart hook as opposed to adding nvidia @@ -1464,6 +1494,23 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}} obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol) } + + if config.CDI.IsNRIPluginEnabled() { + // setup mounts for the runtime NRI socket file + nriSocketFile := getContainerEnv(container, "NRI_SOCKET") + if nriSocketFile == "" { + nriSocketFile = DefaultRuntimeNRISocketFile + } + setContainerEnv(container, "NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile)) + + nriVolMountSocketName := "nri-socket" + nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir} + container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket) + + nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}} + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol) + } + return nil } @@ -1536,6 +1583,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, devicePluginContainerName) // update env required for MIG support applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy) @@ -1616,6 +1664,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic } setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, mpsControlMainContainer.Name) // update env required for MIG support applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy) @@ -1730,6 +1779,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet } + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name) setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // set hostPID if specified for DCGM Exporter @@ -1880,6 +1930,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } } + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name) setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) return nil @@ -1923,6 +1974,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2218,6 +2270,11 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + toolkitValidationCtr := findContainerByName(obj.Spec.Template.Spec.InitContainers, "toolkit-validation") + if toolkitValidationCtr != nil && len(toolkitValidationCtr.Name) > 0 { + setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, toolkitValidationCtr.Name) + } + var validatorErr error // apply changes for individual component validators(initContainers) components := []string{ @@ -2573,7 +2630,7 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string { } func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) { - if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO { + if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO || config.CDI.IsNRIPluginEnabled() { return } runtimeClassName := getRuntimeClassName(config) diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index ef2923452..986308a16 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -2559,6 +2559,55 @@ func TestTransformValidator(t *testing.T) { WithPullSecret("pull-secret"). WithRuntimeClassName("nvidia"), }, + { + description: "nri plugin enabled", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "toolkit-validation"}). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }). + WithPullSecret("pull-secret"), + cpSpec: &gpuv1.ClusterPolicySpec{ + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gpu-operator-validator", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + }, + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + NRIPluginEnabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset(). + WithPodAnnotations(map[string]string{ + "nvidia.cdi.k8s.io/container.toolkit-validation": "management.nvidia.com/gpu=all", + }). + WithInitContainer(corev1.Container{ + Name: "toolkit-validation", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }, + ). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }). + WithPullSecret("pull-secret"), + }, } for _, tc := range testCases { @@ -2834,12 +2883,33 @@ func TestTransformToolkitCtrForCDI(t *testing.T) { }, }), }, + { + description: "cdi and nri plugin enabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + NRIPluginEnabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + {Name: CRIOConfigModeEnvName, Value: "config"}, + {Name: CDIEnableNRIPlugin, Value: "true"}, + }, + }), + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { mainContainer := &tc.ds.Spec.Template.Spec.Containers[0] - transformToolkitCtrForCDI(mainContainer) + transformToolkitCtrForCDI(mainContainer, tc.cpSpec.CDI.IsNRIPluginEnabled()) require.EqualValues(t, tc.expectedDs, tc.ds) }) } diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 379e98d87..d82340856 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -146,6 +146,12 @@ spec: (CDI) should be used as the mechanism for making GPUs accessible to containers. type: boolean + nriPluginEnabled: + default: false + description: NRIPluginEnabled indicates whether an NRI Plugin + should be run as a means of injecting CDI devices to gpu management + containers. + type: boolean type: object daemonsets: description: Daemonset defines common configuration for all Daemonsets diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 33efec386..ae8bfd1fb 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -152,6 +152,9 @@ spec: {{- if .Values.cdi.default }} default: {{ .Values.cdi.default }} {{- end }} + {{- if and (.Values.cdi.enabled) (.Values.cdi.nriPluginEnabled) }} + nriPluginEnabled: {{ .Values.cdi.nriPluginEnabled }} + {{- end }} driver: enabled: {{ .Values.driver.enabled }} useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 14129a68d..73903bc70 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -14,6 +14,7 @@ psa: cdi: enabled: true + nriPluginEnabled: false sandboxWorkloads: enabled: false diff --git a/validator/manifests/cuda-workload-validation.yaml b/validator/manifests/cuda-workload-validation.yaml index 11aca2a5a..fa47df5f7 100644 --- a/validator/manifests/cuda-workload-validation.yaml +++ b/validator/manifests/cuda-workload-validation.yaml @@ -1,6 +1,8 @@ apiVersion: v1 kind: Pod metadata: + annotations: + nvidia.cdi.k8s.io/container.cuda-validation: "management.nvidia.com/gpu=all" labels: app: nvidia-cuda-validator generateName: nvidia-cuda-validator-