@@ -27,12 +27,14 @@ import (
2727 corev1 "k8s.io/api/core/v1"
2828 apierrors "k8s.io/apimachinery/pkg/api/errors"
2929 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+ "k8s.io/apimachinery/pkg/labels"
3031 "k8s.io/apimachinery/pkg/runtime"
3132 "k8s.io/apimachinery/pkg/types"
3233 "k8s.io/client-go/util/workqueue"
3334 ctrl "sigs.k8s.io/controller-runtime"
3435 "sigs.k8s.io/controller-runtime/pkg/client"
3536 "sigs.k8s.io/controller-runtime/pkg/controller"
37+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3638 "sigs.k8s.io/controller-runtime/pkg/event"
3739 "sigs.k8s.io/controller-runtime/pkg/handler"
3840 "sigs.k8s.io/controller-runtime/pkg/log"
@@ -49,6 +51,11 @@ import (
4951 "github.com/NVIDIA/gpu-operator/internal/validator"
5052)
5153
54+ const (
55+ nvidiaDriverNodeLabelFinalizer = "nvidia.com/nvidiadriver-node-labels"
56+ managedByLabel = "nvidia.com/gpu.driver.managed-by"
57+ )
58+
5259// NVIDIADriverReconciler reconciles a NVIDIADriver object
5360type NVIDIADriverReconciler struct {
5461 client.Client
@@ -97,6 +104,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
97104 return reconcile.Result {}, wrappedErr
98105 }
99106
107+ // Handle deletion: cleanup labels and finalizer
108+ if ! instance .ObjectMeta .DeletionTimestamp .IsZero () {
109+ return r .reconcileDelete (ctx , instance )
110+ }
111+
112+ // Add finalizer if not present
113+ if ! controllerutil .ContainsFinalizer (instance , nvidiaDriverNodeLabelFinalizer ) {
114+ if err := r .addFinalizer (ctx , instance ); err != nil {
115+ logger .Error (err , "failed to add finalizer" )
116+ return reconcile.Result {}, err
117+ }
118+ }
119+
100120 // Get the singleton NVIDIA ClusterPolicy object in the cluster.
101121 clusterPolicyList := & gpuv1.ClusterPolicyList {}
102122 if err := r .List (ctx , clusterPolicyList ); err != nil {
@@ -151,6 +171,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
151171 return reconcile.Result {}, nil
152172 }
153173
174+ // Reconcile node labels
175+ if err := r .reconcileNodeLabels (ctx , instance ); err != nil {
176+ logger .Error (err , "failed to reconcile node labels" )
177+ return reconcile.Result {}, err
178+ }
179+
154180 if instance .Spec .UsePrecompiledDrivers () && (instance .Spec .IsGDSEnabled () || instance .Spec .IsGDRCopyEnabled ()) {
155181 err := errors .New ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
156182 logger .Error (err , "unsupported driver combination detected" )
@@ -220,6 +246,146 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
220246 return reconcile.Result {}, nil
221247}
222248
249+ // addFinalizer adds a finalizer to the NVIDIADriver resource
250+ func (r * NVIDIADriverReconciler ) addFinalizer (ctx context.Context , instance * nvidiav1alpha1.NVIDIADriver ) error {
251+ logger := log .FromContext (ctx )
252+ logger .Info ("Adding finalizer to NVIDIADriver" )
253+ patch := client .MergeFrom (instance .DeepCopy ())
254+ controllerutil .AddFinalizer (instance , nvidiaDriverNodeLabelFinalizer )
255+ if err := r .Patch (ctx , instance , patch ); err != nil {
256+ return err
257+ }
258+ return nil
259+ }
260+
261+ // reconcileDelete handles the deletion of a NVIDIADriver resource
262+ // It ensures that any node labels managed by this NVIDIADriver are cleaned up
263+ func (r * NVIDIADriverReconciler ) reconcileDelete (ctx context.Context , instance * nvidiav1alpha1.NVIDIADriver ) (reconcile.Result , error ) {
264+ logger := log .FromContext (ctx )
265+
266+ if controllerutil .ContainsFinalizer (instance , nvidiaDriverNodeLabelFinalizer ) {
267+ logger .Info ("NVIDIADriver is being deleted, cleaning up node labels" )
268+
269+ // Remove node labels before deleting
270+ if err := r .cleanupNodeLabels (ctx , instance ); err != nil {
271+ logger .Error (err , "failed to cleanup node labels" )
272+ return reconcile.Result {}, err
273+ }
274+
275+ // Remove the finalizer
276+ patch := client .MergeFrom (instance .DeepCopy ())
277+ controllerutil .RemoveFinalizer (instance , nvidiaDriverNodeLabelFinalizer )
278+ if err := r .Patch (ctx , instance , patch ); err != nil {
279+ return reconcile.Result {}, err
280+ }
281+ logger .Info ("Finalizer removed, NVIDIADriver will be deleted" )
282+ }
283+ return reconcile.Result {}, nil
284+ }
285+
286+ // reconcileNodeLabels ensures that the node labels for the NVIDIADriver resource are correctly set
287+ func (r * NVIDIADriverReconciler ) reconcileNodeLabels (ctx context.Context , nvd * nvidiav1alpha1.NVIDIADriver ) error {
288+ logger := log .FromContext (ctx )
289+
290+ var nodes corev1.NodeList
291+ if err := r .List (ctx , & nodes ); err != nil {
292+ logger .Error (err , "failed to list nodes" )
293+ return err
294+ }
295+
296+ selector := labels .SelectorFromSet (nvd .Spec .NodeSelector )
297+
298+ for i := range nodes .Items {
299+ node := & nodes .Items [i ]
300+
301+ nodeLabels := node .GetLabels ()
302+ if nodeLabels == nil {
303+ nodeLabels = make (map [string ]string )
304+ }
305+
306+ matches := selector .Matches (labels .Set (nodeLabels ))
307+ current , exists := nodeLabels [managedByLabel ]
308+
309+ var desired * string
310+ if matches {
311+ desired = & nvd .Name
312+ }
313+
314+ // Only update if:
315+ // 1. We want to add/change label and it doesn't exist or is different, OR
316+ // 2. We want to remove label and it exists with OUR value
317+ needsUpdate :=
318+ (desired != nil && (! exists || current != * desired )) ||
319+ (desired == nil && exists && current == nvd .Name )
320+
321+ if ! needsUpdate {
322+ continue
323+ }
324+
325+ nodeCopy := node .DeepCopy ()
326+ newLabels := maps .Clone (nodeLabels )
327+
328+ if desired != nil {
329+ logger .Info ("Setting driver management node label" ,
330+ "node" , node .Name ,
331+ "label" , managedByLabel ,
332+ "desired" , * desired ,
333+ )
334+ newLabels [managedByLabel ] = * desired
335+ } else {
336+ logger .Info ("Removing driver management node label" ,
337+ "node" , node .Name ,
338+ "label" , managedByLabel ,
339+ )
340+ delete (newLabels , managedByLabel )
341+ }
342+
343+ node .SetLabels (newLabels )
344+ if err := r .Patch (ctx , node , client .MergeFrom (nodeCopy )); err != nil {
345+ logger .Error (err , "failed to update node label" , "node" , node .Name )
346+ return err
347+ }
348+ }
349+ return nil
350+ }
351+
352+ // cleanupNodeLabels removes the managed-by label from all nodes managed by the given NVIDIADriver
353+ func (r * NVIDIADriverReconciler ) cleanupNodeLabels (ctx context.Context , nvd * nvidiav1alpha1.NVIDIADriver ) error {
354+ logger := log .FromContext (ctx )
355+
356+ nodeList := & corev1.NodeList {}
357+ if err := r .List (ctx , nodeList ); err != nil {
358+ logger .Error (err , "failed to list nodes during cleanup" )
359+ return err
360+ }
361+
362+ for i := range nodeList .Items {
363+ node := & nodeList .Items [i ]
364+ nodeLabels := node .GetLabels ()
365+ if nodeLabels == nil {
366+ continue
367+ }
368+
369+ currentValue , hasLabel := nodeLabels [managedByLabel ]
370+ if hasLabel && currentValue == nvd .Name {
371+ logger .Info ("Removing driver management label from node during cleanup" , "node" , node .Name )
372+ nodeCopy := node .DeepCopy ()
373+ // Clone the labels map to avoid modifying the original
374+ newLabels := maps .Clone (nodeLabels )
375+ delete (newLabels , managedByLabel )
376+ node .SetLabels (newLabels )
377+ patch := client .MergeFrom (nodeCopy )
378+ if err := r .Patch (ctx , node , patch ); err != nil {
379+ logger .Error (err , "failed to remove label from node" , "node" , node .Name )
380+ return err
381+ }
382+ }
383+ }
384+
385+ logger .Info (fmt .Sprintf ("Successfully cleaned up %s node labels for NVIDIADriver %s" , managedByLabel , nvd .Name ))
386+ return nil
387+ }
388+
223389func (r * NVIDIADriverReconciler ) updateCrStatus (
224390 ctx context.Context , cr * nvidiav1alpha1.NVIDIADriver , status state.Results ) error {
225391 reqLogger := log .FromContext (ctx )
0 commit comments