Skip to content

Commit 80c737f

Browse files
committed
use managed-by label with nvidiadriver cr
Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 23e5272 commit 80c737f

4 files changed

Lines changed: 169 additions & 3 deletions

File tree

controllers/nvidiadriver_controller.go

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@ import (
2727
corev1 "k8s.io/api/core/v1"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/labels"
3031
"k8s.io/apimachinery/pkg/runtime"
3132
"k8s.io/apimachinery/pkg/types"
3233
"k8s.io/client-go/util/workqueue"
3334
ctrl "sigs.k8s.io/controller-runtime"
3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536
"sigs.k8s.io/controller-runtime/pkg/controller"
37+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3638
"sigs.k8s.io/controller-runtime/pkg/event"
3739
"sigs.k8s.io/controller-runtime/pkg/handler"
3840
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -49,6 +51,11 @@ import (
4951
"github.com/NVIDIA/gpu-operator/internal/validator"
5052
)
5153

54+
const (
55+
nvidiaDriverNodeLabelFinalizer = "nvidia.com/nvidiadriver-node-labels"
56+
managedByLabel = "nvidia.com/gpu.driver.managed-by"
57+
)
58+
5259
// NVIDIADriverReconciler reconciles a NVIDIADriver object
5360
type NVIDIADriverReconciler struct {
5461
client.Client
@@ -97,6 +104,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
97104
return reconcile.Result{}, wrappedErr
98105
}
99106

107+
// Handle deletion: cleanup labels and finalizer
108+
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
109+
return r.reconcileDelete(ctx, instance)
110+
}
111+
112+
// Add finalizer if not present
113+
if !controllerutil.ContainsFinalizer(instance, nvidiaDriverNodeLabelFinalizer) {
114+
if err := r.addFinalizer(ctx, instance); err != nil {
115+
logger.Error(err, "failed to add finalizer")
116+
return reconcile.Result{}, err
117+
}
118+
}
119+
100120
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101121
clusterPolicyList := &gpuv1.ClusterPolicyList{}
102122
if err := r.List(ctx, clusterPolicyList); err != nil {
@@ -151,6 +171,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
151171
return reconcile.Result{}, nil
152172
}
153173

174+
// Reconcile node labels
175+
if err := r.reconcileNodeLabels(ctx, instance); err != nil {
176+
logger.Error(err, "failed to reconcile node labels")
177+
return reconcile.Result{}, err
178+
}
179+
154180
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
155181
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
156182
logger.Error(err, "unsupported driver combination detected")
@@ -220,6 +246,146 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
220246
return reconcile.Result{}, nil
221247
}
222248

249+
// addFinalizer adds a finalizer to the NVIDIADriver resource
250+
func (r *NVIDIADriverReconciler) addFinalizer(ctx context.Context, instance *nvidiav1alpha1.NVIDIADriver) error {
251+
logger := log.FromContext(ctx)
252+
logger.Info("Adding finalizer to NVIDIADriver")
253+
patch := client.MergeFrom(instance.DeepCopy())
254+
controllerutil.AddFinalizer(instance, nvidiaDriverNodeLabelFinalizer)
255+
if err := r.Patch(ctx, instance, patch); err != nil {
256+
return err
257+
}
258+
return nil
259+
}
260+
261+
// reconcileDelete handles the deletion of a NVIDIADriver resource
262+
// It ensures that any node labels managed by this NVIDIADriver are cleaned up
263+
func (r *NVIDIADriverReconciler) reconcileDelete(ctx context.Context, instance *nvidiav1alpha1.NVIDIADriver) (reconcile.Result, error) {
264+
logger := log.FromContext(ctx)
265+
266+
if controllerutil.ContainsFinalizer(instance, nvidiaDriverNodeLabelFinalizer) {
267+
logger.Info("NVIDIADriver is being deleted, cleaning up node labels")
268+
269+
// Remove node labels before deleting
270+
if err := r.cleanupNodeLabels(ctx, instance); err != nil {
271+
logger.Error(err, "failed to cleanup node labels")
272+
return reconcile.Result{}, err
273+
}
274+
275+
// Remove the finalizer
276+
patch := client.MergeFrom(instance.DeepCopy())
277+
controllerutil.RemoveFinalizer(instance, nvidiaDriverNodeLabelFinalizer)
278+
if err := r.Patch(ctx, instance, patch); err != nil {
279+
return reconcile.Result{}, err
280+
}
281+
logger.Info("Finalizer removed, NVIDIADriver will be deleted")
282+
}
283+
return reconcile.Result{}, nil
284+
}
285+
286+
// reconcileNodeLabels ensures that the node labels for the NVIDIADriver resource are correctly set
287+
func (r *NVIDIADriverReconciler) reconcileNodeLabels(ctx context.Context, nvd *nvidiav1alpha1.NVIDIADriver) error {
288+
logger := log.FromContext(ctx)
289+
290+
var nodes corev1.NodeList
291+
if err := r.List(ctx, &nodes); err != nil {
292+
logger.Error(err, "failed to list nodes")
293+
return err
294+
}
295+
296+
selector := labels.SelectorFromSet(nvd.Spec.NodeSelector)
297+
298+
for i := range nodes.Items {
299+
node := &nodes.Items[i]
300+
301+
nodeLabels := node.GetLabels()
302+
if nodeLabels == nil {
303+
nodeLabels = make(map[string]string)
304+
}
305+
306+
matches := selector.Matches(labels.Set(nodeLabels))
307+
current, exists := nodeLabels[managedByLabel]
308+
309+
var desired *string
310+
if matches {
311+
desired = &nvd.Name
312+
}
313+
314+
// Only update if:
315+
// 1. We want to add/change label and it doesn't exist or is different, OR
316+
// 2. We want to remove label and it exists with OUR value
317+
needsUpdate :=
318+
(desired != nil && (!exists || current != *desired)) ||
319+
(desired == nil && exists && current == nvd.Name)
320+
321+
if !needsUpdate {
322+
continue
323+
}
324+
325+
nodeCopy := node.DeepCopy()
326+
newLabels := maps.Clone(nodeLabels)
327+
328+
if desired != nil {
329+
logger.Info("Setting driver management node label",
330+
"node", node.Name,
331+
"label", managedByLabel,
332+
"desired", *desired,
333+
)
334+
newLabels[managedByLabel] = *desired
335+
} else {
336+
logger.Info("Removing driver management node label",
337+
"node", node.Name,
338+
"label", managedByLabel,
339+
)
340+
delete(newLabels, managedByLabel)
341+
}
342+
343+
node.SetLabels(newLabels)
344+
if err := r.Patch(ctx, node, client.MergeFrom(nodeCopy)); err != nil {
345+
logger.Error(err, "failed to update node label", "node", node.Name)
346+
return err
347+
}
348+
}
349+
return nil
350+
}
351+
352+
// cleanupNodeLabels removes the managed-by label from all nodes managed by the given NVIDIADriver
353+
func (r *NVIDIADriverReconciler) cleanupNodeLabels(ctx context.Context, nvd *nvidiav1alpha1.NVIDIADriver) error {
354+
logger := log.FromContext(ctx)
355+
356+
nodeList := &corev1.NodeList{}
357+
if err := r.List(ctx, nodeList); err != nil {
358+
logger.Error(err, "failed to list nodes during cleanup")
359+
return err
360+
}
361+
362+
for i := range nodeList.Items {
363+
node := &nodeList.Items[i]
364+
nodeLabels := node.GetLabels()
365+
if nodeLabels == nil {
366+
continue
367+
}
368+
369+
currentValue, hasLabel := nodeLabels[managedByLabel]
370+
if hasLabel && currentValue == nvd.Name {
371+
logger.Info("Removing driver management label from node during cleanup", "node", node.Name)
372+
nodeCopy := node.DeepCopy()
373+
// Clone the labels map to avoid modifying the original
374+
newLabels := maps.Clone(nodeLabels)
375+
delete(newLabels, managedByLabel)
376+
node.SetLabels(newLabels)
377+
patch := client.MergeFrom(nodeCopy)
378+
if err := r.Patch(ctx, node, patch); err != nil {
379+
logger.Error(err, "failed to remove label from node", "node", node.Name)
380+
return err
381+
}
382+
}
383+
}
384+
385+
logger.Info(fmt.Sprintf("Successfully cleaned up %s node labels for NVIDIADriver %s", managedByLabel, nvd.Name))
386+
return nil
387+
}
388+
223389
func (r *NVIDIADriverReconciler) updateCrStatus(
224390
ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error {
225391
reqLogger := log.FromContext(ctx)

internal/state/driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS
573573
Spec: spec,
574574
AppName: nvidiaDriverAppName,
575575
Name: nvidiaDriverName,
576+
CRName: cr.Name,
576577
ImagePath: imagePath,
577578
ManagerImagePath: managerImagePath,
578579
OSVersion: nodePool.getOS(),

internal/state/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type driverSpec struct {
3232
Spec *nvidiav1alpha1.NVIDIADriverSpec
3333
AppName string
3434
Name string
35+
CRName string
3536
ImagePath string
3637
ManagerImagePath string
3738
OSVersion string

manifests/state-driver/0500_daemonset.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,12 @@ spec:
6363
terminationGracePeriodSeconds: 120
6464
{{- end }}
6565
nodeSelector:
66+
nvidia.com/gpu.driver.managed-by: {{ .Driver.CRName }}
6667
{{- if eq .Driver.Spec.DriverType "vgpu-host-manager" }}
6768
nvidia.com/gpu.deploy.vgpu-manager: "true"
6869
{{- else }}
6970
nvidia.com/gpu.deploy.driver: "true"
7071
{{- end }}
71-
{{- if .Driver.Spec.NodeSelector }}
72-
{{- .Driver.Spec.NodeSelector | yaml | nindent 8 }}
73-
{{- end }}
7472
{{- if and (.Openshift) (.Runtime.OpenshiftDriverToolkitEnabled) }}
7573
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: {{ .Openshift.RHCOSVersion | quote }}
7674
{{- end }}

0 commit comments

Comments
 (0)