@@ -152,6 +152,14 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152 return ctrl.Result {}, err
153153 }
154154
155+ // Clear stale upgrade labels from nodes that no longer have driver pods
156+ // Use the built state to avoid duplicate API calls and to skip nodes actively being upgraded
157+ // This is best-effort cleanup that runs every reconciliation
158+ if err := r .clearUpgradeLabelsWhereDriverNotRunning (ctx , state , driverLabel , clusterPolicyCtrl .operatorNamespace ); err != nil {
159+ // Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
160+ r .Log .Error (err , "Failed to clear stale upgrade labels" )
161+ }
162+
155163 reqLogger .Info ("Propagate state to state manager" )
156164 reqLogger .V (consts .LogLevelDebug ).Info ("Current cluster upgrade state" , "state" , state )
157165
@@ -198,6 +206,98 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198206 return ctrl.Result {Requeue : true , RequeueAfter : plannedRequeueInterval }, nil
199207}
200208
209+ // clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
210+ // This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
211+ // but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
212+ func (r * UpgradeReconciler ) clearUpgradeLabelsWhereDriverNotRunning (ctx context.Context , state * upgrade.ClusterUpgradeState , driverLabel map [string ]string , namespace string ) error {
213+ upgradeStateLabel := upgrade .GetUpgradeStateLabelKey ()
214+
215+ // Build a set of nodes being actively managed by the upgrade process first
216+ managedNodes := make (map [string ]bool )
217+ for _ , nodeStates := range state .NodeStates {
218+ for _ , nodeState := range nodeStates {
219+ if nodeState .Node != nil {
220+ managedNodes [nodeState .Node .Name ] = true
221+ }
222+ }
223+ }
224+
225+ // List only nodes that have the upgrade label
226+ nodeList := & corev1.NodeList {}
227+ if err := r .List (ctx , nodeList , client.HasLabels {upgradeStateLabel }); err != nil {
228+ return fmt .Errorf ("failed to list nodes with upgrade labels: %w" , err )
229+ }
230+
231+ if len (nodeList .Items ) == 0 {
232+ return nil
233+ }
234+
235+ // Filter out nodes being actively managed by upgrade process
236+ var nodesToCheck []corev1.Node
237+ for _ , node := range nodeList .Items {
238+ if ! managedNodes [node .Name ] {
239+ nodesToCheck = append (nodesToCheck , node )
240+ }
241+ }
242+
243+ if len (nodesToCheck ) == 0 {
244+ return nil
245+ }
246+
247+ // List driver DaemonSets to check which nodes should have pods
248+ // This protects against removing labels during pod recreation (e.g., during rolling updates)
249+ dsList := & appsv1.DaemonSetList {}
250+ if err := r .List (ctx , dsList , client .InNamespace (namespace ), client .MatchingLabels (driverLabel )); err != nil {
251+ return fmt .Errorf ("failed to list driver DaemonSets: %w" , err )
252+ }
253+
254+ // Build a set of nodes that DaemonSets would schedule to
255+ // This protects nodes during pod recreation (e.g., rolling updates where pod is temporarily missing)
256+ nodesShouldHavePods := make (map [string ]bool )
257+ for _ , ds := range dsList .Items {
258+ // DaemonSet nodeSelector is a map[string]string in spec.template.spec.nodeSelector
259+ nodeSelector := ds .Spec .Template .Spec .NodeSelector
260+ selector := labels .SelectorFromSet (nodeSelector )
261+
262+ for _ , node := range nodesToCheck {
263+ // Check if this DaemonSet would schedule to this node
264+ if selector .Matches (labels .Set (node .Labels )) {
265+ nodesShouldHavePods [node .Name ] = true
266+ }
267+ }
268+ }
269+
270+ // Clear upgrade label from nodes that aren't targeted by any DaemonSet
271+ // nodesToCheck already excludes nodes with pods (filtered by managedNodes)
272+ // The nodesShouldHavePods check protects against removing labels during pod recreation
273+ // (e.g., rolling updates where old pod is deleted but new pod not yet scheduled)
274+ //
275+ // Note: There is a small race condition window where a DaemonSet's nodeSelector is updated
276+ // after we list DaemonSets but before we patch the node. This is acceptable because:
277+ // 1. This is best-effort cleanup that runs every 2 minutes
278+ // 2. The primary use case (nodeSelector narrowing to exclude nodes) is correctly handled
279+ // 3. False negatives (not removing when we should) are safer than false positives
280+ for i := range nodesToCheck {
281+ node := & nodesToCheck [i ]
282+ shouldHaveDriverPod := nodesShouldHavePods [node .Name ]
283+
284+ // Only remove label if no DaemonSet intends to schedule a pod there
285+ if ! shouldHaveDriverPod {
286+ r .Log .Info ("Clearing stale upgrade label from node" , "node" , node .Name )
287+
288+ nodeCopy := node .DeepCopy ()
289+ delete (nodeCopy .Labels , upgradeStateLabel )
290+ if err := r .Patch (ctx , nodeCopy , client .MergeFrom (node )); err != nil {
291+ r .Log .Error (err , "Failed to clear upgrade label from node" , "node" , node .Name )
292+ // Continue with other nodes even if one fails
293+ continue
294+ }
295+ }
296+ }
297+
298+ return nil
299+ }
300+
201301// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202302// It is used for cleanup when autoUpgrade feature gets disabled
203303func (r * UpgradeReconciler ) removeNodeUpgradeStateLabels (ctx context.Context ) error {
0 commit comments