@@ -152,6 +152,14 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152 return ctrl.Result {}, err
153153 }
154154
155+ // Clear stale upgrade labels from nodes that no longer have driver pods
156+ // Use the built state to avoid duplicate API calls and to skip nodes actively being upgraded
157+ // This is best-effort cleanup that runs every reconciliation
158+ if err := r .clearStaleUpgradeLabels (ctx , state , driverLabel , clusterPolicyCtrl .operatorNamespace ); err != nil {
159+ // Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
160+ r .Log .V (consts .LogLevelWarning ).Info ("Failed to clear stale upgrade labels" , "error" , err )
161+ }
162+
155163 reqLogger .Info ("Propagate state to state manager" )
156164 reqLogger .V (consts .LogLevelDebug ).Info ("Current cluster upgrade state" , "state" , state )
157165
@@ -198,6 +206,101 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198206 return ctrl.Result {Requeue : true , RequeueAfter : plannedRequeueInterval }, nil
199207}
200208
209+ // clearStaleUpgradeLabels removes upgrade labels from nodes where driver pods are no longer scheduled.
210+ // This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
211+ // but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
212+ func (r * UpgradeReconciler ) clearStaleUpgradeLabels (ctx context.Context , state * upgrade.ClusterUpgradeState , driverLabel map [string ]string , namespace string ) error {
213+ upgradeStateLabel := upgrade .GetUpgradeStateLabelKey ()
214+
215+ // Build a set of nodes being actively managed by the upgrade process
216+ // Managed nodes are those in ClusterUpgradeState.NodeStates, which are populated by BuildState()
217+ // and represent nodes that currently have driver pods associated with them
218+ managedNodes := make (map [string ]bool )
219+ for _ , nodeStates := range state .NodeStates {
220+ for _ , nodeState := range nodeStates {
221+ if nodeState .Node != nil {
222+ managedNodes [nodeState .Node .Name ] = true
223+ }
224+ }
225+ }
226+
227+ // List only nodes that have the upgrade label
228+ nodeList := & corev1.NodeList {}
229+ if err := r .List (ctx , nodeList , client.HasLabels {upgradeStateLabel }); err != nil {
230+ return fmt .Errorf ("failed to list nodes with upgrade labels: %w" , err )
231+ }
232+
233+ if len (nodeList .Items ) == 0 {
234+ return nil
235+ }
236+
237+ // Filter out nodes being actively managed by upgrade process (those with driver pods in state.NodeStates)
238+ // This ensures we only clean up labels from nodes that truly have stale labels
239+ var nodesToCheck []corev1.Node
240+ for _ , node := range nodeList .Items {
241+ if ! managedNodes [node .Name ] {
242+ nodesToCheck = append (nodesToCheck , node )
243+ }
244+ }
245+
246+ if len (nodesToCheck ) == 0 {
247+ return nil
248+ }
249+
250+ // List driver DaemonSets to check which nodes should have pods
251+ // This protects against removing labels during pod recreation (e.g., during rolling updates)
252+ dsList := & appsv1.DaemonSetList {}
253+ if err := r .List (ctx , dsList , client .InNamespace (namespace ), client .MatchingLabels (driverLabel )); err != nil {
254+ return fmt .Errorf ("failed to list driver DaemonSets: %w" , err )
255+ }
256+
257+ // Build a set of nodes that DaemonSets would schedule to
258+ // This protects nodes during pod recreation (e.g., rolling updates where pod is temporarily missing)
259+ nodesSelected := make (map [string ]bool )
260+ for _ , ds := range dsList .Items {
261+ // DaemonSet nodeSelector is a map[string]string in spec.template.spec.nodeSelector
262+ nodeSelector := ds .Spec .Template .Spec .NodeSelector
263+ selector := labels .SelectorFromSet (nodeSelector )
264+
265+ for _ , node := range nodesToCheck {
266+ // Check if this DaemonSet would schedule to this node
267+ if selector .Matches (labels .Set (node .Labels )) {
268+ nodesSelected [node .Name ] = true
269+ }
270+ }
271+ }
272+
273+ // Clear upgrade label from nodes that aren't targeted by any DaemonSet
274+ // nodesToCheck excludes nodes actively managed by the upgrade state machine (managedNodes filter)
275+ // These managed nodes are in ClusterUpgradeState.NodeStates and have driver pods associated with them
276+ // The nodesSelected check protects against removing labels during pod recreation
277+ // (e.g., rolling updates where old pod is deleted but new pod not yet scheduled)
278+ //
279+ // Note: There is a small race condition window where a DaemonSet's nodeSelector is updated
280+ // after we list DaemonSets but before we patch the node. This is acceptable because:
281+ // 1. This is best-effort cleanup that runs every 2 minutes
282+ // 2. The primary use case (nodeSelector narrowing to exclude nodes) is correctly handled
283+ // 3. False negatives (not removing when we should) are safer than false positives
284+ for i := range nodesToCheck {
285+ node := & nodesToCheck [i ]
286+
287+ // Only remove label from node if no DaemonSet intends to schedule a pod there
288+ if _ , exists := nodesSelected [node .Name ]; ! exists {
289+ r .Log .Info ("Clearing stale upgrade label from node" , "node" , node .Name )
290+
291+ nodeCopy := node .DeepCopy ()
292+ delete (nodeCopy .Labels , upgradeStateLabel )
293+ if err := r .Patch (ctx , nodeCopy , client .MergeFrom (node )); err != nil {
294+ r .Log .Error (err , "Failed to clear upgrade label from node" , "node" , node .Name )
295+ // Continue with other nodes even if one fails
296+ continue
297+ }
298+ }
299+ }
300+
301+ return nil
302+ }
303+
201304// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202305// It is used for cleanup when autoUpgrade feature gets disabled
203306func (r * UpgradeReconciler ) removeNodeUpgradeStateLabels (ctx context.Context ) error {
0 commit comments