@@ -152,6 +152,13 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152 return ctrl.Result {}, err
153153 }
154154
155+ // Clear stale upgrade labels from nodes that no longer have driver pods
156+ // Use the built state so we can avoid removing labels from nodes actively being upgraded
157+ if err := r .clearUpgradeLabelsWhereDriverNotRunning (ctx , state , driverLabel , clusterPolicyCtrl .operatorNamespace ); err != nil {
158+ // Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
159+ r .Log .Error (err , "Failed to clear stale upgrade labels" )
160+ }
161+
155162 reqLogger .Info ("Propagate state to state manager" )
156163 reqLogger .V (consts .LogLevelDebug ).Info ("Current cluster upgrade state" , "state" , state )
157164
@@ -198,6 +205,74 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198205 return ctrl.Result {Requeue : true , RequeueAfter : plannedRequeueInterval }, nil
199206}
200207
208+ // clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
209+ // This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
210+ // but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
211+ func (r * UpgradeReconciler ) clearUpgradeLabelsWhereDriverNotRunning (ctx context.Context , state * upgrade.ClusterUpgradeState , driverLabel map [string ]string , namespace string ) error {
212+ upgradeStateLabel := upgrade .GetUpgradeStateLabelKey ()
213+
214+ // List all nodes
215+ nodeList := & corev1.NodeList {}
216+ if err := r .List (ctx , nodeList ); err != nil {
217+ return fmt .Errorf ("failed to list nodes: %w" , err )
218+ }
219+
220+ // Filter nodes that have the upgrade label (any value)
221+ var nodesWithUpgradeLabel []corev1.Node
222+ for _ , node := range nodeList .Items {
223+ if _ , hasLabel := node .Labels [upgradeStateLabel ]; hasLabel {
224+ nodesWithUpgradeLabel = append (nodesWithUpgradeLabel , node )
225+ }
226+ }
227+
228+ if len (nodesWithUpgradeLabel ) == 0 {
229+ return nil // No nodes with upgrade labels
230+ }
231+
232+ // Build a set of nodes being actively managed by the upgrade process
233+ // We should not remove labels from these nodes as they may be mid-upgrade
234+ managedNodes := make (map [string ]bool )
235+ for nodeName := range state .NodeStates {
236+ managedNodes [nodeName ] = true
237+ }
238+
239+ // List all driver pods (including orphaned ones)
240+ podList := & corev1.PodList {}
241+ if err := r .List (ctx , podList , client .InNamespace (namespace ), client .MatchingLabels (driverLabel )); err != nil {
242+ return fmt .Errorf ("failed to list driver pods: %w" , err )
243+ }
244+
245+ // Create a set of nodes that have driver pods (any driver pods)
246+ nodesWithPods := make (map [string ]bool )
247+ for _ , pod := range podList .Items {
248+ if pod .Spec .NodeName != "" {
249+ nodesWithPods [pod .Spec .NodeName ] = true
250+ }
251+ }
252+
253+ // Clear upgrade label from nodes that don't have driver pods
254+ for i := range nodesWithUpgradeLabel {
255+ node := & nodesWithUpgradeLabel [i ]
256+ // Skip nodes being actively managed by upgrade process
257+ if managedNodes [node .Name ] {
258+ continue
259+ }
260+ if _ , hasDriverPod := nodesWithPods [node .Name ]; ! hasDriverPod {
261+ r .Log .Info ("Clearing stale upgrade label from node" , "node" , node .Name )
262+
263+ nodeCopy := node .DeepCopy ()
264+ delete (node .Labels , upgradeStateLabel )
265+ if err := r .Patch (ctx , node , client .MergeFrom (nodeCopy )); err != nil {
266+ r .Log .Error (err , "Failed to clear upgrade label from node" , "node" , node .Name )
267+ // Continue with other nodes even if one fails
268+ continue
269+ }
270+ }
271+ }
272+
273+ return nil
274+ }
275+
201276// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202277// It is used for cleanup when autoUpgrade feature gets disabled
203278func (r * UpgradeReconciler ) removeNodeUpgradeStateLabels (ctx context.Context ) error {
0 commit comments