Skip to content

Commit b2f136d

Browse files
committed
remove driver upgrade label from nodes
this commit removes driver upgrade label from nodes which don't have any driver pod running on them Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 688e38d commit b2f136d

2 files changed

Lines changed: 535 additions & 0 deletions

File tree

controllers/upgrade_controller.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,14 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152
return ctrl.Result{}, err
153153
}
154154

155+
// Clear stale upgrade labels from nodes that no longer have driver pods
156+
// Use the built state to avoid duplicate API calls and to skip nodes actively being upgraded
157+
// This is best-effort cleanup that runs every reconciliation
158+
if err := r.clearUpgradeLabelsWhereDriverNotRunning(ctx, state, driverLabel, clusterPolicyCtrl.operatorNamespace); err != nil {
159+
// Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
160+
r.Log.Error(err, "Failed to clear stale upgrade labels")
161+
}
162+
155163
reqLogger.Info("Propagate state to state manager")
156164
reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state", "state", state)
157165

@@ -198,6 +206,98 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198206
return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil
199207
}
200208

209+
// clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
210+
// This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
211+
// but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
212+
func (r *UpgradeReconciler) clearUpgradeLabelsWhereDriverNotRunning(ctx context.Context, state *upgrade.ClusterUpgradeState, driverLabel map[string]string, namespace string) error {
213+
upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
214+
215+
// Build a set of nodes being actively managed by the upgrade process first
216+
managedNodes := make(map[string]bool)
217+
for _, nodeStates := range state.NodeStates {
218+
for _, nodeState := range nodeStates {
219+
if nodeState.Node != nil {
220+
managedNodes[nodeState.Node.Name] = true
221+
}
222+
}
223+
}
224+
225+
// List only nodes that have the upgrade label
226+
nodeList := &corev1.NodeList{}
227+
if err := r.List(ctx, nodeList, client.HasLabels{upgradeStateLabel}); err != nil {
228+
return fmt.Errorf("failed to list nodes with upgrade labels: %w", err)
229+
}
230+
231+
if len(nodeList.Items) == 0 {
232+
return nil
233+
}
234+
235+
// Filter out nodes being actively managed by upgrade process
236+
var nodesToCheck []corev1.Node
237+
for _, node := range nodeList.Items {
238+
if !managedNodes[node.Name] {
239+
nodesToCheck = append(nodesToCheck, node)
240+
}
241+
}
242+
243+
if len(nodesToCheck) == 0 {
244+
return nil
245+
}
246+
247+
// List driver DaemonSets to check which nodes should have pods
248+
// This protects against removing labels during pod recreation (e.g., during rolling updates)
249+
dsList := &appsv1.DaemonSetList{}
250+
if err := r.List(ctx, dsList, client.InNamespace(namespace), client.MatchingLabels(driverLabel)); err != nil {
251+
return fmt.Errorf("failed to list driver DaemonSets: %w", err)
252+
}
253+
254+
// Build a set of nodes that DaemonSets would schedule to
255+
// This protects nodes during pod recreation (e.g., rolling updates where pod is temporarily missing)
256+
nodesShouldHavePods := make(map[string]bool)
257+
for _, ds := range dsList.Items {
258+
// DaemonSet nodeSelector is a map[string]string in spec.template.spec.nodeSelector
259+
nodeSelector := ds.Spec.Template.Spec.NodeSelector
260+
selector := labels.SelectorFromSet(nodeSelector)
261+
262+
for _, node := range nodesToCheck {
263+
// Check if this DaemonSet would schedule to this node
264+
if selector.Matches(labels.Set(node.Labels)) {
265+
nodesShouldHavePods[node.Name] = true
266+
}
267+
}
268+
}
269+
270+
// Clear upgrade label from nodes that aren't targeted by any DaemonSet
271+
// nodesToCheck already excludes nodes with pods (filtered by managedNodes)
272+
// The nodesShouldHavePods check protects against removing labels during pod recreation
273+
// (e.g., rolling updates where old pod is deleted but new pod not yet scheduled)
274+
//
275+
// Note: There is a small race condition window where a DaemonSet's nodeSelector is updated
276+
// after we list DaemonSets but before we patch the node. This is acceptable because:
277+
// 1. This is best-effort cleanup that runs every 2 minutes
278+
// 2. The primary use case (nodeSelector narrowing to exclude nodes) is correctly handled
279+
// 3. False negatives (not removing when we should) are safer than false positives
280+
for i := range nodesToCheck {
281+
node := &nodesToCheck[i]
282+
shouldHaveDriverPod := nodesShouldHavePods[node.Name]
283+
284+
// Only remove label if no DaemonSet intends to schedule a pod there
285+
if !shouldHaveDriverPod {
286+
r.Log.Info("Clearing stale upgrade label from node", "node", node.Name)
287+
288+
nodeCopy := node.DeepCopy()
289+
delete(nodeCopy.Labels, upgradeStateLabel)
290+
if err := r.Patch(ctx, nodeCopy, client.MergeFrom(node)); err != nil {
291+
r.Log.Error(err, "Failed to clear upgrade label from node", "node", node.Name)
292+
// Continue with other nodes even if one fails
293+
continue
294+
}
295+
}
296+
}
297+
298+
return nil
299+
}
300+
201301
// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202302
// It is used for cleanup when autoUpgrade feature gets disabled
203303
func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error {

0 commit comments

Comments
 (0)