Skip to content

Commit 530ced1

Browse files
committed
remove driver upgrade label from nodes
this commit removes driver upgrade label from nodes which don't have any driver pod running on them Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 688e38d commit 530ced1

1 file changed

Lines changed: 75 additions & 0 deletions

File tree

controllers/upgrade_controller.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,13 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
152152
return ctrl.Result{}, err
153153
}
154154

155+
// Clear stale upgrade labels from nodes that no longer have driver pods
156+
// Use the built state so we can avoid removing labels from nodes actively being upgraded
157+
if err := r.clearUpgradeLabelsWhereDriverNotRunning(ctx, state, driverLabel, clusterPolicyCtrl.operatorNamespace); err != nil {
158+
// Log the error but continue with the upgrade process, as this is a best-effort cleanup and should not block upgrades
159+
r.Log.Error(err, "Failed to clear stale upgrade labels")
160+
}
161+
155162
reqLogger.Info("Propagate state to state manager")
156163
reqLogger.V(consts.LogLevelDebug).Info("Current cluster upgrade state", "state", state)
157164

@@ -198,6 +205,74 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
198205
return ctrl.Result{Requeue: true, RequeueAfter: plannedRequeueInterval}, nil
199206
}
200207

208+
// clearUpgradeLabelsWhereDriverNotRunning removes upgrade labels from nodes where driver pods are no longer scheduled.
209+
// This handles the case where a nodeSelector change causes pods to be terminated from certain nodes,
210+
// but the upgrade labels remain. It skips nodes that are actively being managed by the upgrade process.
211+
func (r *UpgradeReconciler) clearUpgradeLabelsWhereDriverNotRunning(ctx context.Context, state *upgrade.ClusterUpgradeState, driverLabel map[string]string, namespace string) error {
212+
upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
213+
214+
// List all nodes
215+
nodeList := &corev1.NodeList{}
216+
if err := r.List(ctx, nodeList); err != nil {
217+
return fmt.Errorf("failed to list nodes: %w", err)
218+
}
219+
220+
// Filter nodes that have the upgrade label (any value)
221+
var nodesWithUpgradeLabel []corev1.Node
222+
for _, node := range nodeList.Items {
223+
if _, hasLabel := node.Labels[upgradeStateLabel]; hasLabel {
224+
nodesWithUpgradeLabel = append(nodesWithUpgradeLabel, node)
225+
}
226+
}
227+
228+
if len(nodesWithUpgradeLabel) == 0 {
229+
return nil // No nodes with upgrade labels
230+
}
231+
232+
// Build a set of nodes being actively managed by the upgrade process
233+
// We should not remove labels from these nodes as they may be mid-upgrade
234+
managedNodes := make(map[string]bool)
235+
for nodeName := range state.NodeStates {
236+
managedNodes[nodeName] = true
237+
}
238+
239+
// List all driver pods (including orphaned ones)
240+
podList := &corev1.PodList{}
241+
if err := r.List(ctx, podList, client.InNamespace(namespace), client.MatchingLabels(driverLabel)); err != nil {
242+
return fmt.Errorf("failed to list driver pods: %w", err)
243+
}
244+
245+
// Create a set of nodes that have driver pods (any driver pods)
246+
nodesWithPods := make(map[string]bool)
247+
for _, pod := range podList.Items {
248+
if pod.Spec.NodeName != "" {
249+
nodesWithPods[pod.Spec.NodeName] = true
250+
}
251+
}
252+
253+
// Clear upgrade label from nodes that don't have driver pods
254+
for i := range nodesWithUpgradeLabel {
255+
node := &nodesWithUpgradeLabel[i]
256+
// Skip nodes being actively managed by upgrade process
257+
if managedNodes[node.Name] {
258+
continue
259+
}
260+
if _, hasDriverPod := nodesWithPods[node.Name]; !hasDriverPod {
261+
r.Log.Info("Clearing stale upgrade label from node", "node", node.Name)
262+
263+
nodeCopy := node.DeepCopy()
264+
delete(node.Labels, upgradeStateLabel)
265+
if err := r.Patch(ctx, node, client.MergeFrom(nodeCopy)); err != nil {
266+
r.Log.Error(err, "Failed to clear upgrade label from node", "node", node.Name)
267+
// Continue with other nodes even if one fails
268+
continue
269+
}
270+
}
271+
}
272+
273+
return nil
274+
}
275+
201276
// removeNodeUpgradeStateLabels loops over nodes in the cluster and removes "nvidia.com/gpu-driver-upgrade-state"
202277
// It is used for cleanup when autoUpgrade feature gets disabled
203278
func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) error {

0 commit comments

Comments
 (0)