From 0fa68337503e3d27f9471e23a4540d331c0c375c Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 1 Apr 2026 13:11:56 +1000 Subject: [PATCH 1/2] Fix HFS setCondition ignoring ObservedGeneration changes The setCondition function in the HostFirmwareSettings controller was checking whether the condition changed by comparing against the old info.hfs.Status.Conditions rather than the newly updated status. When only ObservedGeneration changed but the condition status remained the same, the function returned false (no change), causing the status update to be skipped. This left ObservedGeneration permanently stale, blocking the BMH controller's generation-based consistency checks. Fix by using the return value of meta.SetStatusCondition, which correctly detects ObservedGeneration changes. Also remove the unused info parameter from setCondition since it is no longer needed. Assisted-By: Claude Opus 4.6 (cherry picked from commit 218158a96ba2b15f004fd7eb97356de0f20ba23a) --- .../hostfirmwaresettings_controller.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/internal/controller/metal3.io/hostfirmwaresettings_controller.go b/internal/controller/metal3.io/hostfirmwaresettings_controller.go index 7cc291120f..84390893f2 100644 --- a/internal/controller/metal3.io/hostfirmwaresettings_controller.go +++ b/internal/controller/metal3.io/hostfirmwaresettings_controller.go @@ -247,7 +247,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info generation := info.hfs.GetGeneration() if specMismatch { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { dirty = true } @@ -255,7 +255,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info // Eventually this will be handled by a webhook errors := r.validateHostFirmwareSettings(info, &newStatus, schema) if len(errors) == 0 { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } } else { @@ -266,15 +266,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info } } reason = reasonConfigurationError - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { dirty = true } } } else { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { dirty = true } } @@ -425,7 +425,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque } } -func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo, +func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus, reason conditionReason, message string) bool { newCondition := metav1.Condition{ @@ -435,13 +435,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus Reason: string(reason), Message: message, } - meta.SetStatusCondition(&status.Conditions, newCondition) - currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond)) - if currCond == nil || currCond.Status != newStatus { - return true - } - return false + return meta.SetStatusCondition(&status.Conditions, newCondition) } // Generate a name based on the schema key and values which should be the same for similar hardware. From c383955fd94e52d85a95b2b55f3c66c91ea9df51 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 1 Apr 2026 13:25:58 +1000 Subject: [PATCH 2/2] Handle generation mismatch in servicing error recovery When a user clears HFC/HFS specs to abort a failed servicing operation, the BMH controller calls getHostFirmwareSettings/Components which check that metadata.generation matches status.conditions.observedGeneration. Since the HFS/HFC sub-controllers may not have reconciled yet after the spec change, this generation mismatch returns an error that blocks the abort path, leaving the host stuck in ServicingError with exponential backoff. Fix with two complementary changes: 1. Add a fast-path in doServiceIfNeeded: when in ServicingError state, check specs directly via lightweight r.Get() calls (which don't validate generation) before the generation-sensitive functions. If specs are cleared, abort immediately via prov.Service() and clear the error/operational status on success. 2. Watch HostFirmwareSettings and HostFirmwareComponents for generation changes (spec modifications), mapping events to the corresponding BMH. This ensures the BMH controller reconciles promptly when specs change, rather than waiting for error-state exponential backoff. Assisted-By: Claude Opus 4.6 (cherry picked from commit b2357829467a440b8e1c60f3fd8fa88cab856efe) --- .../metal3.io/baremetalhost_controller.go | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b2bb51a9f4..42ef4c02a9 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" ) @@ -1460,6 +1461,42 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot) } + // Fast-path: when recovering from a servicing error, check if specs are + // cleared before calling getHostFirmwareSettings/Components. Those + // functions fail on generation mismatch (sub-controller hasn't reconciled + // yet), which would block the abort/recovery path indefinitely. + if info.host.Status.ErrorType == metal3api.ServicingError { + specsExist := false + if liveFirmwareSettingsAllowed { + hfsCheck := &metal3api.HostFirmwareSettings{} + if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 { + specsExist = true + } + } + if liveFirmwareUpdatesAllowed && !specsExist { + hfcCheck := &metal3api.HostFirmwareComponents{} + if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 { + specsExist = true + } + } + if !specsExist { + info.log.Info("specs cleared while in servicing error state, attempting abort") + provResult, _, err := prov.Service(ctx, servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} + } + if provResult.ErrorMessage != "" { + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + return actionContinue{provResult.RequeueAfter} + } + info.log.Info("successfully recovered from servicing error") + clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) + return actionComplete{} + } + } + if liveFirmwareSettingsAllowed { // handling pre-HFS FirmwareSettings here if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { @@ -2501,6 +2538,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE controller.Owns(&metal3api.PreprovisioningImage{}) } + // Watch HFC/HFS for spec changes (generation bumps) so that the BMH + // controller reconciles promptly when a user clears firmware specs, + // rather than waiting for error-state exponential backoff to expire. + firmwareEventHandler := handler.EnqueueRequestsFromMapFunc( + func(_ context.Context, obj client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKey{ + Name: obj.GetName(), + Namespace: obj.GetNamespace(), + }}} + }, + ) + controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + return controller.Complete(r) }