From 80cb2d24793c28c1824dca19f8b6457437d69658 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 20 Mar 2026 20:06:25 +1000 Subject: [PATCH 1/6] Revert "Enable BMO to abort servicing to back out of failures" This reverts commit aa17d5a5e5acd44c999c417c396d902de434b4b8. --- .../metal3.io/baremetalhost_controller.go | 21 +++++-------- pkg/provisioner/ironic/servicing.go | 31 ++----------------- pkg/provisioner/ironic/servicing_test.go | 25 +-------------- pkg/provisioner/provisioner.go | 3 -- 4 files changed, 10 insertions(+), 70 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b2bb51a9f4..671b18a87b 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1466,8 +1466,6 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.FirmwareConfig = info.host.Spec.Firmware fwDirty = true } - servicingData.HasFirmwareSpec = fwDirty && info.host.Spec.Firmware != nil - // handling HFS based FirmwareSettings here var hfs *metal3api.HostFirmwareSettings var err error @@ -1479,8 +1477,6 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.ActualFirmwareSettings = hfs.Status.Settings servicingData.TargetFirmwareSettings = hfs.Spec.Settings } - - servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfs != nil && len(hfs.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1497,8 +1493,6 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.TargetFirmwareComponents = hfc.Spec.Updates } } - - servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfc != nil && len(hfc.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty @@ -2148,8 +2142,7 @@ func hostObjectHasChanges(conditions []metav1.Condition, changedCondition, valid return false, true, nil } -// Get the stored firmware settings. Returns dirty=true if there are valid pending changes. -// The hfs object is returned when available regardless of validity, so callers can inspect spec contents. +// Get the stored firmware settings if there are valid changes. func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, info *reconcileInfo) (dirty bool, hfs *metal3api.HostFirmwareSettings, err error) { hfs = &metal3api.HostFirmwareSettings{} if err = r.Get(ctx, info.request.NamespacedName, hfs); err != nil { @@ -2169,7 +2162,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i } if !valid { info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName) - return false, hfs, nil + return false, nil, nil } if changed { @@ -2183,11 +2176,11 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i } info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName) - return false, hfs, nil + return false, nil, nil } -// Get the stored firmware components. Returns dirty=true if there are valid pending changes. -// The hfc object is returned when available regardless of validity, so callers can inspect spec contents. +// Get the stored firmware settings if there are valid changes. + func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, info *reconcileInfo) (dirty bool, hfc *metal3api.HostFirmwareComponents, err error) { hfc = &metal3api.HostFirmwareComponents{} if err = r.Get(ctx, info.request.NamespacedName, hfc); err != nil { @@ -2207,7 +2200,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, } if !valid { info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName) - return false, hfc, nil + return false, nil, nil } if changed { info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName) @@ -2215,7 +2208,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, } info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName) - return false, hfc, nil + return false, nil, nil } func setConditionTrue(host *metal3api.BareMetalHost, typ, reason string) { diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 317adfb365..396e00adae 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -78,16 +78,6 @@ func (p *ironicProvisioner) startServicing(ctx context.Context, bmcAccess bmc.Ac return } -func (p *ironicProvisioner) abortServicing(ctx context.Context, ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) { - p.log.Info("aborting servicing because firmware spec was removed") - started, result, err = p.tryChangeNodeProvisionState( - ctx, - ironicNode, - nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, - ) - return -} - func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) { bmcAccess, err := p.bmcAccess() if err != nil { @@ -103,13 +93,7 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: - // When servicing failed and user removed all firmware specs, - // abort the servicing operation to back out - if !data.HasFirmwareSpec { - return p.abortServicing(ctx, ironicNode) - } - - // When servicing failed and there are pending updates, we need to clean host provisioning settings + // When servicing failed, we need to clean host provisioning settings. // If restartOnFailure is false, it means the settings aren't cleared. if !restartOnFailure { result, err = operationFailed(ironicNode.LastError) @@ -137,18 +121,7 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic // Servicing finished p.log.Info("servicing finished on the host") result, err = operationComplete() - case nodes.Servicing: - p.log.Info("waiting for host to finish servicing", - "state", ironicNode.ProvisionState, - "serviceStep", ironicNode.ServiceStep) - result, err = operationContinuing(provisionRequeueDelay) - - case nodes.ServiceWait: - // If user removed all firmware specs while in ServiceWait, abort - if !data.HasFirmwareSpec { - return p.abortServicing(ctx, ironicNode) - } - + case nodes.Servicing, nodes.ServiceWait: p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go index bf8ff09d9a..7dc07d2183 100644 --- a/pkg/provisioner/ironic/servicing_test.go +++ b/pkg/provisioner/ironic/servicing_test.go @@ -86,17 +86,6 @@ func TestService(t *testing.T) { expectedRequestAfter: 0, expectedDirty: false, }, - { - name: "serviceFail state(abort - no firmware spec)", - ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ - ProvisionState: string(nodes.ServiceFail), - UUID: nodeUUID, - }), - skipConfig: true, - expectedStarted: true, - expectedRequestAfter: 10, - expectedDirty: true, - }, { name: "serviceFail state(retry)", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ @@ -141,17 +130,6 @@ func TestService(t *testing.T) { expectedRequestAfter: 10, expectedDirty: true, }, - { - name: "serviceWait state(abort - no firmware spec)", - ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ - ProvisionState: string(nodes.ServiceWait), - UUID: nodeUUID, - }), - skipConfig: true, - expectedStarted: true, - expectedRequestAfter: 10, - expectedDirty: true, - }, { name: "active state(servicing finished)", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ @@ -186,14 +164,13 @@ func TestService(t *testing.T) { host.Status.Provisioning.ID = nodeUUID prepData := provisioner.ServicingData{} if !tc.skipConfig { - host.Spec.BMC.Address = "bios-test://test.bmc/" + host.Spec.BMC.Address = "raid-test://test.bmc/" prepData.ActualFirmwareSettings = metal3api.SettingsMap{ "Answer": "unknown", } prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{ "Answer": intstr.FromInt(42), } - prepData.HasFirmwareSpec = true } publisher := func(reason, message string) {} diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index ab940823d1..fbe823b3e3 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -121,9 +121,6 @@ type ServicingData struct { TargetFirmwareSettings metal3api.DesiredSettingsMap ActualFirmwareSettings metal3api.SettingsMap TargetFirmwareComponents []metal3api.FirmwareUpdate - // True if any firmware spec exists (settings, components, or legacy FirmwareConfig), - // used to distinguish "no updates" from "user cleared spec". - HasFirmwareSpec bool } type ProvisionData struct { From 3375ac608d09d569973bd0462e492c2046158787 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Wed, 6 Aug 2025 16:49:08 +1000 Subject: [PATCH 2/6] Initial attempt at using wait step to back out of failed servicing. Generated-By: Claude Code Sonnet 4 Signed-off-by: Jacob Anders (cherry picked from commit c40d0ad4c80587ec223b37a599f58ca4e484f81e) (cherry picked from commit b628dd504e8f00100153228992facd6061c3c46e) --- .../metal3.io/baremetalhost_controller.go | 42 +++++++++++++++++ pkg/provisioner/ironic/servicing.go | 47 ++++++++++++++++++- pkg/provisioner/provisioner.go | 3 ++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 671b18a87b..36d40bedd7 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1477,6 +1477,12 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.ActualFirmwareSettings = hfs.Status.Settings servicingData.TargetFirmwareSettings = hfs.Spec.Settings } + + // Track if HFS spec has actual settings - check independently since getHostFirmwareSettings + // returns nil when no changes even if object exists + hfsExists := &metal3api.HostFirmwareSettings{} + hfsExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfsExists) + servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1493,16 +1499,52 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.TargetFirmwareComponents = hfc.Spec.Updates } } + + // Track if HFC spec has actual updates - check independently since getHostFirmwareComponents + // returns nil when no changes even if object exists + hfcExists := &metal3api.HostFirmwareComponents{} + hfcExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfcExists) + servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty + info.log.Info("janders_debug: servicing data flags", + "hasSettingsSpec", servicingData.HasFirmwareSettingsSpec, + "hasComponentsSpec", servicingData.HasFirmwareComponentsSpec, + "fwDirty", fwDirty, "hfsDirty", hfsDirty, "hfcDirty", hfcDirty, + "hasChanges", hasChanges, "note", "hasSpec flags check if spec.settings/spec.updates have content") + // Even if settings are clean, we need to check the result of the current servicing. if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { // If nothing is going on, return control to the power management. return nil } + // If we're in a servicing error state and updates were removed from spec, + // let the provisioner handle the transition back to active + if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges { + info.log.Info("updates removed from spec while in servicing error state, attempting recovery") + provResult, _, err := prov.Service(servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)} + } + if provResult.ErrorMessage != "" { + info.log.Info("failed to recover from servicing error", "error", provResult.ErrorMessage) + return actionError{fmt.Errorf("failed to recover from servicing error: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + info.log.Info("abort operation in progress, checking back later") + return actionContinue{provResult.RequeueAfter} + } + // If abort completed and no error, we've successfully recovered + info.log.Info("successfully recovered from servicing error") + info.host.Status.ErrorType = "" + info.host.Status.ErrorMessage = "" + info.host.Status.OperationalStatus = metal3api.OperationalStatusOK + return actionComplete{} + } + // FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually // succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the // update didn't actually happen. This is deemed an acceptable risk for the moment since it is only diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 396e00adae..5871684a36 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -78,6 +78,25 @@ func (p *ironicProvisioner) startServicing(ctx context.Context, bmcAccess bmc.Ac return } +func (p *ironicProvisioner) abortServicing(ctx context.Context, ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) { + // Clear maintenance flag first if it's set + if ironicNode.Maintenance { + p.log.Info("clearing maintenance flag before aborting servicing") + result, err = p.setMaintenanceFlag(ctx, ironicNode, false, "") + return result, started, err + } + + // Set started to let the controller know about the change + p.log.Info("aborting servicing due to removal of spec.updates/spec.settings") + started, result, err = p.tryChangeNodeProvisionState( + ctx, + ironicNode, + nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, + ) + p.log.Info("janders_debug: abort result", "started", started, "result", result, "error", err) + return +} + func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) { bmcAccess, err := p.bmcAccess() if err != nil { @@ -91,9 +110,29 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic return result, started, err } + // Check if there are any pending updates + serviceSteps, err := p.buildServiceSteps(bmcAccess, data) + if err != nil { + result, err = operationFailed(err.Error()) + return result, started, err + } + + p.log.Info("janders_debug: servicing state check", + "hasSettingsSpec", data.HasFirmwareSettingsSpec, + "hasComponentsSpec", data.HasFirmwareComponentsSpec, + "serviceStepsCount", len(serviceSteps), + "nodeState", ironicNode.ProvisionState) + switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: - // When servicing failed, we need to clean host provisioning settings. + // When servicing failed and user actually removed the specs (not just no updates calculated), + // we need to abort the servicing operation to back out + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting servicing because spec.updates/spec.settings was removed") + return p.abortServicing(ctx, ironicNode) + } + + // When servicing failed and there are pending updates, we need to clean host provisioning settings // If restartOnFailure is false, it means the settings aren't cleared. if !restartOnFailure { result, err = operationFailed(ironicNode.LastError) @@ -122,6 +161,12 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic p.log.Info("servicing finished on the host") result, err = operationComplete() case nodes.Servicing, nodes.ServiceWait: + // If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed") + return p.abortServicing(ctx, ironicNode) + } + p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index fbe823b3e3..2fb00fdc17 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -121,6 +121,9 @@ type ServicingData struct { TargetFirmwareSettings metal3api.DesiredSettingsMap ActualFirmwareSettings metal3api.SettingsMap TargetFirmwareComponents []metal3api.FirmwareUpdate + // Flags to track if specs exist (vs. just no updates calculated) + HasFirmwareSettingsSpec bool + HasFirmwareComponentsSpec bool } type ProvisionData struct { From 6e524c6f25f877563f0e17509fc58845746200c7 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 20 Mar 2026 20:24:25 +1000 Subject: [PATCH 3/6] Adapting old patch to current codebase Signed-off-by: Jacob Anders --- internal/controller/metal3.io/baremetalhost_controller.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 36d40bedd7..8d35f5eb54 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1481,7 +1481,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr // Track if HFS spec has actual settings - check independently since getHostFirmwareSettings // returns nil when no changes even if object exists hfsExists := &metal3api.HostFirmwareSettings{} - hfsExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfsExists) + hfsExistsErr := r.Get(ctx, info.request.NamespacedName, hfsExists) servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0) } @@ -1503,7 +1503,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr // Track if HFC spec has actual updates - check independently since getHostFirmwareComponents // returns nil when no changes even if object exists hfcExists := &metal3api.HostFirmwareComponents{} - hfcExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfcExists) + hfcExistsErr := r.Get(ctx, info.request.NamespacedName, hfcExists) servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0) } @@ -1525,7 +1525,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr // let the provisioner handle the transition back to active if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges { info.log.Info("updates removed from spec while in servicing error state, attempting recovery") - provResult, _, err := prov.Service(servicingData, false, false) + provResult, _, err := prov.Service(ctx, servicingData, false, false) if err != nil { return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)} } From 37bc8af0a87ae16770e4ec0ccb1808b82d80a3e7 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 20 Mar 2026 21:44:11 +1000 Subject: [PATCH 4/6] Bypass generation mismatch on servicing error recovery When a user clears HFC/HFS specs to abort a failed servicing operation, the BMH controller calls getHostFirmwareSettings/Components which check that metadata.generation matches status.observedGeneration. Since the sub-controllers may not have reconciled yet after the spec was cleared, this generation mismatch causes an error that blocks the abort/recovery path indefinitely, leaving the host stuck in servicing failed with exponential backoff. Add a fast-path check early in doServiceIfNeeded: when in ServicingError state, directly check if specs are empty via lightweight r.Get() calls (which don't validate generation) before falling through to the generation-sensitive getHostFirmwareSettings/Components functions. If specs are cleared, proceed directly to the abort via prov.Service(). Also removes debug logging from the previous iteration. Assisted-By: Claude Opus 4.6 --- .../metal3.io/baremetalhost_controller.go | 69 +++++++++++-------- pkg/provisioner/ironic/servicing.go | 11 +-- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index 8d35f5eb54..b87c789d83 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -1460,6 +1460,45 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot) } + // Fast-path: when recovering from a servicing error, check if specs are + // cleared before calling getHostFirmwareSettings/Components. Those + // functions fail on generation mismatch (sub-controller hasn't reconciled + // yet), which would block the abort/recovery path indefinitely. + if info.host.Status.ErrorType == metal3api.ServicingError { + specsExist := false + if liveFirmwareSettingsAllowed { + hfsCheck := &metal3api.HostFirmwareSettings{} + if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 { + specsExist = true + } + } + if liveFirmwareUpdatesAllowed && !specsExist { + hfcCheck := &metal3api.HostFirmwareComponents{} + if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 { + specsExist = true + } + } + if !specsExist { + info.log.Info("specs cleared while in servicing error state, attempting abort") + provResult, _, err := prov.Service(ctx, servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} + } + if provResult.ErrorMessage != "" { + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + info.log.Info("abort operation in progress, checking back later") + return actionContinue{provResult.RequeueAfter} + } + info.log.Info("successfully recovered from servicing error") + info.host.Status.ErrorType = "" + info.host.Status.ErrorMessage = "" + info.host.Status.OperationalStatus = metal3api.OperationalStatusOK + return actionComplete{} + } + } + if liveFirmwareSettingsAllowed { // handling pre-HFS FirmwareSettings here if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { @@ -1509,42 +1548,12 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr hasChanges := fwDirty || hfsDirty || hfcDirty - info.log.Info("janders_debug: servicing data flags", - "hasSettingsSpec", servicingData.HasFirmwareSettingsSpec, - "hasComponentsSpec", servicingData.HasFirmwareComponentsSpec, - "fwDirty", fwDirty, "hfsDirty", hfsDirty, "hfcDirty", hfcDirty, - "hasChanges", hasChanges, "note", "hasSpec flags check if spec.settings/spec.updates have content") - // Even if settings are clean, we need to check the result of the current servicing. if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { // If nothing is going on, return control to the power management. return nil } - // If we're in a servicing error state and updates were removed from spec, - // let the provisioner handle the transition back to active - if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges { - info.log.Info("updates removed from spec while in servicing error state, attempting recovery") - provResult, _, err := prov.Service(ctx, servicingData, false, false) - if err != nil { - return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)} - } - if provResult.ErrorMessage != "" { - info.log.Info("failed to recover from servicing error", "error", provResult.ErrorMessage) - return actionError{fmt.Errorf("failed to recover from servicing error: %s", provResult.ErrorMessage)} - } - if provResult.Dirty { - info.log.Info("abort operation in progress, checking back later") - return actionContinue{provResult.RequeueAfter} - } - // If abort completed and no error, we've successfully recovered - info.log.Info("successfully recovered from servicing error") - info.host.Status.ErrorType = "" - info.host.Status.ErrorMessage = "" - info.host.Status.OperationalStatus = metal3api.OperationalStatusOK - return actionComplete{} - } - // FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually // succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the // update didn't actually happen. This is deemed an acceptable risk for the moment since it is only diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 5871684a36..54bba8b964 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -93,7 +93,6 @@ func (p *ironicProvisioner) abortServicing(ctx context.Context, ironicNode *node ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetAbort}, ) - p.log.Info("janders_debug: abort result", "started", started, "result", result, "error", err) return } @@ -110,19 +109,11 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic return result, started, err } - // Check if there are any pending updates - serviceSteps, err := p.buildServiceSteps(bmcAccess, data) - if err != nil { + if _, err = p.buildServiceSteps(bmcAccess, data); err != nil { result, err = operationFailed(err.Error()) return result, started, err } - p.log.Info("janders_debug: servicing state check", - "hasSettingsSpec", data.HasFirmwareSettingsSpec, - "hasComponentsSpec", data.HasFirmwareComponentsSpec, - "serviceStepsCount", len(serviceSteps), - "nodeState", ironicNode.ProvisionState) - switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: // When servicing failed and user actually removed the specs (not just no updates calculated), From 3e8c53d5a758d93281294c6d22f592c6ee0947cb Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Mon, 23 Mar 2026 13:49:22 +1000 Subject: [PATCH 5/6] Watch HFC/HFS spec changes to trigger BMH reconciliation When a BareMetalHost is in a servicing error state with exponential backoff, clearing the HostFirmwareComponents or HostFirmwareSettings spec would not trigger a BMH reconcile because the BMH controller did not watch those resources. The controller would sit in backoff for hours before discovering the specs were cleared. Add watches on HostFirmwareSettings and HostFirmwareComponents with a GenerationChangedPredicate so that spec changes (which bump generation) trigger an immediate BMH reconcile. Status-only updates from the sub-controllers are filtered out since they don't change generation. Assisted-By: Claude Opus 4.6 --- .../metal3.io/baremetalhost_controller.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b87c789d83..30a1207a03 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" ) @@ -2545,6 +2546,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE controller.Owns(&metal3api.PreprovisioningImage{}) } + // Watch HFC/HFS for spec changes (generation bumps) so that the BMH + // controller reconciles promptly when a user clears firmware specs, + // rather than waiting for the error-state exponential backoff to expire. + firmwareEventHandler := handler.EnqueueRequestsFromMapFunc( + func(_ context.Context, obj client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKey{ + Name: obj.GetName(), + Namespace: obj.GetNamespace(), + }}} + }, + ) + controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + return controller.Complete(r) } From 41620b320830a6856aa77d88729e88965d69dc7e Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 31 Mar 2026 16:56:47 +1000 Subject: [PATCH 6/6] Fix HFS setCondition ignoring ObservedGeneration changes The HFS controller's setCondition had a custom dirty-check that only compared the condition Status field (True/False/Unknown) against the old conditions, ignoring ObservedGeneration. When the HFS spec generation changed without altering the condition outcome, the status update was skipped and ObservedGeneration stayed stale. This caused the BMH controller to block indefinitely on the generation mismatch check in getHostFirmwareSettings, preventing power-on after a reboot. Align with the HFC controller's setUpdatesCondition by returning meta.SetStatusCondition directly, which compares all condition fields including ObservedGeneration. Assisted-By: Claude Opus 4.6 Signed-off-by: Jacob Anders --- .../hostfirmwaresettings_controller.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/internal/controller/metal3.io/hostfirmwaresettings_controller.go b/internal/controller/metal3.io/hostfirmwaresettings_controller.go index f3000eff53..eae3aa7709 100644 --- a/internal/controller/metal3.io/hostfirmwaresettings_controller.go +++ b/internal/controller/metal3.io/hostfirmwaresettings_controller.go @@ -245,7 +245,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info generation := info.hfs.GetGeneration() if specMismatch { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { dirty = true } @@ -253,7 +253,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info // Eventually this will be handled by a webhook errors := r.validateHostFirmwareSettings(info, &newStatus, schema) if len(errors) == 0 { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } } else { @@ -264,15 +264,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info } } reason = reasonConfigurationError - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { dirty = true } } } else { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { dirty = true } } @@ -422,7 +422,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque } } -func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo, +func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus, reason conditionReason, message string) bool { newCondition := metav1.Condition{ @@ -432,13 +432,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus Reason: string(reason), Message: message, } - meta.SetStatusCondition(&status.Conditions, newCondition) - currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond)) - if currCond == nil || currCond.Status != newStatus { - return true - } - return false + return meta.SetStatusCondition(&status.Conditions, newCondition) } // Generate a name based on the schema key and values which should be the same for similar hardware.