Fix transient error handling in CloudOperatorReconciler and tests

nrb · claude · nrb · commit 0ec02f6ef8c4 · 2026-03-05T15:13:40.000-05:00
Add handleTransientError/handleDegradeError methods to CloudOperatorReconciler
with an aggregatedTransientDegradedThreshold of 2m30s (longer than the
sub-controller threshold of 2m, to accommodate sub-controller recovery time).

Fix test: handleTransientError test was stepping the clock by transientDegradedThreshold
(2m, the sub-controller constant) instead of aggregatedTransientDegradedThreshold
(2m30s), so the threshold was never exceeded and the degraded condition was
never set.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Nolan Brubaker &lt;nolan@nbrubaker.com&gt;
diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go
@@ -32,8 +32,9 @@ const (
 	cloudConfigControllerDegradedCondition  = "CloudConfigControllerDegraded"
 
 	// transientDegradedThreshold is how long transient errors must persist before
-	// the controller sets CloudConfigControllerDegraded=True. This prevents brief
+	// the controller sets Degraded=True. This prevents brief
 	// API server blips during upgrades from immediately degrading the operator.
+	// Applies to both CloudConfigController and TrustedCAController.
 	transientDegradedThreshold = 2 * time.Minute
 )
 
diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go
@@ -19,6 +19,7 @@ package controllers
 import (
 	"context"
 	"fmt"
+	"time"
 
 	configv1 "github.com/openshift/api/config/v1"
 	operatorv1 "github.com/openshift/api/operator/v1"
@@ -45,16 +46,24 @@ const (
 
 	// Condition type for Cloud Controller ownership
 	cloudControllerOwnershipCondition = "CloudControllerOwner"
+
+	// aggregatedTransientDegradedThreshold is how long transient errors must persist before
+	// the controller sets Degraded=True.
+	// This prevents brief API server blips during upgrades from immediately degrading the operator.
+	// Applies to top-level operator, and is longer in order
+	// to accomodate changes in the lower-level operators.
+	aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second)
 )
 
 // CloudOperatorReconciler reconciles a ClusterOperator object
 type CloudOperatorReconciler struct {
 	ClusterOperatorStatusClient
-	Scheme            *runtime.Scheme
-	watcher           ObjectWatcher
-	ImagesFile        string
-	FeatureGateAccess featuregates.FeatureGateAccess
-	TLSProfileSpec    configv1.TLSProfileSpec
+	Scheme                  *runtime.Scheme
+	watcher                 ObjectWatcher
+	ImagesFile              string
+	FeatureGateAccess       featuregates.FeatureGateAccess
+	TLSProfileSpec          configv1.TLSProfileSpec
+	consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
 }
 
 // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete
@@ -69,59 +78,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 	infra := &configv1.Infrastructure{}
 	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
 		klog.Infof("Infrastructure cluster does not exist. Skipping...")
-
 		if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
 			klog.Errorf("Unable to sync cluster operator status: %s", err)
-			return ctrl.Result{}, err
+			return r.handleTransientError(ctx, conditionOverrides, err)
 		}
-
+		// It's ok for the infrastructure cluster to not exist
+		r.clearFailureWindow()
 		return ctrl.Result{}, nil
 	} else if err != nil {
 		klog.Errorf("Unable to retrive Infrastructure object: %v", err)
-
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return r.handleTransientError(ctx, conditionOverrides, err)
 	}
 
 	allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides)
 	if err != nil {
 		klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err)
-		return ctrl.Result{}, err
+		return r.handleTransientError(ctx, conditionOverrides, err)
 	} else if !allowedToProvision {
+		// We're not allowed to provision, but didn't have any failures.
+		r.clearFailureWindow()
 		return ctrl.Result{}, nil
 	}
 
 	clusterProxy := &configv1.Proxy{}
 	if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) {
 		klog.Errorf("Unable to retrive Proxy object: %v", err)
-
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return r.handleTransientError(ctx, conditionOverrides, err)
 	}
 
 	operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSProfileSpec)
 	if err != nil {
 		klog.Errorf("Unable to build operator config %s", err)
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return r.handleDegradeError(ctx, conditionOverrides, err)
 	}
 
 	if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
 		klog.Errorf("Unable to sync operands: %s", err)
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return r.handleTransientError(ctx, conditionOverrides, err)
 	}
 
 	if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
@@ -134,9 +127,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 		return ctrl.Result{}, err
 	}
 
+	// successful reconcile, make sure the failure window is cleared.
+	r.clearFailureWindow()
 	return ctrl.Result{}, nil
 }
 
+func (r *CloudOperatorReconciler) clearFailureWindow() {
+	r.consecutiveFailureSince = nil
+}
+
+// handleTransientError records the start of a failure window and degrades the
+// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns
+// a non-nil error so controller-runtime requeues with exponential backoff.
+func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
+	now := r.Clock.Now()
+	if r.consecutiveFailureSince == nil {
+		r.consecutiveFailureSince = &now
+		klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold)
+		return ctrl.Result{}, err
+	}
+	elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
+	if elapsed < aggregatedTransientDegradedThreshold {
+		klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
+	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleDegradeError sets OperatorDegraded=True immediately and returns nil so
+// controller-runtime does NOT requeue. Existing watches on Infrastructure,
+// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed.
+func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
+	klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err)
+	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+	}
+	return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger
+}
+
 func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error {
 	// Deploy resources for platform
 	resources, err := cloud.GetResources(config)
diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go
@@ -2,6 +2,7 @@ package controllers
 
 import (
 	"context"
+	"fmt"
 	"time"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() {
 	})
 
 })
+
+var _ = Describe("CloudOperatorReconciler error handling", func() {
+	ctx := context.Background()
+
+	AfterEach(func() {
+		co := &configv1.ClusterOperator{}
+		if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil {
+			Eventually(func() bool {
+				err := cl.Delete(ctx, co)
+				return err == nil || apierrors.IsNotFound(err)
+			}).Should(BeTrue())
+		}
+		Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue())
+	})
+
+	It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() {
+		reconciler := &CloudOperatorReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            clocktesting.NewFakePassiveClock(time.Now()),
+				ManagedNamespace: defaultManagementNamespace,
+				Recorder:         record.NewFakeRecorder(32),
+			},
+			Scheme: scheme.Scheme,
+		}
+
+		_, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error"))
+		Expect(err).NotTo(HaveOccurred())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue())
+	})
+
+	It("handleTransientError should not degrade before threshold, but degrade after threshold", func() {
+		fakeClock := clocktesting.NewFakeClock(time.Now())
+		reconciler := &CloudOperatorReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            fakeClock,
+				ManagedNamespace: defaultManagementNamespace,
+				Recorder:         record.NewFakeRecorder(32),
+			},
+			Scheme: scheme.Scheme,
+		}
+
+		// Pre-create the ClusterOperator so that setStatusDegraded can update its status
+		// subresource when the threshold is exceeded (status subresource updates require the
+		// object to already exist in the cluster).
+		co := &configv1.ClusterOperator{}
+		co.SetName(clusterOperatorName)
+		Expect(cl.Create(ctx, co)).To(Succeed())
+
+		// First reconcile: transient failure starts; error returned but no degraded condition set.
+		_, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
+		Expect(err).To(HaveOccurred())
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(),
+			"should not be degraded before threshold")
+
+		// Advance clock past the degraded threshold.
+		fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second)
+
+		// Second reconcile: threshold exceeded, controller sets degraded.
+		_, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
+		Expect(err).To(HaveOccurred())
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue())
+	})
+})

Original file line number	Diff line number	Diff line change
`@@ -32,8 +32,9 @@ const (`
`32`	`32`	`cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"`
`33`	`33`
`34`	`34`	`// transientDegradedThreshold is how long transient errors must persist before`
`35`		`- // the controller sets CloudConfigControllerDegraded=True. This prevents brief`
	`35`	`+ // the controller sets Degraded=True. This prevents brief`
`36`	`36`	`// API server blips during upgrades from immediately degrading the operator.`
	`37`	`+ // Applies to both CloudConfigController and TrustedCAController.`
`37`	`38`	`transientDegradedThreshold = 2 * time.Minute`
`38`	`39`	`)`
`39`	`40`