Skip to content

Commit 0ec02f6

Browse files
nrbclaude
andcommitted
Fix transient error handling in CloudOperatorReconciler and tests
Add handleTransientError/handleDegradeError methods to CloudOperatorReconciler with an aggregatedTransientDegradedThreshold of 2m30s (longer than the sub-controller threshold of 2m, to accommodate sub-controller recovery time). Fix test: handleTransientError test was stepping the clock by transientDegradedThreshold (2m, the sub-controller constant) instead of aggregatedTransientDegradedThreshold (2m30s), so the threshold was never exceeded and the degraded condition was never set. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Nolan Brubaker <nolan@nbrubaker.com>
1 parent 165ff0a commit 0ec02f6

3 files changed

Lines changed: 136 additions & 32 deletions

File tree

pkg/controllers/cloud_config_sync_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ const (
3232
cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"
3333

3434
// transientDegradedThreshold is how long transient errors must persist before
35-
// the controller sets CloudConfigControllerDegraded=True. This prevents brief
35+
// the controller sets Degraded=True. This prevents brief
3636
// API server blips during upgrades from immediately degrading the operator.
37+
// Applies to both CloudConfigController and TrustedCAController.
3738
transientDegradedThreshold = 2 * time.Minute
3839
)
3940

pkg/controllers/clusteroperator_controller.go

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22+
"time"
2223

2324
configv1 "github.com/openshift/api/config/v1"
2425
operatorv1 "github.com/openshift/api/operator/v1"
@@ -45,16 +46,24 @@ const (
4546

4647
// Condition type for Cloud Controller ownership
4748
cloudControllerOwnershipCondition = "CloudControllerOwner"
49+
50+
// aggregatedTransientDegradedThreshold is how long transient errors must persist before
51+
// the controller sets Degraded=True.
52+
// This prevents brief API server blips during upgrades from immediately degrading the operator.
53+
// Applies to top-level operator, and is longer in order
54+
// to accomodate changes in the lower-level operators.
55+
aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second)
4856
)
4957

5058
// CloudOperatorReconciler reconciles a ClusterOperator object
5159
type CloudOperatorReconciler struct {
5260
ClusterOperatorStatusClient
53-
Scheme *runtime.Scheme
54-
watcher ObjectWatcher
55-
ImagesFile string
56-
FeatureGateAccess featuregates.FeatureGateAccess
57-
TLSProfileSpec configv1.TLSProfileSpec
61+
Scheme *runtime.Scheme
62+
watcher ObjectWatcher
63+
ImagesFile string
64+
FeatureGateAccess featuregates.FeatureGateAccess
65+
TLSProfileSpec configv1.TLSProfileSpec
66+
consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
5867
}
5968

6069
// +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete
@@ -69,59 +78,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
6978
infra := &configv1.Infrastructure{}
7079
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
7180
klog.Infof("Infrastructure cluster does not exist. Skipping...")
72-
7381
if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
7482
klog.Errorf("Unable to sync cluster operator status: %s", err)
75-
return ctrl.Result{}, err
83+
return r.handleTransientError(ctx, conditionOverrides, err)
7684
}
77-
85+
// It's ok for the infrastructure cluster to not exist
86+
r.clearFailureWindow()
7887
return ctrl.Result{}, nil
7988
} else if err != nil {
8089
klog.Errorf("Unable to retrive Infrastructure object: %v", err)
81-
82-
if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
83-
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
84-
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
85-
}
86-
return ctrl.Result{}, err
90+
return r.handleTransientError(ctx, conditionOverrides, err)
8791
}
8892

8993
allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides)
9094
if err != nil {
9195
klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err)
92-
return ctrl.Result{}, err
96+
return r.handleTransientError(ctx, conditionOverrides, err)
9397
} else if !allowedToProvision {
98+
// We're not allowed to provision, but didn't have any failures.
99+
r.clearFailureWindow()
94100
return ctrl.Result{}, nil
95101
}
96102

97103
clusterProxy := &configv1.Proxy{}
98104
if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) {
99105
klog.Errorf("Unable to retrive Proxy object: %v", err)
100-
101-
if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
102-
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
103-
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
104-
}
105-
return ctrl.Result{}, err
106+
return r.handleTransientError(ctx, conditionOverrides, err)
106107
}
107108

108109
operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSProfileSpec)
109110
if err != nil {
110111
klog.Errorf("Unable to build operator config %s", err)
111-
if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
112-
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
113-
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
114-
}
115-
return ctrl.Result{}, err
112+
return r.handleDegradeError(ctx, conditionOverrides, err)
116113
}
117114

118115
if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
119116
klog.Errorf("Unable to sync operands: %s", err)
120-
if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
121-
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
122-
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
123-
}
124-
return ctrl.Result{}, err
117+
return r.handleTransientError(ctx, conditionOverrides, err)
125118
}
126119

127120
if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
@@ -134,9 +127,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
134127
return ctrl.Result{}, err
135128
}
136129

130+
// successful reconcile, make sure the failure window is cleared.
131+
r.clearFailureWindow()
137132
return ctrl.Result{}, nil
138133
}
139134

135+
func (r *CloudOperatorReconciler) clearFailureWindow() {
136+
r.consecutiveFailureSince = nil
137+
}
138+
139+
// handleTransientError records the start of a failure window and degrades the
140+
// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns
141+
// a non-nil error so controller-runtime requeues with exponential backoff.
142+
func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
143+
now := r.Clock.Now()
144+
if r.consecutiveFailureSince == nil {
145+
r.consecutiveFailureSince = &now
146+
klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold)
147+
return ctrl.Result{}, err
148+
}
149+
elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
150+
if elapsed < aggregatedTransientDegradedThreshold {
151+
klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err)
152+
return ctrl.Result{}, err
153+
}
154+
klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
155+
if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
156+
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
157+
}
158+
return ctrl.Result{}, err
159+
}
160+
161+
// handleDegradeError sets OperatorDegraded=True immediately and returns nil so
162+
// controller-runtime does NOT requeue. Existing watches on Infrastructure,
163+
// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed.
164+
func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
165+
klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err)
166+
if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
167+
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
168+
}
169+
return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger
170+
}
171+
140172
func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error {
141173
// Deploy resources for platform
142174
resources, err := cloud.GetResources(config)

pkg/controllers/clusteroperator_controller_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controllers
22

33
import (
44
"context"
5+
"fmt"
56
"time"
67

78
. "github.com/onsi/ginkgo/v2"
@@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() {
618619
})
619620

620621
})
622+
623+
var _ = Describe("CloudOperatorReconciler error handling", func() {
624+
ctx := context.Background()
625+
626+
AfterEach(func() {
627+
co := &configv1.ClusterOperator{}
628+
if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil {
629+
Eventually(func() bool {
630+
err := cl.Delete(ctx, co)
631+
return err == nil || apierrors.IsNotFound(err)
632+
}).Should(BeTrue())
633+
}
634+
Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue())
635+
})
636+
637+
It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() {
638+
reconciler := &CloudOperatorReconciler{
639+
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
640+
Client: cl,
641+
Clock: clocktesting.NewFakePassiveClock(time.Now()),
642+
ManagedNamespace: defaultManagementNamespace,
643+
Recorder: record.NewFakeRecorder(32),
644+
},
645+
Scheme: scheme.Scheme,
646+
}
647+
648+
_, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error"))
649+
Expect(err).NotTo(HaveOccurred())
650+
651+
co := &configv1.ClusterOperator{}
652+
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
653+
Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue())
654+
})
655+
656+
It("handleTransientError should not degrade before threshold, but degrade after threshold", func() {
657+
fakeClock := clocktesting.NewFakeClock(time.Now())
658+
reconciler := &CloudOperatorReconciler{
659+
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
660+
Client: cl,
661+
Clock: fakeClock,
662+
ManagedNamespace: defaultManagementNamespace,
663+
Recorder: record.NewFakeRecorder(32),
664+
},
665+
Scheme: scheme.Scheme,
666+
}
667+
668+
// Pre-create the ClusterOperator so that setStatusDegraded can update its status
669+
// subresource when the threshold is exceeded (status subresource updates require the
670+
// object to already exist in the cluster).
671+
co := &configv1.ClusterOperator{}
672+
co.SetName(clusterOperatorName)
673+
Expect(cl.Create(ctx, co)).To(Succeed())
674+
675+
// First reconcile: transient failure starts; error returned but no degraded condition set.
676+
_, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
677+
Expect(err).To(HaveOccurred())
678+
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
679+
Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(),
680+
"should not be degraded before threshold")
681+
682+
// Advance clock past the degraded threshold.
683+
fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second)
684+
685+
// Second reconcile: threshold exceeded, controller sets degraded.
686+
_, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
687+
Expect(err).To(HaveOccurred())
688+
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
689+
Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue())
690+
})
691+
})

0 commit comments

Comments
 (0)