@@ -19,6 +19,7 @@ package controllers
1919import (
2020 "context"
2121 "fmt"
22+ "time"
2223
2324 configv1 "github.com/openshift/api/config/v1"
2425 operatorv1 "github.com/openshift/api/operator/v1"
@@ -45,16 +46,24 @@ const (
4546
4647 // Condition type for Cloud Controller ownership
4748 cloudControllerOwnershipCondition = "CloudControllerOwner"
49+
50+ // aggregatedTransientDegradedThreshold is how long transient errors must persist before
51+ // the controller sets Degraded=True.
52+ // This prevents brief API server blips during upgrades from immediately degrading the operator.
53+ // Applies to top-level operator, and is longer in order
54+ // to accomodate changes in the lower-level operators.
55+ aggregatedTransientDegradedThreshold = 2 * time .Minute + (30 * time .Second )
4856)
4957
5058// CloudOperatorReconciler reconciles a ClusterOperator object
5159type CloudOperatorReconciler struct {
5260 ClusterOperatorStatusClient
53- Scheme * runtime.Scheme
54- watcher ObjectWatcher
55- ImagesFile string
56- FeatureGateAccess featuregates.FeatureGateAccess
57- TLSProfileSpec configv1.TLSProfileSpec
61+ Scheme * runtime.Scheme
62+ watcher ObjectWatcher
63+ ImagesFile string
64+ FeatureGateAccess featuregates.FeatureGateAccess
65+ TLSProfileSpec configv1.TLSProfileSpec
66+ consecutiveFailureSince * time.Time // nil when the last reconcile succeeded
5867}
5968
6069// +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete
@@ -69,59 +78,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
6978 infra := & configv1.Infrastructure {}
7079 if err := r .Get (ctx , client.ObjectKey {Name : infrastructureResourceName }, infra ); errors .IsNotFound (err ) {
7180 klog .Infof ("Infrastructure cluster does not exist. Skipping..." )
72-
7381 if err := r .setStatusAvailable (ctx , conditionOverrides ); err != nil {
7482 klog .Errorf ("Unable to sync cluster operator status: %s" , err )
75- return ctrl. Result {}, err
83+ return r . handleTransientError ( ctx , conditionOverrides , err )
7684 }
77-
85+ // It's ok for the infrastructure cluster to not exist
86+ r .clearFailureWindow ()
7887 return ctrl.Result {}, nil
7988 } else if err != nil {
8089 klog .Errorf ("Unable to retrive Infrastructure object: %v" , err )
81-
82- if err := r .setStatusDegraded (ctx , err , conditionOverrides ); err != nil {
83- klog .Errorf ("Error syncing ClusterOperatorStatus: %v" , err )
84- return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , err )
85- }
86- return ctrl.Result {}, err
90+ return r .handleTransientError (ctx , conditionOverrides , err )
8791 }
8892
8993 allowedToProvision , err := r .provisioningAllowed (ctx , infra , conditionOverrides )
9094 if err != nil {
9195 klog .Errorf ("Unable to determine cluster state to check if provision is allowed: %v" , err )
92- return ctrl. Result {}, err
96+ return r . handleTransientError ( ctx , conditionOverrides , err )
9397 } else if ! allowedToProvision {
98+ // We're not allowed to provision, but didn't have any failures.
99+ r .clearFailureWindow ()
94100 return ctrl.Result {}, nil
95101 }
96102
97103 clusterProxy := & configv1.Proxy {}
98104 if err := r .Get (ctx , client.ObjectKey {Name : proxyResourceName }, clusterProxy ); err != nil && ! errors .IsNotFound (err ) {
99105 klog .Errorf ("Unable to retrive Proxy object: %v" , err )
100-
101- if err := r .setStatusDegraded (ctx , err , conditionOverrides ); err != nil {
102- klog .Errorf ("Error syncing ClusterOperatorStatus: %v" , err )
103- return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , err )
104- }
105- return ctrl.Result {}, err
106+ return r .handleTransientError (ctx , conditionOverrides , err )
106107 }
107108
108109 operatorConfig , err := config .ComposeConfig (infra , clusterProxy , r .ImagesFile , r .ManagedNamespace , r .FeatureGateAccess , r .TLSProfileSpec )
109110 if err != nil {
110111 klog .Errorf ("Unable to build operator config %s" , err )
111- if err := r .setStatusDegraded (ctx , err , conditionOverrides ); err != nil {
112- klog .Errorf ("Error syncing ClusterOperatorStatus: %v" , err )
113- return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , err )
114- }
115- return ctrl.Result {}, err
112+ return r .handleDegradeError (ctx , conditionOverrides , err )
116113 }
117114
118115 if err := r .sync (ctx , operatorConfig , conditionOverrides ); err != nil {
119116 klog .Errorf ("Unable to sync operands: %s" , err )
120- if err := r .setStatusDegraded (ctx , err , conditionOverrides ); err != nil {
121- klog .Errorf ("Error syncing ClusterOperatorStatus: %v" , err )
122- return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , err )
123- }
124- return ctrl.Result {}, err
117+ return r .handleTransientError (ctx , conditionOverrides , err )
125118 }
126119
127120 if err := r .setStatusAvailable (ctx , conditionOverrides ); err != nil {
@@ -134,9 +127,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
134127 return ctrl.Result {}, err
135128 }
136129
130+ // successful reconcile, make sure the failure window is cleared.
131+ r .clearFailureWindow ()
137132 return ctrl.Result {}, nil
138133}
139134
135+ func (r * CloudOperatorReconciler ) clearFailureWindow () {
136+ r .consecutiveFailureSince = nil
137+ }
138+
139+ // handleTransientError records the start of a failure window and degrades the
140+ // operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns
141+ // a non-nil error so controller-runtime requeues with exponential backoff.
142+ func (r * CloudOperatorReconciler ) handleTransientError (ctx context.Context , conditionOverrides []configv1.ClusterOperatorStatusCondition , err error ) (ctrl.Result , error ) {
143+ now := r .Clock .Now ()
144+ if r .consecutiveFailureSince == nil {
145+ r .consecutiveFailureSince = & now
146+ klog .V (4 ).Infof ("CloudOperatorReconciler: transient failure started (%v), will degrade after %s" , err , aggregatedTransientDegradedThreshold )
147+ return ctrl.Result {}, err
148+ }
149+ elapsed := r .Clock .Now ().Sub (* r .consecutiveFailureSince )
150+ if elapsed < aggregatedTransientDegradedThreshold {
151+ klog .V (4 ).Infof ("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v" , elapsed , aggregatedTransientDegradedThreshold , err )
152+ return ctrl.Result {}, err
153+ }
154+ klog .Warningf ("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v" , elapsed , err )
155+ if setErr := r .setStatusDegraded (ctx , err , conditionOverrides ); setErr != nil {
156+ return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , setErr )
157+ }
158+ return ctrl.Result {}, err
159+ }
160+
161+ // handleDegradeError sets OperatorDegraded=True immediately and returns nil so
162+ // controller-runtime does NOT requeue. Existing watches on Infrastructure,
163+ // ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed.
164+ func (r * CloudOperatorReconciler ) handleDegradeError (ctx context.Context , conditionOverrides []configv1.ClusterOperatorStatusCondition , err error ) (ctrl.Result , error ) {
165+ klog .Errorf ("CloudOperatorReconciler: persistent error, setting degraded: %v" , err )
166+ if setErr := r .setStatusDegraded (ctx , err , conditionOverrides ); setErr != nil {
167+ return ctrl.Result {}, fmt .Errorf ("error syncing ClusterOperatorStatus: %v" , setErr )
168+ }
169+ return ctrl.Result {}, nil // do not requeue; a watch event will re-trigger
170+ }
171+
140172func (r * CloudOperatorReconciler ) sync (ctx context.Context , config config.OperatorConfig , conditionOverrides []configv1.ClusterOperatorStatusCondition ) error {
141173 // Deploy resources for platform
142174 resources , err := cloud .GetResources (config )
0 commit comments