diff --git a/pkg/gatherers/conditional/conditional_gatherer.go b/pkg/gatherers/conditional/conditional_gatherer.go index 3ada944fc0..b6e786cd5e 100644 --- a/pkg/gatherers/conditional/conditional_gatherer.go +++ b/pkg/gatherers/conditional/conditional_gatherer.go @@ -31,6 +31,7 @@ import ( "github.com/openshift/insights-operator/pkg/gatherers/common" "github.com/openshift/insights-operator/pkg/insights/insightsclient" "github.com/openshift/insights-operator/pkg/record" + "github.com/openshift/insights-operator/pkg/retry" "github.com/openshift/insights-operator/pkg/utils" ) @@ -271,35 +272,33 @@ func (g *Gatherer) getRemoteConfiguration(ctx context.Context) ([]byte, error) { Cap: 3 * time.Minute, } endpointWithVersion := fmt.Sprintf(endpoint, ocpVersion) - var remoteConfigData []byte - err = wait.ExponentialBackoffWithContext(ctx, backOff, func(ctx context.Context) (done bool, err error) { + + result, err := retry.RetryWithExpBackOff(ctx, backOff, retry.RetryOnNon200HTTP, func() (retry.Result, error) { resp, err := g.insightsCli.GetWithPathParam(ctx, endpoint, ocpVersion, false) if err != nil { - return false, err + return retry.Result{}, err } + defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - klog.Infof("Received HTTP status code %d, trying again in %s", resp.StatusCode, backOff.Step()) - if backOff.Steps > 1 { - return false, nil - } - return true, insightsclient.HttpError{ - Err: fmt.Errorf("received HTTP %s from %s. Using the default built-in configuration", resp.Status, endpointWithVersion), + return retry.Result{}, insightsclient.HttpError{ + Err: fmt.Errorf("received HTTP %s from %s", resp.Status, endpointWithVersion), StatusCode: resp.StatusCode, } } - remoteConfigData, err = io.ReadAll(resp.Body) - defer resp.Body.Close() + + data, err := io.ReadAll(resp.Body) if err != nil { - return false, nil + return retry.Result{}, err } - return true, nil + return retry.Result{Data: data}, nil }) + if err != nil { return nil, err } - return remoteConfigData, nil + return result.Data, nil } func (g *Gatherer) getRemoteConfigEndpoint() (string, error) { diff --git a/pkg/insights/insightsuploader/insightsuploader.go b/pkg/insights/insightsuploader/insightsuploader.go index 97f23f8609..225dda9ac2 100644 --- a/pkg/insights/insightsuploader/insightsuploader.go +++ b/pkg/insights/insightsuploader/insightsuploader.go @@ -16,6 +16,7 @@ import ( "github.com/openshift/insights-operator/pkg/config/configobserver" "github.com/openshift/insights-operator/pkg/controllerstatus" "github.com/openshift/insights-operator/pkg/insights/insightsclient" + "github.com/openshift/insights-operator/pkg/retry" ) type Authorizer interface { @@ -206,25 +207,18 @@ func (c *Controller) Upload(ctx context.Context, s *insightsclient.Source) (stri start := time.Now() s.ID = start.Format(time.RFC3339) s.Type = "application/vnd.redhat.openshift.periodic" - var requestID string - var statusCode int - err := wait.ExponentialBackoff(c.backoff, func() (done bool, err error) { - requestID, statusCode, err = c.client.SendAndGetID(ctx, c.configurator.Config().DataReporting.UploadEndpoint, *s) - if err != nil { - // do no return the error if it's not the last attempt - if c.backoff.Steps > 1 { - klog.Infof("Unable to upload report after %s: %v", time.Since(start).Truncate(time.Second/100), err) - klog.Errorf("%v. Trying again in %s", err, c.backoff.Step()) - return false, nil - } - } - return true, err + + result, err := retry.RetryWithExpBackOff(ctx, c.backoff, retry.RetryOnAll, func() (retry.Result, error) { + requestID, statusCode, err := c.client.SendAndGetID(ctx, c.configurator.Config().DataReporting.UploadEndpoint, *s) + return retry.Result{RequestID: requestID, StatusCode: statusCode}, err }) + if err != nil { - return "", statusCode, err + klog.Infof("Unable to upload report after %s: %v", time.Since(start).Truncate(time.Second/100), err) + return "", result.StatusCode, err } klog.Infof("Uploaded report successfully in %s", time.Since(start)) - return requestID, statusCode, nil + return result.RequestID, result.StatusCode, nil } func reportToLogs(source io.Reader) error { diff --git a/pkg/ocm/clustertransfer/cluster_transfer.go b/pkg/ocm/clustertransfer/cluster_transfer.go index 25cd7be41c..57c50e4310 100644 --- a/pkg/ocm/clustertransfer/cluster_transfer.go +++ b/pkg/ocm/clustertransfer/cluster_transfer.go @@ -13,6 +13,7 @@ import ( "github.com/openshift/insights-operator/pkg/controllerstatus" "github.com/openshift/insights-operator/pkg/insights/insightsclient" "github.com/openshift/insights-operator/pkg/ocm" + "github.com/openshift/insights-operator/pkg/retry" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -81,7 +82,7 @@ func (c *Controller) Run(ctx context.Context) { // in the response then check if a secret update is required, and if so, perform the update. func (c *Controller) requestDataAndUpdateSecret(ctx context.Context, endpoint string) { klog.Infof("checking the availability of cluster transfer. Next check is in %s", c.configurator.Config().ClusterTransfer.Interval) - data, err := c.requestClusterTransferWithExponentialBackoff(endpoint) + data, err := c.requestClusterTransferWithExponentialBackoff(ctx, endpoint) if err != nil { msg := fmt.Sprintf("failed to pull cluster transfer: %v", err) httpErr, ok := err.(insightsclient.HttpError) @@ -212,7 +213,7 @@ func (c *Controller) updatePullSecret(ctx context.Context, newData []byte) error // from the OCM API with exponential backoff. // It returns HttpError (see insightsclient.go) in case of any HTTP error response from the OCM API. // The exponential backoff is applied only for HTTP errors >= 500. -func (c *Controller) requestClusterTransferWithExponentialBackoff(endpoint string) ([]byte, error) { +func (c *Controller) requestClusterTransferWithExponentialBackoff(ctx context.Context, endpoint string) ([]byte, error) { bo := wait.Backoff{ Duration: c.configurator.Config().ClusterTransfer.Interval / 24, // 30 min as the first waiting Factor: 2, @@ -221,31 +222,15 @@ func (c *Controller) requestClusterTransferWithExponentialBackoff(endpoint strin Cap: c.configurator.Config().ClusterTransfer.Interval, } - var data []byte - err := wait.ExponentialBackoff(bo, func() (bool, error) { - var err error - data, err = c.client.RecvClusterTransfer(endpoint) - if err == nil { - return true, nil - } - // don't try again in case it's not an HTTP error - it could mean we're in disconnected env - if !insightsclient.IsHttpError(err) { - return true, err - } - httpErr := err.(insightsclient.HttpError) - if httpErr.StatusCode >= http.StatusInternalServerError { - // check the number of steps to prevent "timeout waiting for condition" error - we want to propagate the HTTP error below - if bo.Steps > 1 { - klog.Errorf("Got HTTP %v. Trying again in %s", httpErr.StatusCode, bo.Step()) - return false, nil - } - } - return true, httpErr + result, err := retry.RetryWithExpBackOff(ctx, bo, retry.RetryOn50xHTTP, func() (retry.Result, error) { + data, err := c.client.RecvClusterTransfer(endpoint) + return retry.Result{Data: data}, err }) + if err != nil { return nil, err } - return data, nil + return result.Data, nil } func (c *Controller) updateStatus(healthy bool, msg, reason string, httpErr *insightsclient.HttpError) { diff --git a/pkg/ocm/sca/sca.go b/pkg/ocm/sca/sca.go index d0a7a57655..cbc2da4ea7 100644 --- a/pkg/ocm/sca/sca.go +++ b/pkg/ocm/sca/sca.go @@ -19,6 +19,7 @@ import ( "github.com/openshift/insights-operator/pkg/controllerstatus" "github.com/openshift/insights-operator/pkg/insights/insightsclient" "github.com/openshift/insights-operator/pkg/ocm" + "github.com/openshift/insights-operator/pkg/retry" ) const ( @@ -299,35 +300,17 @@ func (c *Controller) requestSCAWithExpBackoff( Cap: c.configurator.Config().SCA.Interval, } - var err error - var data []byte - err = wait.ExponentialBackoff(bo, func() (bool, error) { - data, err = c.client.RecvSCACerts(ctx, endpoint, nodeArchitectures) - if err != nil { - // don't try again in case it's not an HTTP error - it could mean we're in disconnected env - if !insightsclient.IsHttpError(err) { - return true, err - } - httpErr := err.(insightsclient.HttpError) - // try again only in case of 500 or higher - if httpErr.StatusCode >= http.StatusInternalServerError { - // check the number of steps to prevent "timeout waiting for condition" error - we want to propagate the HTTP error - if bo.Steps > 1 { - klog.Errorf("%v. Trying again in %s", httpErr, bo.Step()) - return false, nil - } - } - return true, httpErr - } - - return true, nil + result, err := retry.RetryWithExpBackOff(ctx, bo, retry.RetryOn50xHTTP, func() (retry.Result, error) { + data, err := c.client.RecvSCACerts(ctx, endpoint, nodeArchitectures) + return retry.Result{Data: data}, err }) + if err != nil { return nil, err } var response Response - err = json.Unmarshal(data, &response) + err = json.Unmarshal(result.Data, &response) if err != nil { klog.Errorf("Unable to decode response: %v", err) return nil, err diff --git a/pkg/retry/retry.go b/pkg/retry/retry.go new file mode 100644 index 0000000000..4548179d09 --- /dev/null +++ b/pkg/retry/retry.go @@ -0,0 +1,138 @@ +// Package retry provides shared retry logic with exponential backoff for HTTP operations. +// +// Usage example: +// +// result, err := retry.RetryWithExpBackOff( +// ctx, +// wait.Backoff{ +// Duration: interval/32, +// Factor: 2, +// Steps: ocm.FailureCountThreshold, +// Cap: interval, +// }, +// retry.RetryOn50xHTTP, +// func() (retry.Result, error) { +// data, err := client.RecvSCACerts(ctx, endpoint, nodeArchs) +// return retry.Result{Data: data}, err +// }, +// ) +package retry + +import ( + "context" + "errors" + "net/http" + + "github.com/openshift/insights-operator/pkg/insights/insightsclient" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/klog/v2" +) + +type RetryStrategy int64 + +const ( + // RetryOn50xHTTP retries only on HTTP 500+ errors, skips retry for non-HTTP errors (disconnected env) + // Used by: sca.go, clustertransfer.go + RetryOn50xHTTP RetryStrategy = iota + + // RetryOnNon200HTTP retries on any non-200 HTTP status code + // Used by: conditional_gatherer.go + RetryOnNon200HTTP + + // RetryOnAll retries on all errors + // Used by: insightsuploader.go + RetryOnAll +) + +// Result holds the response data from retry operations +type Result struct { + Data []byte + StatusCode int + RequestID string +} + +// shouldRetry determines if an error should be retried based on the strategy. +// Returns true if retry should be attempted (when steps remain). +// Returns false immediately if the context is canceled or deadline exceeded. +func shouldRetry(ctx context.Context, err error, strategy RetryStrategy) bool { + // Don't retry if context is canceled or deadline exceeded + if ctx.Err() != nil { + return false + } + + // Don't retry context cancellation or deadline errors + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + + // Extract status code from HttpError (handles both pointer and non-pointer) + var statusCode int + var isHTTPError bool + + switch e := err.(type) { + case *insightsclient.HttpError: + // Pointer - real-world case from newHTTPErrorFromResponse + statusCode = e.StatusCode + isHTTPError = true + case insightsclient.HttpError: + // Non-pointer - test case + statusCode = e.StatusCode + isHTTPError = true + } + + switch strategy { + case RetryOn50xHTTP: + // Only retry HTTP 500+ errors, skip non-HTTP errors (disconnected env) + if !isHTTPError { + return false + } + return statusCode >= http.StatusInternalServerError + + case RetryOnNon200HTTP: + // Retry on any non-200 HTTP status, or non-HTTP errors + if !isHTTPError { + return true // retry non-HTTP errors + } + return statusCode != http.StatusOK + + case RetryOnAll: + // Retry on all errors + return true + + default: + // Unknown strategy, don't retry + klog.Infof("Unknown strategy %d for retry mechanism", strategy) + return false + } +} + +func RetryWithExpBackOff(ctx context.Context, bo wait.Backoff, strategy RetryStrategy, operation func() (Result, error)) (Result, error) { + var lastErr error + var result Result + + attempt := 0 + maxAttempts := bo.Steps + + err := wait.ExponentialBackoffWithContext(ctx, bo, func(context.Context) (bool, error) { + attempt++ + result, lastErr = operation() + if lastErr != nil { + // Use strategy to determine if we should retry + if shouldRetry(ctx, lastErr, strategy) { + klog.Errorf("%v. Retrying (attempt %d/%d)", lastErr, attempt, maxAttempts) + return false, nil + } + // Don't retry based on strategy + return true, lastErr + } + + return true, nil + }) + + // If we exhausted retries, return the last operation error instead of the timeout error + if wait.Interrupted(err) && lastErr != nil { + return result, lastErr + } + + return result, err +} diff --git a/pkg/retry/retry_test.go b/pkg/retry/retry_test.go new file mode 100644 index 0000000000..af11dc4085 --- /dev/null +++ b/pkg/retry/retry_test.go @@ -0,0 +1,389 @@ +package retry + +import ( + "bytes" + "context" + "fmt" + "net/http" + "strings" + "testing" + + "github.com/openshift/insights-operator/pkg/insights/insightsclient" + "k8s.io/apimachinery/pkg/util/wait" +) + +func Test_ShouldRetry(t *testing.T) { + type testCase struct { + name string + err error + strategy RetryStrategy + shouldRetry bool + } + + testCases := []testCase{ + // RetryOn500HTTP strategy tests + { + name: "RetryOn500HTTP should retry on HTTP 500", + err: insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("internal server error"), + }, + strategy: RetryOn50xHTTP, + shouldRetry: true, + }, + { + name: "RetryOn500HTTP should NOT retry on non-HTTP error", + err: fmt.Errorf("network connection error"), + strategy: RetryOn50xHTTP, + shouldRetry: false, + }, + { + name: "RetryOn500HTTP should NOT retry on HTTP 404", + err: insightsclient.HttpError{ + StatusCode: http.StatusNotFound, + Err: fmt.Errorf("not found"), + }, + strategy: RetryOn50xHTTP, + shouldRetry: false, + }, + + // RetryOnNon200HTTP strategy tests + { + name: "RetryOnNon200HTTP should retry on HTTP 404", + err: insightsclient.HttpError{ + StatusCode: http.StatusNotFound, + Err: fmt.Errorf("not found"), + }, + strategy: RetryOnNon200HTTP, + shouldRetry: true, + }, + { + name: "RetryOnNon200HTTP should retry on HTTP 500", + err: insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("internal server error"), + }, + strategy: RetryOnNon200HTTP, + shouldRetry: true, + }, + { + name: "RetryOnNon200HTTP should retry on non-HTTP error", + err: fmt.Errorf("network error"), + strategy: RetryOnNon200HTTP, + shouldRetry: true, + }, + { + name: "RetryOnNon200HTTP should NOT retry on HTTP 200", + err: insightsclient.HttpError{ + StatusCode: http.StatusOK, + Err: nil, + }, + strategy: RetryOnNon200HTTP, + shouldRetry: false, + }, + + // RetryOnAll strategy tests + { + name: "RetryOnAll should retry on HTTP 400", + err: insightsclient.HttpError{ + StatusCode: http.StatusBadRequest, + Err: fmt.Errorf("bad request"), + }, + strategy: RetryOnAll, + shouldRetry: true, + }, + { + name: "RetryOnAll should retry on HTTP 500", + err: insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("internal server error"), + }, + strategy: RetryOnAll, + shouldRetry: true, + }, + { + name: "RetryOnAll should retry on non-HTTP error", + err: fmt.Errorf("some random error"), + strategy: RetryOnAll, + shouldRetry: true, + }, + { + name: "RetryOnAll should retry even on HTTP 200 with error", + err: insightsclient.HttpError{ + StatusCode: http.StatusOK, + Err: fmt.Errorf("unexpected error despite 200"), + }, + strategy: RetryOnAll, + shouldRetry: true, + }, + + // Unknown strategy tests + { + name: "Unknown strategy should NOT retry", + err: fmt.Errorf("some error"), + strategy: RetryStrategy(999), + shouldRetry: false, + }, + + // Context cancellation tests + { + name: "should NOT retry on context.Canceled error", + err: context.Canceled, + strategy: RetryOnAll, + shouldRetry: false, + }, + { + name: "should NOT retry on context.DeadlineExceeded error", + err: context.DeadlineExceeded, + strategy: RetryOnAll, + shouldRetry: false, + }, + + // Pointer HttpError tests (real-world scenario) + { + name: "RetryOn50xHTTP should retry on *HttpError (pointer) with HTTP 500", + err: &insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("internal server error"), + }, + strategy: RetryOn50xHTTP, + shouldRetry: true, + }, + { + name: "RetryOn50xHTTP should NOT retry on *HttpError (pointer) with HTTP 404", + err: &insightsclient.HttpError{ + StatusCode: http.StatusNotFound, + Err: fmt.Errorf("not found"), + }, + strategy: RetryOn50xHTTP, + shouldRetry: false, + }, + { + name: "RetryOnNon200HTTP should retry on *HttpError (pointer) with HTTP 500", + err: &insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("internal server error"), + }, + strategy: RetryOnNon200HTTP, + shouldRetry: true, + }, + { + name: "RetryOnNon200HTTP should NOT retry on *HttpError (pointer) with HTTP 200", + err: &insightsclient.HttpError{ + StatusCode: http.StatusOK, + Err: nil, + }, + strategy: RetryOnNon200HTTP, + shouldRetry: false, + }, + } + + for _, tt := range testCases { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + result := shouldRetry(ctx, tt.err, tt.strategy) + if result != tt.shouldRetry { + t.Errorf("shouldRetry() = %v, want %v", result, tt.shouldRetry) + } + }) + } +} + +func Test_RetryWithExpBackOff(t *testing.T) { + type testCase struct { + name string + backoff wait.Backoff + strategy RetryStrategy + operation func() (Result, error) + expectedResult Result + expectedError string + } + + testCases := []testCase{ + { + name: "successful operation on first try", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOnAll, + operation: func() (Result, error) { + return Result{Data: []byte("success")}, nil + }, + expectedResult: Result{Data: []byte("success")}, + expectedError: "", + }, + { + name: "successful after 2 retries", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOnAll, + operation: func() func() (Result, error) { + attempts := 0 + return func() (Result, error) { + attempts++ + if attempts < 3 { + return Result{}, fmt.Errorf("attempt %d failed", attempts) + } + return Result{Data: []byte("success after retries")}, nil + } + }(), + expectedResult: Result{Data: []byte("success after retries")}, + expectedError: "", + }, + { + name: "exhausted retries returns last error", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOnAll, + operation: func() (Result, error) { + return Result{}, fmt.Errorf("persistent failure") + }, + expectedResult: Result{}, + expectedError: "persistent failure", + }, + { + name: "single-step backoff returns original error", + backoff: wait.Backoff{ + Duration: 1, + Steps: 1, + Factor: 2, + }, + strategy: RetryOnAll, + operation: func() (Result, error) { + return Result{}, fmt.Errorf("immediate failure") + }, + expectedResult: Result{}, + expectedError: "immediate failure", + }, + { + name: "RetryOn50xHTTP does not retry on HTTP 404", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOn50xHTTP, + operation: func() (Result, error) { + return Result{}, insightsclient.HttpError{ + StatusCode: http.StatusNotFound, + Err: fmt.Errorf("not found"), + } + }, + expectedResult: Result{}, + expectedError: "not found", + }, + { + name: "RetryOn50xHTTP retries until exhausted on HTTP 500", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOn50xHTTP, + operation: func() (Result, error) { + return Result{}, insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("server error"), + } + }, + expectedResult: Result{}, + expectedError: "server error", + }, + { + name: "RetryOnNon200HTTP succeeds after HTTP 500 then 200", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOnNon200HTTP, + operation: func() func() (Result, error) { + attempts := 0 + return func() (Result, error) { + attempts++ + if attempts < 2 { + return Result{}, insightsclient.HttpError{ + StatusCode: http.StatusInternalServerError, + Err: fmt.Errorf("temporary server error"), + } + } + return Result{Data: []byte("recovered")}, nil + } + }(), + expectedResult: Result{Data: []byte("recovered")}, + expectedError: "", + }, + { + name: "RetryOnNon200HTTP does not retry on HTTP 200", + backoff: wait.Backoff{ + Duration: 1, + Steps: 3, + Factor: 2, + }, + strategy: RetryOnNon200HTTP, + operation: func() (Result, error) { + return Result{}, insightsclient.HttpError{ + StatusCode: http.StatusOK, + Err: fmt.Errorf("error despite 200"), + } + }, + expectedResult: Result{}, + expectedError: "error despite 200", + }, + { + name: "counts steps correctly - fails after exact retry count", + backoff: wait.Backoff{ + Duration: 1, + Steps: 2, + Factor: 2, + }, + strategy: RetryOnAll, + operation: func() func() (Result, error) { + attempts := 0 + return func() (Result, error) { + attempts++ + return Result{}, fmt.Errorf("fail attempt %d", attempts) + } + }(), + expectedResult: Result{}, + expectedError: "fail attempt 2", + }, + } + + for _, tt := range testCases { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + result, err := RetryWithExpBackOff(ctx, tt.backoff, tt.strategy, tt.operation) + + // Check error + if tt.expectedError != "" { + if err == nil { + t.Errorf("expected error %q, got nil", tt.expectedError) + } else if !strings.Contains(err.Error(), tt.expectedError) { + t.Errorf("expected error containing %q, got %q", tt.expectedError, err.Error()) + } + } else { + if err != nil { + t.Errorf("expected no error, got %v", err) + } + } + + // Check data + if tt.expectedResult.Data != nil { + if result.Data == nil { + t.Errorf("expected data %q, got nil", string(tt.expectedResult.Data)) + } else if !bytes.Equal(result.Data, tt.expectedResult.Data) { + t.Errorf("expected data %q, got %q", string(tt.expectedResult.Data), string(result.Data)) + } + } + }) + } +}