From 2e4f37da0d6be8dbab8e074055680d54fdde63f1 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Tue, 28 Apr 2026 16:32:49 -0500 Subject: [PATCH 01/15] Improving integration mode to become more resilient to upstream outages --- chart/templates/replicated-deployment.yaml | 4 + chart/templates/replicated-role.yaml | 6 + chart/values.yaml | 12 + cmd/replicated/api.go | 48 ++-- pkg/apiserver/bootstrap.go | 188 ++++++++++------ pkg/apiserver/bootstrap_test.go | 67 ++++++ pkg/apiserver/server.go | 192 ++++++++++++++-- pkg/apiserver/server_test.go | 127 +++++++++++ pkg/apiserver/testdata_test.go | 57 +++++ pkg/handlers/app.go | 2 +- pkg/handlers/healthz.go | 36 ++- pkg/handlers/healthz_test.go | 44 ++++ pkg/handlers/license.go | 43 +++- pkg/heartbeat/heartbeat.go | 14 +- pkg/license/cache/cache.go | 201 +++++++++++++++++ pkg/license/cache/cache_test.go | 187 ++++++++++++++++ pkg/license/license.go | 9 +- pkg/license/sync.go | 157 +++++++++++++ pkg/license/sync_test.go | 243 +++++++++++++++++++++ pkg/license/testdata_test.go | 54 +++++ pkg/startupstate/startupstate.go | 94 ++++++++ pkg/startupstate/startupstate_test.go | 43 ++++ pkg/store/memory_store.go | 69 +++++- pkg/store/memory_store_test.go | 67 ++++++ 24 files changed, 1840 insertions(+), 124 deletions(-) create mode 100644 pkg/apiserver/bootstrap_test.go create mode 100644 pkg/apiserver/server_test.go create mode 100644 pkg/apiserver/testdata_test.go create mode 100644 pkg/handlers/healthz_test.go create mode 100644 pkg/license/cache/cache.go create mode 100644 pkg/license/cache/cache_test.go create mode 100644 pkg/license/sync.go create mode 100644 pkg/license/sync_test.go create mode 100644 pkg/license/testdata_test.go create mode 100644 pkg/startupstate/startupstate.go create mode 100644 pkg/startupstate/startupstate_test.go diff --git a/chart/templates/replicated-deployment.yaml b/chart/templates/replicated-deployment.yaml index 4cddd9f4..0c74ab5c 100644 --- a/chart/templates/replicated-deployment.yaml +++ b/chart/templates/replicated-deployment.yaml @@ -193,6 +193,10 @@ spec: - name: REPLICATED_HA_ENABLED value: "true" {{- end }} + {{- if .Values.requireUpstreamOnStartup }} + - name: REPLICATED_REQUIRE_UPSTREAM_ON_STARTUP + value: "true" + {{- end }} {{- if (.Values.integration).licenseID }} - name: REPLICATED_INTEGRATION_LICENSE_ID valueFrom: diff --git a/chart/templates/replicated-role.yaml b/chart/templates/replicated-role.yaml index 4944c979..2180870c 100644 --- a/chart/templates/replicated-role.yaml +++ b/chart/templates/replicated-role.yaml @@ -34,12 +34,18 @@ rules: - 'secrets' verbs: - 'update' + # The names below must match the SDK's runtime expectations. If you + # rename one, update the corresponding constant on the Go side: + # replicated-license-cache → pkg/license/cache/cache.SecretName + # replicated-meta-data → pkg/meta/meta.ReplicatedMetadataSecretName + # replicated-support-metadata → pkg/supportbundle.SupportBundleMetadataSecretName resourceNames: - {{ include "replicated.secretName" . }} - replicated-instance-report - replicated-custom-app-metrics-report - replicated-meta-data - replicated-support-metadata + - replicated-license-cache {{ end }} {{ if .Values.tlsCertSecretName }} - apiGroups: diff --git a/chart/values.yaml b/chart/values.yaml index 156a4e10..10074afb 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -340,6 +340,18 @@ reportAllImages: false # See docs for feature availability in this mode. readOnlyMode: false +# When true, the pod will not be marked Ready until the SDK has successfully +# completed the full bootstrap, including a successful upstream license +# refresh and update fetch against replicated.app. This is the pre-Phase-1 +# behavior and is provided as an opt-out for operators who require it. +# +# Default (false): the pod becomes Ready as soon as the local critical +# bootstrap (license parse, signature verify, expiry check, store init) +# succeeds — or after a 30s safety timeout, whichever comes first. Upstream +# license refresh and update fetch run in the background and never block +# readiness. Most users want this. +requireUpstreamOnStartup: false + # Proxy configuration for outbound connections # Configure HTTPS proxy settings for the Replicated SDK # These values can also be set via global.replicated.httpsProxy and global.replicated.noProxy diff --git a/cmd/replicated/api.go b/cmd/replicated/api.go index 5e7ffa02..843785b5 100644 --- a/cmd/replicated/api.go +++ b/cmd/replicated/api.go @@ -31,6 +31,11 @@ func APICmd() *cobra.Command { namespace := v.GetString("namespace") configFilePath := v.GetString("config-file") integrationLicenseID := v.GetString("integration-license-id") + // require-upstream-on-startup auto-binds to + // REPLICATED_REQUIRE_UPSTREAM_ON_STARTUP via root.go's + // SetEnvPrefix("REPLICATED") + AutomaticEnv + + // SetEnvKeyReplacer("-", "_"); no explicit BindEnv needed. + requireUpstreamOnStartup := v.GetBool("require-upstream-on-startup") if configFilePath == "" && integrationLicenseID == "" { return errors.New("either config file or integration license id must be specified") @@ -57,27 +62,28 @@ func APICmd() *cobra.Command { } params := apiserver.APIServerParams{ - Context: cmd.Context(), - LicenseBytes: []byte(replicatedConfig.License), - IntegrationLicenseID: integrationLicenseID, - LicenseFields: replicatedConfig.LicenseFields, - AppName: replicatedConfig.AppName, - ChannelID: replicatedConfig.ChannelID, - ChannelName: replicatedConfig.ChannelName, - ChannelSequence: replicatedConfig.ChannelSequence, - ReleaseSequence: replicatedConfig.ReleaseSequence, - ReleaseCreatedAt: replicatedConfig.ReleaseCreatedAt, - ReleaseNotes: replicatedConfig.ReleaseNotes, - VersionLabel: replicatedConfig.VersionLabel, - ReplicatedAppEndpoint: replicatedConfig.ReplicatedAppEndpoint, - ReleaseImages: replicatedConfig.ReleaseImages, - StatusInformers: replicatedConfig.StatusInformers, - ReplicatedID: replicatedConfig.ReplicatedID, - AppID: replicatedConfig.AppID, - TlsCertSecretName: replicatedConfig.TlsCertSecretName, - ReportAllImages: replicatedConfig.ReportAllImages, - ReadOnlyMode: replicatedConfig.ReadOnlyMode, - Namespace: namespace, + Context: cmd.Context(), + LicenseBytes: []byte(replicatedConfig.License), + IntegrationLicenseID: integrationLicenseID, + LicenseFields: replicatedConfig.LicenseFields, + AppName: replicatedConfig.AppName, + ChannelID: replicatedConfig.ChannelID, + ChannelName: replicatedConfig.ChannelName, + ChannelSequence: replicatedConfig.ChannelSequence, + ReleaseSequence: replicatedConfig.ReleaseSequence, + ReleaseCreatedAt: replicatedConfig.ReleaseCreatedAt, + ReleaseNotes: replicatedConfig.ReleaseNotes, + VersionLabel: replicatedConfig.VersionLabel, + ReplicatedAppEndpoint: replicatedConfig.ReplicatedAppEndpoint, + ReleaseImages: replicatedConfig.ReleaseImages, + StatusInformers: replicatedConfig.StatusInformers, + ReplicatedID: replicatedConfig.ReplicatedID, + AppID: replicatedConfig.AppID, + TlsCertSecretName: replicatedConfig.TlsCertSecretName, + ReportAllImages: replicatedConfig.ReportAllImages, + ReadOnlyMode: replicatedConfig.ReadOnlyMode, + Namespace: namespace, + RequireUpstreamOnStartup: requireUpstreamOnStartup, } apiserver.Start(params) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index 2447aecf..5a7cd946 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -1,6 +1,7 @@ package apiserver import ( + "context" "log" "github.com/cenkalti/backoff/v4" @@ -21,9 +22,28 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/upstream" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" "github.com/replicatedhq/replicated-sdk/pkg/util" + "k8s.io/client-go/kubernetes" ) -func bootstrap(params APIServerParams) error { +// bootstrapCritical performs the bootstrap work that must succeed before the +// SDK can serve meaningful responses. It loads, signature-verifies, and +// expiry-checks the license, then populates the in-memory store. It also +// initializes the local appstate operator so app/* endpoints can begin +// reporting status as soon as the pod is Ready. +// +// In production mode (LicenseBytes provided by the chart), this path is +// fully local. In integration mode it consults the upstream Vendor Portal +// with cache fallback: a previously-cached license satisfies critical even +// when replicated.app is unreachable, so subsequent boots after one +// successful sync survive offline conditions. First-boot offline (no +// cache, unreachable upstream) returns backoff.Permanent — the SDK refuses +// to start rather than silently run with empty data. +// +// Permanent failures (license parse error, signature invalid, expired, +// first-boot-offline-with-no-cache) are returned as backoff.Permanent so +// the retry loop above gives up immediately. Transient failures bubble up +// unwrapped and the caller will retry. +func bootstrapCritical(params APIServerParams) error { clientset, err := k8sutil.GetClientset() if err != nil { return errors.Wrap(err, "failed to get clientset") @@ -31,7 +51,6 @@ func bootstrap(params APIServerParams) error { replicatedID, appID := params.ReplicatedID, params.AppID if replicatedID == "" || appID == "" { - // retrieve replicated and app ids replicatedID, appID, err = util.GetReplicatedAndAppIDs(clientset, params.Namespace) if err != nil { return errors.Wrap(err, "failed to get replicated and app ids") @@ -47,7 +66,6 @@ func bootstrap(params APIServerParams) error { log.Println("replicatedID:", replicatedID) log.Println("appID:", appID) - // In Embedded Cluster installations, automatically enable reporting all images reportAllImages := params.ReportAllImages if !reportAllImages { distribution := report.GetDistribution(clientset) @@ -57,46 +75,11 @@ func bootstrap(params APIServerParams) error { } } - var unverifiedWrapper licensewrapper.LicenseWrapper - if len(params.LicenseBytes) > 0 { - wrapper, err := sdklicense.LoadLicenseFromBytes(params.LicenseBytes) - if err != nil { - return errors.Wrap(err, "failed to parse license from base64") - } - unverifiedWrapper = wrapper - } else if params.IntegrationLicenseID != "" { - wrapper, err := sdklicense.GetLicenseByID(params.IntegrationLicenseID, params.ReplicatedAppEndpoint) - if err != nil { - return backoff.Permanent(errors.Wrap(err, "failed to get license by id for integration license id")) - } - if wrapper.GetLicenseType() != "dev" { - return errors.New("integration license must be a dev license") - } - unverifiedWrapper = wrapper - } - - err = unverifiedWrapper.VerifySignature() + verifiedWrapper, err := loadAndVerifyLicense(params, clientset) if err != nil { - if licensewrappertypes.IsLicenseDataValidationError(err) { - // this is not a fatal error, it means that the license data outside of the signature was changed - // however, the data inside the signature was still valid, and so the license has been updated to use that data instead - log.Println(err.Error()) - } else { - return backoff.Permanent(errors.Wrap(err, "failed to verify license signature")) - } + return err } - verifiedWrapper := unverifiedWrapper - if !util.IsAirgap() { - // sync license - licenseData, err := sdklicense.GetLatestLicense(verifiedWrapper, params.ReplicatedAppEndpoint) - if err != nil { - return errors.Wrap(err, "failed to get latest license") - } - verifiedWrapper = licenseData.License - } - - // check license expiration expired, err := sdklicense.LicenseIsExpired(verifiedWrapper) if err != nil { return errors.Wrap(err, "failed to check if license is expired") @@ -109,7 +92,6 @@ func bootstrap(params APIServerParams) error { if channelID == "" { channelID = verifiedWrapper.GetChannelID() } - channelName := params.ChannelName if channelName == "" { channelName = verifiedWrapper.GetChannelName() @@ -135,29 +117,9 @@ func bootstrap(params APIServerParams) error { ReadOnlyMode: params.ReadOnlyMode, }) - isIntegrationModeEnabled, err := integration.IsEnabled(params.Context, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) - if err != nil { - return errors.Wrap(err, "failed to check if integration mode is enabled") - } - - if !util.IsAirgap() && !isIntegrationModeEnabled { - // retrieve and cache updates - currentCursor := upstreamtypes.ReplicatedCursor{ - ChannelID: store.GetStore().GetChannelID(), - ChannelName: store.GetStore().GetChannelName(), - ChannelSequence: store.GetStore().GetChannelSequence(), - } - updates, err := upstream.GetUpdates(store.GetStore(), store.GetStore().GetLicense(), currentCursor) - if err != nil { - return errors.Wrap(err, "failed to get updates") - } - store.GetStore().SetUpdates(updates) - } - appStateOperator := appstate.InitOperator(clientset, params.Namespace) appStateOperator.Start() - // if no status informers are provided, generate them from the helm release informers := params.StatusInformers if informers == nil && helm.IsHelmManaged() { helmRelease, err := helm.GetRelease(helm.GetReleaseName()) @@ -175,11 +137,113 @@ func bootstrap(params APIServerParams) error { Informers: informers, }) + return nil +} + +// loadAndVerifyLicense loads the license from chart-embedded bytes +// (production) or from the upstream Vendor Portal by integration-license +// ID with cache fallback, then signature-verifies it. +// +// Integration mode goes through SyncLicenseByID: a successful upstream +// call writes through to the cache; an upstream failure with a populated +// cache transparently uses the cached license; an upstream failure with +// no cache returns a permanent error so the pod refuses to start with no +// usable license. +func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface) (licensewrapper.LicenseWrapper, error) { + var unverifiedWrapper licensewrapper.LicenseWrapper + + switch { + case len(params.LicenseBytes) > 0: + wrapper, err := sdklicense.LoadLicenseFromBytes(params.LicenseBytes) + if err != nil { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "failed to parse license from base64")) + } + unverifiedWrapper = wrapper + case params.IntegrationLicenseID != "": + ctx := params.Context + if ctx == nil { + ctx = context.Background() + } + data, _, err := sdklicense.SyncLicenseByID(ctx, clientset, params.Namespace, params.IntegrationLicenseID, params.ReplicatedAppEndpoint) + if err != nil { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "integration mode requires either reachable upstream or a previously-cached license; neither was available")) + } + if data.License.GetLicenseType() != "dev" { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("integration license must be a dev license")) + } + unverifiedWrapper = data.License + default: + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("no license source configured: either LicenseBytes or IntegrationLicenseID is required")) + } + + if err := unverifiedWrapper.VerifySignature(); err != nil { + if licensewrappertypes.IsLicenseDataValidationError(err) { + // Non-fatal: license data outside the signature was changed, + // but the data inside the signature was still valid; the + // wrapper has been updated to use that data instead. + log.Println(err.Error()) + } else { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "failed to verify license signature")) + } + } + + return unverifiedWrapper, nil +} + +// bootstrapBackground performs upstream-dependent bootstrap work whose +// failure must not prevent the SDK from being marked Ready in the default +// configuration. The caller decides how to interpret returned errors: +// +// - In default (resilient) mode, errors are logged and ignored; handlers +// continue serving from whatever bootstrapCritical placed in the store. +// - With requireUpstreamOnStartup=true, the caller treats any error here +// as fatal and the pod will not be marked Ready. +// +// Upstream calls go through the cache-aware Sync* wrappers so a successful +// refresh writes through to the cache and an upstream failure transparently +// falls back to cached data when available. The in-memory store ends up +// populated either way. +func bootstrapBackground(params APIServerParams) error { + clientset, err := k8sutil.GetClientset() + if err != nil { + return errors.Wrap(err, "failed to get clientset") + } + + ctx := params.Context + if ctx == nil { + ctx = context.Background() + } + + if !util.IsAirgap() { + licenseData, _, err := sdklicense.SyncLatestLicense(ctx, clientset, params.Namespace, store.GetStore().GetLicense(), params.ReplicatedAppEndpoint) + if err != nil { + return errors.Wrap(err, "failed to get latest license") + } + store.GetStore().SetLicense(licenseData.License) + } + + isIntegrationModeEnabled, err := integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) + if err != nil { + return errors.Wrap(err, "failed to check if integration mode is enabled") + } + + if !util.IsAirgap() && !isIntegrationModeEnabled { + currentCursor := upstreamtypes.ReplicatedCursor{ + ChannelID: store.GetStore().GetChannelID(), + ChannelName: store.GetStore().GetChannelName(), + ChannelSequence: store.GetStore().GetChannelSequence(), + } + updates, err := upstream.GetUpdates(store.GetStore(), store.GetStore().GetLicense(), currentCursor) + if err != nil { + return errors.Wrap(err, "failed to get updates") + } + store.GetStore().SetUpdates(updates) + } + if err := heartbeat.Start(); err != nil { return errors.Wrap(err, "failed to start heartbeat") } - // this is at the end of the bootstrap function so that it doesn't re-run on retry if !util.IsAirgap() && store.GetStore().IsDevLicense() { go func() { if err := util.WarnOnOutdatedReplicatedVersion(); err != nil { diff --git a/pkg/apiserver/bootstrap_test.go b/pkg/apiserver/bootstrap_test.go new file mode 100644 index 00000000..93151f89 --- /dev/null +++ b/pkg/apiserver/bootstrap_test.go @@ -0,0 +1,67 @@ +package apiserver + +import ( + "net/http" + "sync/atomic" + "testing" + + "github.com/cenkalti/backoff/v4" + "github.com/stretchr/testify/require" + "k8s.io/client-go/kubernetes/fake" +) + +// blockingTransport is a RoundTripper that fails any HTTP request and +// records that a dial was attempted. Tests use this in place of +// http.DefaultTransport to assert that a code path is fully local. +type blockingTransport struct { + dials atomic.Int32 +} + +func (b *blockingTransport) RoundTrip(req *http.Request) (*http.Response, error) { + b.dials.Add(1) + return nil, &neverDialError{host: req.URL.Host} +} + +type neverDialError struct{ host string } + +func (e *neverDialError) Error() string { + return "blockingTransport: outbound network access denied to " + e.host +} + +// withBlockingTransport swaps http.DefaultTransport for the duration of the +// test and returns a counter the test can inspect. +func withBlockingTransport(t *testing.T) *blockingTransport { + t.Helper() + bt := &blockingTransport{} + orig := http.DefaultTransport + http.DefaultTransport = bt + t.Cleanup(func() { http.DefaultTransport = orig }) + return bt +} + +func TestLoadAndVerifyLicense_InvalidLicenseBytes_ReturnsPermanent(t *testing.T) { + bt := withBlockingTransport(t) + clientset := fake.NewSimpleClientset() + + _, err := loadAndVerifyLicense(APIServerParams{ + LicenseBytes: []byte("not a license"), + }, clientset) + require.Error(t, err, "expected an error when LicenseBytes is malformed") + + var perm *backoff.PermanentError + require.ErrorAs(t, err, &perm, "expected backoff.Permanent") + require.Zero(t, bt.dials.Load(), "production-mode license parse must not dial upstream") +} + +func TestLoadAndVerifyLicense_ProductionMode_DoesNotDialUpstream(t *testing.T) { + bt := withBlockingTransport(t) + clientset := fake.NewSimpleClientset() + + // We don't care about the success/failure outcome here — only that + // LicenseBytes-mode doesn't reach for the network. Even with bytes + // that fail signature verification, no upstream call is permitted. + _, _ = loadAndVerifyLicense(APIServerParams{ + LicenseBytes: []byte(validLicenseYAML), + }, clientset) + require.Zero(t, bt.dials.Load(), "production-mode license load+verify must not dial upstream") +} diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 02660b8d..25cfa301 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -16,10 +16,23 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/k8sutil" sdklicensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/replicatedhq/replicated-sdk/pkg/logger" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" ) +const ( + // bootstrapRetryInterval is how long the bootstrap retry loop waits + // between attempts when bootstrapCritical returns a transient error. + bootstrapRetryInterval = 10 * time.Second + + // bootstrapCriticalDeadline bounds how long Start() will wait for + // bootstrapCritical before marking the pod Ready in the default + // resilient mode. Critical continues retrying in the background after + // this deadline; the deadline only governs when /healthz flips to 200. + bootstrapCriticalDeadline = 30 * time.Second +) + type APIServerParams struct { Context context.Context LicenseBytes []byte @@ -42,22 +55,48 @@ type APIServerParams struct { TlsCertSecretName string ReportAllImages bool ReadOnlyMode bool + + // RequireUpstreamOnStartup, when true, restores the pre-Phase-1 + // behavior in which the pod does not become Ready until the full + // bootstrap (critical + background) succeeds, including a successful + // upstream license refresh and update fetch. Default false. + RequireUpstreamOnStartup bool } func Start(params APIServerParams) { log.Println("Replicated version:", buildversion.Version()) - backoffDuration := 10 * time.Second - bootstrapFn := func() error { - return bootstrap(params) - } - err := backoff.RetryNotify(bootstrapFn, backoff.NewConstantBackOff(backoffDuration), func(err error, d time.Duration) { - log.Printf("failed to bootstrap, retrying in %s: %v", d, err) - }) + state := startupstate.New() + handlers.SetStartupState(state) + + srv, err := buildServer(params) if err != nil { - log.Fatalf("failed to bootstrap: %v", err) + logger.Error(err) + return + } + + go func() { + if err := runBootstrap(params, state); err != nil { + log.Fatalf("%v", err) + } + }() + + if params.TlsCertSecretName != "" { + log.Printf("Starting Replicated API on port %d with TLS...\n", 3000) + log.Fatal(srv.ListenAndServeTLS("", "")) + } else { + log.Printf("Starting Replicated API on port %d...\n", 3000) + log.Fatal(srv.ListenAndServe()) } +} +// buildServer constructs the HTTP server and registers all routes. It also +// loads TLS material (from a kubernetes Secret) when configured. +// +// This is split out so the listener can be started in the foreground while +// bootstrap runs concurrently, instead of bootstrap blocking listener +// startup as it did pre-Phase-1. +func buildServer(params APIServerParams) (*http.Server, error) { r := mux.NewRouter() r.Use(handlers.CorsMiddleware) @@ -100,26 +139,143 @@ func Start(params APIServerParams) { Addr: ":3000", } - // Configure TLS if certificate name is provided if params.TlsCertSecretName != "" { clientset, err := k8sutil.GetClientset() if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - return + return nil, errors.Wrap(err, "failed to get clientset") } - tlsConfig, err := loadTLSConfig(clientset, params.Namespace, params.TlsCertSecretName) if err != nil { - log.Fatalf("failed to load TLS config: %v", err) + return nil, errors.Wrap(err, "failed to load TLS config") } srv.TLSConfig = tlsConfig + } - log.Printf("Starting Replicated API on port %d with TLS...\n", 3000) - log.Fatal(srv.ListenAndServeTLS("", "")) - } else { - log.Printf("Starting Replicated API on port %d...\n", 3000) - log.Fatal(srv.ListenAndServe()) + return srv, nil +} + +// bootstrapDeps wires the orchestrator to its collaborators. The default +// production deps are constructed by defaultBootstrapDeps; tests substitute +// fakes by calling runBootstrapWithDeps directly. +type bootstrapDeps struct { + critical func(APIServerParams) error + background func(APIServerParams) error + deadline time.Duration + retryInterval time.Duration +} + +func defaultBootstrapDeps() bootstrapDeps { + return bootstrapDeps{ + critical: bootstrapCritical, + background: bootstrapBackground, + deadline: bootstrapCriticalDeadline, + retryInterval: bootstrapRetryInterval, + } +} + +// runBootstrap drives the bootstrap state machine. The behavior depends on +// params.RequireUpstreamOnStartup: +// +// - false (default): bootstrapCritical is retried with backoff; the pod +// is marked Ready as soon as critical succeeds OR the +// bootstrapCriticalDeadline elapses, whichever comes first. After the +// deadline, critical continues to retry; on eventual success the +// bootstrapBackground phase runs. bootstrapBackground errors are logged +// but never block readiness. +// +// - true: the pre-Phase-1 contract is preserved. bootstrapCritical and +// bootstrapBackground are run in sequence with retry-with-backoff and +// the pod is not marked Ready until both succeed. +// +// A non-nil return value indicates the process should exit fatally; the +// readiness state has already been transitioned to Failed when this happens +// so a final scrape of /healthz reflects what occurred. Callers (typically +// Start) map a non-nil error to log.Fatalf. +// +// runBootstrap blocks until the bootstrap pipeline has fully resolved (or +// has resolved enough to know it must exit). It is intended to be invoked +// from a goroutine while the HTTP listener runs in the foreground. +func runBootstrap(params APIServerParams, state *startupstate.Tracker) error { + return runBootstrapWithDeps(params, state, defaultBootstrapDeps()) +} + +func runBootstrapWithDeps(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { + if params.RequireUpstreamOnStartup { + return runBootstrapStrict(params, state, deps) + } + return runBootstrapResilient(params, state, deps) +} + +func runBootstrapStrict(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { + err := backoff.RetryNotify( + func() error { + if err := deps.critical(params); err != nil { + return err + } + return deps.background(params) + }, + backoff.NewConstantBackOff(deps.retryInterval), + func(err error, d time.Duration) { + log.Printf("failed to bootstrap (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) + }, + ) + if err != nil { + state.MarkFailed() + return errors.Wrap(err, "failed to bootstrap") + } + state.MarkReady() + return nil +} + +func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { + criticalDone := make(chan error, 1) + go func() { + criticalDone <- backoff.RetryNotify( + func() error { return deps.critical(params) }, + backoff.NewConstantBackOff(deps.retryInterval), + func(err error, d time.Duration) { + log.Printf("failed to bootstrap critical, retrying in %s: %v", d, err) + }, + ) + }() + + timer := time.NewTimer(deps.deadline) + defer timer.Stop() + + select { + case criticalErr := <-criticalDone: + if criticalErr != nil { + state.MarkFailed() + return errors.Wrap(criticalErr, "failed to bootstrap critical") + } + state.MarkReady() + + case <-timer.C: + // Critical hasn't completed in the readiness window. Mark Ready + // anyway — handlers will serve whatever the in-memory store has + // (likely empty until critical completes) but the pod won't be + // stuck in CrashLoopBackOff. Block here until critical does + // resolve so we can decide whether to advance to background. + logger.Warnf( + "sdk_ready_after_critical_timeout: bootstrapCritical did not complete within %s; marking pod Ready and continuing critical bootstrap in background", + deps.deadline, + ) + state.MarkReady() + if criticalErr := <-criticalDone; criticalErr != nil { + state.MarkFailed() + return errors.Wrap(criticalErr, "failed to bootstrap critical (after readiness)") + } + // Symmetric counterpart to sdk_ready_after_critical_timeout — + // gives operators a closing log line when the deferred critical + // path eventually succeeds, instead of leaving them to infer it + // from the absence of further retry warnings. + logger.Infof("sdk_critical_succeeded_after_readiness: bootstrapCritical completed after the readiness deadline; advancing to background phase") + } + + if err := deps.background(params); err != nil { + logger.Warnf("bootstrap background phase completed with errors (handlers continue serving from in-memory store): %v", err) } + return nil } // loadTLSConfig loads TLS certificate and key from a Kubernetes secret diff --git a/pkg/apiserver/server_test.go b/pkg/apiserver/server_test.go new file mode 100644 index 00000000..d116669a --- /dev/null +++ b/pkg/apiserver/server_test.go @@ -0,0 +1,127 @@ +package apiserver + +import ( + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/cenkalti/backoff/v4" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" + "github.com/stretchr/testify/require" +) + +// fastDeps returns bootstrapDeps with short timing so tests run quickly. +// Tests pass critical and background as parameters because each scenario +// needs its own behavior. +func fastDeps(critical, background func(APIServerParams) error) bootstrapDeps { + return bootstrapDeps{ + critical: critical, + background: background, + deadline: 50 * time.Millisecond, + retryInterval: 5 * time.Millisecond, + } +} + +func TestRunBootstrapResilient_CriticalTimesOutThenSucceeds(t *testing.T) { + state := startupstate.New() + var bgRan atomic.Bool + criticalCalls := 0 + criticalSlow := func(APIServerParams) error { + criticalCalls++ + // First two attempts: pretend the upstream is slow / failing. + // Third attempt: succeed. The first two attempts together must + // outlast the deadline so we exercise the timeout-then-success + // path. + if criticalCalls < 3 { + time.Sleep(40 * time.Millisecond) + return errors.New("transient") + } + return nil + } + + deps := fastDeps( + criticalSlow, + func(APIServerParams) error { bgRan.Store(true); return nil }, + ) + deps.deadline = 30 * time.Millisecond + + start := time.Now() + require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps)) + elapsed := time.Since(start) + + require.True(t, state.IsReady(), "expected state Ready, got %s", state.Get()) + require.True(t, bgRan.Load(), "expected background to run once critical eventually succeeded") + require.GreaterOrEqual(t, elapsed, deps.deadline, "expected to wait at least the deadline") +} + +func TestRunBootstrapResilient_CriticalPermanentError_FailsFast(t *testing.T) { + state := startupstate.New() + var bgRan atomic.Bool + + deps := fastDeps( + func(APIServerParams) error { + return backoff.Permanent(errors.New("license is expired")) + }, + func(APIServerParams) error { bgRan.Store(true); return nil }, + ) + + err := runBootstrapResilient(APIServerParams{}, state, deps) + require.Error(t, err) + require.Equal(t, startupstate.Failed, state.Get()) + require.False(t, bgRan.Load(), "background must not run after a permanent critical failure") +} + +func TestRunBootstrapResilient_BackgroundFailureDoesNotAffectReady(t *testing.T) { + state := startupstate.New() + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { return errors.New("upstream sync failed") }, + ) + + require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps), "background failures should not bubble up") + require.True(t, state.IsReady(), "expected state Ready despite background failure") +} + +func TestRunBootstrapStrict_BlocksReadyUntilFullBootstrapSucceeds(t *testing.T) { + state := startupstate.New() + var phase atomic.Int32 + + deps := fastDeps( + func(APIServerParams) error { + phase.Store(1) + return nil + }, + func(APIServerParams) error { + if !state.IsReady() && phase.Load() == 1 { + // We deliberately observe state HERE — strict mode must + // not have flipped Ready before background returns. + phase.Store(2) + } + return nil + }, + ) + params := APIServerParams{RequireUpstreamOnStartup: true} + + require.NoError(t, runBootstrapWithDeps(params, state, deps)) + require.Equal(t, int32(2), phase.Load(), "background did not observe pre-Ready state") + require.True(t, state.IsReady(), "expected state Ready after strict bootstrap") +} + +func TestRunBootstrapStrict_BackgroundFailure_BlocksReady(t *testing.T) { + state := startupstate.New() + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + // Permanent so the retry loop gives up quickly. + return backoff.Permanent(errors.New("upstream sync permanently failed")) + }, + ) + params := APIServerParams{RequireUpstreamOnStartup: true} + + err := runBootstrapWithDeps(params, state, deps) + require.Error(t, err, "expected an error when strict-mode background fails") + require.Equal(t, startupstate.Failed, state.Get()) +} diff --git a/pkg/apiserver/testdata_test.go b/pkg/apiserver/testdata_test.go new file mode 100644 index 00000000..9159c89e --- /dev/null +++ b/pkg/apiserver/testdata_test.go @@ -0,0 +1,57 @@ +package apiserver + +// validLicenseYAML is a signed test license that round-trips through the +// embedded licensewrapper public key. Sourced from +// pkg/license/signature_test.go's "basic valid signature" fixture. +// +// We carry our own copy so the apiserver test suite can verify +// loadAndVerifyLicense end-to-end without importing test code from another +// package. +const validLicenseYAML = `apiVersion: kots.io/v1beta1 +kind: License +metadata: + name: testcustomer +spec: + appSlug: my-app + channelID: 1vusIYZLAVxMG6q760OJmRKj5i5 + channelName: My Channel + customerName: Test Customer + endpoint: https://replicated.app + entitlements: + bool_field: + title: Bool Field + value: true + valueType: Boolean + expires_at: + description: License Expiration + title: Expiration + value: "2030-07-27T00:00:00Z" + valueType: String + hidden_field: + isHidden: true + title: Hidden Field + value: this is secret + valueType: String + int_field: + title: Int Field + value: 123 + valueType: Integer + string_field: + title: StringField + value: single line text + valueType: String + text_field: + title: Text Field + value: |- + multi + line + text + valueType: Text + isAirgapSupported: true + isGitOpsSupported: true + isSnapshotSupported: true + licenseID: 1vusOokxAVp1tkRGuyxnF23PJcq + licenseSequence: 7 + licenseType: prod + signature: eyJsaWNlbnNlRGF0YSI6ImV5SmhjR2xXWlhKemFXOXVJam9pYTI5MGN5NXBieTkyTVdKbGRHRXhJaXdpYTJsdVpDSTZJa3hwWTJWdWMyVWlMQ0p0WlhSaFpHRjBZU0k2ZXlKdVlXMWxJam9pZEdWemRHTjFjM1J2YldWeUluMHNJbk53WldNaU9uc2liR2xqWlc1elpVbEVJam9pTVhaMWMwOXZhM2hCVm5BeGRHdFNSM1Y1ZUc1R01qTlFTbU54SWl3aWJHbGpaVzV6WlZSNWNHVWlPaUp3Y205a0lpd2lZM1Z6ZEc5dFpYSk9ZVzFsSWpvaVZHVnpkQ0JEZFhOMGIyMWxjaUlzSW1Gd2NGTnNkV2NpT2lKdGVTMWhjSEFpTENKamFHRnVibVZzU1VRaU9pSXhkblZ6U1ZsYVRFRldlRTFITm5FM05qQlBTbTFTUzJvMWFUVWlMQ0pqYUdGdWJtVnNUbUZ0WlNJNklrMTVJRU5vWVc1dVpXd2lMQ0pzYVdObGJuTmxVMlZ4ZFdWdVkyVWlPamNzSW1WdVpIQnZhVzUwSWpvaWFIUjBjSE02THk5eVpYQnNhV05oZEdWa0xtRndjQ0lzSW1WdWRHbDBiR1Z0Wlc1MGN5STZleUppYjI5c1gyWnBaV3hrSWpwN0luUnBkR3hsSWpvaVFtOXZiQ0JHYVdWc1pDSXNJblpoYkhWbElqcDBjblZsTENKMllXeDFaVlI1Y0dVaU9pSkNiMjlzWldGdUluMHNJbVY0Y0dseVpYTmZZWFFpT25zaWRHbDBiR1VpT2lKRmVIQnBjbUYwYVc5dUlpd2laR1Z6WTNKcGNIUnBiMjRpT2lKTWFXTmxibk5sSUVWNGNHbHlZWFJwYjI0aUxDSjJZV3gxWlNJNklqSXdNekF0TURjdE1qZFVNREE2TURBNk1EQmFJaXdpZG1Gc2RXVlVlWEJsSWpvaVUzUnlhVzVuSW4wc0ltaHBaR1JsYmw5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtocFpHUmxiaUJHYVdWc1pDSXNJblpoYkhWbElqb2lkR2hwY3lCcGN5QnpaV055WlhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lMQ0pwYzBocFpHUmxiaUk2ZEhKMVpYMHNJbWx1ZEY5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtsdWRDQkdhV1ZzWkNJc0luWmhiSFZsSWpveE1qTXNJblpoYkhWbFZIbHdaU0k2SWtsdWRHVm5aWElpZlN3aWMzUnlhVzVuWDJacFpXeGtJanA3SW5ScGRHeGxJam9pVTNSeWFXNW5SbWxsYkdRaUxDSjJZV3gxWlNJNkluTnBibWRzWlNCc2FXNWxJSFJsZUhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lmU3dpZEdWNGRGOW1hV1ZzWkNJNmV5SjBhWFJzWlNJNklsUmxlSFFnUm1sbGJHUWlMQ0oyWVd4MVpTSTZJbTExYkhScFhHNXNhVzVsWEc1MFpYaDBJaXdpZG1Gc2RXVlVlWEJsSWpvaVZHVjRkQ0o5ZlN3aWFYTkJhWEpuWVhCVGRYQndiM0owWldRaU9uUnlkV1VzSW1selIybDBUM0J6VTNWd2NHOXlkR1ZrSWpwMGNuVmxMQ0pwYzFOdVlYQnphRzkwVTNWd2NHOXlkR1ZrSWpwMGNuVmxmWDA9IiwiaW5uZXJTaWduYXR1cmUiOiJleUpzYVdObGJuTmxVMmxuYm1GMGRYSmxJam9pYUhneE1XTXZUR1ozUTNoVE5YRmtRWEJGU1hGdVRrMU9NMHBLYTJzNFZHZFhSVVpzVDFKVlJ6UjJjR1YzZEZoV1YzbG1lamRZY0hBd1ExazJZamRyUVRSS2N6TklhR3d3YkZJMFdUQTFMemN2UVVkQ2FEZFZNSGczUkhaTVozUXpVM00wYm5GTFZTdFhXRXBTVHpKWVFVRnZSME4xZFRWR1RGcHJRVWhYY1RSUVFtMXphSFY2Y1ZsdmNucHhlbGhGWVZWVlpFUlVkVXhDTW1nNWFIZ3dXRWhQUmxwUk16bHVkbTlPUjJaT2R5OTRTVmRaZEhSUGRYZHZhMncyTVZsb1JVeFZlRmQxU1ZSRmMwTlVhM2xtTVRNd09IazVSbFJzWlRKeVYyZEVlSEZNYTBSUFNXVXlPRWwzUzJSQkwySXdWVUl5VEZGbVRWcHdWemwyUTNCSkwybHlWek5uYmpaeU5WWjNWMjB2U1dweWJtNDNSelJrVmpadVYzcFRkMGhQUTJSdWEwMTRNRXQ1VVVOa0wxQjFaWEpUYjNSdVEwOXRTMDEzWlRSTGJqaERkMU5YVVRRNGRURkRNbTFpV1VzeGRYTlpOM1YzUFQwaUxDSndkV0pzYVdOTFpYa2lPaUl0TFMwdExVSkZSMGxPSUZCVlFreEpReUJMUlZrdExTMHRMVnh1VFVsSlFrbHFRVTVDWjJ0eGFHdHBSemwzTUVKQlVVVkdRVUZQUTBGUk9FRk5TVWxDUTJkTFEwRlJSVUZ6TkhKdlVIcDFhV1JNZVhOMmIxWTJkemxhTkZ4dVdHRmliME5tWTJNeGFHZFZhQ3N3V1VkS2NFNURSVXhyTjBaTFF5OTJhemR6ZERsR05tY3dUMjlrU0VSbGVYZFJXa2hLZFU1TVpsUnNRbEJHUTJOaU5seHVObTlzVEZOeWNGQTRjbFUzU0d4SGJsRkVSMFJNYVhkS1EyaGtSRGRVVUdSM2FXdHBkMHRGY201aldqaEdaalZsU25vd2RETmlUWFpyVDJaVVluSkJiRnh1WWtGQ1kwbzVNVmxVT1hKdVVXOXFkVWN4UldKUVRqaEZWblI2TWxZNE5IZHViR2Q0TUhCd2JEVjRPSFpOYlhwcE1ISnVibEZVV1VGamJ6WnFhMnBJTTF4dVRuTlVkWE4xUzFkdlJGUjVNWE5yZGtSUk9IbEJZV0ptWTNNME4zWnNRazAwU0RGT1JFNHZSSFJhWWxZdllubDJia0o2YkM4eFZrVnpURmRqWlZWcFRGeHVSWEYxT0VkeWF5dFFVRGQyUkdSd2JFUjNjWFpQV2t4RmRYazNkamhuUm01U09WUlVSV3ByTlVvNWRuWlVTR2RtU25VemVubEVPR2xLWTBSRE5YcHFPVnh1YjFGSlJFRlJRVUpjYmkwdExTMHRSVTVFSUZCVlFreEpReUJMUlZrdExTMHRMVnh1SWl3aWEyVjVVMmxuYm1GMGRYSmxJam9pWlhsS2VtRlhaSFZaV0ZJeFkyMVZhVTlwU2pCUldIQjJXVE5LVms1NmFGaFNSMlJzVVRKb2NtTklXa1ZVVlRsRldqQktXVTFGUmtaVFJFNUZVMGhLYkUxclRUTkxNSEJFVkROR2VGTnROVVJVVlRWVlltMDFiVnBGUm5sWldIQjZaRVJqTVZaSGFFeFBXRUpVVWtacmRrd3diek5aTUZaSlVteFdWRXd5T1VoV1JXeHNWa1ZPTUZSSE1WWlJNR04zVkd4R2JGa3pTblJUUm1zMFZVWk9hMVpWU2pCVU1WbDNZbXQwY0ZSclZuQmpia0poVFZjNWFtSldiSEZaYTNob1UyeHNWV0pGUmtWWGJVWnZWakZLVUZkcWJGSmhXRVp1V2xkb1EyRnVRak5TUjNNd1lWWkpOVTVXVmxkV1ZUVnlUMGhLYjFsVlRYbGhiVGcwVjBkYWVGbHFWbFppYlhoeFpFWkZkMDU1Y3pCaFZsSkpWRVpPTm1WRk1IcGxWWFJ2VFVaR1ZtRXdWVFJSVnpsSFVsaEtVRTFZUmxCU01WcFJVMVJDTmxsV2FIcFdWWEJ0WTBSU2JFMVVRazlPVjNSU1ZucFdUMU5XWTNaU1ZYUkZVMGhzYlU5VmJGaGtNMUl3WTFWc1lXTlhSakJTYTA1RVlVWmtjbUo2VmtSU00wSllUREkxUmsxWVl6SmxWM1JKVlZoQk1sVXhTbEppU0Zwd1VrVXdNRlpFVWt0VU1rWnNVVmQwYzFSV1VrMVVWV055V1RCYVRHSXpaRTlUVm05NVlraE9SR1JzVG5aUmFrWmFaVmRPVGxOVlNteGFiRXB1Wld0U2RVMHhSVGxRVTBselNXMWtjMkl5U21oaVJYUnNaVlZzYTBscWIybFpiVkpzV2xSVk1rNVVXWGRaTWxwcFRrUk9hazlYU1hsUFIwcHRUMVJvYkZsWFRtaGFiVVV5VGtSWmFXWlJQVDBpZlE9PSJ9 +` diff --git a/pkg/handlers/app.go b/pkg/handlers/app.go index cbf19784..79a3328e 100644 --- a/pkg/handlers/app.go +++ b/pkg/handlers/app.go @@ -276,7 +276,7 @@ func GetAppUpdates(w http.ResponseWriter, r *http.Request) { license := store.GetStore().GetLicense() updates := store.GetStore().GetUpdates() - licenseData, err := sdklicense.GetLatestLicense(license, store.GetStore().GetReplicatedAppEndpoint()) + licenseData, _, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, updates) diff --git a/pkg/handlers/healthz.go b/pkg/handlers/healthz.go index 37b19969..31d2945d 100644 --- a/pkg/handlers/healthz.go +++ b/pkg/handlers/healthz.go @@ -2,18 +2,50 @@ package handlers import ( "net/http" + "sync/atomic" "github.com/replicatedhq/replicated-sdk/pkg/buildversion" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" ) type HealthzResponse struct { Version string `json:"version"` + Status string `json:"status,omitempty"` +} + +// startupTracker is a package-level pointer to the bootstrap-state tracker. +// apiserver.Start() installs the tracker via SetStartupState before serving. +// We use atomic.Pointer so reads from /healthz are safe regardless of when +// the tracker is installed. +var startupTracker atomic.Pointer[startupstate.Tracker] + +// SetStartupState installs the bootstrap-state tracker for /healthz to consult. +// Pass nil to clear (used by tests). +func SetStartupState(t *startupstate.Tracker) { + startupTracker.Store(t) } func Healthz(w http.ResponseWriter, r *http.Request) { - healthzResponse := HealthzResponse{ + t := startupTracker.Load() + // Fail closed: if no tracker is installed, treat the SDK as still + // starting. Production wiring always installs a tracker before the + // listener accepts traffic, so reaching this branch indicates either a + // misconfigured caller or a regression — either way, 503 is the safe + // answer. + state := startupstate.Starting + if t != nil { + state = t.Get() + } + + resp := HealthzResponse{ Version: buildversion.Version(), + Status: state.String(), } - JSON(w, http.StatusOK, healthzResponse) + switch state { + case startupstate.Ready: + JSON(w, http.StatusOK, resp) + default: + JSON(w, http.StatusServiceUnavailable, resp) + } } diff --git a/pkg/handlers/healthz_test.go b/pkg/handlers/healthz_test.go new file mode 100644 index 00000000..e29a7b98 --- /dev/null +++ b/pkg/handlers/healthz_test.go @@ -0,0 +1,44 @@ +package handlers + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" + "github.com/stretchr/testify/require" +) + +func TestHealthz_Starting_Returns503(t *testing.T) { + tr := startupstate.New() + SetStartupState(tr) + t.Cleanup(func() { SetStartupState(nil) }) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/healthz", nil) + Healthz(w, r) + + require.Equal(t, http.StatusServiceUnavailable, w.Code) + + var body HealthzResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + require.Equal(t, "starting", body.Status) +} + +func TestHealthz_Ready_Returns200(t *testing.T) { + tr := startupstate.New() + tr.MarkReady() + SetStartupState(tr) + t.Cleanup(func() { SetStartupState(nil) }) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/healthz", nil) + Healthz(w, r) + + require.Equal(t, http.StatusOK, w.Code) + + var body HealthzResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + require.Equal(t, "ready", body.Status) +} diff --git a/pkg/handlers/license.go b/pkg/handlers/license.go index b5bb72db..c69c7c24 100644 --- a/pkg/handlers/license.go +++ b/pkg/handlers/license.go @@ -9,6 +9,7 @@ import ( kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" kotsv1beta2 "github.com/replicatedhq/kotskinds/apis/kots/v1beta2" licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" + "github.com/replicatedhq/replicated-sdk/pkg/k8sutil" sdklicense "github.com/replicatedhq/replicated-sdk/pkg/license" sdklicensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/replicatedhq/replicated-sdk/pkg/logger" @@ -16,6 +17,12 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/util" ) +// LicenseCacheHeader signals to clients that a license response was +// served from the local cache rather than a fresh upstream call. The +// value `stale` is set whenever the request-time SyncLatest* call +// returned SourceCache. +const LicenseCacheHeader = "X-Replicated-License-Cache" + type LicenseInfo struct { LicenseID string `json:"licenseID"` AppSlug string `json:"appSlug"` @@ -37,11 +44,26 @@ type LicenseInfo struct { Entitlements interface{} `json:"entitlements,omitempty"` } +// markStaleIfCached sets the cache-stale header when the source indicates +// the response came from the local cache rather than upstream. +func markStaleIfCached(w http.ResponseWriter, source sdklicense.LicenseSource) { + if source == sdklicense.SourceCache { + w.Header().Set(LicenseCacheHeader, "stale") + } +} + func GetLicenseInfo(w http.ResponseWriter, r *http.Request) { wrapper := store.GetStore().GetLicense() if !util.IsAirgap() { - l, err := sdklicense.GetLatestLicense(wrapper, store.GetStore().GetReplicatedAppEndpoint()) + clientset, err := k8sutil.GetClientset() + if err != nil { + logger.Error(errors.Wrap(err, "failed to get clientset")) + JSONCached(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) + return + } + + l, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), wrapper, store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) @@ -50,6 +72,7 @@ func GetLicenseInfo(w http.ResponseWriter, r *http.Request) { wrapper = l.License store.GetStore().SetLicense(wrapper) + markStaleIfCached(w, source) } JSON(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) @@ -59,7 +82,14 @@ func GetLicenseFields(w http.ResponseWriter, r *http.Request) { licenseFields := store.GetStore().GetLicenseFields() if !util.IsAirgap() { - fields, err := sdklicense.GetLatestLicenseFields(store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) + clientset, err := k8sutil.GetClientset() + if err != nil { + logger.Error(errors.Wrap(err, "failed to get clientset")) + JSONCached(w, http.StatusOK, licenseFields) + return + } + + fields, source, err := sdklicense.SyncLatestLicenseFields(r.Context(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license fields")) JSONCached(w, http.StatusOK, licenseFields) @@ -68,11 +98,19 @@ func GetLicenseFields(w http.ResponseWriter, r *http.Request) { licenseFields = fields store.GetStore().SetLicenseFields(licenseFields) + markStaleIfCached(w, source) } JSON(w, http.StatusOK, licenseFields) } +// GetLicenseField fetches a single license field. Unlike GetLicenseFields +// this handler does NOT go through a cache wrapper for the singular call: +// the in-memory store is repopulated from cache during bootstrap and on +// every SyncLatestLicenseFields hit, so the existing fall-through to +// licenseFields[fieldName] on upstream failure already serves the cached +// value transparently. Wrapping the singular call would duplicate that +// fallback path without adding cache coverage. func GetLicenseField(w http.ResponseWriter, r *http.Request) { fieldName := mux.Vars(r)["fieldName"] @@ -88,6 +126,7 @@ func GetLicenseField(w http.ResponseWriter, r *http.Request) { if lf, ok := licenseFields[fieldName]; !ok { JSONCached(w, http.StatusNotFound, fmt.Sprintf("license field %q not found", fieldName)) } else { + w.Header().Set(LicenseCacheHeader, "stale") JSONCached(w, http.StatusOK, lf) } return diff --git a/pkg/heartbeat/heartbeat.go b/pkg/heartbeat/heartbeat.go index 4b13c910..84db2868 100644 --- a/pkg/heartbeat/heartbeat.go +++ b/pkg/heartbeat/heartbeat.go @@ -1,6 +1,7 @@ package heartbeat import ( + "context" "fmt" "sync" "time" @@ -52,8 +53,14 @@ func Start() error { _, err := job.AddFunc(cronSpec, func() { logger.Debugf("sending a heartbeat for app %s", appSlug) + clientset, err := k8sutil.GetClientset() + if err != nil { + logger.Error(errors.Wrap(err, "failed to get clientset")) + return + } + if !util.IsAirgap() { - licenseData, err := sdklicense.GetLatestLicense(store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) + licenseData, _, err := sdklicense.SyncLatestLicense(context.Background(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) } else { @@ -62,11 +69,6 @@ func Start() error { } go func() { - clientset, err := k8sutil.GetClientset() - if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - return - } if err := report.SendInstanceData(clientset, store.GetStore()); err != nil { logger.Error(errors.Wrap(err, "failed to send instance data")) } diff --git a/pkg/license/cache/cache.go b/pkg/license/cache/cache.go new file mode 100644 index 00000000..5b61e905 --- /dev/null +++ b/pkg/license/cache/cache.go @@ -0,0 +1,201 @@ +// Package cache persists the license document and license-fields map to a +// dedicated Kubernetes Secret so the SDK can serve license endpoints when +// replicated.app is unreachable. +// +// The cache lives in a runtime-managed Secret (`replicated-license-cache`) +// rather than in the chart-managed `replicated` Secret, so helm upgrades +// don't clobber it. This follows the same pattern as the SDK's other +// out-of-band Secrets — `replicated-instance-report`, +// `replicated-custom-app-metrics-report`, `replicated-meta-data`, and +// `replicated-support-metadata` — none of which are rendered by any chart +// template either. +// +// Writes serialize through a package-level mutex (matching pkg/meta and +// pkg/supportbundle). Reads do not lock; concurrent reads of a Secret are +// safe. Read-only mode (store.GetReadOnlyMode()) suppresses writes so the +// SDK never attempts to create a Secret it doesn't have RBAC for. +package cache + +import ( + "context" + "encoding/json" + "sync" + "time" + + "github.com/pkg/errors" + licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/logger" + "github.com/replicatedhq/replicated-sdk/pkg/store" + "github.com/replicatedhq/replicated-sdk/pkg/util" + corev1 "k8s.io/api/core/v1" + kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// SecretName is the name of the runtime-managed cache Secret. The chart's +// RBAC Role grants the SDK `update` on this exact resourceName. +const SecretName = "replicated-license-cache" + +// Secret data keys. +const ( + KeyLicense = "license" // raw signed license YAML bytes + KeyLicenseFields = "license-fields" // JSON-encoded LicenseFields map + KeyLastFetched = "last-fetched" // RFC3339 timestamp of most recent successful upstream sync +) + +// ErrLicenseCacheNotFound is returned by Read when the cache Secret does +// not exist or contains no usable license data. Callers use it to +// distinguish a cache miss from a transient k8s API failure. +var ErrLicenseCacheNotFound = errors.New("license cache not found") + +// CachedLicense is the in-memory shape of a successful Read. LicenseFields +// is nil if the fields key was never written; LicenseBytes is always +// populated on a successful Read (the secret is considered "found" only +// when the license key is present). +type CachedLicense struct { + LicenseBytes []byte + LicenseFields licensetypes.LicenseFields + LastFetched time.Time +} + +// cacheLock serializes writes against the cache Secret to avoid lost +// updates when bootstrap and request-time syncs race. Reads are unlocked. +var cacheLock sync.Mutex + +// Read returns the cached license, or ErrLicenseCacheNotFound if no usable +// cache exists. A Secret with no `license` key counts as not-found because +// the license document is the primary artifact — fields without a license +// to attach them to are not useful on their own. +func Read(ctx context.Context, clientset kubernetes.Interface, namespace string) (*CachedLicense, error) { + secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, SecretName, metav1.GetOptions{}) + if err != nil { + if kuberneteserrors.IsNotFound(err) { + return nil, ErrLicenseCacheNotFound + } + return nil, errors.Wrap(err, "failed to get license cache secret") + } + + licenseBytes, ok := secret.Data[KeyLicense] + if !ok || len(licenseBytes) == 0 { + return nil, ErrLicenseCacheNotFound + } + + cached := &CachedLicense{ + LicenseBytes: licenseBytes, + } + + if fieldsBytes, ok := secret.Data[KeyLicenseFields]; ok && len(fieldsBytes) > 0 { + var fields licensetypes.LicenseFields + if err := json.Unmarshal(fieldsBytes, &fields); err != nil { + // Malformed fields blob is non-fatal — return the license + // without fields and log so an operator can investigate. + logger.Infof("license cache: failed to unmarshal cached license-fields, ignoring: %v", err) + } else { + cached.LicenseFields = fields + } + } + + if tsBytes, ok := secret.Data[KeyLastFetched]; ok && len(tsBytes) > 0 { + if ts, err := time.Parse(time.RFC3339, string(tsBytes)); err == nil { + cached.LastFetched = ts + } + } + + return cached, nil +} + +// WriteLicense upserts the license bytes and refreshes the last-fetched +// timestamp. License-fields, if previously cached, are preserved. +// +// In read-only mode this is a no-op so the SDK never attempts a write it +// doesn't have RBAC for. +func WriteLicense(ctx context.Context, clientset kubernetes.Interface, namespace string, licenseBytes []byte) error { + if len(licenseBytes) == 0 { + return errors.New("refusing to cache empty license bytes") + } + return write(ctx, clientset, namespace, map[string][]byte{ + KeyLicense: licenseBytes, + }) +} + +// WriteLicenseFields upserts the license-fields map and refreshes the +// last-fetched timestamp. The license document, if previously cached, is +// preserved. +func WriteLicenseFields(ctx context.Context, clientset kubernetes.Interface, namespace string, fields licensetypes.LicenseFields) error { + encoded, err := json.Marshal(fields) + if err != nil { + return errors.Wrap(err, "failed to marshal license fields") + } + return write(ctx, clientset, namespace, map[string][]byte{ + KeyLicenseFields: encoded, + }) +} + +// write performs a partial-update upsert: existing keys not in `data` are +// preserved. A `last-fetched` timestamp is always refreshed alongside any +// successful update. +func write(ctx context.Context, clientset kubernetes.Interface, namespace string, data map[string][]byte) error { + if store.GetStore().GetReadOnlyMode() { + logger.Debugf("license cache: skipping write in read-only mode") + return nil + } + + cacheLock.Lock() + defer cacheLock.Unlock() + + merged := make(map[string][]byte, len(data)+1) + for k, v := range data { + merged[k] = v + } + merged[KeyLastFetched] = []byte(time.Now().UTC().Format(time.RFC3339)) + + existing, err := clientset.CoreV1().Secrets(namespace).Get(ctx, SecretName, metav1.GetOptions{}) + if err != nil && !kuberneteserrors.IsNotFound(err) { + return errors.Wrap(err, "failed to get license cache secret") + } + + if kuberneteserrors.IsNotFound(err) { + uid, err := util.GetReplicatedDeploymentUID(clientset, namespace) + if err != nil { + return errors.Wrap(err, "failed to get replicated deployment uid") + } + + secret := &corev1.Secret{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Secret", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: SecretName, + Namespace: namespace, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: util.GetReplicatedDeploymentName(), + UID: uid, + }, + }, + }, + Data: merged, + } + + if _, err := clientset.CoreV1().Secrets(namespace).Create(ctx, secret, metav1.CreateOptions{}); err != nil { + return errors.Wrap(err, "failed to create license cache secret") + } + return nil + } + + if existing.Data == nil { + existing.Data = map[string][]byte{} + } + for k, v := range merged { + existing.Data[k] = v + } + + if _, err := clientset.CoreV1().Secrets(namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return errors.Wrap(err, "failed to update license cache secret") + } + return nil +} diff --git a/pkg/license/cache/cache_test.go b/pkg/license/cache/cache_test.go new file mode 100644 index 00000000..3d3b062f --- /dev/null +++ b/pkg/license/cache/cache_test.go @@ -0,0 +1,187 @@ +package cache + +import ( + "context" + "encoding/json" + "testing" + "time" + + licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/store" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apimachinerytypes "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" +) + +const testNamespace = "test-ns" + +// withDeployment seeds a fake clientset with the SDK's own Deployment so +// that owner-reference resolution succeeds during cache writes. +func withDeployment(t *testing.T) kubernetes.Interface { + t.Helper() + clientset := fake.NewSimpleClientset(&appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "replicated", + Namespace: testNamespace, + UID: apimachinerytypes.UID("deadbeef-0000-0000-0000-000000000000"), + }, + }) + return clientset +} + +func resetStore(t *testing.T) { + t.Helper() + store.SetStore(nil) +} + +func TestRead_NoSecret_ReturnsNotFound(t *testing.T) { + clientset := fake.NewSimpleClientset() + + _, err := Read(context.Background(), clientset, testNamespace) + require.ErrorIs(t, err, ErrLicenseCacheNotFound) +} + +func TestRead_SecretWithoutLicenseKey_ReturnsNotFound(t *testing.T) { + clientset := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, + Data: map[string][]byte{ + KeyLicenseFields: []byte("{}"), + }, + }) + + _, err := Read(context.Background(), clientset, testNamespace) + require.ErrorIs(t, err, ErrLicenseCacheNotFound, "fields-only secret must not satisfy a cache read") +} + +func TestWriteLicense_CreatesSecret(t *testing.T) { + store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) + t.Cleanup(func() { resetStore(t) }) + + clientset := withDeployment(t) + licenseBytes := []byte("license-yaml-document") + + require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, licenseBytes)) + + secret, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) + require.NoError(t, err) + require.Equal(t, licenseBytes, secret.Data[KeyLicense]) + require.NotEmpty(t, secret.Data[KeyLastFetched], "last-fetched must be set on every write") + + require.Len(t, secret.OwnerReferences, 1) + require.Equal(t, "Deployment", secret.OwnerReferences[0].Kind) +} + +func TestWriteLicenseFields_PreservesLicense(t *testing.T) { + store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) + t.Cleanup(func() { resetStore(t) }) + + clientset := withDeployment(t) + ctx := context.Background() + + licenseBytes := []byte("license-yaml-document") + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, licenseBytes)) + + fields := licensetypes.LicenseFields{ + "my_field": licensetypes.LicenseField{ + Title: "My Field", + Value: "v1", + }, + } + require.NoError(t, WriteLicenseFields(ctx, clientset, testNamespace, fields)) + + cached, err := Read(ctx, clientset, testNamespace) + require.NoError(t, err) + require.Equal(t, licenseBytes, cached.LicenseBytes, "license bytes must survive a fields-only write") + require.Equal(t, "v1", cached.LicenseFields["my_field"].Value) +} + +func TestWrite_RefreshesLastFetched(t *testing.T) { + store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) + t.Cleanup(func() { resetStore(t) }) + + clientset := withDeployment(t) + ctx := context.Background() + + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v1"))) + cached1, err := Read(ctx, clientset, testNamespace) + require.NoError(t, err) + + time.Sleep(1100 * time.Millisecond) // RFC3339 has 1s resolution. + + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v2"))) + cached2, err := Read(ctx, clientset, testNamespace) + require.NoError(t, err) + + require.True(t, cached2.LastFetched.After(cached1.LastFetched), "last-fetched must advance on subsequent writes") +} + +func TestWrite_ReadOnlyMode_NoSecretCreated(t *testing.T) { + store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace, ReadOnlyMode: true}) + t.Cleanup(func() { resetStore(t) }) + + clientset := fake.NewSimpleClientset() + + require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"))) + + _, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) + require.True(t, kuberneteserrors.IsNotFound(err), "no secret should be created in read-only mode") +} + +func TestWriteLicense_RejectsEmpty(t *testing.T) { + store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) + t.Cleanup(func() { resetStore(t) }) + + clientset := withDeployment(t) + + require.Error(t, WriteLicense(context.Background(), clientset, testNamespace, nil), + "writing empty license bytes must be rejected to avoid corrupting the cache") +} + +func TestRead_MalformedFields_ReturnsLicenseWithoutFields(t *testing.T) { + clientset := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, + Data: map[string][]byte{ + KeyLicense: []byte("license-yaml"), + KeyLicenseFields: []byte("not valid json"), + }, + }) + + cached, err := Read(context.Background(), clientset, testNamespace) + require.NoError(t, err, "malformed fields must not fail the read") + require.NotNil(t, cached.LicenseBytes) + require.Nil(t, cached.LicenseFields, "malformed fields are dropped, not surfaced") +} + +// Sanity check: cached fields round-trip through JSON without losing the +// LicenseFieldSignature struct. +func TestRead_RoundTripsFieldSignatures(t *testing.T) { + encoded, err := json.Marshal(licensetypes.LicenseFields{ + "f1": licensetypes.LicenseField{ + Title: "F1", + Value: 42.0, + Signature: licensetypes.LicenseFieldSignature{ + V1: "sig-v1", + V2: "sig-v2", + }, + }, + }) + require.NoError(t, err) + + clientset := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, + Data: map[string][]byte{ + KeyLicense: []byte("license-yaml"), + KeyLicenseFields: encoded, + }, + }) + + cached, err := Read(context.Background(), clientset, testNamespace) + require.NoError(t, err) + require.Equal(t, "sig-v1", cached.LicenseFields["f1"].Signature.V1) + require.Equal(t, "sig-v2", cached.LicenseFields["f1"].Signature.V2) +} diff --git a/pkg/license/license.go b/pkg/license/license.go index 36be71b9..ed25ea6a 100644 --- a/pkg/license/license.go +++ b/pkg/license/license.go @@ -23,7 +23,10 @@ type LicenseData struct { License licensewrapper.LicenseWrapper } -func GetLicenseByID(licenseID string, endpoint string) (licensewrapper.LicenseWrapper, error) { +// GetLicenseByID fetches the license document by ID from the upstream +// Vendor Portal. Returns the parsed wrapper alongside the raw signed +// bytes so callers can persist them (e.g. to the cache). +func GetLicenseByID(licenseID string, endpoint string) (*LicenseData, error) { if endpoint == "" { endpoint = defaultReplicatedAppEndpoint } @@ -31,10 +34,10 @@ func GetLicenseByID(licenseID string, endpoint string) (licensewrapper.LicenseWr licenseData, err := getLicenseFromAPI(url, licenseID) if err != nil { - return licensewrapper.LicenseWrapper{}, errors.Wrap(err, "failed to get license from api") + return nil, errors.Wrap(err, "failed to get license from api") } - return licenseData.License, nil + return licenseData, nil } func GetLatestLicense(wrapper licensewrapper.LicenseWrapper, endpoint string) (*LicenseData, error) { diff --git a/pkg/license/sync.go b/pkg/license/sync.go new file mode 100644 index 00000000..38c40c11 --- /dev/null +++ b/pkg/license/sync.go @@ -0,0 +1,157 @@ +package license + +import ( + "context" + "sync" + + "github.com/pkg/errors" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" + "github.com/replicatedhq/replicated-sdk/pkg/license/cache" + "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/logger" + "k8s.io/client-go/kubernetes" +) + +// LicenseSource identifies whether a Sync* result came from the live +// upstream or from the local cache. Handlers use this to decide whether +// to set the X-Replicated-License-Cache: stale response header. +type LicenseSource int + +const ( + SourceUpstream LicenseSource = iota + SourceCache +) + +// staleWarnOnce ensures the cache-fallback warning log fires at most once +// per pod lifetime. Subsequent fallbacks are silent to avoid log spam +// during a sustained replicated.app outage. +var staleWarnOnce sync.Once + +// warnStale logs the cache-fallback the first time it fires in a given +// pod. The original upstream error is included to give operators a clue +// about why the upstream call failed. +func warnStale(upstreamErr error) { + staleWarnOnce.Do(func() { + logger.Warnf("license_cache_fallback: serving cached license because upstream is unavailable: %v", upstreamErr) + }) +} + +// SyncLicenseByID fetches a license by ID from the upstream Vendor Portal. +// On success, the license bytes are written through to the cache and +// SourceUpstream is returned. +// +// On upstream failure, the cache is consulted: if a previously-cached +// license is present and parses correctly, it is returned with +// SourceCache. If the cache is missing or unusable, the original upstream +// error is returned wrapped (so callers can inspect it). +// +// This wrapper is the entry point for the integration-mode boot path. +// Production-mode boots use the chart-embedded license bytes and never +// reach here. +func SyncLicenseByID( + ctx context.Context, + clientset kubernetes.Interface, + namespace string, + licenseID string, + endpoint string, +) (*LicenseData, LicenseSource, error) { + data, upstreamErr := GetLicenseByID(licenseID, endpoint) + if upstreamErr == nil { + if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes); writeErr != nil { + logger.Infof("license cache: write-through after GetLicenseByID failed, continuing: %v", writeErr) + } + return data, SourceUpstream, nil + } + + cached, cacheErr := cache.Read(ctx, clientset, namespace) + if cacheErr != nil { + return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") + } + + parsed, parseErr := LoadLicenseFromBytes(cached.LicenseBytes) + if parseErr != nil { + return nil, SourceUpstream, errors.Wrapf(upstreamErr, "upstream failed and cached license could not be parsed: %v", parseErr) + } + + warnStale(upstreamErr) + return &LicenseData{ + LicenseBytes: cached.LicenseBytes, + License: parsed, + }, SourceCache, nil +} + +// SyncLatestLicense refreshes the license against the upstream and writes +// through to the cache on success. On upstream failure, falls back to the +// cached license; on cache miss, returns the original upstream error. +// +// The returned LicenseData.LicenseBytes is what gets persisted on the +// success path. Cache hits return the bytes that were written by the most +// recent successful upstream call. +func SyncLatestLicense( + ctx context.Context, + clientset kubernetes.Interface, + namespace string, + wrapper licensewrapper.LicenseWrapper, + endpoint string, +) (*LicenseData, LicenseSource, error) { + data, upstreamErr := GetLatestLicense(wrapper, endpoint) + if upstreamErr == nil { + if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes); writeErr != nil { + logger.Infof("license cache: write-through after GetLatestLicense failed, continuing: %v", writeErr) + } + return data, SourceUpstream, nil + } + + cached, cacheErr := cache.Read(ctx, clientset, namespace) + if cacheErr != nil { + return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") + } + + parsed, parseErr := LoadLicenseFromBytes(cached.LicenseBytes) + if parseErr != nil { + return nil, SourceUpstream, errors.Wrapf(upstreamErr, "upstream failed and cached license could not be parsed: %v", parseErr) + } + + warnStale(upstreamErr) + return &LicenseData{ + LicenseBytes: cached.LicenseBytes, + License: parsed, + }, SourceCache, nil +} + +// SyncLatestLicenseFields refreshes the license-fields map against the +// upstream and writes through to the cache on success. On upstream +// failure, falls back to the cached fields; on cache miss, returns the +// original upstream error. +// +// Note: GetLatestLicenseField (singular) is intentionally NOT wrapped. +// Its handler already falls back to the in-memory store on upstream +// failure, and the in-memory store is repopulated from the cache during +// bootstrap. Wrapping it would duplicate that fallback path without +// adding value. +func SyncLatestLicenseFields( + ctx context.Context, + clientset kubernetes.Interface, + namespace string, + wrapper licensewrapper.LicenseWrapper, + endpoint string, +) (types.LicenseFields, LicenseSource, error) { + fields, upstreamErr := GetLatestLicenseFields(wrapper, endpoint) + if upstreamErr == nil { + if writeErr := cache.WriteLicenseFields(ctx, clientset, namespace, fields); writeErr != nil { + logger.Infof("license cache: write-through after GetLatestLicenseFields failed, continuing: %v", writeErr) + } + return fields, SourceUpstream, nil + } + + cached, cacheErr := cache.Read(ctx, clientset, namespace) + if cacheErr != nil { + return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") + } + if cached.LicenseFields == nil { + return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and no cached license fields available") + } + + warnStale(upstreamErr) + return cached.LicenseFields, SourceCache, nil +} diff --git a/pkg/license/sync_test.go b/pkg/license/sync_test.go new file mode 100644 index 00000000..0a7ce2cc --- /dev/null +++ b/pkg/license/sync_test.go @@ -0,0 +1,243 @@ +package license + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync" + "testing" + + "github.com/replicatedhq/replicated-sdk/pkg/license/cache" + "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/store" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apimachinerytypes "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" +) + +const syncTestNamespace = "test-ns" + +// testFixture bundles the moving parts every Sync* test needs: a fake +// kubernetes clientset (with the SDK Deployment so cache writes can resolve +// owner references), an initialized in-memory store, and a reset hook for +// the package-level once-per-pod warning. +type testFixture struct { + clientset kubernetes.Interface +} + +func newTestFixture(t *testing.T) *testFixture { + t.Helper() + + store.InitInMemory(store.InitInMemoryStoreOptions{ + Namespace: syncTestNamespace, + }) + t.Cleanup(func() { store.SetStore(nil) }) + + resetStaleWarnOnce(t) + + clientset := fake.NewSimpleClientset(&appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "replicated", + Namespace: syncTestNamespace, + UID: apimachinerytypes.UID("deadbeef-0000-0000-0000-000000000000"), + }, + }) + + return &testFixture{clientset: clientset} +} + +// resetStaleWarnOnce makes the once-per-pod warning fire fresh in each +// test. The package-level sync.Once otherwise leaks state across tests. +func resetStaleWarnOnce(t *testing.T) { + t.Helper() + staleWarnOnce = sync.Once{} +} + +// licenseServer returns an httptest server that serves validLicenseYAML +// from any path (so the same server backs GetLicenseByID and +// GetLatestLicense regardless of URL shape). +func licenseServer(t *testing.T) *httptest.Server { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/yaml") + _, _ = w.Write([]byte(validLicenseYAML)) + })) + t.Cleanup(srv.Close) + return srv +} + +// brokenServer returns an httptest server that 503s every request, +// simulating an unreachable replicated.app. +func brokenServer(t *testing.T) *httptest.Server { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "upstream broken", http.StatusServiceUnavailable) + })) + t.Cleanup(srv.Close) + return srv +} + +// fieldsServer returns an httptest server that serves a fixed +// LicenseFields payload from any path. +func fieldsServer(t *testing.T, fields types.LicenseFields) *httptest.Server { + t.Helper() + body, err := json.Marshal(fields) + require.NoError(t, err) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(body) + })) + t.Cleanup(srv.Close) + return srv +} + +// ---- SyncLicenseByID ---- + +func TestSyncLicenseByID_UpstreamSuccess_WritesThroughToCache(t *testing.T) { + f := newTestFixture(t) + srv := licenseServer(t) + + data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + require.NoError(t, err) + require.Equal(t, SourceUpstream, source) + require.NotEmpty(t, data.LicenseBytes) + + cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) + require.NoError(t, err, "successful upstream call must write through to cache") + require.Equal(t, data.LicenseBytes, cached.LicenseBytes) +} + +func TestSyncLicenseByID_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + + data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + require.NoError(t, err) + require.Equal(t, SourceCache, source) + require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) +} + +func TestSyncLicenseByID_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + _, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + require.Error(t, err) + require.Equal(t, SourceUpstream, source, "no fallback available, source must reflect the failed attempt") + require.Contains(t, err.Error(), "license cache miss") +} + +// ---- SyncLatestLicense ---- + +func TestSyncLatestLicense_UpstreamSuccess_WritesThroughToCache(t *testing.T) { + f := newTestFixture(t) + srv := licenseServer(t) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.NoError(t, err) + require.Equal(t, SourceUpstream, source) + + cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) + require.NoError(t, err) + require.Equal(t, data.LicenseBytes, cached.LicenseBytes) +} + +func TestSyncLatestLicense_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + + data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.NoError(t, err) + require.Equal(t, SourceCache, source) + require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) +} + +func TestSyncLatestLicense_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + _, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.Error(t, err) + require.Equal(t, SourceUpstream, source) + require.Contains(t, err.Error(), "license cache miss") +} + +// ---- SyncLatestLicenseFields ---- + +func TestSyncLatestLicenseFields_UpstreamSuccess_WritesThroughToCache(t *testing.T) { + f := newTestFixture(t) + expected := types.LicenseFields{ + "my_field": types.LicenseField{Title: "My Field", Value: "v1"}, + } + srv := fieldsServer(t, expected) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + // In production a license is always cached before fields are + // (bootstrapCritical → bootstrapBackground ordering), so seed that + // state before exercising the fields write-through. + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + + got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.NoError(t, err) + require.Equal(t, SourceUpstream, source) + require.Equal(t, "v1", got["my_field"].Value) + + cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) + require.NoError(t, err) + require.Equal(t, "v1", cached.LicenseFields["my_field"].Value) +} + +func TestSyncLatestLicenseFields_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + cached := types.LicenseFields{ + "my_field": types.LicenseField{Title: "Cached", Value: "stale"}, + } + // Seed the cache with both a license (so Read returns success) and fields. + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + require.NoError(t, cache.WriteLicenseFields(context.Background(), f.clientset, syncTestNamespace, cached)) + + got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.NoError(t, err) + require.Equal(t, SourceCache, source) + require.Equal(t, "stale", got["my_field"].Value) +} + +func TestSyncLatestLicenseFields_UpstreamFailure_NoCachedFields_ReturnsWrappedError(t *testing.T) { + f := newTestFixture(t) + srv := brokenServer(t) + + wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) + require.NoError(t, err) + + // License cached but fields never written — fields fallback must miss. + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + + _, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + require.Error(t, err) + require.Equal(t, SourceUpstream, source) + require.Contains(t, err.Error(), "no cached license fields available") +} diff --git a/pkg/license/testdata_test.go b/pkg/license/testdata_test.go new file mode 100644 index 00000000..897a5300 --- /dev/null +++ b/pkg/license/testdata_test.go @@ -0,0 +1,54 @@ +package license + +// validLicenseYAML is a signed test license that round-trips through the +// embedded licensewrapper public key. Sourced from signature_test.go's +// "basic valid signature" fixture, exposed as a constant here so the +// sync_test.go suite can reuse it. +const validLicenseYAML = `apiVersion: kots.io/v1beta1 +kind: License +metadata: + name: testcustomer +spec: + appSlug: my-app + channelID: 1vusIYZLAVxMG6q760OJmRKj5i5 + channelName: My Channel + customerName: Test Customer + endpoint: https://replicated.app + entitlements: + bool_field: + title: Bool Field + value: true + valueType: Boolean + expires_at: + description: License Expiration + title: Expiration + value: "2030-07-27T00:00:00Z" + valueType: String + hidden_field: + isHidden: true + title: Hidden Field + value: this is secret + valueType: String + int_field: + title: Int Field + value: 123 + valueType: Integer + string_field: + title: StringField + value: single line text + valueType: String + text_field: + title: Text Field + value: |- + multi + line + text + valueType: Text + isAirgapSupported: true + isGitOpsSupported: true + isSnapshotSupported: true + licenseID: 1vusOokxAVp1tkRGuyxnF23PJcq + licenseSequence: 7 + licenseType: prod + signature: eyJsaWNlbnNlRGF0YSI6ImV5SmhjR2xXWlhKemFXOXVJam9pYTI5MGN5NXBieTkyTVdKbGRHRXhJaXdpYTJsdVpDSTZJa3hwWTJWdWMyVWlMQ0p0WlhSaFpHRjBZU0k2ZXlKdVlXMWxJam9pZEdWemRHTjFjM1J2YldWeUluMHNJbk53WldNaU9uc2liR2xqWlc1elpVbEVJam9pTVhaMWMwOXZhM2hCVm5BeGRHdFNSM1Y1ZUc1R01qTlFTbU54SWl3aWJHbGpaVzV6WlZSNWNHVWlPaUp3Y205a0lpd2lZM1Z6ZEc5dFpYSk9ZVzFsSWpvaVZHVnpkQ0JEZFhOMGIyMWxjaUlzSW1Gd2NGTnNkV2NpT2lKdGVTMWhjSEFpTENKamFHRnVibVZzU1VRaU9pSXhkblZ6U1ZsYVRFRldlRTFITm5FM05qQlBTbTFTUzJvMWFUVWlMQ0pqYUdGdWJtVnNUbUZ0WlNJNklrMTVJRU5vWVc1dVpXd2lMQ0pzYVdObGJuTmxVMlZ4ZFdWdVkyVWlPamNzSW1WdVpIQnZhVzUwSWpvaWFIUjBjSE02THk5eVpYQnNhV05oZEdWa0xtRndjQ0lzSW1WdWRHbDBiR1Z0Wlc1MGN5STZleUppYjI5c1gyWnBaV3hrSWpwN0luUnBkR3hsSWpvaVFtOXZiQ0JHYVdWc1pDSXNJblpoYkhWbElqcDBjblZsTENKMllXeDFaVlI1Y0dVaU9pSkNiMjlzWldGdUluMHNJbVY0Y0dseVpYTmZZWFFpT25zaWRHbDBiR1VpT2lKRmVIQnBjbUYwYVc5dUlpd2laR1Z6WTNKcGNIUnBiMjRpT2lKTWFXTmxibk5sSUVWNGNHbHlZWFJwYjI0aUxDSjJZV3gxWlNJNklqSXdNekF0TURjdE1qZFVNREE2TURBNk1EQmFJaXdpZG1Gc2RXVlVlWEJsSWpvaVUzUnlhVzVuSW4wc0ltaHBaR1JsYmw5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtocFpHUmxiaUJHYVdWc1pDSXNJblpoYkhWbElqb2lkR2hwY3lCcGN5QnpaV055WlhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lMQ0pwYzBocFpHUmxiaUk2ZEhKMVpYMHNJbWx1ZEY5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtsdWRDQkdhV1ZzWkNJc0luWmhiSFZsSWpveE1qTXNJblpoYkhWbFZIbHdaU0k2SWtsdWRHVm5aWElpZlN3aWMzUnlhVzVuWDJacFpXeGtJanA3SW5ScGRHeGxJam9pVTNSeWFXNW5SbWxsYkdRaUxDSjJZV3gxWlNJNkluTnBibWRzWlNCc2FXNWxJSFJsZUhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lmU3dpZEdWNGRGOW1hV1ZzWkNJNmV5SjBhWFJzWlNJNklsUmxlSFFnUm1sbGJHUWlMQ0oyWVd4MVpTSTZJbTExYkhScFhHNXNhVzVsWEc1MFpYaDBJaXdpZG1Gc2RXVlVlWEJsSWpvaVZHVjRkQ0o5ZlN3aWFYTkJhWEpuWVhCVGRYQndiM0owWldRaU9uUnlkV1VzSW1selIybDBUM0J6VTNWd2NHOXlkR1ZrSWpwMGNuVmxMQ0pwYzFOdVlYQnphRzkwVTNWd2NHOXlkR1ZrSWpwMGNuVmxmWDA9IiwiaW5uZXJTaWduYXR1cmUiOiJleUpzYVdObGJuTmxVMmxuYm1GMGRYSmxJam9pYUhneE1XTXZUR1ozUTNoVE5YRmtRWEJGU1hGdVRrMU9NMHBLYTJzNFZHZFhSVVpzVDFKVlJ6UjJjR1YzZEZoV1YzbG1lamRZY0hBd1ExazJZamRyUVRSS2N6TklhR3d3YkZJMFdUQTFMemN2UVVkQ2FEZFZNSGczUkhaTVozUXpVM00wYm5GTFZTdFhXRXBTVHpKWVFVRnZSME4xZFRWR1RGcHJRVWhYY1RSUVFtMXphSFY2Y1ZsdmNucHhlbGhGWVZWVlpFUlVkVXhDTW1nNWFIZ3dXRWhQUmxwUk16bHVkbTlPUjJaT2R5OTRTVmRaZEhSUGRYZHZhMncyTVZsb1JVeFZlRmQxU1ZSRmMwTlVhM2xtTVRNd09IazVSbFJzWlRKeVYyZEVlSEZNYTBSUFNXVXlPRWwzUzJSQkwySXdWVUl5VEZGbVRWcHdWemwyUTNCSkwybHlWek5uYmpaeU5WWjNWMjB2U1dweWJtNDNSelJrVmpadVYzcFRkMGhQUTJSdWEwMTRNRXQ1VVVOa0wxQjFaWEpUYjNSdVEwOXRTMDEzWlRSTGJqaERkMU5YVVRRNGRURkRNbTFpV1VzeGRYTlpOM1YzUFQwaUxDSndkV0pzYVdOTFpYa2lPaUl0TFMwdExVSkZSMGxPSUZCVlFreEpReUJMUlZrdExTMHRMVnh1VFVsSlFrbHFRVTVDWjJ0eGFHdHBSemwzTUVKQlVVVkdRVUZQUTBGUk9FRk5TVWxDUTJkTFEwRlJSVUZ6TkhKdlVIcDFhV1JNZVhOMmIxWTJkemxhTkZ4dVdHRmliME5tWTJNeGFHZFZhQ3N3V1VkS2NFNURSVXhyTjBaTFF5OTJhemR6ZERsR05tY3dUMjlrU0VSbGVYZFJXa2hLZFU1TVpsUnNRbEJHUTJOaU5seHVObTlzVEZOeWNGQTRjbFUzU0d4SGJsRkVSMFJNYVhkS1EyaGtSRGRVVUdSM2FXdHBkMHRGY201aldqaEdaalZsU25vd2RETmlUWFpyVDJaVVluSkJiRnh1WWtGQ1kwbzVNVmxVT1hKdVVXOXFkVWN4UldKUVRqaEZWblI2TWxZNE5IZHViR2Q0TUhCd2JEVjRPSFpOYlhwcE1ISnVibEZVV1VGamJ6WnFhMnBJTTF4dVRuTlVkWE4xUzFkdlJGUjVNWE5yZGtSUk9IbEJZV0ptWTNNME4zWnNRazAwU0RGT1JFNHZSSFJhWWxZdllubDJia0o2YkM4eFZrVnpURmRqWlZWcFRGeHVSWEYxT0VkeWF5dFFVRGQyUkdSd2JFUjNjWFpQV2t4RmRYazNkamhuUm01U09WUlVSV3ByTlVvNWRuWlVTR2RtU25VemVubEVPR2xLWTBSRE5YcHFPVnh1YjFGSlJFRlJRVUpjYmkwdExTMHRSVTVFSUZCVlFreEpReUJMUlZrdExTMHRMVnh1SWl3aWEyVjVVMmxuYm1GMGRYSmxJam9pWlhsS2VtRlhaSFZaV0ZJeFkyMVZhVTlwU2pCUldIQjJXVE5LVms1NmFGaFNSMlJzVVRKb2NtTklXa1ZVVlRsRldqQktXVTFGUmtaVFJFNUZVMGhLYkUxclRUTkxNSEJFVkROR2VGTnROVVJVVlRWVlltMDFiVnBGUm5sWldIQjZaRVJqTVZaSGFFeFBXRUpVVWtacmRrd3diek5aTUZaSlVteFdWRXd5T1VoV1JXeHNWa1ZPTUZSSE1WWlJNR04zVkd4R2JGa3pTblJUUm1zMFZVWk9hMVpWU2pCVU1WbDNZbXQwY0ZSclZuQmpia0poVFZjNWFtSldiSEZaYTNob1UyeHNWV0pGUmtWWGJVWnZWakZLVUZkcWJGSmhXRVp1V2xkb1EyRnVRak5TUjNNd1lWWkpOVTVXVmxkV1ZUVnlUMGhLYjFsVlRYbGhiVGcwVjBkYWVGbHFWbFppYlhoeFpFWkZkMDU1Y3pCaFZsSkpWRVpPTm1WRk1IcGxWWFJ2VFVaR1ZtRXdWVFJSVnpsSFVsaEtVRTFZUmxCU01WcFJVMVJDTmxsV2FIcFdWWEJ0WTBSU2JFMVVRazlPVjNSU1ZucFdUMU5XWTNaU1ZYUkZVMGhzYlU5VmJGaGtNMUl3WTFWc1lXTlhSakJTYTA1RVlVWmtjbUo2VmtSU00wSllUREkxUmsxWVl6SmxWM1JKVlZoQk1sVXhTbEppU0Zwd1VrVXdNRlpFVWt0VU1rWnNVVmQwYzFSV1VrMVVWV055V1RCYVRHSXpaRTlUVm05NVlraE9SR1JzVG5aUmFrWmFaVmRPVGxOVlNteGFiRXB1Wld0U2RVMHhSVGxRVTBselNXMWtjMkl5U21oaVJYUnNaVlZzYTBscWIybFpiVkpzV2xSVk1rNVVXWGRaTWxwcFRrUk9hazlYU1hsUFIwcHRUMVJvYkZsWFRtaGFiVVV5VGtSWmFXWlJQVDBpZlE9PSJ9 +` diff --git a/pkg/startupstate/startupstate.go b/pkg/startupstate/startupstate.go new file mode 100644 index 00000000..682938f0 --- /dev/null +++ b/pkg/startupstate/startupstate.go @@ -0,0 +1,94 @@ +// Package startupstate exposes a shared, concurrency-safe enum that tracks the +// lifecycle of the SDK's bootstrap process. It is consumed by the apiserver +// (which transitions the state) and by the /healthz handler (which reports the +// state to readiness probes). +// +// The package lives outside both apiserver and handlers to avoid an import +// cycle between them. +package startupstate + +import "sync/atomic" + +// State enumerates the bootstrap lifecycle phases. +type State int32 + +const ( + // Starting indicates the API server is up but bootstrap has not yet + // completed (or, with requireUpstreamOnStartup, has not yet completed + // the full critical+background path). + Starting State = iota + // Ready indicates the SDK is ready to serve requests. With the default + // configuration this fires after bootstrapCritical succeeds or after + // the readiness timeout elapses, whichever comes first. + Ready + // Failed indicates bootstrap encountered a permanent (non-retryable) + // error. /healthz will surface 503 in this state and the process will + // typically exit shortly after. + Failed +) + +// String returns a stable, lower-case representation suitable for logs and +// JSON payloads. +func (s State) String() string { + switch s { + case Starting: + return "starting" + case Ready: + return "ready" + case Failed: + return "failed" + default: + return "unknown" + } +} + +// Tracker holds the current bootstrap state. The zero value is usable and +// reports Starting. +type Tracker struct { + state atomic.Int32 +} + +// New returns a Tracker initialized to Starting. +func New() *Tracker { + return &Tracker{} +} + +// Get returns the current state. +func (t *Tracker) Get() State { + return State(t.state.Load()) +} + +// Set unconditionally writes the supplied state. +func (t *Tracker) Set(s State) { + t.state.Store(int32(s)) +} + +// MarkReady transitions the tracker to Ready unless it is already Failed. +// Returns true if the call caused the transition. +// +// Failed is sticky: once a permanent bootstrap error has been recorded we +// never silently flip back to Ready. +func (t *Tracker) MarkReady() bool { + for { + current := State(t.state.Load()) + if current == Failed { + return false + } + if current == Ready { + return false + } + if t.state.CompareAndSwap(int32(current), int32(Ready)) { + return true + } + } +} + +// MarkFailed transitions the tracker to Failed. Failed is terminal. +func (t *Tracker) MarkFailed() { + t.state.Store(int32(Failed)) +} + +// IsReady is a convenience that returns true iff the current state is Ready. +func (t *Tracker) IsReady() bool { + return t.Get() == Ready +} diff --git a/pkg/startupstate/startupstate_test.go b/pkg/startupstate/startupstate_test.go new file mode 100644 index 00000000..3af9b3fa --- /dev/null +++ b/pkg/startupstate/startupstate_test.go @@ -0,0 +1,43 @@ +package startupstate + +import ( + "sync" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMarkFailed_BlocksLaterMarkReady(t *testing.T) { + tr := New() + tr.MarkFailed() + + require.False(t, tr.MarkReady(), "MarkReady after MarkFailed must not transition") + require.Equal(t, Failed, tr.Get(), "Failed state must be sticky") +} + +func TestConcurrentMarkReady(t *testing.T) { + tr := New() + const goroutines = 64 + + var wg sync.WaitGroup + transitions := make(chan bool, goroutines) + wg.Add(goroutines) + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + transitions <- tr.MarkReady() + }() + } + wg.Wait() + close(transitions) + + count := 0 + for ok := range transitions { + if ok { + count++ + } + } + + require.Equal(t, 1, count, "exactly one MarkReady call should report a transition") + require.True(t, tr.IsReady(), "expected Ready after concurrent MarkReady") +} diff --git a/pkg/store/memory_store.go b/pkg/store/memory_store.go index 7eefd516..b59adadf 100644 --- a/pkg/store/memory_store.go +++ b/pkg/store/memory_store.go @@ -2,18 +2,30 @@ package store import ( "strings" + "sync" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/logger" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" - licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" ) +// InMemoryStore is the SDK's process-local source of truth for license and +// app state. +// +// Concurrency: fields written outside of InitInMemory are protected by mu +// (an RWMutex). The fields broken out below as "set once at init" are +// written exclusively by InitInMemory, which runs single-threaded before +// the HTTP listener accepts traffic, so they intentionally do not take +// the lock. Adding a setter for any of those fields requires moving it +// under the mu-protected group. type InMemoryStore struct { + mu sync.RWMutex + + // Set once at init (no locking required for read). replicatedID string appID string - license licensewrapper.LicenseWrapper - licenseFields licensetypes.LicenseFields appSlug string appName string channelID string @@ -26,12 +38,18 @@ type InMemoryStore struct { replicatedAppEndpoint string releaseImages []string namespace string - appStatus appstatetypes.AppStatus - updates []upstreamtypes.ChannelRelease + reportAllImages bool + readOnlyMode bool + + // Mutated at runtime by background goroutines (heartbeat, appstate + // operator) and request handlers concurrently with each other. All + // reads and writes of these fields go through mu. + license licensewrapper.LicenseWrapper + licenseFields licensetypes.LicenseFields + appStatus appstatetypes.AppStatus + updates []upstreamtypes.ChannelRelease // podImages holds namespace -> podUID -> []ImageInfo - podImages map[string]map[string][]appstatetypes.ImageInfo - reportAllImages bool - readOnlyMode bool + podImages map[string]map[string][]appstatetypes.ImageInfo } type InitInMemoryStoreOptions struct { @@ -86,27 +104,44 @@ func (s *InMemoryStore) GetAppID() string { } func (s *InMemoryStore) GetLicense() licensewrapper.LicenseWrapper { + s.mu.RLock() + defer s.mu.RUnlock() return s.license } func (s *InMemoryStore) SetLicense(license licensewrapper.LicenseWrapper) { + s.mu.Lock() + defer s.mu.Unlock() // DeepCopy appropriate version if license.V1 != nil { s.license = licensewrapper.LicenseWrapper{ V1: license.V1.DeepCopy(), } - } else if license.V2 != nil { + return + } + if license.V2 != nil { s.license = licensewrapper.LicenseWrapper{ V2: license.V2.DeepCopy(), } + return } + // Both V1 and V2 are nil — the previous license is preserved + // rather than overwritten with a zero value. Production callers go + // through loadAndVerifyLicense / Sync* which never return an + // empty wrapper alongside a nil error, so reaching this branch is + // a contract violation worth surfacing. + logger.Debugf("InMemoryStore.SetLicense called with empty wrapper; retaining previous license") } func (s *InMemoryStore) GetLicenseFields() licensetypes.LicenseFields { + s.mu.RLock() + defer s.mu.RUnlock() return s.licenseFields } func (s *InMemoryStore) SetLicenseFields(licenseFields licensetypes.LicenseFields) { + s.mu.Lock() + defer s.mu.Unlock() // copy by value not reference if licenseFields == nil { s.licenseFields = nil @@ -121,6 +156,8 @@ func (s *InMemoryStore) SetLicenseFields(licenseFields licensetypes.LicenseField } func (s *InMemoryStore) IsDevLicense() bool { + s.mu.RLock() + defer s.mu.RUnlock() return s.license.GetLicenseType() == "dev" } @@ -173,14 +210,20 @@ func (s *InMemoryStore) GetNamespace() string { } func (s *InMemoryStore) GetAppStatus() appstatetypes.AppStatus { + s.mu.RLock() + defer s.mu.RUnlock() return s.appStatus } func (s *InMemoryStore) SetAppStatus(status appstatetypes.AppStatus) { + s.mu.Lock() + defer s.mu.Unlock() s.appStatus = status } func (s *InMemoryStore) SetPodImages(namespace string, podUID string, images []appstatetypes.ImageInfo) { + s.mu.Lock() + defer s.mu.Unlock() if s.podImages == nil { s.podImages = make(map[string]map[string][]appstatetypes.ImageInfo) } @@ -280,6 +323,8 @@ func canonicalNameTag(s string) string { } func (s *InMemoryStore) DeletePodImages(namespace string, podUID string) { + s.mu.Lock() + defer s.mu.Unlock() if s.podImages == nil { return } @@ -293,6 +338,8 @@ func (s *InMemoryStore) DeletePodImages(namespace string, podUID string) { } func (s *InMemoryStore) GetRunningImages() map[string][]string { + s.mu.RLock() + defer s.mu.RUnlock() // Aggregate image -> unique SHA set across all namespaces/pods resultSet := make(map[string]map[string]struct{}) for _, pods := range s.podImages { @@ -320,10 +367,14 @@ func (s *InMemoryStore) GetRunningImages() map[string][]string { } func (s *InMemoryStore) GetUpdates() []upstreamtypes.ChannelRelease { + s.mu.RLock() + defer s.mu.RUnlock() return s.updates } func (s *InMemoryStore) SetUpdates(updates []upstreamtypes.ChannelRelease) { + s.mu.Lock() + defer s.mu.Unlock() s.updates = updates } diff --git a/pkg/store/memory_store_test.go b/pkg/store/memory_store_test.go index 196ec38c..78fcf985 100644 --- a/pkg/store/memory_store_test.go +++ b/pkg/store/memory_store_test.go @@ -1,9 +1,13 @@ package store import ( + "sync" "testing" + kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" + licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/stretchr/testify/require" ) @@ -205,3 +209,66 @@ func TestInMemoryStore_SetPodImages_PrivateRegistryRewrite_DoesNotOvermatchSimil require.Nil(t, got["registry.example.com/team/myapp/mynginx:latest"]) // not matched require.ElementsMatch(t, []string{"sha256:ok"}, got["registry.example.com/team/myapp/nginx:latest"]) // matched } + +// TestInMemoryStore_ConcurrentReadWrite exercises the runtime-mutated +// fields under simultaneous read and write pressure. Without the RWMutex +// added in Phase 2, `go test -race` flags this as a data race. +// +// The scenario mirrors production: bootstrapBackground / heartbeat / +// request handlers all calling Set* on license, license-fields, app +// status, updates, and pod images while readers (request handlers, +// reporters) call the corresponding Get*. +func TestInMemoryStore_ConcurrentReadWrite(t *testing.T) { + s := &InMemoryStore{} + + const goroutines = 32 + const iterations = 50 + + license := licensewrapper.LicenseWrapper{V1: &kotsv1beta1.License{}} + fields := licensetypes.LicenseFields{ + "f1": licensetypes.LicenseField{Title: "F1", Value: "v"}, + } + + var wg sync.WaitGroup + wg.Add(goroutines * 5) + + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetLicense(license) + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + _ = s.GetLicense() + _ = s.IsDevLicense() + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetLicenseFields(fields) + _ = s.GetLicenseFields() + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetAppStatus(appstatetypes.AppStatus{}) + _ = s.GetAppStatus() + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetPodImages("ns", "pod", []appstatetypes.ImageInfo{{Name: "nginx", SHA: "sha256:x"}}) + _ = s.GetRunningImages() + s.DeletePodImages("ns", "pod") + } + }() + } + + wg.Wait() +} From 15048a0f0684df63b6724741283f03d43b251d9f Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Tue, 28 Apr 2026 17:13:16 -0500 Subject: [PATCH 02/15] bugbot --- chart/templates/replicated-role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/chart/templates/replicated-role.yaml b/chart/templates/replicated-role.yaml index 2180870c..a8149602 100644 --- a/chart/templates/replicated-role.yaml +++ b/chart/templates/replicated-role.yaml @@ -94,6 +94,7 @@ rules: - {{ include "replicated.secretName" . }} - replicated-meta-data - replicated-support-metadata + - replicated-license-cache # the legacy replicated-sdk configmap is required when the SDK secret is not found - apiGroups: - '' From ed7b2a85c9daa05cdc68330a568f5dad3dc27764 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Tue, 28 Apr 2026 21:42:54 -0500 Subject: [PATCH 03/15] bugbot --- pkg/apiserver/bootstrap.go | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index 5a7cd946..6e5e2ce5 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -2,6 +2,7 @@ package apiserver import ( "context" + stderrors "errors" "log" "github.com/cenkalti/backoff/v4" @@ -203,6 +204,16 @@ func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface // refresh writes through to the cache and an upstream failure transparently // falls back to cached data when available. The in-memory store ends up // populated either way. +// +// Each step runs independently and accumulates its error rather than +// returning early. This guarantees that a transient failure in one step +// (e.g. SyncLatestLicense when upstream is briefly unreachable) does not +// silently disable downstream steps for the pod's lifetime in resilient +// mode — most importantly, heartbeat.Start() always gets a chance to run +// so the instance continues to check in. Strict mode still observes the +// joined error and retries the whole function via backoff; all steps are +// idempotent (heartbeat.Start clears and re-adds its cron entries; Set* +// store writes are last-write-wins). func bootstrapBackground(params APIServerParams) error { clientset, err := k8sutil.GetClientset() if err != nil { @@ -214,17 +225,20 @@ func bootstrapBackground(params APIServerParams) error { ctx = context.Background() } + var errs []error + if !util.IsAirgap() { licenseData, _, err := sdklicense.SyncLatestLicense(ctx, clientset, params.Namespace, store.GetStore().GetLicense(), params.ReplicatedAppEndpoint) if err != nil { - return errors.Wrap(err, "failed to get latest license") + errs = append(errs, errors.Wrap(err, "failed to get latest license")) + } else { + store.GetStore().SetLicense(licenseData.License) } - store.GetStore().SetLicense(licenseData.License) } isIntegrationModeEnabled, err := integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) if err != nil { - return errors.Wrap(err, "failed to check if integration mode is enabled") + errs = append(errs, errors.Wrap(err, "failed to check if integration mode is enabled")) } if !util.IsAirgap() && !isIntegrationModeEnabled { @@ -235,13 +249,14 @@ func bootstrapBackground(params APIServerParams) error { } updates, err := upstream.GetUpdates(store.GetStore(), store.GetStore().GetLicense(), currentCursor) if err != nil { - return errors.Wrap(err, "failed to get updates") + errs = append(errs, errors.Wrap(err, "failed to get updates")) + } else { + store.GetStore().SetUpdates(updates) } - store.GetStore().SetUpdates(updates) } if err := heartbeat.Start(); err != nil { - return errors.Wrap(err, "failed to start heartbeat") + errs = append(errs, errors.Wrap(err, "failed to start heartbeat")) } if !util.IsAirgap() && store.GetStore().IsDevLicense() { @@ -252,5 +267,5 @@ func bootstrapBackground(params APIServerParams) error { }() } - return nil + return stderrors.Join(errs...) } From 50ed4269ae5c411c96c64ff04780a833860b4994 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Tue, 28 Apr 2026 22:16:44 -0500 Subject: [PATCH 04/15] bugbot --- dagger/e2e.go | 45 ++++++++++++++++++++++++++++---------- pkg/apiserver/bootstrap.go | 34 +++++++++++++++++++++------- pkg/apiserver/server.go | 34 +++++++++++++++++++--------- 3 files changed, 84 insertions(+), 29 deletions(-) diff --git a/dagger/e2e.go b/dagger/e2e.go index 268a9440..ae1c0624 100644 --- a/dagger/e2e.go +++ b/dagger/e2e.go @@ -267,24 +267,47 @@ spec: } fmt.Println(out) - // wait for the replicated-ssl-test deployment to be ready + // Wait for the replicated-ssl-test deployment to be ready. The probe + // inside the pod (curl -k https://replicated:3000/health) only passes + // once the replicated service has Ready endpoints AND TLS is serving, + // but the pod itself also has to be scheduled, image-pulled, and + // started. On slower CI clusters (notably GKE during image-pull + // contention) a 1m budget is not enough — observed failures show the + // alpine/curl pod still in ContainerCreating at the timeout. 3m gives + // the cluster room to schedule without masking real readiness failures. ctr = dag.Container().From("bitnami/kubectl:latest"). WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). WithEnvVariable("KUBECONFIG", kubeconfigPath). - WithExec([]string{"kubectl", "wait", "--for=condition=available", "deployment/replicated-ssl-test", "--timeout=1m"}) + WithExec([]string{"kubectl", "wait", "--for=condition=available", "deployment/replicated-ssl-test", "--timeout=3m"}) out, err = ctr.Stdout(ctx) if err != nil { - ctr = dag.Container().From("bitnami/kubectl:latest"). - WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). - WithEnvVariable("KUBECONFIG", kubeconfigPath). - WithExec([]string{"kubectl", "logs", "-p", "-l", "app.kubernetes.io/name=replicated"}) - out, err2 := ctr.Stdout(ctx) - if err2 != nil { - return fmt.Errorf("failed to get logs for replicated deployment: %w", err2) + // Best-effort diagnostics. None of these may individually fail + // the test — the original wait error is what we report. In + // particular, kubectl logs without -p must be used here: the + // previous use of "-p" returned BadRequest ("previous + // terminated container ... not found") whenever the replicated + // pod had not crashed, which masked the real wait timeout with + // a misleading "log fetch" error. + dumpCmd := func(args []string) { + c := dag.Container().From("bitnami/kubectl:latest"). + WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). + WithEnvVariable("KUBECONFIG", kubeconfigPath). + With(CacheBustingExec(args)) + if stdout, derr := c.Stdout(ctx); derr == nil { + fmt.Printf("$ %s\n%s\n", strings.Join(args, " "), stdout) + } else if stderr, _ := c.Stderr(ctx); stderr != "" { + fmt.Printf("$ %s\n(diagnostic command failed: %v)\n%s\n", strings.Join(args, " "), derr, stderr) + } else { + fmt.Printf("$ %s\n(diagnostic command failed: %v)\n", strings.Join(args, " "), derr) + } } - fmt.Println(out) + dumpCmd([]string{"kubectl", "get", "pods", "-o", "wide"}) + dumpCmd([]string{"kubectl", "describe", "deployment/replicated-ssl-test"}) + dumpCmd([]string{"kubectl", "describe", "pods", "-l", "app=replicated-ssl-test"}) + dumpCmd([]string{"kubectl", "logs", "-l", "app=replicated-ssl-test", "--tail=100"}) + dumpCmd([]string{"kubectl", "logs", "-l", "app.kubernetes.io/name=replicated", "--tail=100"}) - return fmt.Errorf("failed to wait for replicated deployment to be ready: %w", err) + return fmt.Errorf("failed to wait for replicated-ssl-test deployment to be ready: %w", err) } fmt.Println(out) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index 6e5e2ce5..ecd6e419 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -118,9 +118,13 @@ func bootstrapCritical(params APIServerParams) error { ReadOnlyMode: params.ReadOnlyMode, }) - appStateOperator := appstate.InitOperator(clientset, params.Namespace) - appStateOperator.Start() - + // Resolve informers BEFORE starting the appstate operator goroutine + // so a helm.GetRelease error on this attempt doesn't leave a + // runAppStateMonitor goroutine running that the next retry would + // duplicate. InitOperator overwrites the package-level `operator` + // pointer and Operator.Start unconditionally spawns a new goroutine + // without shutting down the previous monitor, so the start must be + // the last fallible-or-not step in the function. informers := params.StatusInformers if informers == nil && helm.IsHelmManaged() { helmRelease, err := helm.GetRelease(helm.GetReleaseName()) @@ -132,6 +136,8 @@ func bootstrapCritical(params APIServerParams) error { } } + appStateOperator := appstate.InitOperator(clientset, params.Namespace) + appStateOperator.Start() appStateOperator.ApplyAppInformers(appstatetypes.AppInformersArgs{ AppSlug: store.GetStore().GetAppSlug(), Sequence: store.GetStore().GetReleaseSequence(), @@ -210,10 +216,13 @@ func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface // (e.g. SyncLatestLicense when upstream is briefly unreachable) does not // silently disable downstream steps for the pod's lifetime in resilient // mode — most importantly, heartbeat.Start() always gets a chance to run -// so the instance continues to check in. Strict mode still observes the -// joined error and retries the whole function via backoff; all steps are -// idempotent (heartbeat.Start clears and re-adds its cron entries; Set* -// store writes are last-write-wins). +// so the instance continues to check in. Strict mode observes the joined +// error and retries this function via backoff; the steps here are safe +// to retry (heartbeat.Start clears and re-adds its cron entries; Set* +// store writes are last-write-wins). Critical-phase initializers that +// would leak resources on retry (notably appstate.InitOperator) live in +// bootstrapCritical and are retried by their own loop, which terminates +// on the first success. func bootstrapBackground(params APIServerParams) error { clientset, err := k8sutil.GetClientset() if err != nil { @@ -236,12 +245,21 @@ func bootstrapBackground(params APIServerParams) error { } } + // integrationCheckOK distinguishes "we know the mode" from "we don't + // know yet". The zero value of isIntegrationModeEnabled is false, so + // without this flag a transient IsEnabled failure would silently fall + // into the !isIntegrationModeEnabled branch below and call + // upstream.GetUpdates against the Vendor Portal even when the pod is + // actually running in integration (dev) mode. We instead skip the + // updates fetch entirely on this turn and let the next bootstrap + // retry (strict) or the heartbeat-driven refresh (resilient) recover. isIntegrationModeEnabled, err := integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) + integrationCheckOK := err == nil if err != nil { errs = append(errs, errors.Wrap(err, "failed to check if integration mode is enabled")) } - if !util.IsAirgap() && !isIntegrationModeEnabled { + if !util.IsAirgap() && integrationCheckOK && !isIntegrationModeEnabled { currentCursor := upstreamtypes.ReplicatedCursor{ ChannelID: store.GetStore().GetChannelID(), ChannelName: store.GetStore().GetChannelName(), diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 25cfa301..902d2758 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -207,22 +207,36 @@ func runBootstrapWithDeps(params APIServerParams, state *startupstate.Tracker, d } func runBootstrapStrict(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { - err := backoff.RetryNotify( - func() error { - if err := deps.critical(params); err != nil { - return err - } - return deps.background(params) + // Retry critical and background as separate retry loops rather than + // re-running both inside a single function. If we re-ran critical on + // a transient background failure, each retry would re-invoke + // appstate.InitOperator + Operator.Start, leaking the previously + // spawned runAppStateMonitor goroutine (Operator.Start does not call + // Shutdown on a prior instance, and InitOperator overwrites the + // package-level operator pointer). Splitting the loops ensures + // critical runs exactly once and background can retry independently. + if err := backoff.RetryNotify( + func() error { return deps.critical(params) }, + backoff.NewConstantBackOff(deps.retryInterval), + func(err error, d time.Duration) { + log.Printf("failed to bootstrap critical (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) }, + ); err != nil { + state.MarkFailed() + return errors.Wrap(err, "failed to bootstrap critical") + } + + if err := backoff.RetryNotify( + func() error { return deps.background(params) }, backoff.NewConstantBackOff(deps.retryInterval), func(err error, d time.Duration) { - log.Printf("failed to bootstrap (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) + log.Printf("failed to bootstrap background (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) }, - ) - if err != nil { + ); err != nil { state.MarkFailed() - return errors.Wrap(err, "failed to bootstrap") + return errors.Wrap(err, "failed to bootstrap background") } + state.MarkReady() return nil } From f4cebfb2a4247e94e1cbfea44a3907ade2e7c83e Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 09:43:36 -0500 Subject: [PATCH 05/15] bugbot --- pkg/apiserver/bootstrap.go | 25 ++++++++++++++++++++----- pkg/apiserver/server.go | 19 +++++++++++++++++-- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index ecd6e419..a521b836 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -4,6 +4,7 @@ import ( "context" stderrors "errors" "log" + "sync" "github.com/cenkalti/backoff/v4" "github.com/pkg/errors" @@ -277,13 +278,27 @@ func bootstrapBackground(params APIServerParams) error { errs = append(errs, errors.Wrap(err, "failed to start heartbeat")) } + // In strict mode bootstrapBackground is wrapped in a retry loop, so + // any goroutine launched here would otherwise be re-spawned on every + // retry and leak. The dev-version check is a one-time observability + // signal — gate it behind a package-level sync.Once so it runs at most + // once per process regardless of how many times bootstrapBackground + // is invoked. if !util.IsAirgap() && store.GetStore().IsDevLicense() { - go func() { - if err := util.WarnOnOutdatedReplicatedVersion(); err != nil { - logger.Infof("Failed to check if running an outdated replicated version: %v", err) - } - }() + warnOnOutdatedReplicatedVersionOnce.Do(func() { + go func() { + if err := util.WarnOnOutdatedReplicatedVersion(); err != nil { + logger.Infof("Failed to check if running an outdated replicated version: %v", err) + } + }() + }) } return stderrors.Join(errs...) } + +// warnOnOutdatedReplicatedVersionOnce ensures the dev-mode upstream-version +// warning goroutine is launched at most once per process. bootstrapBackground +// can be invoked multiple times (strict-mode retry loop) and we don't want +// to spawn a fresh goroutine on every retry. +var warnOnOutdatedReplicatedVersionOnce sync.Once diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 902d2758..6435abaf 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -276,8 +276,23 @@ func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, ) state.MarkReady() if criticalErr := <-criticalDone; criticalErr != nil { - state.MarkFailed() - return errors.Wrap(criticalErr, "failed to bootstrap critical (after readiness)") + // We have already signaled Ready, so the kubelet has + // (or is about to) added this pod to its Service + // endpoints. Flipping back to Failed → log.Fatalf would + // produce a Ready→crash→restart→Ready→crash loop where + // every cycle briefly exposes an under-initialized + // store to traffic. The resilient-mode contract is + // "serve degraded rather than flap" — log loudly and + // keep the pod alive. Background work below is skipped + // because it depends on store state that critical + // never populated; the heartbeat won't run, but + // existing endpoints will continue to answer with + // whatever defaults the in-memory store has. + logger.Errorf( + "sdk_critical_failed_after_readiness: bootstrapCritical failed after the readiness deadline; pod stays Ready to avoid a Ready→crash flap, but the in-memory store may be under-initialized and downstream handlers may return degraded data: %v", + criticalErr, + ) + return nil } // Symmetric counterpart to sdk_ready_after_critical_timeout — // gives operators a closing log line when the deferred critical From 10164409a79b386f44ad2c66f4565384e4b34dbf Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 10:06:54 -0500 Subject: [PATCH 06/15] bugbot --- pkg/store/memory_store.go | 17 +++++++++++- pkg/store/memory_store_test.go | 51 +++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/pkg/store/memory_store.go b/pkg/store/memory_store.go index b59adadf..1935bb6e 100644 --- a/pkg/store/memory_store.go +++ b/pkg/store/memory_store.go @@ -133,10 +133,25 @@ func (s *InMemoryStore) SetLicense(license licensewrapper.LicenseWrapper) { logger.Debugf("InMemoryStore.SetLicense called with empty wrapper; retaining previous license") } +// GetLicenseFields returns a defensive copy of the license-fields map so +// callers may safely mutate the returned map (e.g. handlers.GetLicenseField +// does `delete(m, key)` and `m[key] = v` against this result, then writes +// it back via SetLicenseFields). Returning the internal map by reference +// would race with concurrent SetLicenseFields writers under mu, because +// the RLock only guards the pointer load — not the map's contents — once +// the function returns. A shallow copy is sufficient: LicenseField values +// are stored by value, and no caller mutates the inner struct fields. func (s *InMemoryStore) GetLicenseFields() licensetypes.LicenseFields { s.mu.RLock() defer s.mu.RUnlock() - return s.licenseFields + if s.licenseFields == nil { + return nil + } + out := make(licensetypes.LicenseFields, len(s.licenseFields)) + for k, v := range s.licenseFields { + out[k] = v + } + return out } func (s *InMemoryStore) SetLicenseFields(licenseFields licensetypes.LicenseFields) { diff --git a/pkg/store/memory_store_test.go b/pkg/store/memory_store_test.go index 78fcf985..991676f1 100644 --- a/pkg/store/memory_store_test.go +++ b/pkg/store/memory_store_test.go @@ -210,6 +210,36 @@ func TestInMemoryStore_SetPodImages_PrivateRegistryRewrite_DoesNotOvermatchSimil require.ElementsMatch(t, []string{"sha256:ok"}, got["registry.example.com/team/myapp/nginx:latest"]) // matched } +// TestInMemoryStore_GetLicenseFields_ReturnsCopy verifies that mutating +// the map returned by GetLicenseFields does not affect the store's +// internal state. This invariant is what lets handlers like +// GetLicenseField (which `delete(m, k)` and `m[k] = v` on the result) +// run concurrently with SetLicenseFields without corrupting either +// goroutine's view of the map. +func TestInMemoryStore_GetLicenseFields_ReturnsCopy(t *testing.T) { + s := &InMemoryStore{} + s.SetLicenseFields(licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + "name": {Name: "name", Value: "acme"}, + }) + + got := s.GetLicenseFields() + require.Len(t, got, 2) + + delete(got, "seats") + got["injected"] = licensetypes.LicenseField{Name: "injected", Value: "should-not-leak"} + + stillThere := s.GetLicenseFields() + require.Len(t, stillThere, 2, "store mutated by external map edit") + require.Contains(t, stillThere, "seats", "deletion on returned map leaked into store") + require.NotContains(t, stillThere, "injected", "insertion on returned map leaked into store") +} + +func TestInMemoryStore_GetLicenseFields_NilWhenEmpty(t *testing.T) { + s := &InMemoryStore{} + require.Nil(t, s.GetLicenseFields(), "empty store should return nil, not an empty map, to preserve pre-PR semantics") +} + // TestInMemoryStore_ConcurrentReadWrite exercises the runtime-mutated // fields under simultaneous read and write pressure. Without the RWMutex // added in Phase 2, `go test -race` flags this as a data race. @@ -230,7 +260,7 @@ func TestInMemoryStore_ConcurrentReadWrite(t *testing.T) { } var wg sync.WaitGroup - wg.Add(goroutines * 5) + wg.Add(goroutines * 6) for i := 0; i < goroutines; i++ { go func() { @@ -253,6 +283,25 @@ func TestInMemoryStore_ConcurrentReadWrite(t *testing.T) { _ = s.GetLicenseFields() } }() + // Mirror the handlers.GetLicenseField pattern: pull the + // fields map out, mutate it locally, then push it back. With + // the pre-fix GetLicenseFields (which returned the internal + // map by reference), this races against the SetLicenseFields + // goroutine above and `go test -race` flags it. With the + // defensive copy in place, the mutation operates on a local + // map and never touches the store's internal state. + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + m := s.GetLicenseFields() + if m == nil { + m = licensetypes.LicenseFields{} + } + m["transient"] = licensetypes.LicenseField{Name: "transient"} + delete(m, "transient") + s.SetLicenseFields(m) + } + }() go func() { defer wg.Done() for j := 0; j < iterations; j++ { From c3ce71c3ff2b922e54f69f4114a1d8af76cb9a5b Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 10:15:17 -0500 Subject: [PATCH 07/15] bugbot --- pkg/store/memory_store.go | 25 ++++++++++++---- pkg/store/memory_store_test.go | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pkg/store/memory_store.go b/pkg/store/memory_store.go index 1935bb6e..60c55ab0 100644 --- a/pkg/store/memory_store.go +++ b/pkg/store/memory_store.go @@ -154,20 +154,35 @@ func (s *InMemoryStore) GetLicenseFields() licensetypes.LicenseFields { return out } +// SetLicenseFields stores the caller-provided map as the new authoritative +// license-fields state, replacing whatever was there before. +// +// Replace (not merge) semantics matter for two callers in particular: +// +// - handlers.GetLicenseField, which deletes a key from a copy returned by +// GetLicenseFields when the upstream Vendor Portal reports the field +// no longer exists, then calls SetLicenseFields with that copy. With +// merge semantics the deletion is silently lost. +// - handlers.GetLicenseFields, which assigns the full upstream-fetched +// map to its local variable and writes it back. With merge semantics a +// field removed upstream would persist forever in the local store. +// +// We also defensive-copy on the way in so that a caller mutating +// `licenseFields` after this call cannot race with subsequent readers. +// Combined with GetLicenseFields' returned-copy contract, this means the +// store's internal map is never aliased outside this package. func (s *InMemoryStore) SetLicenseFields(licenseFields licensetypes.LicenseFields) { s.mu.Lock() defer s.mu.Unlock() - // copy by value not reference if licenseFields == nil { s.licenseFields = nil return } - if s.licenseFields == nil { - s.licenseFields = licensetypes.LicenseFields{} - } + cp := make(licensetypes.LicenseFields, len(licenseFields)) for k, v := range licenseFields { - s.licenseFields[k] = v + cp[k] = v } + s.licenseFields = cp } func (s *InMemoryStore) IsDevLicense() bool { diff --git a/pkg/store/memory_store_test.go b/pkg/store/memory_store_test.go index 991676f1..5ad595c8 100644 --- a/pkg/store/memory_store_test.go +++ b/pkg/store/memory_store_test.go @@ -240,6 +240,60 @@ func TestInMemoryStore_GetLicenseFields_NilWhenEmpty(t *testing.T) { require.Nil(t, s.GetLicenseFields(), "empty store should return nil, not an empty map, to preserve pre-PR semantics") } +// TestInMemoryStore_SetLicenseFields_ReplacesNotMerges locks in +// replace-not-merge semantics. handlers.GetLicenseField removes a key +// from a copy of the map and writes it back to indicate the upstream +// Vendor Portal has dropped that field; handlers.GetLicenseFields +// likewise overwrites with the full upstream-fetched map. With merge +// semantics, a field that was deleted on either side would persist +// forever in the local store after the next handler write. +func TestInMemoryStore_SetLicenseFields_ReplacesNotMerges(t *testing.T) { + s := &InMemoryStore{} + s.SetLicenseFields(licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + "tier": {Name: "tier", Value: "gold"}, + }) + + // Mirror handlers.GetLicenseField's removal pattern: pull a copy, + // drop a key, write it back. + m := s.GetLicenseFields() + delete(m, "seats") + s.SetLicenseFields(m) + + got := s.GetLicenseFields() + require.NotContains(t, got, "seats", "deleted key persisted because SetLicenseFields merged instead of replacing") + require.Contains(t, got, "tier") + require.Len(t, got, 1) + + // Replacing with a totally different shape must drop the old keys. + s.SetLicenseFields(licensetypes.LicenseFields{ + "newField": {Name: "newField", Value: "x"}, + }) + got = s.GetLicenseFields() + require.Len(t, got, 1) + require.Contains(t, got, "newField") + require.NotContains(t, got, "tier") +} + +// TestInMemoryStore_SetLicenseFields_DefensiveCopyOnWrite verifies the +// store doesn't alias the caller's input map. Without the defensive +// copy, a caller mutating its own variable after writing would corrupt +// what subsequent readers see. +func TestInMemoryStore_SetLicenseFields_DefensiveCopyOnWrite(t *testing.T) { + s := &InMemoryStore{} + input := licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + } + s.SetLicenseFields(input) + + input["seats"] = licensetypes.LicenseField{Name: "seats", Value: 999} + input["leak"] = licensetypes.LicenseField{Name: "leak"} + + got := s.GetLicenseFields() + require.Equal(t, 10, got["seats"].Value, "store aliased caller's input map") + require.NotContains(t, got, "leak", "store aliased caller's input map") +} + // TestInMemoryStore_ConcurrentReadWrite exercises the runtime-mutated // fields under simultaneous read and write pressure. Without the RWMutex // added in Phase 2, `go test -race` flags this as a data race. From 13412f0bfe2b9b851bc622e813f61e558f0fa43a Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 10:27:02 -0500 Subject: [PATCH 08/15] bugbot --- pkg/store/store_interface.go | 47 ++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/pkg/store/store_interface.go b/pkg/store/store_interface.go index c8fbfb5b..6f7db5f8 100644 --- a/pkg/store/store_interface.go +++ b/pkg/store/store_interface.go @@ -1,17 +1,32 @@ package store import ( + "sync/atomic" + + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" - licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" ) -var ( - store Store +// storePtr holds the package-level Store as an atomic.Pointer so that +// handler goroutines (calling GetStore on every request) can safely +// observe stores written by bootstrapCritical's InitInMemory call. +// +// Pre-Phase-1, the HTTP listener didn't accept traffic until after +// InitInMemory had returned, so a plain interface variable was safe by +// virtue of happens-before via process startup ordering. With the +// listener-up-before-bootstrap refactor, request handlers and the +// bootstrap goroutine race on this variable: an unsynchronized interface +// (itab + data pointer = two words) can be torn under concurrent +// reads/writes. +// +// We store *Store rather than Store directly because atomic.Pointer +// requires a pointer type. SetStore(nil) is preserved as "clear the +// store" (used by test cleanup) by storing a nil *Store. +var storePtr atomic.Pointer[Store] - _ Store = (*InMemoryStore)(nil) -) +var _ Store = (*InMemoryStore)(nil) type Store interface { GetReplicatedID() string @@ -45,13 +60,29 @@ type Store interface { GetReadOnlyMode() bool } +// SetStore atomically installs s as the package-level store. Passing nil +// clears the store; subsequent GetStore calls will return a fresh empty +// InMemoryStore until a non-nil value is installed. Test cleanup paths +// rely on the nil-clear behavior. func SetStore(s Store) { - store = s + if s == nil { + storePtr.Store(nil) + return + } + storePtr.Store(&s) } +// GetStore returns the currently installed store. If no store has been +// installed yet (or after SetStore(nil)), it returns a fresh empty +// InMemoryStore so handlers always have a usable, non-panicking target. +// Note that the empty-store fallback is per-call and not persisted: the +// pod is briefly observable in this state if the listener accepts a +// request before bootstrapCritical's InitInMemory completes, which is +// the same window /healthz reports as Starting → 503. func GetStore() Store { - if store == nil { + p := storePtr.Load() + if p == nil { return &InMemoryStore{} } - return store + return *p } From a54f86d03e323c4ebc77c1f5f7f42dc01bbbd14e Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 10:46:16 -0500 Subject: [PATCH 09/15] bugbot --- pkg/apiserver/server.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 6435abaf..ead10444 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -71,8 +71,14 @@ func Start(params APIServerParams) { srv, err := buildServer(params) if err != nil { - logger.Error(err) - return + // Pre-Phase-1 this same condition (most commonly: TLS secret + // missing or malformed) terminated the process with exit code + // 1 via log.Fatalf. Returning early instead would cause the + // cobra RunE to return nil and the binary to exit 0, masking + // a misconfigured deployment as a "successful" run. Keep the + // fatal exit so the kubelet sees a CrashLoopBackOff signal + // and operators get an obvious failure mode. + log.Fatalf("failed to build server: %v", err) } go func() { From a0fe529f2623cec122badc48260ceeb28d12b4a6 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 11:19:03 -0500 Subject: [PATCH 10/15] bugbot --- pkg/handlers/app.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pkg/handlers/app.go b/pkg/handlers/app.go index 79a3328e..981b62e0 100644 --- a/pkg/handlers/app.go +++ b/pkg/handlers/app.go @@ -276,12 +276,23 @@ func GetAppUpdates(w http.ResponseWriter, r *http.Request) { license := store.GetStore().GetLicense() updates := store.GetStore().GetUpdates() - licenseData, _, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint()) + // The updates response is driven by the just-refreshed license + // (channel ID/name/sequence pulled from the wrapper). When the + // underlying SyncLatestLicense fell back to the cache because + // upstream was unreachable, the channel/cursor info we use to + // compute updates is stale by definition — surface that to clients + // the same way GetLicenseInfo and GetLicenseFields do, via the + // X-Replicated-License-Cache: stale header. Without this, a client + // seeing /api/v1/app/updates can't distinguish a genuinely + // up-to-date "no updates available" from a cache-fallback "we + // can't actually tell, here's what we knew before." + licenseData, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, updates) return } + markStaleIfCached(w, source) license = licenseData.License store.GetStore().SetLicense(license) From cf4fee52bac546d26afacf82dc3ae7b4f2e22313 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 11:44:50 -0500 Subject: [PATCH 11/15] bugbot --- pkg/apiserver/bootstrap.go | 4 +-- pkg/apiserver/server.go | 21 ++++++++++++-- pkg/apiserver/server_test.go | 37 ++++++++++++++++++++++-- pkg/handlers/app.go | 2 +- pkg/handlers/license.go | 4 +-- pkg/heartbeat/heartbeat.go | 2 +- pkg/license/cache/cache.go | 32 ++++++++++++++------- pkg/license/cache/cache_test.go | 51 ++++++++++++++++----------------- pkg/license/sync.go | 24 ++++++++++++---- pkg/license/sync_test.go | 30 +++++++++---------- pkg/store/store_interface.go | 47 +++++++++++++++++++++++------- 11 files changed, 174 insertions(+), 80 deletions(-) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index a521b836..cdf2204d 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -172,7 +172,7 @@ func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface if ctx == nil { ctx = context.Background() } - data, _, err := sdklicense.SyncLicenseByID(ctx, clientset, params.Namespace, params.IntegrationLicenseID, params.ReplicatedAppEndpoint) + data, _, err := sdklicense.SyncLicenseByID(ctx, clientset, params.Namespace, params.IntegrationLicenseID, params.ReplicatedAppEndpoint, params.ReadOnlyMode) if err != nil { return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "integration mode requires either reachable upstream or a previously-cached license; neither was available")) } @@ -238,7 +238,7 @@ func bootstrapBackground(params APIServerParams) error { var errs []error if !util.IsAirgap() { - licenseData, _, err := sdklicense.SyncLatestLicense(ctx, clientset, params.Namespace, store.GetStore().GetLicense(), params.ReplicatedAppEndpoint) + licenseData, _, err := sdklicense.SyncLatestLicense(ctx, clientset, params.Namespace, store.GetStore().GetLicense(), params.ReplicatedAppEndpoint, params.ReadOnlyMode) if err != nil { errs = append(errs, errors.Wrap(err, "failed to get latest license")) } else { diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index ead10444..5fb78839 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -307,8 +307,25 @@ func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, logger.Infof("sdk_critical_succeeded_after_readiness: bootstrapCritical completed after the readiness deadline; advancing to background phase") } - if err := deps.background(params); err != nil { - logger.Warnf("bootstrap background phase completed with errors (handlers continue serving from in-memory store): %v", err) + // Retry bootstrapBackground until it succeeds or returns a + // permanent error. Without this loop, a transient failure on the + // first attempt (e.g. heartbeat.Start hitting a momentary cron + // init issue, or upstream being briefly unreachable while the + // pod was already marked Ready via the timeout path) would leave + // the heartbeat cron job and any subsequent license refreshes + // disabled for the entire pod lifetime. Strict mode already + // retries; resilient mode must too — the only difference between + // the two modes is whether retry blocks readiness, not whether + // it happens at all. Errors are still logged at Warn level on + // each attempt and never block the pod from staying Ready. + if err := backoff.RetryNotify( + func() error { return deps.background(params) }, + backoff.NewConstantBackOff(deps.retryInterval), + func(err error, d time.Duration) { + logger.Warnf("bootstrap background phase failed, retrying in %s (handlers continue serving from in-memory store): %v", d, err) + }, + ); err != nil { + logger.Warnf("bootstrap background phase gave up with permanent error (handlers continue serving from in-memory store): %v", err) } return nil } diff --git a/pkg/apiserver/server_test.go b/pkg/apiserver/server_test.go index d116669a..853f41ee 100644 --- a/pkg/apiserver/server_test.go +++ b/pkg/apiserver/server_test.go @@ -72,18 +72,51 @@ func TestRunBootstrapResilient_CriticalPermanentError_FailsFast(t *testing.T) { require.False(t, bgRan.Load(), "background must not run after a permanent critical failure") } -func TestRunBootstrapResilient_BackgroundFailureDoesNotAffectReady(t *testing.T) { +func TestRunBootstrapResilient_BackgroundPermanentFailureDoesNotAffectReady(t *testing.T) { state := startupstate.New() deps := fastDeps( func(APIServerParams) error { return nil }, - func(APIServerParams) error { return errors.New("upstream sync failed") }, + func(APIServerParams) error { + // Permanent so the resilient retry loop gives up quickly + // — otherwise this test would block forever, since the + // loop must keep retrying transient background errors + // (heartbeat startup, upstream sync) for the pod's + // entire lifetime. + return backoff.Permanent(errors.New("upstream sync permanently failed")) + }, ) require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps), "background failures should not bubble up") require.True(t, state.IsReady(), "expected state Ready despite background failure") } +// TestRunBootstrapResilient_BackgroundRetriesUntilSuccess verifies that +// resilient mode keeps retrying bootstrapBackground after transient +// failures rather than swallowing them. Without this retry, a momentary +// hiccup on the first background attempt (e.g. heartbeat cron init, +// upstream license sync) would silently disable the heartbeat job and +// every subsequent license refresh for the entire pod lifetime. +func TestRunBootstrapResilient_BackgroundRetriesUntilSuccess(t *testing.T) { + state := startupstate.New() + var bgCalls atomic.Int32 + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + n := bgCalls.Add(1) + if n < 3 { + return errors.New("transient") + } + return nil + }, + ) + + require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps)) + require.True(t, state.IsReady()) + require.GreaterOrEqual(t, bgCalls.Load(), int32(3), "resilient mode must retry transient background failures") +} + func TestRunBootstrapStrict_BlocksReadyUntilFullBootstrapSucceeds(t *testing.T) { state := startupstate.New() var phase atomic.Int32 diff --git a/pkg/handlers/app.go b/pkg/handlers/app.go index 981b62e0..8fb79871 100644 --- a/pkg/handlers/app.go +++ b/pkg/handlers/app.go @@ -286,7 +286,7 @@ func GetAppUpdates(w http.ResponseWriter, r *http.Request) { // seeing /api/v1/app/updates can't distinguish a genuinely // up-to-date "no updates available" from a cache-fallback "we // can't actually tell, here's what we knew before." - licenseData, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint()) + licenseData, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, updates) diff --git a/pkg/handlers/license.go b/pkg/handlers/license.go index c69c7c24..9e9261f8 100644 --- a/pkg/handlers/license.go +++ b/pkg/handlers/license.go @@ -63,7 +63,7 @@ func GetLicenseInfo(w http.ResponseWriter, r *http.Request) { return } - l, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), wrapper, store.GetStore().GetReplicatedAppEndpoint()) + l, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), wrapper, store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) @@ -89,7 +89,7 @@ func GetLicenseFields(w http.ResponseWriter, r *http.Request) { return } - fields, source, err := sdklicense.SyncLatestLicenseFields(r.Context(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) + fields, source, err := sdklicense.SyncLatestLicenseFields(r.Context(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license fields")) JSONCached(w, http.StatusOK, licenseFields) diff --git a/pkg/heartbeat/heartbeat.go b/pkg/heartbeat/heartbeat.go index 84db2868..44512c19 100644 --- a/pkg/heartbeat/heartbeat.go +++ b/pkg/heartbeat/heartbeat.go @@ -60,7 +60,7 @@ func Start() error { } if !util.IsAirgap() { - licenseData, _, err := sdklicense.SyncLatestLicense(context.Background(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) + licenseData, _, err := sdklicense.SyncLatestLicense(context.Background(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) } else { diff --git a/pkg/license/cache/cache.go b/pkg/license/cache/cache.go index 5b61e905..8a25090b 100644 --- a/pkg/license/cache/cache.go +++ b/pkg/license/cache/cache.go @@ -12,8 +12,11 @@ // // Writes serialize through a package-level mutex (matching pkg/meta and // pkg/supportbundle). Reads do not lock; concurrent reads of a Secret are -// safe. Read-only mode (store.GetReadOnlyMode()) suppresses writes so the -// SDK never attempts to create a Secret it doesn't have RBAC for. +// safe. Read-only mode is honored via an explicit readOnlyMode argument +// on the public Write* functions rather than by consulting a package- +// level store, because some callers (notably bootstrapCritical) write to +// the cache before the runtime store has been initialized — taking the +// flag as a parameter eliminates that ordering hazard. package cache import ( @@ -25,7 +28,6 @@ import ( "github.com/pkg/errors" licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/replicatedhq/replicated-sdk/pkg/logger" - "github.com/replicatedhq/replicated-sdk/pkg/store" "github.com/replicatedhq/replicated-sdk/pkg/util" corev1 "k8s.io/api/core/v1" kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" @@ -108,35 +110,43 @@ func Read(ctx context.Context, clientset kubernetes.Interface, namespace string) // WriteLicense upserts the license bytes and refreshes the last-fetched // timestamp. License-fields, if previously cached, are preserved. // -// In read-only mode this is a no-op so the SDK never attempts a write it -// doesn't have RBAC for. -func WriteLicense(ctx context.Context, clientset kubernetes.Interface, namespace string, licenseBytes []byte) error { +// readOnlyMode must be passed in by the caller rather than read from the +// package-level store: bootstrapCritical's loadAndVerifyLicense calls +// this through SyncLicenseByID before store.InitInMemory has installed +// the runtime config, so an empty fallback InMemoryStore would report +// readOnlyMode=false and we'd attempt a write that an operator +// explicitly opted out of. Passing the bool from the call site (params +// at bootstrap, store getter elsewhere) eliminates that ordering trap. +func WriteLicense(ctx context.Context, clientset kubernetes.Interface, namespace string, licenseBytes []byte, readOnlyMode bool) error { if len(licenseBytes) == 0 { return errors.New("refusing to cache empty license bytes") } return write(ctx, clientset, namespace, map[string][]byte{ KeyLicense: licenseBytes, - }) + }, readOnlyMode) } // WriteLicenseFields upserts the license-fields map and refreshes the // last-fetched timestamp. The license document, if previously cached, is // preserved. -func WriteLicenseFields(ctx context.Context, clientset kubernetes.Interface, namespace string, fields licensetypes.LicenseFields) error { +// +// See WriteLicense for why readOnlyMode is an explicit parameter rather +// than read from the package-level store. +func WriteLicenseFields(ctx context.Context, clientset kubernetes.Interface, namespace string, fields licensetypes.LicenseFields, readOnlyMode bool) error { encoded, err := json.Marshal(fields) if err != nil { return errors.Wrap(err, "failed to marshal license fields") } return write(ctx, clientset, namespace, map[string][]byte{ KeyLicenseFields: encoded, - }) + }, readOnlyMode) } // write performs a partial-update upsert: existing keys not in `data` are // preserved. A `last-fetched` timestamp is always refreshed alongside any // successful update. -func write(ctx context.Context, clientset kubernetes.Interface, namespace string, data map[string][]byte) error { - if store.GetStore().GetReadOnlyMode() { +func write(ctx context.Context, clientset kubernetes.Interface, namespace string, data map[string][]byte, readOnlyMode bool) error { + if readOnlyMode { logger.Debugf("license cache: skipping write in read-only mode") return nil } diff --git a/pkg/license/cache/cache_test.go b/pkg/license/cache/cache_test.go index 3d3b062f..5e232498 100644 --- a/pkg/license/cache/cache_test.go +++ b/pkg/license/cache/cache_test.go @@ -34,11 +34,6 @@ func withDeployment(t *testing.T) kubernetes.Interface { return clientset } -func resetStore(t *testing.T) { - t.Helper() - store.SetStore(nil) -} - func TestRead_NoSecret_ReturnsNotFound(t *testing.T) { clientset := fake.NewSimpleClientset() @@ -59,13 +54,10 @@ func TestRead_SecretWithoutLicenseKey_ReturnsNotFound(t *testing.T) { } func TestWriteLicense_CreatesSecret(t *testing.T) { - store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) - t.Cleanup(func() { resetStore(t) }) - clientset := withDeployment(t) licenseBytes := []byte("license-yaml-document") - require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, licenseBytes)) + require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, licenseBytes, false)) secret, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) require.NoError(t, err) @@ -77,14 +69,11 @@ func TestWriteLicense_CreatesSecret(t *testing.T) { } func TestWriteLicenseFields_PreservesLicense(t *testing.T) { - store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) - t.Cleanup(func() { resetStore(t) }) - clientset := withDeployment(t) ctx := context.Background() licenseBytes := []byte("license-yaml-document") - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, licenseBytes)) + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, licenseBytes, false)) fields := licensetypes.LicenseFields{ "my_field": licensetypes.LicenseField{ @@ -92,7 +81,7 @@ func TestWriteLicenseFields_PreservesLicense(t *testing.T) { Value: "v1", }, } - require.NoError(t, WriteLicenseFields(ctx, clientset, testNamespace, fields)) + require.NoError(t, WriteLicenseFields(ctx, clientset, testNamespace, fields, false)) cached, err := Read(ctx, clientset, testNamespace) require.NoError(t, err) @@ -101,19 +90,16 @@ func TestWriteLicenseFields_PreservesLicense(t *testing.T) { } func TestWrite_RefreshesLastFetched(t *testing.T) { - store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) - t.Cleanup(func() { resetStore(t) }) - clientset := withDeployment(t) ctx := context.Background() - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v1"))) + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v1"), false)) cached1, err := Read(ctx, clientset, testNamespace) require.NoError(t, err) time.Sleep(1100 * time.Millisecond) // RFC3339 has 1s resolution. - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v2"))) + require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v2"), false)) cached2, err := Read(ctx, clientset, testNamespace) require.NoError(t, err) @@ -121,24 +107,35 @@ func TestWrite_RefreshesLastFetched(t *testing.T) { } func TestWrite_ReadOnlyMode_NoSecretCreated(t *testing.T) { - store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace, ReadOnlyMode: true}) - t.Cleanup(func() { resetStore(t) }) - clientset := fake.NewSimpleClientset() - require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"))) + require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"), true)) _, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) require.True(t, kuberneteserrors.IsNotFound(err), "no secret should be created in read-only mode") } -func TestWriteLicense_RejectsEmpty(t *testing.T) { - store.InitInMemory(store.InitInMemoryStoreOptions{Namespace: testNamespace}) - t.Cleanup(func() { resetStore(t) }) +// TestWrite_ReadOnlyMode_BeforeStoreInit verifies that the read-only +// gate works during early bootstrap, before store.InitInMemory has run. +// Pre-fix this would have observed the empty fallback InMemoryStore's +// readOnlyMode=false default, attempted the write, and only failed via +// RBAC (with a confusing log line). Now the read-only flag is passed +// explicitly so no global-state observation is needed. +func TestWrite_ReadOnlyMode_BeforeStoreInit(t *testing.T) { + store.SetStore(nil) // explicitly: no store installed. + clientset := fake.NewSimpleClientset() + require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"), true)) + require.NoError(t, WriteLicenseFields(context.Background(), clientset, testNamespace, licensetypes.LicenseFields{}, true)) + + _, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) + require.True(t, kuberneteserrors.IsNotFound(err), "no secret should be created when caller declares read-only mode, regardless of store init state") +} + +func TestWriteLicense_RejectsEmpty(t *testing.T) { clientset := withDeployment(t) - require.Error(t, WriteLicense(context.Background(), clientset, testNamespace, nil), + require.Error(t, WriteLicense(context.Background(), clientset, testNamespace, nil, false), "writing empty license bytes must be rejected to avoid corrupting the cache") } diff --git a/pkg/license/sync.go b/pkg/license/sync.go index 38c40c11..c97d3a81 100644 --- a/pkg/license/sync.go +++ b/pkg/license/sync.go @@ -45,19 +45,23 @@ func warnStale(upstreamErr error) { // SourceCache. If the cache is missing or unusable, the original upstream // error is returned wrapped (so callers can inspect it). // -// This wrapper is the entry point for the integration-mode boot path. -// Production-mode boots use the chart-embedded license bytes and never -// reach here. +// readOnlyMode is propagated explicitly because this is the integration- +// mode boot entry point — it runs before store.InitInMemory installs the +// runtime config, so reading the read-only flag from the package-level +// store would observe the empty fallback's zero-value (false) instead of +// the operator's actual setting. Production-mode boots use the chart- +// embedded license bytes and never reach here. func SyncLicenseByID( ctx context.Context, clientset kubernetes.Interface, namespace string, licenseID string, endpoint string, + readOnlyMode bool, ) (*LicenseData, LicenseSource, error) { data, upstreamErr := GetLicenseByID(licenseID, endpoint) if upstreamErr == nil { - if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes); writeErr != nil { + if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes, readOnlyMode); writeErr != nil { logger.Infof("license cache: write-through after GetLicenseByID failed, continuing: %v", writeErr) } return data, SourceUpstream, nil @@ -87,16 +91,20 @@ func SyncLicenseByID( // The returned LicenseData.LicenseBytes is what gets persisted on the // success path. Cache hits return the bytes that were written by the most // recent successful upstream call. +// +// readOnlyMode is propagated explicitly so the cache layer never has to +// consult a package-level store; see SyncLicenseByID for the rationale. func SyncLatestLicense( ctx context.Context, clientset kubernetes.Interface, namespace string, wrapper licensewrapper.LicenseWrapper, endpoint string, + readOnlyMode bool, ) (*LicenseData, LicenseSource, error) { data, upstreamErr := GetLatestLicense(wrapper, endpoint) if upstreamErr == nil { - if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes); writeErr != nil { + if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes, readOnlyMode); writeErr != nil { logger.Infof("license cache: write-through after GetLatestLicense failed, continuing: %v", writeErr) } return data, SourceUpstream, nil @@ -129,16 +137,20 @@ func SyncLatestLicense( // failure, and the in-memory store is repopulated from the cache during // bootstrap. Wrapping it would duplicate that fallback path without // adding value. +// +// readOnlyMode is propagated explicitly so the cache layer never has to +// consult a package-level store; see SyncLicenseByID for the rationale. func SyncLatestLicenseFields( ctx context.Context, clientset kubernetes.Interface, namespace string, wrapper licensewrapper.LicenseWrapper, endpoint string, + readOnlyMode bool, ) (types.LicenseFields, LicenseSource, error) { fields, upstreamErr := GetLatestLicenseFields(wrapper, endpoint) if upstreamErr == nil { - if writeErr := cache.WriteLicenseFields(ctx, clientset, namespace, fields); writeErr != nil { + if writeErr := cache.WriteLicenseFields(ctx, clientset, namespace, fields, readOnlyMode); writeErr != nil { logger.Infof("license cache: write-through after GetLatestLicenseFields failed, continuing: %v", writeErr) } return fields, SourceUpstream, nil diff --git a/pkg/license/sync_test.go b/pkg/license/sync_test.go index 0a7ce2cc..3c65deef 100644 --- a/pkg/license/sync_test.go +++ b/pkg/license/sync_test.go @@ -101,7 +101,7 @@ func TestSyncLicenseByID_UpstreamSuccess_WritesThroughToCache(t *testing.T) { f := newTestFixture(t) srv := licenseServer(t) - data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) require.NoError(t, err) require.Equal(t, SourceUpstream, source) require.NotEmpty(t, data.LicenseBytes) @@ -115,9 +115,9 @@ func TestSyncLicenseByID_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { f := newTestFixture(t) srv := brokenServer(t) - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) require.NoError(t, err) require.Equal(t, SourceCache, source) require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) @@ -127,7 +127,7 @@ func TestSyncLicenseByID_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *testin f := newTestFixture(t) srv := brokenServer(t) - _, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL) + _, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) require.Error(t, err) require.Equal(t, SourceUpstream, source, "no fallback available, source must reflect the failed attempt") require.Contains(t, err.Error(), "license cache miss") @@ -142,7 +142,7 @@ func TestSyncLatestLicense_UpstreamSuccess_WritesThroughToCache(t *testing.T) { wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) require.NoError(t, err) - data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.NoError(t, err) require.Equal(t, SourceUpstream, source) @@ -158,9 +158,9 @@ func TestSyncLatestLicense_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) require.NoError(t, err) - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.NoError(t, err) require.Equal(t, SourceCache, source) require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) @@ -173,7 +173,7 @@ func TestSyncLatestLicense_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *test wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) require.NoError(t, err) - _, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + _, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.Error(t, err) require.Equal(t, SourceUpstream, source) require.Contains(t, err.Error(), "license cache miss") @@ -194,9 +194,9 @@ func TestSyncLatestLicenseFields_UpstreamSuccess_WritesThroughToCache(t *testing // In production a license is always cached before fields are // (bootstrapCritical → bootstrapBackground ordering), so seed that // state before exercising the fields write-through. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.NoError(t, err) require.Equal(t, SourceUpstream, source) require.Equal(t, "v1", got["my_field"].Value) @@ -217,10 +217,10 @@ func TestSyncLatestLicenseFields_UpstreamFailure_CacheHit_ReturnsCached(t *testi "my_field": types.LicenseField{Title: "Cached", Value: "stale"}, } // Seed the cache with both a license (so Read returns success) and fields. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) - require.NoError(t, cache.WriteLicenseFields(context.Background(), f.clientset, syncTestNamespace, cached)) + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) + require.NoError(t, cache.WriteLicenseFields(context.Background(), f.clientset, syncTestNamespace, cached, false)) - got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.NoError(t, err) require.Equal(t, SourceCache, source) require.Equal(t, "stale", got["my_field"].Value) @@ -234,9 +234,9 @@ func TestSyncLatestLicenseFields_UpstreamFailure_NoCachedFields_ReturnsWrappedEr require.NoError(t, err) // License cached but fields never written — fields fallback must miss. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML))) + require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - _, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL) + _, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) require.Error(t, err) require.Equal(t, SourceUpstream, source) require.Contains(t, err.Error(), "no cached license fields available") diff --git a/pkg/store/store_interface.go b/pkg/store/store_interface.go index 6f7db5f8..b1b8c92c 100644 --- a/pkg/store/store_interface.go +++ b/pkg/store/store_interface.go @@ -1,6 +1,7 @@ package store import ( + "sync" "sync/atomic" licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" @@ -26,6 +27,22 @@ import ( // store" (used by test cleanup) by storing a nil *Store. var storePtr atomic.Pointer[Store] +// fallbackStore is the single empty InMemoryStore that GetStore returns +// when no real store has been installed yet. Lazily created on first +// access so unused code paths (especially tests that never call any +// store function) don't pay the allocation. We deliberately reuse the +// same instance across all "store not yet installed" GetStore calls so +// that a sequence like `store.GetStore().SetX(v); store.GetStore().GetX()` +// observes its own write rather than seeing a fresh zero-value store on +// the second call. This window is observable in the resilient-mode +// timeout path, where the pod is marked Ready (and accepts traffic) +// before bootstrapCritical has had a chance to install the real store +// via InitInMemory. +var ( + fallbackStoreOnce sync.Once + fallbackStore *InMemoryStore +) + var _ Store = (*InMemoryStore)(nil) type Store interface { @@ -72,17 +89,25 @@ func SetStore(s Store) { storePtr.Store(&s) } -// GetStore returns the currently installed store. If no store has been -// installed yet (or after SetStore(nil)), it returns a fresh empty -// InMemoryStore so handlers always have a usable, non-panicking target. -// Note that the empty-store fallback is per-call and not persisted: the -// pod is briefly observable in this state if the listener accepts a -// request before bootstrapCritical's InitInMemory completes, which is -// the same window /healthz reports as Starting → 503. +// GetStore returns the currently installed store, or a single shared +// empty fallback InMemoryStore when none has been installed yet. The +// fallback is process-wide (allocated lazily on first need) rather than +// per-call so that mutations performed against it persist across +// consecutive GetStore reads — handlers that do, e.g., +// `s := GetStore(); s.SetX(v); s2 := GetStore(); _ = s2.GetX()` must +// observe their own writes even before InitInMemory installs the real +// store. Once SetStore is called with a real store, GetStore returns +// that real store and the fallback becomes orphaned; any writes that +// landed on the fallback during the bootstrap window are lost. That is +// an accepted trade-off: the alternative (blocking handlers behind +// bootstrap) is exactly the CrashLoopBackOff regression Phase 1 set +// out to fix. func GetStore() Store { - p := storePtr.Load() - if p == nil { - return &InMemoryStore{} + if p := storePtr.Load(); p != nil { + return *p } - return *p + fallbackStoreOnce.Do(func() { + fallbackStore = &InMemoryStore{} + }) + return fallbackStore } From 82ace5d48240daa116d86a697d744ca6d4b8b37f Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 16:12:57 -0500 Subject: [PATCH 12/15] addressing comments --- chart/templates/replicated-deployment.yaml | 4 +- chart/templates/replicated-role.yaml | 3 - chart/values.yaml | 17 +- cmd/replicated/api.go | 51 +++-- dagger/e2e.go | 27 +++ pkg/apiserver/bootstrap.go | 126 ++++++----- pkg/apiserver/bootstrap_test.go | 71 +++++- pkg/apiserver/server.go | 95 ++------ pkg/apiserver/server_test.go | 107 ++++----- pkg/handlers/app.go | 13 +- pkg/handlers/license.go | 43 +--- pkg/heartbeat/heartbeat.go | 3 +- pkg/license/cache/cache.go | 211 ------------------ pkg/license/cache/cache_test.go | 184 ---------------- pkg/license/license.go | 9 +- pkg/license/sync.go | 169 -------------- pkg/license/sync_test.go | 243 --------------------- pkg/startupstate/startupstate.go | 5 +- pkg/store/memory_store.go | 8 +- pkg/store/store_interface.go | 6 +- pkg/store/store_interface_test.go | 93 ++++++++ pkg/util/env.go | 45 +++- pkg/util/env_test.go | 79 +++++++ 23 files changed, 495 insertions(+), 1117 deletions(-) delete mode 100644 pkg/license/cache/cache.go delete mode 100644 pkg/license/cache/cache_test.go delete mode 100644 pkg/license/sync.go delete mode 100644 pkg/license/sync_test.go create mode 100644 pkg/store/store_interface_test.go create mode 100644 pkg/util/env_test.go diff --git a/chart/templates/replicated-deployment.yaml b/chart/templates/replicated-deployment.yaml index 0c74ab5c..69d67f76 100644 --- a/chart/templates/replicated-deployment.yaml +++ b/chart/templates/replicated-deployment.yaml @@ -193,8 +193,8 @@ spec: - name: REPLICATED_HA_ENABLED value: "true" {{- end }} - {{- if .Values.requireUpstreamOnStartup }} - - name: REPLICATED_REQUIRE_UPSTREAM_ON_STARTUP + {{- if .Values.devOffline }} + - name: REPLICATED_DEV_OFFLINE value: "true" {{- end }} {{- if (.Values.integration).licenseID }} diff --git a/chart/templates/replicated-role.yaml b/chart/templates/replicated-role.yaml index a8149602..30ec299b 100644 --- a/chart/templates/replicated-role.yaml +++ b/chart/templates/replicated-role.yaml @@ -36,7 +36,6 @@ rules: - 'update' # The names below must match the SDK's runtime expectations. If you # rename one, update the corresponding constant on the Go side: - # replicated-license-cache → pkg/license/cache/cache.SecretName # replicated-meta-data → pkg/meta/meta.ReplicatedMetadataSecretName # replicated-support-metadata → pkg/supportbundle.SupportBundleMetadataSecretName resourceNames: @@ -45,7 +44,6 @@ rules: - replicated-custom-app-metrics-report - replicated-meta-data - replicated-support-metadata - - replicated-license-cache {{ end }} {{ if .Values.tlsCertSecretName }} - apiGroups: @@ -94,7 +92,6 @@ rules: - {{ include "replicated.secretName" . }} - replicated-meta-data - replicated-support-metadata - - replicated-license-cache # the legacy replicated-sdk configmap is required when the SDK secret is not found - apiGroups: - '' diff --git a/chart/values.yaml b/chart/values.yaml index 10074afb..fd2f2afa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -340,17 +340,12 @@ reportAllImages: false # See docs for feature availability in this mode. readOnlyMode: false -# When true, the pod will not be marked Ready until the SDK has successfully -# completed the full bootstrap, including a successful upstream license -# refresh and update fetch against replicated.app. This is the pre-Phase-1 -# behavior and is provided as an opt-out for operators who require it. -# -# Default (false): the pod becomes Ready as soon as the local critical -# bootstrap (license parse, signature verify, expiry check, store init) -# succeeds — or after a 30s safety timeout, whichever comes first. Upstream -# license refresh and update fetch run in the background and never block -# readiness. Most users want this. -requireUpstreamOnStartup: false +# When true (and license type is "dev"), the SDK treats the install as +# airgap — it never calls replicated.app and serves the license from the +# chart-embedded bytes. Intended for local development with a dev license +# when working offline (VPN, flaky network, air-gapped lab). Setting this +# with a non-dev license causes the SDK to refuse to start. +devOffline: false # Proxy configuration for outbound connections # Configure HTTPS proxy settings for the Replicated SDK diff --git a/cmd/replicated/api.go b/cmd/replicated/api.go index 843785b5..3eadf4c3 100644 --- a/cmd/replicated/api.go +++ b/cmd/replicated/api.go @@ -31,11 +31,10 @@ func APICmd() *cobra.Command { namespace := v.GetString("namespace") configFilePath := v.GetString("config-file") integrationLicenseID := v.GetString("integration-license-id") - // require-upstream-on-startup auto-binds to - // REPLICATED_REQUIRE_UPSTREAM_ON_STARTUP via root.go's - // SetEnvPrefix("REPLICATED") + AutomaticEnv + + // dev-offline auto-binds to REPLICATED_DEV_OFFLINE via + // root.go's SetEnvPrefix("REPLICATED") + AutomaticEnv + // SetEnvKeyReplacer("-", "_"); no explicit BindEnv needed. - requireUpstreamOnStartup := v.GetBool("require-upstream-on-startup") + devOffline := v.GetBool("dev-offline") if configFilePath == "" && integrationLicenseID == "" { return errors.New("either config file or integration license id must be specified") @@ -62,28 +61,28 @@ func APICmd() *cobra.Command { } params := apiserver.APIServerParams{ - Context: cmd.Context(), - LicenseBytes: []byte(replicatedConfig.License), - IntegrationLicenseID: integrationLicenseID, - LicenseFields: replicatedConfig.LicenseFields, - AppName: replicatedConfig.AppName, - ChannelID: replicatedConfig.ChannelID, - ChannelName: replicatedConfig.ChannelName, - ChannelSequence: replicatedConfig.ChannelSequence, - ReleaseSequence: replicatedConfig.ReleaseSequence, - ReleaseCreatedAt: replicatedConfig.ReleaseCreatedAt, - ReleaseNotes: replicatedConfig.ReleaseNotes, - VersionLabel: replicatedConfig.VersionLabel, - ReplicatedAppEndpoint: replicatedConfig.ReplicatedAppEndpoint, - ReleaseImages: replicatedConfig.ReleaseImages, - StatusInformers: replicatedConfig.StatusInformers, - ReplicatedID: replicatedConfig.ReplicatedID, - AppID: replicatedConfig.AppID, - TlsCertSecretName: replicatedConfig.TlsCertSecretName, - ReportAllImages: replicatedConfig.ReportAllImages, - ReadOnlyMode: replicatedConfig.ReadOnlyMode, - Namespace: namespace, - RequireUpstreamOnStartup: requireUpstreamOnStartup, + Context: cmd.Context(), + LicenseBytes: []byte(replicatedConfig.License), + IntegrationLicenseID: integrationLicenseID, + LicenseFields: replicatedConfig.LicenseFields, + AppName: replicatedConfig.AppName, + ChannelID: replicatedConfig.ChannelID, + ChannelName: replicatedConfig.ChannelName, + ChannelSequence: replicatedConfig.ChannelSequence, + ReleaseSequence: replicatedConfig.ReleaseSequence, + ReleaseCreatedAt: replicatedConfig.ReleaseCreatedAt, + ReleaseNotes: replicatedConfig.ReleaseNotes, + VersionLabel: replicatedConfig.VersionLabel, + ReplicatedAppEndpoint: replicatedConfig.ReplicatedAppEndpoint, + ReleaseImages: replicatedConfig.ReleaseImages, + StatusInformers: replicatedConfig.StatusInformers, + ReplicatedID: replicatedConfig.ReplicatedID, + AppID: replicatedConfig.AppID, + TlsCertSecretName: replicatedConfig.TlsCertSecretName, + ReportAllImages: replicatedConfig.ReportAllImages, + ReadOnlyMode: replicatedConfig.ReadOnlyMode, + Namespace: namespace, + DevOffline: devOffline, } apiserver.Start(params) diff --git a/dagger/e2e.go b/dagger/e2e.go index ae1c0624..b6efa7bd 100644 --- a/dagger/e2e.go +++ b/dagger/e2e.go @@ -1539,3 +1539,30 @@ func upgradeChartAndRestart( return nil } + +// Future: dagger e2e scenarios for the devOffline opt-in (Phase 3.5 of +// proposals/sdk_local_dev_mode.md). Implemented in a separate PR; the +// unit tests in pkg/apiserver/bootstrap_test.go (TestApplyDevOfflineGuard_*) +// already cover both validation paths at the logic level. Scenario C +// (default behavior unchanged, devOffline=false) is already covered by +// the existing e2e flow above. Scenarios A and B need dagger plumbing +// that does not exist yet — a dev-license fixture alongside the e2e +// secrets, and a NetworkPolicy denying egress to replicated.app from +// the SDK pod. +// +// Scenario A (devOffline=true + dev license + blocked upstream): +// +// helm install ... \ +// --set-file license=$DEV_LICENSE \ +// --set devOffline=true +// kubectl apply -f testdata/networkpolicy-deny-replicated-app.yaml +// # Expected: pod becomes Ready, /api/v1/license/info serves from +// # chart bytes, no upstream dial attempted. +// +// Scenario B (devOffline=true + non-dev license): +// +// helm install ... \ +// --set-file license=$PROD_LICENSE \ +// --set devOffline=true +// # Expected: pod CrashLoopBackOff with "devOffline=true requires a +// # dev license" in the logs. diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index cdf2204d..a8c19ece 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -24,7 +24,6 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/upstream" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" "github.com/replicatedhq/replicated-sdk/pkg/util" - "k8s.io/client-go/kubernetes" ) // bootstrapCritical performs the bootstrap work that must succeed before the @@ -33,17 +32,17 @@ import ( // initializes the local appstate operator so app/* endpoints can begin // reporting status as soon as the pod is Ready. // -// In production mode (LicenseBytes provided by the chart), this path is -// fully local. In integration mode it consults the upstream Vendor Portal -// with cache fallback: a previously-cached license satisfies critical even -// when replicated.app is unreachable, so subsequent boots after one -// successful sync survive offline conditions. First-boot offline (no -// cache, unreachable upstream) returns backoff.Permanent — the SDK refuses -// to start rather than silently run with empty data. +// In production mode (LicenseBytes provided by the chart) this path is +// fully local — no upstream call is made. In integration mode the +// license document is fetched from the Vendor Portal by ID, so an +// unreachable upstream on first boot causes critical to retry. The +// devOffline opt-in (production mode + a dev license) flips the runtime +// airgap flag here so subsequent !util.IsAirgap() gates skip their +// upstream calls; it does not change the license source. // // Permanent failures (license parse error, signature invalid, expired, -// first-boot-offline-with-no-cache) are returned as backoff.Permanent so -// the retry loop above gives up immediately. Transient failures bubble up +// devOffline + non-dev license) are returned as backoff.Permanent so the +// retry loop above gives up immediately. Transient failures bubble up // unwrapped and the caller will retry. func bootstrapCritical(params APIServerParams) error { clientset, err := k8sutil.GetClientset() @@ -77,11 +76,15 @@ func bootstrapCritical(params APIServerParams) error { } } - verifiedWrapper, err := loadAndVerifyLicense(params, clientset) + verifiedWrapper, err := loadAndVerifyLicense(params) if err != nil { return err } + // Expiry is unconditional and has no side effects, so check it + // before applyDevOfflineGuard — which flips the process-global + // airgap override on success. Reordering keeps that side effect + // from landing on a license we are about to reject anyway. expired, err := sdklicense.LicenseIsExpired(verifiedWrapper) if err != nil { return errors.Wrap(err, "failed to check if license is expired") @@ -90,6 +93,10 @@ func bootstrapCritical(params APIServerParams) error { return backoff.Permanent(errors.New("License is expired")) } + if err := applyDevOfflineGuard(verifiedWrapper, params.DevOffline); err != nil { + return err + } + channelID := params.ChannelID if channelID == "" { channelID = verifiedWrapper.GetChannelID() @@ -150,14 +157,8 @@ func bootstrapCritical(params APIServerParams) error { // loadAndVerifyLicense loads the license from chart-embedded bytes // (production) or from the upstream Vendor Portal by integration-license -// ID with cache fallback, then signature-verifies it. -// -// Integration mode goes through SyncLicenseByID: a successful upstream -// call writes through to the cache; an upstream failure with a populated -// cache transparently uses the cached license; an upstream failure with -// no cache returns a permanent error so the pod refuses to start with no -// usable license. -func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface) (licensewrapper.LicenseWrapper, error) { +// ID, then signature-verifies it. +func loadAndVerifyLicense(params APIServerParams) (licensewrapper.LicenseWrapper, error) { var unverifiedWrapper licensewrapper.LicenseWrapper switch { @@ -168,18 +169,14 @@ func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface } unverifiedWrapper = wrapper case params.IntegrationLicenseID != "": - ctx := params.Context - if ctx == nil { - ctx = context.Background() - } - data, _, err := sdklicense.SyncLicenseByID(ctx, clientset, params.Namespace, params.IntegrationLicenseID, params.ReplicatedAppEndpoint, params.ReadOnlyMode) + wrapper, err := sdklicense.GetLicenseByID(params.IntegrationLicenseID, params.ReplicatedAppEndpoint) if err != nil { - return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "integration mode requires either reachable upstream or a previously-cached license; neither was available")) + return licensewrapper.LicenseWrapper{}, errors.Wrap(err, "failed to get license by id") } - if data.License.GetLicenseType() != "dev" { + if wrapper.GetLicenseType() != "dev" { return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("integration license must be a dev license")) } - unverifiedWrapper = data.License + unverifiedWrapper = wrapper default: return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("no license source configured: either LicenseBytes or IntegrationLicenseID is required")) } @@ -198,30 +195,43 @@ func loadAndVerifyLicense(params APIServerParams, clientset kubernetes.Interface return unverifiedWrapper, nil } -// bootstrapBackground performs upstream-dependent bootstrap work whose -// failure must not prevent the SDK from being marked Ready in the default -// configuration. The caller decides how to interpret returned errors: +// applyDevOfflineGuard enforces the dev-only contract of the devOffline +// opt-in: when the operator sets replicated.devOffline=true, the loaded +// license must be a dev license, and on success the runtime airgap flag +// is flipped so all !util.IsAirgap() gates skip their upstream calls. // -// - In default (resilient) mode, errors are logged and ignored; handlers -// continue serving from whatever bootstrapCritical placed in the store. -// - With requireUpstreamOnStartup=true, the caller treats any error here -// as fatal and the pod will not be marked Ready. -// -// Upstream calls go through the cache-aware Sync* wrappers so a successful -// refresh writes through to the cache and an upstream failure transparently -// falls back to cached data when available. The in-memory store ends up -// populated either way. +// A non-dev license with devOffline=true returns backoff.Permanent so +// the bootstrap retry loop gives up immediately — the install is +// misconfigured and no amount of retrying will fix it. Production +// licenses cannot silently end up running offline this way; operators +// who want true airgap must use the existing isAirgap chart value with +// its accompanying ops requirements. +func applyDevOfflineGuard(wrapper licensewrapper.LicenseWrapper, devOffline bool) error { + if !devOffline { + return nil + } + if wrapper.GetLicenseType() != "dev" { + return backoff.Permanent(errors.New("devOffline=true requires a dev license")) + } + util.SetAirgapOverride(true) + log.Println("devOffline enabled: SDK will not call replicated.app for this install") + return nil +} + +// bootstrapBackground performs upstream-dependent bootstrap work whose +// failure must not prevent the SDK from being marked Ready. Errors are +// logged and the call is retried by the caller; handlers continue serving +// from whatever bootstrapCritical placed in the store. // // Each step runs independently and accumulates its error rather than // returning early. This guarantees that a transient failure in one step -// (e.g. SyncLatestLicense when upstream is briefly unreachable) does not -// silently disable downstream steps for the pod's lifetime in resilient -// mode — most importantly, heartbeat.Start() always gets a chance to run -// so the instance continues to check in. Strict mode observes the joined -// error and retries this function via backoff; the steps here are safe -// to retry (heartbeat.Start clears and re-adds its cron entries; Set* -// store writes are last-write-wins). Critical-phase initializers that -// would leak resources on retry (notably appstate.InitOperator) live in +// (e.g. GetLatestLicense when upstream is briefly unreachable) does not +// silently disable downstream steps for the pod's lifetime — most +// importantly, heartbeat.Start() always gets a chance to run so the +// instance continues to check in. The steps here are safe to retry +// (heartbeat.Start clears and re-adds its cron entries; Set* store writes +// are last-write-wins). Critical-phase initializers that would leak +// resources on retry (notably appstate.InitOperator) live in // bootstrapCritical and are retried by their own loop, which terminates // on the first success. func bootstrapBackground(params APIServerParams) error { @@ -238,7 +248,7 @@ func bootstrapBackground(params APIServerParams) error { var errs []error if !util.IsAirgap() { - licenseData, _, err := sdklicense.SyncLatestLicense(ctx, clientset, params.Namespace, store.GetStore().GetLicense(), params.ReplicatedAppEndpoint, params.ReadOnlyMode) + licenseData, err := sdklicense.GetLatestLicense(store.GetStore().GetLicense(), params.ReplicatedAppEndpoint) if err != nil { errs = append(errs, errors.Wrap(err, "failed to get latest license")) } else { @@ -252,8 +262,8 @@ func bootstrapBackground(params APIServerParams) error { // into the !isIntegrationModeEnabled branch below and call // upstream.GetUpdates against the Vendor Portal even when the pod is // actually running in integration (dev) mode. We instead skip the - // updates fetch entirely on this turn and let the next bootstrap - // retry (strict) or the heartbeat-driven refresh (resilient) recover. + // updates fetch entirely on this turn and let the heartbeat-driven + // refresh recover. isIntegrationModeEnabled, err := integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) integrationCheckOK := err == nil if err != nil { @@ -278,12 +288,12 @@ func bootstrapBackground(params APIServerParams) error { errs = append(errs, errors.Wrap(err, "failed to start heartbeat")) } - // In strict mode bootstrapBackground is wrapped in a retry loop, so - // any goroutine launched here would otherwise be re-spawned on every - // retry and leak. The dev-version check is a one-time observability - // signal — gate it behind a package-level sync.Once so it runs at most - // once per process regardless of how many times bootstrapBackground - // is invoked. + // bootstrapBackground is wrapped in a retry loop, so any goroutine + // launched here would otherwise be re-spawned on every retry and + // leak. The dev-version check is a one-time observability signal — + // gate it behind a package-level sync.Once so it runs at most once + // per process regardless of how many times bootstrapBackground is + // invoked. if !util.IsAirgap() && store.GetStore().IsDevLicense() { warnOnOutdatedReplicatedVersionOnce.Do(func() { go func() { @@ -299,6 +309,6 @@ func bootstrapBackground(params APIServerParams) error { // warnOnOutdatedReplicatedVersionOnce ensures the dev-mode upstream-version // warning goroutine is launched at most once per process. bootstrapBackground -// can be invoked multiple times (strict-mode retry loop) and we don't want -// to spawn a fresh goroutine on every retry. +// can be invoked multiple times by the retry loop and we don't want to +// spawn a fresh goroutine on every retry. var warnOnOutdatedReplicatedVersionOnce sync.Once diff --git a/pkg/apiserver/bootstrap_test.go b/pkg/apiserver/bootstrap_test.go index 93151f89..3d700b6e 100644 --- a/pkg/apiserver/bootstrap_test.go +++ b/pkg/apiserver/bootstrap_test.go @@ -6,8 +6,10 @@ import ( "testing" "github.com/cenkalti/backoff/v4" + kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" + "github.com/replicatedhq/replicated-sdk/pkg/util" "github.com/stretchr/testify/require" - "k8s.io/client-go/kubernetes/fake" ) // blockingTransport is a RoundTripper that fails any HTTP request and @@ -41,11 +43,10 @@ func withBlockingTransport(t *testing.T) *blockingTransport { func TestLoadAndVerifyLicense_InvalidLicenseBytes_ReturnsPermanent(t *testing.T) { bt := withBlockingTransport(t) - clientset := fake.NewSimpleClientset() _, err := loadAndVerifyLicense(APIServerParams{ LicenseBytes: []byte("not a license"), - }, clientset) + }) require.Error(t, err, "expected an error when LicenseBytes is malformed") var perm *backoff.PermanentError @@ -55,13 +56,73 @@ func TestLoadAndVerifyLicense_InvalidLicenseBytes_ReturnsPermanent(t *testing.T) func TestLoadAndVerifyLicense_ProductionMode_DoesNotDialUpstream(t *testing.T) { bt := withBlockingTransport(t) - clientset := fake.NewSimpleClientset() // We don't care about the success/failure outcome here — only that // LicenseBytes-mode doesn't reach for the network. Even with bytes // that fail signature verification, no upstream call is permitted. _, _ = loadAndVerifyLicense(APIServerParams{ LicenseBytes: []byte(validLicenseYAML), - }, clientset) + }) require.Zero(t, bt.dials.Load(), "production-mode license load+verify must not dial upstream") } + +// resetAirgapOverride restores the package-level airgap override after a +// test that flipped it. Without this, the override leaks into subsequent +// tests in the package and silently disables their upstream gates. +func resetAirgapOverride(t *testing.T) { + t.Helper() + t.Cleanup(util.ResetAirgapOverride) +} + +func devLicenseWrapper() licensewrapper.LicenseWrapper { + return licensewrapper.LicenseWrapper{ + V1: &kotsv1beta1.License{ + Spec: kotsv1beta1.LicenseSpec{LicenseType: "dev"}, + }, + } +} + +func prodLicenseWrapper() licensewrapper.LicenseWrapper { + return licensewrapper.LicenseWrapper{ + V1: &kotsv1beta1.License{ + Spec: kotsv1beta1.LicenseSpec{LicenseType: "prod"}, + }, + } +} + +func TestApplyDevOfflineGuard_OffByDefault_NoOp(t *testing.T) { + resetAirgapOverride(t) + require.NoError(t, applyDevOfflineGuard(prodLicenseWrapper(), false)) + require.False(t, util.IsAirgap(), "devOffline=false must not flip airgap override") +} + +func TestApplyDevOfflineGuard_ProdLicense_Rejected(t *testing.T) { + resetAirgapOverride(t) + err := applyDevOfflineGuard(prodLicenseWrapper(), true) + require.Error(t, err) + + var perm *backoff.PermanentError + require.ErrorAs(t, err, &perm, "non-dev license + devOffline must be a permanent failure") + require.False(t, util.IsAirgap(), "rejected guard must not have flipped the airgap override") +} + +func TestApplyDevOfflineGuard_DevLicense_FlipsAirgap(t *testing.T) { + resetAirgapOverride(t) + require.False(t, util.IsAirgap(), "precondition: airgap should be off before guard runs") + + require.NoError(t, applyDevOfflineGuard(devLicenseWrapper(), true)) + require.True(t, util.IsAirgap(), "devOffline=true + dev license must flip the airgap override") +} + +// TestApplyDevOfflineGuard_NoUpstreamDial defends the contract that the +// guard itself never reaches for the network. The guard only inspects +// already-parsed wrapper fields and writes a process-local atomic; if a +// future refactor accidentally introduced an upstream call here it would +// undermine the entire purpose of the offline opt-in. +func TestApplyDevOfflineGuard_NoUpstreamDial(t *testing.T) { + resetAirgapOverride(t) + bt := withBlockingTransport(t) + + require.NoError(t, applyDevOfflineGuard(devLicenseWrapper(), true)) + require.Zero(t, bt.dials.Load(), "devOffline guard must not dial upstream") +} diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 5fb78839..272ec210 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -56,11 +56,13 @@ type APIServerParams struct { ReportAllImages bool ReadOnlyMode bool - // RequireUpstreamOnStartup, when true, restores the pre-Phase-1 - // behavior in which the pod does not become Ready until the full - // bootstrap (critical + background) succeeds, including a successful - // upstream license refresh and update fetch. Default false. - RequireUpstreamOnStartup bool + // DevOffline, when true, instructs bootstrapCritical to validate + // the loaded license is a dev license and then flip the runtime + // airgap flag so all !util.IsAirgap() gates skip their upstream + // calls. Production licenses are rejected at bootstrap with a + // permanent error so this opt-in cannot silently disable upstream + // behavior in production. Default false. + DevOffline bool } func Start(params APIServerParams) { @@ -179,19 +181,12 @@ func defaultBootstrapDeps() bootstrapDeps { } } -// runBootstrap drives the bootstrap state machine. The behavior depends on -// params.RequireUpstreamOnStartup: -// -// - false (default): bootstrapCritical is retried with backoff; the pod -// is marked Ready as soon as critical succeeds OR the -// bootstrapCriticalDeadline elapses, whichever comes first. After the -// deadline, critical continues to retry; on eventual success the -// bootstrapBackground phase runs. bootstrapBackground errors are logged -// but never block readiness. -// -// - true: the pre-Phase-1 contract is preserved. bootstrapCritical and -// bootstrapBackground are run in sequence with retry-with-backoff and -// the pod is not marked Ready until both succeed. +// runBootstrap drives the bootstrap state machine. bootstrapCritical is +// retried with backoff; the pod is marked Ready as soon as critical +// succeeds OR the bootstrapCriticalDeadline elapses, whichever comes +// first. After the deadline, critical continues to retry; on eventual +// success the bootstrapBackground phase runs. bootstrapBackground errors +// are logged but never block readiness. // // A non-nil return value indicates the process should exit fatally; the // readiness state has already been transitioned to Failed when this happens @@ -206,48 +201,6 @@ func runBootstrap(params APIServerParams, state *startupstate.Tracker) error { } func runBootstrapWithDeps(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { - if params.RequireUpstreamOnStartup { - return runBootstrapStrict(params, state, deps) - } - return runBootstrapResilient(params, state, deps) -} - -func runBootstrapStrict(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { - // Retry critical and background as separate retry loops rather than - // re-running both inside a single function. If we re-ran critical on - // a transient background failure, each retry would re-invoke - // appstate.InitOperator + Operator.Start, leaking the previously - // spawned runAppStateMonitor goroutine (Operator.Start does not call - // Shutdown on a prior instance, and InitOperator overwrites the - // package-level operator pointer). Splitting the loops ensures - // critical runs exactly once and background can retry independently. - if err := backoff.RetryNotify( - func() error { return deps.critical(params) }, - backoff.NewConstantBackOff(deps.retryInterval), - func(err error, d time.Duration) { - log.Printf("failed to bootstrap critical (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) - }, - ); err != nil { - state.MarkFailed() - return errors.Wrap(err, "failed to bootstrap critical") - } - - if err := backoff.RetryNotify( - func() error { return deps.background(params) }, - backoff.NewConstantBackOff(deps.retryInterval), - func(err error, d time.Duration) { - log.Printf("failed to bootstrap background (requireUpstreamOnStartup=true), retrying in %s: %v", d, err) - }, - ); err != nil { - state.MarkFailed() - return errors.Wrap(err, "failed to bootstrap background") - } - - state.MarkReady() - return nil -} - -func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { criticalDone := make(chan error, 1) go func() { criticalDone <- backoff.RetryNotify( @@ -287,13 +240,13 @@ func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, // endpoints. Flipping back to Failed → log.Fatalf would // produce a Ready→crash→restart→Ready→crash loop where // every cycle briefly exposes an under-initialized - // store to traffic. The resilient-mode contract is - // "serve degraded rather than flap" — log loudly and - // keep the pod alive. Background work below is skipped - // because it depends on store state that critical - // never populated; the heartbeat won't run, but - // existing endpoints will continue to answer with - // whatever defaults the in-memory store has. + // store to traffic. The contract is "serve degraded + // rather than flap" — log loudly and keep the pod + // alive. Background work below is skipped because it + // depends on store state that critical never populated; + // the heartbeat won't run, but existing endpoints will + // continue to answer with whatever defaults the + // in-memory store has. logger.Errorf( "sdk_critical_failed_after_readiness: bootstrapCritical failed after the readiness deadline; pod stays Ready to avoid a Ready→crash flap, but the in-memory store may be under-initialized and downstream handlers may return degraded data: %v", criticalErr, @@ -313,11 +266,9 @@ func runBootstrapResilient(params APIServerParams, state *startupstate.Tracker, // init issue, or upstream being briefly unreachable while the // pod was already marked Ready via the timeout path) would leave // the heartbeat cron job and any subsequent license refreshes - // disabled for the entire pod lifetime. Strict mode already - // retries; resilient mode must too — the only difference between - // the two modes is whether retry blocks readiness, not whether - // it happens at all. Errors are still logged at Warn level on - // each attempt and never block the pod from staying Ready. + // disabled for the entire pod lifetime. Errors are still logged + // at Warn level on each attempt and never block the pod from + // staying Ready. if err := backoff.RetryNotify( func() error { return deps.background(params) }, backoff.NewConstantBackOff(deps.retryInterval), diff --git a/pkg/apiserver/server_test.go b/pkg/apiserver/server_test.go index 853f41ee..094e1a61 100644 --- a/pkg/apiserver/server_test.go +++ b/pkg/apiserver/server_test.go @@ -23,7 +23,7 @@ func fastDeps(critical, background func(APIServerParams) error) bootstrapDeps { } } -func TestRunBootstrapResilient_CriticalTimesOutThenSucceeds(t *testing.T) { +func TestRunBootstrap_CriticalTimesOutThenSucceeds(t *testing.T) { state := startupstate.New() var bgRan atomic.Bool criticalCalls := 0 @@ -47,7 +47,7 @@ func TestRunBootstrapResilient_CriticalTimesOutThenSucceeds(t *testing.T) { deps.deadline = 30 * time.Millisecond start := time.Now() - require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps)) + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps)) elapsed := time.Since(start) require.True(t, state.IsReady(), "expected state Ready, got %s", state.Get()) @@ -55,7 +55,7 @@ func TestRunBootstrapResilient_CriticalTimesOutThenSucceeds(t *testing.T) { require.GreaterOrEqual(t, elapsed, deps.deadline, "expected to wait at least the deadline") } -func TestRunBootstrapResilient_CriticalPermanentError_FailsFast(t *testing.T) { +func TestRunBootstrap_CriticalPermanentError_FailsFast(t *testing.T) { state := startupstate.New() var bgRan atomic.Bool @@ -66,38 +66,37 @@ func TestRunBootstrapResilient_CriticalPermanentError_FailsFast(t *testing.T) { func(APIServerParams) error { bgRan.Store(true); return nil }, ) - err := runBootstrapResilient(APIServerParams{}, state, deps) + err := runBootstrapWithDeps(APIServerParams{}, state, deps) require.Error(t, err) require.Equal(t, startupstate.Failed, state.Get()) require.False(t, bgRan.Load(), "background must not run after a permanent critical failure") } -func TestRunBootstrapResilient_BackgroundPermanentFailureDoesNotAffectReady(t *testing.T) { +func TestRunBootstrap_BackgroundPermanentFailureDoesNotAffectReady(t *testing.T) { state := startupstate.New() deps := fastDeps( func(APIServerParams) error { return nil }, func(APIServerParams) error { - // Permanent so the resilient retry loop gives up quickly - // — otherwise this test would block forever, since the - // loop must keep retrying transient background errors - // (heartbeat startup, upstream sync) for the pod's - // entire lifetime. + // Permanent so the retry loop gives up quickly — otherwise + // this test would block forever, since the loop must keep + // retrying transient background errors (heartbeat startup, + // upstream sync) for the pod's entire lifetime. return backoff.Permanent(errors.New("upstream sync permanently failed")) }, ) - require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps), "background failures should not bubble up") + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), "background failures should not bubble up") require.True(t, state.IsReady(), "expected state Ready despite background failure") } -// TestRunBootstrapResilient_BackgroundRetriesUntilSuccess verifies that -// resilient mode keeps retrying bootstrapBackground after transient +// TestRunBootstrap_BackgroundRetriesUntilSuccess verifies that the +// orchestrator keeps retrying bootstrapBackground after transient // failures rather than swallowing them. Without this retry, a momentary // hiccup on the first background attempt (e.g. heartbeat cron init, // upstream license sync) would silently disable the heartbeat job and // every subsequent license refresh for the entire pod lifetime. -func TestRunBootstrapResilient_BackgroundRetriesUntilSuccess(t *testing.T) { +func TestRunBootstrap_BackgroundRetriesUntilSuccess(t *testing.T) { state := startupstate.New() var bgCalls atomic.Int32 @@ -112,49 +111,55 @@ func TestRunBootstrapResilient_BackgroundRetriesUntilSuccess(t *testing.T) { }, ) - require.NoError(t, runBootstrapResilient(APIServerParams{}, state, deps)) + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps)) require.True(t, state.IsReady()) - require.GreaterOrEqual(t, bgCalls.Load(), int32(3), "resilient mode must retry transient background failures") + require.GreaterOrEqual(t, bgCalls.Load(), int32(3), "must retry transient background failures") } -func TestRunBootstrapStrict_BlocksReadyUntilFullBootstrapSucceeds(t *testing.T) { - state := startupstate.New() - var phase atomic.Int32 - - deps := fastDeps( - func(APIServerParams) error { - phase.Store(1) - return nil - }, - func(APIServerParams) error { - if !state.IsReady() && phase.Load() == 1 { - // We deliberately observe state HERE — strict mode must - // not have flipped Ready before background returns. - phase.Store(2) - } - return nil - }, - ) - params := APIServerParams{RequireUpstreamOnStartup: true} - - require.NoError(t, runBootstrapWithDeps(params, state, deps)) - require.Equal(t, int32(2), phase.Load(), "background did not observe pre-Ready state") - require.True(t, state.IsReady(), "expected state Ready after strict bootstrap") -} - -func TestRunBootstrapStrict_BackgroundFailure_BlocksReady(t *testing.T) { +// TestRunBootstrap_CriticalFailsAfterDeadline_StaysReadyButSkipsBackground +// pins the deliberate "false-readiness over Ready→crash flap" trade-off +// in runBootstrapWithDeps: when bootstrapCritical outlasts the readiness +// timer (so the pod has already been marked Ready) and then ultimately +// returns a permanent error, the pod stays Ready and the background +// phase is intentionally skipped. The orchestrator returns nil because +// flipping back to Failed → log.Fatalf would produce a +// Ready→crash→restart→Ready→crash loop where every cycle briefly exposes +// an under-initialized store to traffic. +// +// A future change that "fixes" this perceived false-Ready by transitioning +// to Failed, by running background anyway, or by returning a non-nil +// error from runBootstrap would silently alter a documented contract. +// This test should fail in that case so the change is conscious. +func TestRunBootstrap_CriticalFailsAfterDeadline_StaysReadyButSkipsBackground(t *testing.T) { state := startupstate.New() + var bgRan atomic.Bool + var criticalCalls atomic.Int32 + + criticalSlowThenPermanent := func(APIServerParams) error { + n := criticalCalls.Add(1) + if n == 1 { + // First attempt: sleep past the deadline so the timer + // fires and the orchestrator marks Ready before we + // return. The returned error must be transient so + // RetryNotify schedules a second attempt. + time.Sleep(40 * time.Millisecond) + return errors.New("transient first failure outlasting deadline") + } + // Second attempt: permanent so RetryNotify gives up and the + // orchestrator observes a non-nil criticalErr post-deadline. + return backoff.Permanent(errors.New("license is unrecoverable")) + } deps := fastDeps( - func(APIServerParams) error { return nil }, - func(APIServerParams) error { - // Permanent so the retry loop gives up quickly. - return backoff.Permanent(errors.New("upstream sync permanently failed")) - }, + criticalSlowThenPermanent, + func(APIServerParams) error { bgRan.Store(true); return nil }, ) - params := APIServerParams{RequireUpstreamOnStartup: true} + deps.deadline = 30 * time.Millisecond - err := runBootstrapWithDeps(params, state, deps) - require.Error(t, err, "expected an error when strict-mode background fails") - require.Equal(t, startupstate.Failed, state.Get()) + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), + "post-deadline critical failure must not bubble up — pod stays Ready") + require.True(t, state.IsReady(), + "pod must stay Ready after the Ready→fail transition; flipping to Failed would cause a Ready→crash flap") + require.False(t, bgRan.Load(), + "background must be skipped when critical permanently fails after the readiness deadline") } diff --git a/pkg/handlers/app.go b/pkg/handlers/app.go index 8fb79871..cbf19784 100644 --- a/pkg/handlers/app.go +++ b/pkg/handlers/app.go @@ -276,23 +276,12 @@ func GetAppUpdates(w http.ResponseWriter, r *http.Request) { license := store.GetStore().GetLicense() updates := store.GetStore().GetUpdates() - // The updates response is driven by the just-refreshed license - // (channel ID/name/sequence pulled from the wrapper). When the - // underlying SyncLatestLicense fell back to the cache because - // upstream was unreachable, the channel/cursor info we use to - // compute updates is stale by definition — surface that to clients - // the same way GetLicenseInfo and GetLicenseFields do, via the - // X-Replicated-License-Cache: stale header. Without this, a client - // seeing /api/v1/app/updates can't distinguish a genuinely - // up-to-date "no updates available" from a cache-fallback "we - // can't actually tell, here's what we knew before." - licenseData, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), license, store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) + licenseData, err := sdklicense.GetLatestLicense(license, store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, updates) return } - markStaleIfCached(w, source) license = licenseData.License store.GetStore().SetLicense(license) diff --git a/pkg/handlers/license.go b/pkg/handlers/license.go index 9e9261f8..b5bb72db 100644 --- a/pkg/handlers/license.go +++ b/pkg/handlers/license.go @@ -9,7 +9,6 @@ import ( kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" kotsv1beta2 "github.com/replicatedhq/kotskinds/apis/kots/v1beta2" licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" - "github.com/replicatedhq/replicated-sdk/pkg/k8sutil" sdklicense "github.com/replicatedhq/replicated-sdk/pkg/license" sdklicensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/replicatedhq/replicated-sdk/pkg/logger" @@ -17,12 +16,6 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/util" ) -// LicenseCacheHeader signals to clients that a license response was -// served from the local cache rather than a fresh upstream call. The -// value `stale` is set whenever the request-time SyncLatest* call -// returned SourceCache. -const LicenseCacheHeader = "X-Replicated-License-Cache" - type LicenseInfo struct { LicenseID string `json:"licenseID"` AppSlug string `json:"appSlug"` @@ -44,26 +37,11 @@ type LicenseInfo struct { Entitlements interface{} `json:"entitlements,omitempty"` } -// markStaleIfCached sets the cache-stale header when the source indicates -// the response came from the local cache rather than upstream. -func markStaleIfCached(w http.ResponseWriter, source sdklicense.LicenseSource) { - if source == sdklicense.SourceCache { - w.Header().Set(LicenseCacheHeader, "stale") - } -} - func GetLicenseInfo(w http.ResponseWriter, r *http.Request) { wrapper := store.GetStore().GetLicense() if !util.IsAirgap() { - clientset, err := k8sutil.GetClientset() - if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - JSONCached(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) - return - } - - l, source, err := sdklicense.SyncLatestLicense(r.Context(), clientset, store.GetStore().GetNamespace(), wrapper, store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) + l, err := sdklicense.GetLatestLicense(wrapper, store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) JSONCached(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) @@ -72,7 +50,6 @@ func GetLicenseInfo(w http.ResponseWriter, r *http.Request) { wrapper = l.License store.GetStore().SetLicense(wrapper) - markStaleIfCached(w, source) } JSON(w, http.StatusOK, licenseInfoFromWrapper(wrapper)) @@ -82,14 +59,7 @@ func GetLicenseFields(w http.ResponseWriter, r *http.Request) { licenseFields := store.GetStore().GetLicenseFields() if !util.IsAirgap() { - clientset, err := k8sutil.GetClientset() - if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - JSONCached(w, http.StatusOK, licenseFields) - return - } - - fields, source, err := sdklicense.SyncLatestLicenseFields(r.Context(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) + fields, err := sdklicense.GetLatestLicenseFields(store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license fields")) JSONCached(w, http.StatusOK, licenseFields) @@ -98,19 +68,11 @@ func GetLicenseFields(w http.ResponseWriter, r *http.Request) { licenseFields = fields store.GetStore().SetLicenseFields(licenseFields) - markStaleIfCached(w, source) } JSON(w, http.StatusOK, licenseFields) } -// GetLicenseField fetches a single license field. Unlike GetLicenseFields -// this handler does NOT go through a cache wrapper for the singular call: -// the in-memory store is repopulated from cache during bootstrap and on -// every SyncLatestLicenseFields hit, so the existing fall-through to -// licenseFields[fieldName] on upstream failure already serves the cached -// value transparently. Wrapping the singular call would duplicate that -// fallback path without adding cache coverage. func GetLicenseField(w http.ResponseWriter, r *http.Request) { fieldName := mux.Vars(r)["fieldName"] @@ -126,7 +88,6 @@ func GetLicenseField(w http.ResponseWriter, r *http.Request) { if lf, ok := licenseFields[fieldName]; !ok { JSONCached(w, http.StatusNotFound, fmt.Sprintf("license field %q not found", fieldName)) } else { - w.Header().Set(LicenseCacheHeader, "stale") JSONCached(w, http.StatusOK, lf) } return diff --git a/pkg/heartbeat/heartbeat.go b/pkg/heartbeat/heartbeat.go index 44512c19..5322cb13 100644 --- a/pkg/heartbeat/heartbeat.go +++ b/pkg/heartbeat/heartbeat.go @@ -1,7 +1,6 @@ package heartbeat import ( - "context" "fmt" "sync" "time" @@ -60,7 +59,7 @@ func Start() error { } if !util.IsAirgap() { - licenseData, _, err := sdklicense.SyncLatestLicense(context.Background(), clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint(), store.GetStore().GetReadOnlyMode()) + licenseData, err := sdklicense.GetLatestLicense(store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) if err != nil { logger.Error(errors.Wrap(err, "failed to get latest license")) } else { diff --git a/pkg/license/cache/cache.go b/pkg/license/cache/cache.go deleted file mode 100644 index 8a25090b..00000000 --- a/pkg/license/cache/cache.go +++ /dev/null @@ -1,211 +0,0 @@ -// Package cache persists the license document and license-fields map to a -// dedicated Kubernetes Secret so the SDK can serve license endpoints when -// replicated.app is unreachable. -// -// The cache lives in a runtime-managed Secret (`replicated-license-cache`) -// rather than in the chart-managed `replicated` Secret, so helm upgrades -// don't clobber it. This follows the same pattern as the SDK's other -// out-of-band Secrets — `replicated-instance-report`, -// `replicated-custom-app-metrics-report`, `replicated-meta-data`, and -// `replicated-support-metadata` — none of which are rendered by any chart -// template either. -// -// Writes serialize through a package-level mutex (matching pkg/meta and -// pkg/supportbundle). Reads do not lock; concurrent reads of a Secret are -// safe. Read-only mode is honored via an explicit readOnlyMode argument -// on the public Write* functions rather than by consulting a package- -// level store, because some callers (notably bootstrapCritical) write to -// the cache before the runtime store has been initialized — taking the -// flag as a parameter eliminates that ordering hazard. -package cache - -import ( - "context" - "encoding/json" - "sync" - "time" - - "github.com/pkg/errors" - licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" - "github.com/replicatedhq/replicated-sdk/pkg/logger" - "github.com/replicatedhq/replicated-sdk/pkg/util" - corev1 "k8s.io/api/core/v1" - kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" -) - -// SecretName is the name of the runtime-managed cache Secret. The chart's -// RBAC Role grants the SDK `update` on this exact resourceName. -const SecretName = "replicated-license-cache" - -// Secret data keys. -const ( - KeyLicense = "license" // raw signed license YAML bytes - KeyLicenseFields = "license-fields" // JSON-encoded LicenseFields map - KeyLastFetched = "last-fetched" // RFC3339 timestamp of most recent successful upstream sync -) - -// ErrLicenseCacheNotFound is returned by Read when the cache Secret does -// not exist or contains no usable license data. Callers use it to -// distinguish a cache miss from a transient k8s API failure. -var ErrLicenseCacheNotFound = errors.New("license cache not found") - -// CachedLicense is the in-memory shape of a successful Read. LicenseFields -// is nil if the fields key was never written; LicenseBytes is always -// populated on a successful Read (the secret is considered "found" only -// when the license key is present). -type CachedLicense struct { - LicenseBytes []byte - LicenseFields licensetypes.LicenseFields - LastFetched time.Time -} - -// cacheLock serializes writes against the cache Secret to avoid lost -// updates when bootstrap and request-time syncs race. Reads are unlocked. -var cacheLock sync.Mutex - -// Read returns the cached license, or ErrLicenseCacheNotFound if no usable -// cache exists. A Secret with no `license` key counts as not-found because -// the license document is the primary artifact — fields without a license -// to attach them to are not useful on their own. -func Read(ctx context.Context, clientset kubernetes.Interface, namespace string) (*CachedLicense, error) { - secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, SecretName, metav1.GetOptions{}) - if err != nil { - if kuberneteserrors.IsNotFound(err) { - return nil, ErrLicenseCacheNotFound - } - return nil, errors.Wrap(err, "failed to get license cache secret") - } - - licenseBytes, ok := secret.Data[KeyLicense] - if !ok || len(licenseBytes) == 0 { - return nil, ErrLicenseCacheNotFound - } - - cached := &CachedLicense{ - LicenseBytes: licenseBytes, - } - - if fieldsBytes, ok := secret.Data[KeyLicenseFields]; ok && len(fieldsBytes) > 0 { - var fields licensetypes.LicenseFields - if err := json.Unmarshal(fieldsBytes, &fields); err != nil { - // Malformed fields blob is non-fatal — return the license - // without fields and log so an operator can investigate. - logger.Infof("license cache: failed to unmarshal cached license-fields, ignoring: %v", err) - } else { - cached.LicenseFields = fields - } - } - - if tsBytes, ok := secret.Data[KeyLastFetched]; ok && len(tsBytes) > 0 { - if ts, err := time.Parse(time.RFC3339, string(tsBytes)); err == nil { - cached.LastFetched = ts - } - } - - return cached, nil -} - -// WriteLicense upserts the license bytes and refreshes the last-fetched -// timestamp. License-fields, if previously cached, are preserved. -// -// readOnlyMode must be passed in by the caller rather than read from the -// package-level store: bootstrapCritical's loadAndVerifyLicense calls -// this through SyncLicenseByID before store.InitInMemory has installed -// the runtime config, so an empty fallback InMemoryStore would report -// readOnlyMode=false and we'd attempt a write that an operator -// explicitly opted out of. Passing the bool from the call site (params -// at bootstrap, store getter elsewhere) eliminates that ordering trap. -func WriteLicense(ctx context.Context, clientset kubernetes.Interface, namespace string, licenseBytes []byte, readOnlyMode bool) error { - if len(licenseBytes) == 0 { - return errors.New("refusing to cache empty license bytes") - } - return write(ctx, clientset, namespace, map[string][]byte{ - KeyLicense: licenseBytes, - }, readOnlyMode) -} - -// WriteLicenseFields upserts the license-fields map and refreshes the -// last-fetched timestamp. The license document, if previously cached, is -// preserved. -// -// See WriteLicense for why readOnlyMode is an explicit parameter rather -// than read from the package-level store. -func WriteLicenseFields(ctx context.Context, clientset kubernetes.Interface, namespace string, fields licensetypes.LicenseFields, readOnlyMode bool) error { - encoded, err := json.Marshal(fields) - if err != nil { - return errors.Wrap(err, "failed to marshal license fields") - } - return write(ctx, clientset, namespace, map[string][]byte{ - KeyLicenseFields: encoded, - }, readOnlyMode) -} - -// write performs a partial-update upsert: existing keys not in `data` are -// preserved. A `last-fetched` timestamp is always refreshed alongside any -// successful update. -func write(ctx context.Context, clientset kubernetes.Interface, namespace string, data map[string][]byte, readOnlyMode bool) error { - if readOnlyMode { - logger.Debugf("license cache: skipping write in read-only mode") - return nil - } - - cacheLock.Lock() - defer cacheLock.Unlock() - - merged := make(map[string][]byte, len(data)+1) - for k, v := range data { - merged[k] = v - } - merged[KeyLastFetched] = []byte(time.Now().UTC().Format(time.RFC3339)) - - existing, err := clientset.CoreV1().Secrets(namespace).Get(ctx, SecretName, metav1.GetOptions{}) - if err != nil && !kuberneteserrors.IsNotFound(err) { - return errors.Wrap(err, "failed to get license cache secret") - } - - if kuberneteserrors.IsNotFound(err) { - uid, err := util.GetReplicatedDeploymentUID(clientset, namespace) - if err != nil { - return errors.Wrap(err, "failed to get replicated deployment uid") - } - - secret := &corev1.Secret{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "v1", - Kind: "Secret", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: SecretName, - Namespace: namespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "apps/v1", - Kind: "Deployment", - Name: util.GetReplicatedDeploymentName(), - UID: uid, - }, - }, - }, - Data: merged, - } - - if _, err := clientset.CoreV1().Secrets(namespace).Create(ctx, secret, metav1.CreateOptions{}); err != nil { - return errors.Wrap(err, "failed to create license cache secret") - } - return nil - } - - if existing.Data == nil { - existing.Data = map[string][]byte{} - } - for k, v := range merged { - existing.Data[k] = v - } - - if _, err := clientset.CoreV1().Secrets(namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { - return errors.Wrap(err, "failed to update license cache secret") - } - return nil -} diff --git a/pkg/license/cache/cache_test.go b/pkg/license/cache/cache_test.go deleted file mode 100644 index 5e232498..00000000 --- a/pkg/license/cache/cache_test.go +++ /dev/null @@ -1,184 +0,0 @@ -package cache - -import ( - "context" - "encoding/json" - "testing" - "time" - - licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" - "github.com/replicatedhq/replicated-sdk/pkg/store" - "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - apimachinerytypes "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" -) - -const testNamespace = "test-ns" - -// withDeployment seeds a fake clientset with the SDK's own Deployment so -// that owner-reference resolution succeeds during cache writes. -func withDeployment(t *testing.T) kubernetes.Interface { - t.Helper() - clientset := fake.NewSimpleClientset(&appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "replicated", - Namespace: testNamespace, - UID: apimachinerytypes.UID("deadbeef-0000-0000-0000-000000000000"), - }, - }) - return clientset -} - -func TestRead_NoSecret_ReturnsNotFound(t *testing.T) { - clientset := fake.NewSimpleClientset() - - _, err := Read(context.Background(), clientset, testNamespace) - require.ErrorIs(t, err, ErrLicenseCacheNotFound) -} - -func TestRead_SecretWithoutLicenseKey_ReturnsNotFound(t *testing.T) { - clientset := fake.NewSimpleClientset(&corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, - Data: map[string][]byte{ - KeyLicenseFields: []byte("{}"), - }, - }) - - _, err := Read(context.Background(), clientset, testNamespace) - require.ErrorIs(t, err, ErrLicenseCacheNotFound, "fields-only secret must not satisfy a cache read") -} - -func TestWriteLicense_CreatesSecret(t *testing.T) { - clientset := withDeployment(t) - licenseBytes := []byte("license-yaml-document") - - require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, licenseBytes, false)) - - secret, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) - require.NoError(t, err) - require.Equal(t, licenseBytes, secret.Data[KeyLicense]) - require.NotEmpty(t, secret.Data[KeyLastFetched], "last-fetched must be set on every write") - - require.Len(t, secret.OwnerReferences, 1) - require.Equal(t, "Deployment", secret.OwnerReferences[0].Kind) -} - -func TestWriteLicenseFields_PreservesLicense(t *testing.T) { - clientset := withDeployment(t) - ctx := context.Background() - - licenseBytes := []byte("license-yaml-document") - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, licenseBytes, false)) - - fields := licensetypes.LicenseFields{ - "my_field": licensetypes.LicenseField{ - Title: "My Field", - Value: "v1", - }, - } - require.NoError(t, WriteLicenseFields(ctx, clientset, testNamespace, fields, false)) - - cached, err := Read(ctx, clientset, testNamespace) - require.NoError(t, err) - require.Equal(t, licenseBytes, cached.LicenseBytes, "license bytes must survive a fields-only write") - require.Equal(t, "v1", cached.LicenseFields["my_field"].Value) -} - -func TestWrite_RefreshesLastFetched(t *testing.T) { - clientset := withDeployment(t) - ctx := context.Background() - - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v1"), false)) - cached1, err := Read(ctx, clientset, testNamespace) - require.NoError(t, err) - - time.Sleep(1100 * time.Millisecond) // RFC3339 has 1s resolution. - - require.NoError(t, WriteLicense(ctx, clientset, testNamespace, []byte("v2"), false)) - cached2, err := Read(ctx, clientset, testNamespace) - require.NoError(t, err) - - require.True(t, cached2.LastFetched.After(cached1.LastFetched), "last-fetched must advance on subsequent writes") -} - -func TestWrite_ReadOnlyMode_NoSecretCreated(t *testing.T) { - clientset := fake.NewSimpleClientset() - - require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"), true)) - - _, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) - require.True(t, kuberneteserrors.IsNotFound(err), "no secret should be created in read-only mode") -} - -// TestWrite_ReadOnlyMode_BeforeStoreInit verifies that the read-only -// gate works during early bootstrap, before store.InitInMemory has run. -// Pre-fix this would have observed the empty fallback InMemoryStore's -// readOnlyMode=false default, attempted the write, and only failed via -// RBAC (with a confusing log line). Now the read-only flag is passed -// explicitly so no global-state observation is needed. -func TestWrite_ReadOnlyMode_BeforeStoreInit(t *testing.T) { - store.SetStore(nil) // explicitly: no store installed. - clientset := fake.NewSimpleClientset() - - require.NoError(t, WriteLicense(context.Background(), clientset, testNamespace, []byte("license"), true)) - require.NoError(t, WriteLicenseFields(context.Background(), clientset, testNamespace, licensetypes.LicenseFields{}, true)) - - _, err := clientset.CoreV1().Secrets(testNamespace).Get(context.Background(), SecretName, metav1.GetOptions{}) - require.True(t, kuberneteserrors.IsNotFound(err), "no secret should be created when caller declares read-only mode, regardless of store init state") -} - -func TestWriteLicense_RejectsEmpty(t *testing.T) { - clientset := withDeployment(t) - - require.Error(t, WriteLicense(context.Background(), clientset, testNamespace, nil, false), - "writing empty license bytes must be rejected to avoid corrupting the cache") -} - -func TestRead_MalformedFields_ReturnsLicenseWithoutFields(t *testing.T) { - clientset := fake.NewSimpleClientset(&corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, - Data: map[string][]byte{ - KeyLicense: []byte("license-yaml"), - KeyLicenseFields: []byte("not valid json"), - }, - }) - - cached, err := Read(context.Background(), clientset, testNamespace) - require.NoError(t, err, "malformed fields must not fail the read") - require.NotNil(t, cached.LicenseBytes) - require.Nil(t, cached.LicenseFields, "malformed fields are dropped, not surfaced") -} - -// Sanity check: cached fields round-trip through JSON without losing the -// LicenseFieldSignature struct. -func TestRead_RoundTripsFieldSignatures(t *testing.T) { - encoded, err := json.Marshal(licensetypes.LicenseFields{ - "f1": licensetypes.LicenseField{ - Title: "F1", - Value: 42.0, - Signature: licensetypes.LicenseFieldSignature{ - V1: "sig-v1", - V2: "sig-v2", - }, - }, - }) - require.NoError(t, err) - - clientset := fake.NewSimpleClientset(&corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{Name: SecretName, Namespace: testNamespace}, - Data: map[string][]byte{ - KeyLicense: []byte("license-yaml"), - KeyLicenseFields: encoded, - }, - }) - - cached, err := Read(context.Background(), clientset, testNamespace) - require.NoError(t, err) - require.Equal(t, "sig-v1", cached.LicenseFields["f1"].Signature.V1) - require.Equal(t, "sig-v2", cached.LicenseFields["f1"].Signature.V2) -} diff --git a/pkg/license/license.go b/pkg/license/license.go index ed25ea6a..36be71b9 100644 --- a/pkg/license/license.go +++ b/pkg/license/license.go @@ -23,10 +23,7 @@ type LicenseData struct { License licensewrapper.LicenseWrapper } -// GetLicenseByID fetches the license document by ID from the upstream -// Vendor Portal. Returns the parsed wrapper alongside the raw signed -// bytes so callers can persist them (e.g. to the cache). -func GetLicenseByID(licenseID string, endpoint string) (*LicenseData, error) { +func GetLicenseByID(licenseID string, endpoint string) (licensewrapper.LicenseWrapper, error) { if endpoint == "" { endpoint = defaultReplicatedAppEndpoint } @@ -34,10 +31,10 @@ func GetLicenseByID(licenseID string, endpoint string) (*LicenseData, error) { licenseData, err := getLicenseFromAPI(url, licenseID) if err != nil { - return nil, errors.Wrap(err, "failed to get license from api") + return licensewrapper.LicenseWrapper{}, errors.Wrap(err, "failed to get license from api") } - return licenseData, nil + return licenseData.License, nil } func GetLatestLicense(wrapper licensewrapper.LicenseWrapper, endpoint string) (*LicenseData, error) { diff --git a/pkg/license/sync.go b/pkg/license/sync.go deleted file mode 100644 index c97d3a81..00000000 --- a/pkg/license/sync.go +++ /dev/null @@ -1,169 +0,0 @@ -package license - -import ( - "context" - "sync" - - "github.com/pkg/errors" - licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" - "github.com/replicatedhq/replicated-sdk/pkg/license/cache" - "github.com/replicatedhq/replicated-sdk/pkg/license/types" - "github.com/replicatedhq/replicated-sdk/pkg/logger" - "k8s.io/client-go/kubernetes" -) - -// LicenseSource identifies whether a Sync* result came from the live -// upstream or from the local cache. Handlers use this to decide whether -// to set the X-Replicated-License-Cache: stale response header. -type LicenseSource int - -const ( - SourceUpstream LicenseSource = iota - SourceCache -) - -// staleWarnOnce ensures the cache-fallback warning log fires at most once -// per pod lifetime. Subsequent fallbacks are silent to avoid log spam -// during a sustained replicated.app outage. -var staleWarnOnce sync.Once - -// warnStale logs the cache-fallback the first time it fires in a given -// pod. The original upstream error is included to give operators a clue -// about why the upstream call failed. -func warnStale(upstreamErr error) { - staleWarnOnce.Do(func() { - logger.Warnf("license_cache_fallback: serving cached license because upstream is unavailable: %v", upstreamErr) - }) -} - -// SyncLicenseByID fetches a license by ID from the upstream Vendor Portal. -// On success, the license bytes are written through to the cache and -// SourceUpstream is returned. -// -// On upstream failure, the cache is consulted: if a previously-cached -// license is present and parses correctly, it is returned with -// SourceCache. If the cache is missing or unusable, the original upstream -// error is returned wrapped (so callers can inspect it). -// -// readOnlyMode is propagated explicitly because this is the integration- -// mode boot entry point — it runs before store.InitInMemory installs the -// runtime config, so reading the read-only flag from the package-level -// store would observe the empty fallback's zero-value (false) instead of -// the operator's actual setting. Production-mode boots use the chart- -// embedded license bytes and never reach here. -func SyncLicenseByID( - ctx context.Context, - clientset kubernetes.Interface, - namespace string, - licenseID string, - endpoint string, - readOnlyMode bool, -) (*LicenseData, LicenseSource, error) { - data, upstreamErr := GetLicenseByID(licenseID, endpoint) - if upstreamErr == nil { - if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes, readOnlyMode); writeErr != nil { - logger.Infof("license cache: write-through after GetLicenseByID failed, continuing: %v", writeErr) - } - return data, SourceUpstream, nil - } - - cached, cacheErr := cache.Read(ctx, clientset, namespace) - if cacheErr != nil { - return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") - } - - parsed, parseErr := LoadLicenseFromBytes(cached.LicenseBytes) - if parseErr != nil { - return nil, SourceUpstream, errors.Wrapf(upstreamErr, "upstream failed and cached license could not be parsed: %v", parseErr) - } - - warnStale(upstreamErr) - return &LicenseData{ - LicenseBytes: cached.LicenseBytes, - License: parsed, - }, SourceCache, nil -} - -// SyncLatestLicense refreshes the license against the upstream and writes -// through to the cache on success. On upstream failure, falls back to the -// cached license; on cache miss, returns the original upstream error. -// -// The returned LicenseData.LicenseBytes is what gets persisted on the -// success path. Cache hits return the bytes that were written by the most -// recent successful upstream call. -// -// readOnlyMode is propagated explicitly so the cache layer never has to -// consult a package-level store; see SyncLicenseByID for the rationale. -func SyncLatestLicense( - ctx context.Context, - clientset kubernetes.Interface, - namespace string, - wrapper licensewrapper.LicenseWrapper, - endpoint string, - readOnlyMode bool, -) (*LicenseData, LicenseSource, error) { - data, upstreamErr := GetLatestLicense(wrapper, endpoint) - if upstreamErr == nil { - if writeErr := cache.WriteLicense(ctx, clientset, namespace, data.LicenseBytes, readOnlyMode); writeErr != nil { - logger.Infof("license cache: write-through after GetLatestLicense failed, continuing: %v", writeErr) - } - return data, SourceUpstream, nil - } - - cached, cacheErr := cache.Read(ctx, clientset, namespace) - if cacheErr != nil { - return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") - } - - parsed, parseErr := LoadLicenseFromBytes(cached.LicenseBytes) - if parseErr != nil { - return nil, SourceUpstream, errors.Wrapf(upstreamErr, "upstream failed and cached license could not be parsed: %v", parseErr) - } - - warnStale(upstreamErr) - return &LicenseData{ - LicenseBytes: cached.LicenseBytes, - License: parsed, - }, SourceCache, nil -} - -// SyncLatestLicenseFields refreshes the license-fields map against the -// upstream and writes through to the cache on success. On upstream -// failure, falls back to the cached fields; on cache miss, returns the -// original upstream error. -// -// Note: GetLatestLicenseField (singular) is intentionally NOT wrapped. -// Its handler already falls back to the in-memory store on upstream -// failure, and the in-memory store is repopulated from the cache during -// bootstrap. Wrapping it would duplicate that fallback path without -// adding value. -// -// readOnlyMode is propagated explicitly so the cache layer never has to -// consult a package-level store; see SyncLicenseByID for the rationale. -func SyncLatestLicenseFields( - ctx context.Context, - clientset kubernetes.Interface, - namespace string, - wrapper licensewrapper.LicenseWrapper, - endpoint string, - readOnlyMode bool, -) (types.LicenseFields, LicenseSource, error) { - fields, upstreamErr := GetLatestLicenseFields(wrapper, endpoint) - if upstreamErr == nil { - if writeErr := cache.WriteLicenseFields(ctx, clientset, namespace, fields, readOnlyMode); writeErr != nil { - logger.Infof("license cache: write-through after GetLatestLicenseFields failed, continuing: %v", writeErr) - } - return fields, SourceUpstream, nil - } - - cached, cacheErr := cache.Read(ctx, clientset, namespace) - if cacheErr != nil { - return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and license cache miss") - } - if cached.LicenseFields == nil { - return nil, SourceUpstream, errors.Wrap(upstreamErr, "upstream failed and no cached license fields available") - } - - warnStale(upstreamErr) - return cached.LicenseFields, SourceCache, nil -} diff --git a/pkg/license/sync_test.go b/pkg/license/sync_test.go deleted file mode 100644 index 3c65deef..00000000 --- a/pkg/license/sync_test.go +++ /dev/null @@ -1,243 +0,0 @@ -package license - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "sync" - "testing" - - "github.com/replicatedhq/replicated-sdk/pkg/license/cache" - "github.com/replicatedhq/replicated-sdk/pkg/license/types" - "github.com/replicatedhq/replicated-sdk/pkg/store" - "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - apimachinerytypes "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" -) - -const syncTestNamespace = "test-ns" - -// testFixture bundles the moving parts every Sync* test needs: a fake -// kubernetes clientset (with the SDK Deployment so cache writes can resolve -// owner references), an initialized in-memory store, and a reset hook for -// the package-level once-per-pod warning. -type testFixture struct { - clientset kubernetes.Interface -} - -func newTestFixture(t *testing.T) *testFixture { - t.Helper() - - store.InitInMemory(store.InitInMemoryStoreOptions{ - Namespace: syncTestNamespace, - }) - t.Cleanup(func() { store.SetStore(nil) }) - - resetStaleWarnOnce(t) - - clientset := fake.NewSimpleClientset(&appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "replicated", - Namespace: syncTestNamespace, - UID: apimachinerytypes.UID("deadbeef-0000-0000-0000-000000000000"), - }, - }) - - return &testFixture{clientset: clientset} -} - -// resetStaleWarnOnce makes the once-per-pod warning fire fresh in each -// test. The package-level sync.Once otherwise leaks state across tests. -func resetStaleWarnOnce(t *testing.T) { - t.Helper() - staleWarnOnce = sync.Once{} -} - -// licenseServer returns an httptest server that serves validLicenseYAML -// from any path (so the same server backs GetLicenseByID and -// GetLatestLicense regardless of URL shape). -func licenseServer(t *testing.T) *httptest.Server { - t.Helper() - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/yaml") - _, _ = w.Write([]byte(validLicenseYAML)) - })) - t.Cleanup(srv.Close) - return srv -} - -// brokenServer returns an httptest server that 503s every request, -// simulating an unreachable replicated.app. -func brokenServer(t *testing.T) *httptest.Server { - t.Helper() - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "upstream broken", http.StatusServiceUnavailable) - })) - t.Cleanup(srv.Close) - return srv -} - -// fieldsServer returns an httptest server that serves a fixed -// LicenseFields payload from any path. -func fieldsServer(t *testing.T, fields types.LicenseFields) *httptest.Server { - t.Helper() - body, err := json.Marshal(fields) - require.NoError(t, err) - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write(body) - })) - t.Cleanup(srv.Close) - return srv -} - -// ---- SyncLicenseByID ---- - -func TestSyncLicenseByID_UpstreamSuccess_WritesThroughToCache(t *testing.T) { - f := newTestFixture(t) - srv := licenseServer(t) - - data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceUpstream, source) - require.NotEmpty(t, data.LicenseBytes) - - cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) - require.NoError(t, err, "successful upstream call must write through to cache") - require.Equal(t, data.LicenseBytes, cached.LicenseBytes) -} - -func TestSyncLicenseByID_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - - data, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceCache, source) - require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) -} - -func TestSyncLicenseByID_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - _, source, err := SyncLicenseByID(context.Background(), f.clientset, syncTestNamespace, "license-id", srv.URL, false) - require.Error(t, err) - require.Equal(t, SourceUpstream, source, "no fallback available, source must reflect the failed attempt") - require.Contains(t, err.Error(), "license cache miss") -} - -// ---- SyncLatestLicense ---- - -func TestSyncLatestLicense_UpstreamSuccess_WritesThroughToCache(t *testing.T) { - f := newTestFixture(t) - srv := licenseServer(t) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceUpstream, source) - - cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) - require.NoError(t, err) - require.Equal(t, data.LicenseBytes, cached.LicenseBytes) -} - -func TestSyncLatestLicense_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - - data, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceCache, source) - require.Equal(t, []byte(validLicenseYAML), data.LicenseBytes) -} - -func TestSyncLatestLicense_UpstreamFailure_CacheMiss_ReturnsWrappedError(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - _, source, err := SyncLatestLicense(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.Error(t, err) - require.Equal(t, SourceUpstream, source) - require.Contains(t, err.Error(), "license cache miss") -} - -// ---- SyncLatestLicenseFields ---- - -func TestSyncLatestLicenseFields_UpstreamSuccess_WritesThroughToCache(t *testing.T) { - f := newTestFixture(t) - expected := types.LicenseFields{ - "my_field": types.LicenseField{Title: "My Field", Value: "v1"}, - } - srv := fieldsServer(t, expected) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - // In production a license is always cached before fields are - // (bootstrapCritical → bootstrapBackground ordering), so seed that - // state before exercising the fields write-through. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - - got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceUpstream, source) - require.Equal(t, "v1", got["my_field"].Value) - - cached, err := cache.Read(context.Background(), f.clientset, syncTestNamespace) - require.NoError(t, err) - require.Equal(t, "v1", cached.LicenseFields["my_field"].Value) -} - -func TestSyncLatestLicenseFields_UpstreamFailure_CacheHit_ReturnsCached(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - cached := types.LicenseFields{ - "my_field": types.LicenseField{Title: "Cached", Value: "stale"}, - } - // Seed the cache with both a license (so Read returns success) and fields. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - require.NoError(t, cache.WriteLicenseFields(context.Background(), f.clientset, syncTestNamespace, cached, false)) - - got, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.NoError(t, err) - require.Equal(t, SourceCache, source) - require.Equal(t, "stale", got["my_field"].Value) -} - -func TestSyncLatestLicenseFields_UpstreamFailure_NoCachedFields_ReturnsWrappedError(t *testing.T) { - f := newTestFixture(t) - srv := brokenServer(t) - - wrapper, err := LoadLicenseFromBytes([]byte(validLicenseYAML)) - require.NoError(t, err) - - // License cached but fields never written — fields fallback must miss. - require.NoError(t, cache.WriteLicense(context.Background(), f.clientset, syncTestNamespace, []byte(validLicenseYAML), false)) - - _, source, err := SyncLatestLicenseFields(context.Background(), f.clientset, syncTestNamespace, wrapper, srv.URL, false) - require.Error(t, err) - require.Equal(t, SourceUpstream, source) - require.Contains(t, err.Error(), "no cached license fields available") -} diff --git a/pkg/startupstate/startupstate.go b/pkg/startupstate/startupstate.go index 682938f0..d7ac2242 100644 --- a/pkg/startupstate/startupstate.go +++ b/pkg/startupstate/startupstate.go @@ -13,9 +13,8 @@ import "sync/atomic" type State int32 const ( - // Starting indicates the API server is up but bootstrap has not yet - // completed (or, with requireUpstreamOnStartup, has not yet completed - // the full critical+background path). + // Starting indicates the API server is up but bootstrapCritical has + // not yet completed. Starting State = iota // Ready indicates the SDK is ready to serve requests. With the default // configuration this fires after bootstrapCritical succeeds or after diff --git a/pkg/store/memory_store.go b/pkg/store/memory_store.go index 60c55ab0..cbe81258 100644 --- a/pkg/store/memory_store.go +++ b/pkg/store/memory_store.go @@ -127,10 +127,10 @@ func (s *InMemoryStore) SetLicense(license licensewrapper.LicenseWrapper) { } // Both V1 and V2 are nil — the previous license is preserved // rather than overwritten with a zero value. Production callers go - // through loadAndVerifyLicense / Sync* which never return an - // empty wrapper alongside a nil error, so reaching this branch is - // a contract violation worth surfacing. - logger.Debugf("InMemoryStore.SetLicense called with empty wrapper; retaining previous license") + // through loadAndVerifyLicense / GetLatestLicense which never + // return an empty wrapper alongside a nil error, so reaching this + // branch is a contract violation worth surfacing. + logger.Warnf("InMemoryStore.SetLicense called with empty wrapper; retaining previous license") } // GetLicenseFields returns a defensive copy of the license-fields map so diff --git a/pkg/store/store_interface.go b/pkg/store/store_interface.go index b1b8c92c..856153bf 100644 --- a/pkg/store/store_interface.go +++ b/pkg/store/store_interface.go @@ -78,9 +78,9 @@ type Store interface { } // SetStore atomically installs s as the package-level store. Passing nil -// clears the store; subsequent GetStore calls will return a fresh empty -// InMemoryStore until a non-nil value is installed. Test cleanup paths -// rely on the nil-clear behavior. +// clears the store; subsequent GetStore calls will return the shared +// fallback InMemoryStore (see GetStore) until a non-nil value is +// installed. Test cleanup paths rely on the nil-clear behavior. func SetStore(s Store) { if s == nil { storePtr.Store(nil) diff --git a/pkg/store/store_interface_test.go b/pkg/store/store_interface_test.go new file mode 100644 index 00000000..51f65a62 --- /dev/null +++ b/pkg/store/store_interface_test.go @@ -0,0 +1,93 @@ +package store + +import ( + "sync" + "sync/atomic" + "testing" + + appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" + "github.com/stretchr/testify/require" +) + +// TestSetStore_GetStore_ConcurrentSwapAndRead verifies that the +// package-level store pointer can be swapped while readers are calling +// GetStore() in parallel. Without atomic.Pointer guarding `storePtr`, +// `go test -race` would flag the read at GetStore against the write at +// SetStore — and even without -race, an unsynchronized interface read +// could observe a torn (itab, data) pair on platforms where two-word +// loads aren't atomic. This is the production scenario the listener + +// background-bootstrap refactor introduced: handlers calling GetStore +// race with bootstrapCritical's InitInMemory → SetStore. +func TestSetStore_GetStore_ConcurrentSwapAndRead(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + + const goroutines = 64 + const iterations = 1000 + + a := &InMemoryStore{} + b := &InMemoryStore{} + + var wg sync.WaitGroup + wg.Add(goroutines * 2) + + var reads atomic.Uint64 + for i := 0; i < goroutines; i++ { + go func(swapToA bool) { + defer wg.Done() + for j := 0; j < iterations; j++ { + if swapToA { + SetStore(a) + } else { + SetStore(b) + } + } + }(i%2 == 0) + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s := GetStore() + require.NotNil(t, s, "GetStore must never return nil") + reads.Add(1) + } + }() + } + + wg.Wait() + require.Equal(t, uint64(goroutines*iterations), reads.Load()) +} + +func TestSetStore_NilClearsStore(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + + s := &InMemoryStore{} + SetStore(s) + require.Same(t, s, GetStore(), "SetStore should install the exact provided store") + + SetStore(nil) + got := GetStore() + require.NotNil(t, got, "GetStore must return a non-nil fallback after SetStore(nil)") + require.NotSame(t, s, got, "SetStore(nil) should clear; subsequent GetStore must not return the previously installed store") +} + +// TestGetStore_FallbackIsSharedAcrossCalls locks in the contract that +// when no store has been installed (the resilient-mode-timeout window +// where the pod is Ready but bootstrapCritical hasn't run InitInMemory +// yet), consecutive GetStore calls return the SAME fallback instance — +// not a fresh empty one each time. Otherwise a handler doing +// `s := GetStore(); s.SetX(v)` followed by `GetStore().GetX()` would +// silently lose the write because the second call would see a different +// zero-value store. This test would fail with the per-call +// `&InMemoryStore{}` fallback that pre-dated the fix. +func TestGetStore_FallbackIsSharedAcrossCalls(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + SetStore(nil) + + a := GetStore() + b := GetStore() + require.Same(t, a, b, "fallback must be shared across consecutive GetStore calls") + + // Mutate via one handle, observe via another to prove the writes + // land on a single underlying instance and aren't discarded. + a.SetAppStatus(appstatetypes.AppStatus{State: appstatetypes.StateReady}) + require.Equal(t, appstatetypes.StateReady, b.GetAppStatus().State, "writes against the fallback must persist across GetStore calls") +} diff --git a/pkg/util/env.go b/pkg/util/env.go index 54e8049f..7e369fc7 100644 --- a/pkg/util/env.go +++ b/pkg/util/env.go @@ -2,23 +2,46 @@ package util import ( "os" - "regexp" - - kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + "sync/atomic" ) +// airgapOverride is a runtime opt-in that forces IsAirgap() to report +// true even when DISABLE_OUTBOUND_CONNECTIONS is unset. It exists so the +// devOffline workflow (a dev-license-only convenience for working +// offline without configuring full air-gap support) can reuse the +// existing !IsAirgap() gates throughout the codebase rather than +// introducing a parallel set of checks. +// +// The override is set once during bootstrapCritical when params.DevOffline +// is true and the loaded license is a dev license. Tests that flip the +// override must call ResetAirgapOverride in t.Cleanup so state does not +// leak between tests. +// +// The override and DISABLE_OUTBOUND_CONNECTIONS are independent inputs +// to the same boolean: if either (or both) is set, IsAirgap() returns +// true and behavior is identical to either alone. There is no precedence +// to reason about because there is nothing to disagree on. +var airgapOverride atomic.Bool + +// SetAirgapOverride forces IsAirgap to return true regardless of the +// DISABLE_OUTBOUND_CONNECTIONS env var. Call once during bootstrap. +func SetAirgapOverride(on bool) { + airgapOverride.Store(on) +} + +// ResetAirgapOverride clears the override. Intended for test cleanup +// (t.Cleanup); production callers do not unset the override. +func ResetAirgapOverride() { + airgapOverride.Store(false) +} + func IsAirgap() bool { + if airgapOverride.Load() { + return true + } return os.Getenv("DISABLE_OUTBOUND_CONNECTIONS") == "true" } func IsDevEnv() bool { return os.Getenv("REPLICATED_ENV") == "dev" } - -func IsDevLicense(license *kotsv1beta1.License) bool { - if license == nil { - return false - } - result, _ := regexp.MatchString(`replicated-app`, license.Spec.Endpoint) - return result -} diff --git a/pkg/util/env_test.go b/pkg/util/env_test.go new file mode 100644 index 00000000..45f786bb --- /dev/null +++ b/pkg/util/env_test.go @@ -0,0 +1,79 @@ +package util + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +// TestIsAirgap_TruthTable defends the doc claim in env.go that the +// airgapOverride and DISABLE_OUTBOUND_CONNECTIONS env var are independent +// inputs to the same boolean — IsAirgap returns true if either or both +// is set, and there is no precedence between them. A future contributor +// who introduces a precedence rule (e.g. "env wins" or "override wins") +// would silently change a documented invariant; this table makes the +// invariant testable. +func TestIsAirgap_TruthTable(t *testing.T) { + cases := []struct { + name string + envValue string + override bool + want bool + }{ + {"both off", "", false, false}, + {"override only", "", true, true}, + {"env only", "true", false, true}, + {"both on", "true", true, true}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", tc.envValue) + t.Cleanup(ResetAirgapOverride) + SetAirgapOverride(tc.override) + + require.Equal(t, tc.want, IsAirgap()) + }) + } +} + +// TestIsAirgap_EnvVarRequiresExactTrue locks in the strict-string +// contract of the env-var path: only the literal "true" enables airgap. +// "1", "TRUE", "yes" and friends do NOT. Without this test, a future +// "be more lenient with env values" change would silently broaden the +// airgap surface, which is the wrong direction for a flag that disables +// an important class of behavior. +func TestIsAirgap_EnvVarRequiresExactTrue(t *testing.T) { + t.Cleanup(ResetAirgapOverride) + ResetAirgapOverride() + + for _, v := range []string{"", "1", "yes", "TRUE", "True", "false", "0"} { + t.Run("env="+v, func(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", v) + require.False(t, IsAirgap(), + "env=%q must not enable airgap; only the exact string \"true\" does", v) + }) + } +} + +// TestSetAirgapOverride_Idempotent verifies that flipping the override +// to true repeatedly is safe. bootstrapCritical can be re-invoked by +// the orchestrator's retry loop and applyDevOfflineGuard re-runs each +// time, so SetAirgapOverride(true) is called more than once in normal +// operation. The atomic.Bool semantics make this trivially correct, but +// codifying the expectation prevents a future "guard against double-set" +// refactor from introducing accidental panics or state-machine logic. +func TestSetAirgapOverride_Idempotent(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", "") + t.Cleanup(ResetAirgapOverride) + ResetAirgapOverride() + require.False(t, IsAirgap(), "precondition: override and env both off") + + SetAirgapOverride(true) + SetAirgapOverride(true) + SetAirgapOverride(true) + require.True(t, IsAirgap()) + + ResetAirgapOverride() + require.False(t, IsAirgap()) +} From b8c88509f7479932bdbf0b5d2a7bc63d67b6dfcd Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 17:09:03 -0500 Subject: [PATCH 13/15] bugbot --- dagger/e2e.go | 17 ++++++++--------- pkg/heartbeat/heartbeat.go | 17 +++++++++++------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/dagger/e2e.go b/dagger/e2e.go index b6efa7bd..7e29cd20 100644 --- a/dagger/e2e.go +++ b/dagger/e2e.go @@ -1540,15 +1540,14 @@ func upgradeChartAndRestart( return nil } -// Future: dagger e2e scenarios for the devOffline opt-in (Phase 3.5 of -// proposals/sdk_local_dev_mode.md). Implemented in a separate PR; the -// unit tests in pkg/apiserver/bootstrap_test.go (TestApplyDevOfflineGuard_*) -// already cover both validation paths at the logic level. Scenario C -// (default behavior unchanged, devOffline=false) is already covered by -// the existing e2e flow above. Scenarios A and B need dagger plumbing -// that does not exist yet — a dev-license fixture alongside the e2e -// secrets, and a NetworkPolicy denying egress to replicated.app from -// the SDK pod. +// Future: dagger e2e scenarios for the devOffline opt-in. Tracked as a +// follow-up PR. The unit tests in pkg/apiserver/bootstrap_test.go +// (TestApplyDevOfflineGuard_*) already cover both validation paths at +// the logic level. Scenario C (default behavior unchanged, +// devOffline=false) is already covered by the existing e2e flow above. +// Scenarios A and B need dagger plumbing that does not exist yet — a +// dev-license fixture alongside the e2e secrets, and a NetworkPolicy +// denying egress to replicated.app from the SDK pod. // // Scenario A (devOffline=true + dev license + blocked upstream): // diff --git a/pkg/heartbeat/heartbeat.go b/pkg/heartbeat/heartbeat.go index 5322cb13..8dee525d 100644 --- a/pkg/heartbeat/heartbeat.go +++ b/pkg/heartbeat/heartbeat.go @@ -52,12 +52,6 @@ func Start() error { _, err := job.AddFunc(cronSpec, func() { logger.Debugf("sending a heartbeat for app %s", appSlug) - clientset, err := k8sutil.GetClientset() - if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - return - } - if !util.IsAirgap() { licenseData, err := sdklicense.GetLatestLicense(store.GetStore().GetLicense(), store.GetStore().GetReplicatedAppEndpoint()) if err != nil { @@ -67,7 +61,18 @@ func Start() error { } } + // Clientset acquisition is scoped to the report call that needs + // it. License sync above no longer needs a clientset (it talks + // directly to the upstream Vendor Portal), so a transient k8s + // API hiccup must not gate it. Hoisting GetClientset above the + // license sync would couple the two and silently regress the + // per-tick resilience of license refresh. go func() { + clientset, err := k8sutil.GetClientset() + if err != nil { + logger.Error(errors.Wrap(err, "failed to get clientset")) + return + } if err := report.SendInstanceData(clientset, store.GetStore()); err != nil { logger.Error(errors.Wrap(err, "failed to send instance data")) } From 53c18b99e0bf6c294ac180f71362cc85c5732d67 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 17:25:17 -0500 Subject: [PATCH 14/15] bugbot --- pkg/apiserver/bootstrap.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index a8c19ece..2354a4e3 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -235,11 +235,6 @@ func applyDevOfflineGuard(wrapper licensewrapper.LicenseWrapper, devOffline bool // bootstrapCritical and are retried by their own loop, which terminates // on the first success. func bootstrapBackground(params APIServerParams) error { - clientset, err := k8sutil.GetClientset() - if err != nil { - return errors.Wrap(err, "failed to get clientset") - } - ctx := params.Context if ctx == nil { ctx = context.Background() @@ -264,10 +259,23 @@ func bootstrapBackground(params APIServerParams) error { // actually running in integration (dev) mode. We instead skip the // updates fetch entirely on this turn and let the heartbeat-driven // refresh recover. - isIntegrationModeEnabled, err := integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) - integrationCheckOK := err == nil + // + // Clientset is acquired here, scoped to the integration check that + // needs it. Hoisting GetClientset to the top of the function would + // gate license sync, heartbeat.Start, and the outdated-version check + // on a transient k8s API hiccup, contradicting the error-accumulation + // pattern this function commits to. + var isIntegrationModeEnabled bool + integrationCheckOK := false + clientset, err := k8sutil.GetClientset() if err != nil { - errs = append(errs, errors.Wrap(err, "failed to check if integration mode is enabled")) + errs = append(errs, errors.Wrap(err, "failed to get clientset")) + } else { + isIntegrationModeEnabled, err = integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) + integrationCheckOK = err == nil + if err != nil { + errs = append(errs, errors.Wrap(err, "failed to check if integration mode is enabled")) + } } if !util.IsAirgap() && integrationCheckOK && !isIntegrationModeEnabled { From 1bfa37099be4d410a83cf37ba1cd325479d48448 Mon Sep 17 00:00:00 2001 From: Benny Yang Date: Wed, 29 Apr 2026 17:49:28 -0500 Subject: [PATCH 15/15] bugbot --- pkg/apiserver/server.go | 49 +++++++++++++++++++++++++----------- pkg/apiserver/server_test.go | 34 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 272ec210..c2352cc2 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -31,6 +31,16 @@ const ( // resilient mode. Critical continues retrying in the background after // this deadline; the deadline only governs when /healthz flips to 200. bootstrapCriticalDeadline = 30 * time.Second + + // bootstrapBackgroundMaxRetries caps how many times the background + // phase is retried before the orchestrator gives up. bootstrapBackground + // joins its step errors via stderrors.Join, which strips any + // backoff.Permanent wrapping from individual steps, so without an + // explicit cap the retry loop would spin forever on a persistent + // failure (log-spamming every retryInterval) instead of giving up. + // 30 attempts × 10s ≈ 5 minutes of recovery window; after that the + // heartbeat cron tick (every 4 hours) is the long-term refresh path. + bootstrapBackgroundMaxRetries = 30 ) type APIServerParams struct { @@ -166,10 +176,11 @@ func buildServer(params APIServerParams) (*http.Server, error) { // production deps are constructed by defaultBootstrapDeps; tests substitute // fakes by calling runBootstrapWithDeps directly. type bootstrapDeps struct { - critical func(APIServerParams) error - background func(APIServerParams) error - deadline time.Duration - retryInterval time.Duration + critical func(APIServerParams) error + background func(APIServerParams) error + deadline time.Duration + retryInterval time.Duration + bgMaxRetries uint64 } func defaultBootstrapDeps() bootstrapDeps { @@ -178,6 +189,7 @@ func defaultBootstrapDeps() bootstrapDeps { background: bootstrapBackground, deadline: bootstrapCriticalDeadline, retryInterval: bootstrapRetryInterval, + bgMaxRetries: bootstrapBackgroundMaxRetries, } } @@ -260,23 +272,30 @@ func runBootstrapWithDeps(params APIServerParams, state *startupstate.Tracker, d logger.Infof("sdk_critical_succeeded_after_readiness: bootstrapCritical completed after the readiness deadline; advancing to background phase") } - // Retry bootstrapBackground until it succeeds or returns a - // permanent error. Without this loop, a transient failure on the - // first attempt (e.g. heartbeat.Start hitting a momentary cron - // init issue, or upstream being briefly unreachable while the - // pod was already marked Ready via the timeout path) would leave - // the heartbeat cron job and any subsequent license refreshes - // disabled for the entire pod lifetime. Errors are still logged - // at Warn level on each attempt and never block the pod from - // staying Ready. + // Retry bootstrapBackground for a bounded number of attempts. Without + // this loop, a transient failure on the first attempt (e.g. + // heartbeat.Start hitting a momentary cron init issue, or upstream + // being briefly unreachable while the pod was already marked Ready + // via the timeout path) would leave the heartbeat cron job and any + // subsequent license refreshes disabled for the entire pod lifetime. + // + // The cap matters because bootstrapBackground returns + // stderrors.Join(errs...), which strips any backoff.Permanent + // wrapping from inner steps; without WithMaxRetries, a persistent + // background failure would log-spam every retryInterval forever and + // the give-up branch below would be dead code. After the cap, the + // heartbeat cron tick (every 4 hours) takes over as the long-term + // refresh path. Errors are still logged at Warn level on each + // attempt and never block the pod from staying Ready. + bgBackoff := backoff.WithMaxRetries(backoff.NewConstantBackOff(deps.retryInterval), deps.bgMaxRetries) if err := backoff.RetryNotify( func() error { return deps.background(params) }, - backoff.NewConstantBackOff(deps.retryInterval), + bgBackoff, func(err error, d time.Duration) { logger.Warnf("bootstrap background phase failed, retrying in %s (handlers continue serving from in-memory store): %v", d, err) }, ); err != nil { - logger.Warnf("bootstrap background phase gave up with permanent error (handlers continue serving from in-memory store): %v", err) + logger.Warnf("bootstrap background phase gave up (handlers continue serving from in-memory store; heartbeat cron will retry on next tick): %v", err) } return nil } diff --git a/pkg/apiserver/server_test.go b/pkg/apiserver/server_test.go index 094e1a61..0f47d812 100644 --- a/pkg/apiserver/server_test.go +++ b/pkg/apiserver/server_test.go @@ -20,6 +20,7 @@ func fastDeps(critical, background func(APIServerParams) error) bootstrapDeps { background: background, deadline: 50 * time.Millisecond, retryInterval: 5 * time.Millisecond, + bgMaxRetries: 100, // generous; individual tests override when they want to exercise the cap } } @@ -116,6 +117,39 @@ func TestRunBootstrap_BackgroundRetriesUntilSuccess(t *testing.T) { require.GreaterOrEqual(t, bgCalls.Load(), int32(3), "must retry transient background failures") } +// TestRunBootstrap_BackgroundGivesUpAfterMaxRetries verifies the retry +// loop terminates in finite time when bootstrapBackground returns +// transient errors that never resolve. bootstrapBackground in production +// returns stderrors.Join(errs...), which strips backoff.Permanent +// wrapping from inner steps; without an explicit max-retries cap, the +// loop would log-spam every retryInterval forever and the give-up +// branch in runBootstrapWithDeps would be dead code. This test exercises +// the cap. +func TestRunBootstrap_BackgroundGivesUpAfterMaxRetries(t *testing.T) { + state := startupstate.New() + var bgCalls atomic.Int32 + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + bgCalls.Add(1) + // Plain transient error — never wrapped in + // backoff.Permanent. Without WithMaxRetries this + // would retry forever. + return errors.New("persistent transient failure") + }, + ) + deps.bgMaxRetries = 3 + + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), + "background give-up must not bubble up — pod stays Ready") + require.True(t, state.IsReady()) + // WithMaxRetries(b, n) allows n retries AFTER the initial attempt, + // so total calls is n+1. + require.Equal(t, int32(deps.bgMaxRetries+1), bgCalls.Load(), + "loop must terminate after bgMaxRetries+1 total attempts, not retry forever") +} + // TestRunBootstrap_CriticalFailsAfterDeadline_StaysReadyButSkipsBackground // pins the deliberate "false-readiness over Ready→crash flap" trade-off // in runBootstrapWithDeps: when bootstrapCritical outlasts the readiness