diff --git a/chart/templates/replicated-deployment.yaml b/chart/templates/replicated-deployment.yaml index 4cddd9f4..69d67f76 100644 --- a/chart/templates/replicated-deployment.yaml +++ b/chart/templates/replicated-deployment.yaml @@ -193,6 +193,10 @@ spec: - name: REPLICATED_HA_ENABLED value: "true" {{- end }} + {{- if .Values.devOffline }} + - name: REPLICATED_DEV_OFFLINE + value: "true" + {{- end }} {{- if (.Values.integration).licenseID }} - name: REPLICATED_INTEGRATION_LICENSE_ID valueFrom: diff --git a/chart/templates/replicated-role.yaml b/chart/templates/replicated-role.yaml index 4944c979..30ec299b 100644 --- a/chart/templates/replicated-role.yaml +++ b/chart/templates/replicated-role.yaml @@ -34,6 +34,10 @@ rules: - 'secrets' verbs: - 'update' + # The names below must match the SDK's runtime expectations. If you + # rename one, update the corresponding constant on the Go side: + # replicated-meta-data → pkg/meta/meta.ReplicatedMetadataSecretName + # replicated-support-metadata → pkg/supportbundle.SupportBundleMetadataSecretName resourceNames: - {{ include "replicated.secretName" . }} - replicated-instance-report diff --git a/chart/values.yaml b/chart/values.yaml index 156a4e10..fd2f2afa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -340,6 +340,13 @@ reportAllImages: false # See docs for feature availability in this mode. readOnlyMode: false +# When true (and license type is "dev"), the SDK treats the install as +# airgap — it never calls replicated.app and serves the license from the +# chart-embedded bytes. Intended for local development with a dev license +# when working offline (VPN, flaky network, air-gapped lab). Setting this +# with a non-dev license causes the SDK to refuse to start. +devOffline: false + # Proxy configuration for outbound connections # Configure HTTPS proxy settings for the Replicated SDK # These values can also be set via global.replicated.httpsProxy and global.replicated.noProxy diff --git a/cmd/replicated/api.go b/cmd/replicated/api.go index 5e7ffa02..3eadf4c3 100644 --- a/cmd/replicated/api.go +++ b/cmd/replicated/api.go @@ -31,6 +31,10 @@ func APICmd() *cobra.Command { namespace := v.GetString("namespace") configFilePath := v.GetString("config-file") integrationLicenseID := v.GetString("integration-license-id") + // dev-offline auto-binds to REPLICATED_DEV_OFFLINE via + // root.go's SetEnvPrefix("REPLICATED") + AutomaticEnv + + // SetEnvKeyReplacer("-", "_"); no explicit BindEnv needed. + devOffline := v.GetBool("dev-offline") if configFilePath == "" && integrationLicenseID == "" { return errors.New("either config file or integration license id must be specified") @@ -78,6 +82,7 @@ func APICmd() *cobra.Command { ReportAllImages: replicatedConfig.ReportAllImages, ReadOnlyMode: replicatedConfig.ReadOnlyMode, Namespace: namespace, + DevOffline: devOffline, } apiserver.Start(params) diff --git a/dagger/e2e.go b/dagger/e2e.go index 268a9440..7e29cd20 100644 --- a/dagger/e2e.go +++ b/dagger/e2e.go @@ -267,24 +267,47 @@ spec: } fmt.Println(out) - // wait for the replicated-ssl-test deployment to be ready + // Wait for the replicated-ssl-test deployment to be ready. The probe + // inside the pod (curl -k https://replicated:3000/health) only passes + // once the replicated service has Ready endpoints AND TLS is serving, + // but the pod itself also has to be scheduled, image-pulled, and + // started. On slower CI clusters (notably GKE during image-pull + // contention) a 1m budget is not enough — observed failures show the + // alpine/curl pod still in ContainerCreating at the timeout. 3m gives + // the cluster room to schedule without masking real readiness failures. ctr = dag.Container().From("bitnami/kubectl:latest"). WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). WithEnvVariable("KUBECONFIG", kubeconfigPath). - WithExec([]string{"kubectl", "wait", "--for=condition=available", "deployment/replicated-ssl-test", "--timeout=1m"}) + WithExec([]string{"kubectl", "wait", "--for=condition=available", "deployment/replicated-ssl-test", "--timeout=3m"}) out, err = ctr.Stdout(ctx) if err != nil { - ctr = dag.Container().From("bitnami/kubectl:latest"). - WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). - WithEnvVariable("KUBECONFIG", kubeconfigPath). - WithExec([]string{"kubectl", "logs", "-p", "-l", "app.kubernetes.io/name=replicated"}) - out, err2 := ctr.Stdout(ctx) - if err2 != nil { - return fmt.Errorf("failed to get logs for replicated deployment: %w", err2) + // Best-effort diagnostics. None of these may individually fail + // the test — the original wait error is what we report. In + // particular, kubectl logs without -p must be used here: the + // previous use of "-p" returned BadRequest ("previous + // terminated container ... not found") whenever the replicated + // pod had not crashed, which masked the real wait timeout with + // a misleading "log fetch" error. + dumpCmd := func(args []string) { + c := dag.Container().From("bitnami/kubectl:latest"). + WithFile(kubeconfigPath, kubeconfigSource.File("/kubeconfig")). + WithEnvVariable("KUBECONFIG", kubeconfigPath). + With(CacheBustingExec(args)) + if stdout, derr := c.Stdout(ctx); derr == nil { + fmt.Printf("$ %s\n%s\n", strings.Join(args, " "), stdout) + } else if stderr, _ := c.Stderr(ctx); stderr != "" { + fmt.Printf("$ %s\n(diagnostic command failed: %v)\n%s\n", strings.Join(args, " "), derr, stderr) + } else { + fmt.Printf("$ %s\n(diagnostic command failed: %v)\n", strings.Join(args, " "), derr) + } } - fmt.Println(out) + dumpCmd([]string{"kubectl", "get", "pods", "-o", "wide"}) + dumpCmd([]string{"kubectl", "describe", "deployment/replicated-ssl-test"}) + dumpCmd([]string{"kubectl", "describe", "pods", "-l", "app=replicated-ssl-test"}) + dumpCmd([]string{"kubectl", "logs", "-l", "app=replicated-ssl-test", "--tail=100"}) + dumpCmd([]string{"kubectl", "logs", "-l", "app.kubernetes.io/name=replicated", "--tail=100"}) - return fmt.Errorf("failed to wait for replicated deployment to be ready: %w", err) + return fmt.Errorf("failed to wait for replicated-ssl-test deployment to be ready: %w", err) } fmt.Println(out) @@ -1516,3 +1539,29 @@ func upgradeChartAndRestart( return nil } + +// Future: dagger e2e scenarios for the devOffline opt-in. Tracked as a +// follow-up PR. The unit tests in pkg/apiserver/bootstrap_test.go +// (TestApplyDevOfflineGuard_*) already cover both validation paths at +// the logic level. Scenario C (default behavior unchanged, +// devOffline=false) is already covered by the existing e2e flow above. +// Scenarios A and B need dagger plumbing that does not exist yet — a +// dev-license fixture alongside the e2e secrets, and a NetworkPolicy +// denying egress to replicated.app from the SDK pod. +// +// Scenario A (devOffline=true + dev license + blocked upstream): +// +// helm install ... \ +// --set-file license=$DEV_LICENSE \ +// --set devOffline=true +// kubectl apply -f testdata/networkpolicy-deny-replicated-app.yaml +// # Expected: pod becomes Ready, /api/v1/license/info serves from +// # chart bytes, no upstream dial attempted. +// +// Scenario B (devOffline=true + non-dev license): +// +// helm install ... \ +// --set-file license=$PROD_LICENSE \ +// --set devOffline=true +// # Expected: pod CrashLoopBackOff with "devOffline=true requires a +// # dev license" in the logs. diff --git a/pkg/apiserver/bootstrap.go b/pkg/apiserver/bootstrap.go index 2447aecf..2354a4e3 100644 --- a/pkg/apiserver/bootstrap.go +++ b/pkg/apiserver/bootstrap.go @@ -1,7 +1,10 @@ package apiserver import ( + "context" + stderrors "errors" "log" + "sync" "github.com/cenkalti/backoff/v4" "github.com/pkg/errors" @@ -23,7 +26,25 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/util" ) -func bootstrap(params APIServerParams) error { +// bootstrapCritical performs the bootstrap work that must succeed before the +// SDK can serve meaningful responses. It loads, signature-verifies, and +// expiry-checks the license, then populates the in-memory store. It also +// initializes the local appstate operator so app/* endpoints can begin +// reporting status as soon as the pod is Ready. +// +// In production mode (LicenseBytes provided by the chart) this path is +// fully local — no upstream call is made. In integration mode the +// license document is fetched from the Vendor Portal by ID, so an +// unreachable upstream on first boot causes critical to retry. The +// devOffline opt-in (production mode + a dev license) flips the runtime +// airgap flag here so subsequent !util.IsAirgap() gates skip their +// upstream calls; it does not change the license source. +// +// Permanent failures (license parse error, signature invalid, expired, +// devOffline + non-dev license) are returned as backoff.Permanent so the +// retry loop above gives up immediately. Transient failures bubble up +// unwrapped and the caller will retry. +func bootstrapCritical(params APIServerParams) error { clientset, err := k8sutil.GetClientset() if err != nil { return errors.Wrap(err, "failed to get clientset") @@ -31,7 +52,6 @@ func bootstrap(params APIServerParams) error { replicatedID, appID := params.ReplicatedID, params.AppID if replicatedID == "" || appID == "" { - // retrieve replicated and app ids replicatedID, appID, err = util.GetReplicatedAndAppIDs(clientset, params.Namespace) if err != nil { return errors.Wrap(err, "failed to get replicated and app ids") @@ -47,7 +67,6 @@ func bootstrap(params APIServerParams) error { log.Println("replicatedID:", replicatedID) log.Println("appID:", appID) - // In Embedded Cluster installations, automatically enable reporting all images reportAllImages := params.ReportAllImages if !reportAllImages { distribution := report.GetDistribution(clientset) @@ -57,46 +76,15 @@ func bootstrap(params APIServerParams) error { } } - var unverifiedWrapper licensewrapper.LicenseWrapper - if len(params.LicenseBytes) > 0 { - wrapper, err := sdklicense.LoadLicenseFromBytes(params.LicenseBytes) - if err != nil { - return errors.Wrap(err, "failed to parse license from base64") - } - unverifiedWrapper = wrapper - } else if params.IntegrationLicenseID != "" { - wrapper, err := sdklicense.GetLicenseByID(params.IntegrationLicenseID, params.ReplicatedAppEndpoint) - if err != nil { - return backoff.Permanent(errors.Wrap(err, "failed to get license by id for integration license id")) - } - if wrapper.GetLicenseType() != "dev" { - return errors.New("integration license must be a dev license") - } - unverifiedWrapper = wrapper - } - - err = unverifiedWrapper.VerifySignature() + verifiedWrapper, err := loadAndVerifyLicense(params) if err != nil { - if licensewrappertypes.IsLicenseDataValidationError(err) { - // this is not a fatal error, it means that the license data outside of the signature was changed - // however, the data inside the signature was still valid, and so the license has been updated to use that data instead - log.Println(err.Error()) - } else { - return backoff.Permanent(errors.Wrap(err, "failed to verify license signature")) - } + return err } - verifiedWrapper := unverifiedWrapper - if !util.IsAirgap() { - // sync license - licenseData, err := sdklicense.GetLatestLicense(verifiedWrapper, params.ReplicatedAppEndpoint) - if err != nil { - return errors.Wrap(err, "failed to get latest license") - } - verifiedWrapper = licenseData.License - } - - // check license expiration + // Expiry is unconditional and has no side effects, so check it + // before applyDevOfflineGuard — which flips the process-global + // airgap override on success. Reordering keeps that side effect + // from landing on a license we are about to reject anyway. expired, err := sdklicense.LicenseIsExpired(verifiedWrapper) if err != nil { return errors.Wrap(err, "failed to check if license is expired") @@ -105,11 +93,14 @@ func bootstrap(params APIServerParams) error { return backoff.Permanent(errors.New("License is expired")) } + if err := applyDevOfflineGuard(verifiedWrapper, params.DevOffline); err != nil { + return err + } + channelID := params.ChannelID if channelID == "" { channelID = verifiedWrapper.GetChannelID() } - channelName := params.ChannelName if channelName == "" { channelName = verifiedWrapper.GetChannelName() @@ -135,29 +126,13 @@ func bootstrap(params APIServerParams) error { ReadOnlyMode: params.ReadOnlyMode, }) - isIntegrationModeEnabled, err := integration.IsEnabled(params.Context, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) - if err != nil { - return errors.Wrap(err, "failed to check if integration mode is enabled") - } - - if !util.IsAirgap() && !isIntegrationModeEnabled { - // retrieve and cache updates - currentCursor := upstreamtypes.ReplicatedCursor{ - ChannelID: store.GetStore().GetChannelID(), - ChannelName: store.GetStore().GetChannelName(), - ChannelSequence: store.GetStore().GetChannelSequence(), - } - updates, err := upstream.GetUpdates(store.GetStore(), store.GetStore().GetLicense(), currentCursor) - if err != nil { - return errors.Wrap(err, "failed to get updates") - } - store.GetStore().SetUpdates(updates) - } - - appStateOperator := appstate.InitOperator(clientset, params.Namespace) - appStateOperator.Start() - - // if no status informers are provided, generate them from the helm release + // Resolve informers BEFORE starting the appstate operator goroutine + // so a helm.GetRelease error on this attempt doesn't leave a + // runAppStateMonitor goroutine running that the next retry would + // duplicate. InitOperator overwrites the package-level `operator` + // pointer and Operator.Start unconditionally spawns a new goroutine + // without shutting down the previous monitor, so the start must be + // the last fallible-or-not step in the function. informers := params.StatusInformers if informers == nil && helm.IsHelmManaged() { helmRelease, err := helm.GetRelease(helm.GetReleaseName()) @@ -169,24 +144,179 @@ func bootstrap(params APIServerParams) error { } } + appStateOperator := appstate.InitOperator(clientset, params.Namespace) + appStateOperator.Start() appStateOperator.ApplyAppInformers(appstatetypes.AppInformersArgs{ AppSlug: store.GetStore().GetAppSlug(), Sequence: store.GetStore().GetReleaseSequence(), Informers: informers, }) + return nil +} + +// loadAndVerifyLicense loads the license from chart-embedded bytes +// (production) or from the upstream Vendor Portal by integration-license +// ID, then signature-verifies it. +func loadAndVerifyLicense(params APIServerParams) (licensewrapper.LicenseWrapper, error) { + var unverifiedWrapper licensewrapper.LicenseWrapper + + switch { + case len(params.LicenseBytes) > 0: + wrapper, err := sdklicense.LoadLicenseFromBytes(params.LicenseBytes) + if err != nil { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "failed to parse license from base64")) + } + unverifiedWrapper = wrapper + case params.IntegrationLicenseID != "": + wrapper, err := sdklicense.GetLicenseByID(params.IntegrationLicenseID, params.ReplicatedAppEndpoint) + if err != nil { + return licensewrapper.LicenseWrapper{}, errors.Wrap(err, "failed to get license by id") + } + if wrapper.GetLicenseType() != "dev" { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("integration license must be a dev license")) + } + unverifiedWrapper = wrapper + default: + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.New("no license source configured: either LicenseBytes or IntegrationLicenseID is required")) + } + + if err := unverifiedWrapper.VerifySignature(); err != nil { + if licensewrappertypes.IsLicenseDataValidationError(err) { + // Non-fatal: license data outside the signature was changed, + // but the data inside the signature was still valid; the + // wrapper has been updated to use that data instead. + log.Println(err.Error()) + } else { + return licensewrapper.LicenseWrapper{}, backoff.Permanent(errors.Wrap(err, "failed to verify license signature")) + } + } + + return unverifiedWrapper, nil +} + +// applyDevOfflineGuard enforces the dev-only contract of the devOffline +// opt-in: when the operator sets replicated.devOffline=true, the loaded +// license must be a dev license, and on success the runtime airgap flag +// is flipped so all !util.IsAirgap() gates skip their upstream calls. +// +// A non-dev license with devOffline=true returns backoff.Permanent so +// the bootstrap retry loop gives up immediately — the install is +// misconfigured and no amount of retrying will fix it. Production +// licenses cannot silently end up running offline this way; operators +// who want true airgap must use the existing isAirgap chart value with +// its accompanying ops requirements. +func applyDevOfflineGuard(wrapper licensewrapper.LicenseWrapper, devOffline bool) error { + if !devOffline { + return nil + } + if wrapper.GetLicenseType() != "dev" { + return backoff.Permanent(errors.New("devOffline=true requires a dev license")) + } + util.SetAirgapOverride(true) + log.Println("devOffline enabled: SDK will not call replicated.app for this install") + return nil +} + +// bootstrapBackground performs upstream-dependent bootstrap work whose +// failure must not prevent the SDK from being marked Ready. Errors are +// logged and the call is retried by the caller; handlers continue serving +// from whatever bootstrapCritical placed in the store. +// +// Each step runs independently and accumulates its error rather than +// returning early. This guarantees that a transient failure in one step +// (e.g. GetLatestLicense when upstream is briefly unreachable) does not +// silently disable downstream steps for the pod's lifetime — most +// importantly, heartbeat.Start() always gets a chance to run so the +// instance continues to check in. The steps here are safe to retry +// (heartbeat.Start clears and re-adds its cron entries; Set* store writes +// are last-write-wins). Critical-phase initializers that would leak +// resources on retry (notably appstate.InitOperator) live in +// bootstrapCritical and are retried by their own loop, which terminates +// on the first success. +func bootstrapBackground(params APIServerParams) error { + ctx := params.Context + if ctx == nil { + ctx = context.Background() + } + + var errs []error + + if !util.IsAirgap() { + licenseData, err := sdklicense.GetLatestLicense(store.GetStore().GetLicense(), params.ReplicatedAppEndpoint) + if err != nil { + errs = append(errs, errors.Wrap(err, "failed to get latest license")) + } else { + store.GetStore().SetLicense(licenseData.License) + } + } + + // integrationCheckOK distinguishes "we know the mode" from "we don't + // know yet". The zero value of isIntegrationModeEnabled is false, so + // without this flag a transient IsEnabled failure would silently fall + // into the !isIntegrationModeEnabled branch below and call + // upstream.GetUpdates against the Vendor Portal even when the pod is + // actually running in integration (dev) mode. We instead skip the + // updates fetch entirely on this turn and let the heartbeat-driven + // refresh recover. + // + // Clientset is acquired here, scoped to the integration check that + // needs it. Hoisting GetClientset to the top of the function would + // gate license sync, heartbeat.Start, and the outdated-version check + // on a transient k8s API hiccup, contradicting the error-accumulation + // pattern this function commits to. + var isIntegrationModeEnabled bool + integrationCheckOK := false + clientset, err := k8sutil.GetClientset() + if err != nil { + errs = append(errs, errors.Wrap(err, "failed to get clientset")) + } else { + isIntegrationModeEnabled, err = integration.IsEnabled(ctx, clientset, store.GetStore().GetNamespace(), store.GetStore().GetLicense()) + integrationCheckOK = err == nil + if err != nil { + errs = append(errs, errors.Wrap(err, "failed to check if integration mode is enabled")) + } + } + + if !util.IsAirgap() && integrationCheckOK && !isIntegrationModeEnabled { + currentCursor := upstreamtypes.ReplicatedCursor{ + ChannelID: store.GetStore().GetChannelID(), + ChannelName: store.GetStore().GetChannelName(), + ChannelSequence: store.GetStore().GetChannelSequence(), + } + updates, err := upstream.GetUpdates(store.GetStore(), store.GetStore().GetLicense(), currentCursor) + if err != nil { + errs = append(errs, errors.Wrap(err, "failed to get updates")) + } else { + store.GetStore().SetUpdates(updates) + } + } + if err := heartbeat.Start(); err != nil { - return errors.Wrap(err, "failed to start heartbeat") + errs = append(errs, errors.Wrap(err, "failed to start heartbeat")) } - // this is at the end of the bootstrap function so that it doesn't re-run on retry + // bootstrapBackground is wrapped in a retry loop, so any goroutine + // launched here would otherwise be re-spawned on every retry and + // leak. The dev-version check is a one-time observability signal — + // gate it behind a package-level sync.Once so it runs at most once + // per process regardless of how many times bootstrapBackground is + // invoked. if !util.IsAirgap() && store.GetStore().IsDevLicense() { - go func() { - if err := util.WarnOnOutdatedReplicatedVersion(); err != nil { - logger.Infof("Failed to check if running an outdated replicated version: %v", err) - } - }() + warnOnOutdatedReplicatedVersionOnce.Do(func() { + go func() { + if err := util.WarnOnOutdatedReplicatedVersion(); err != nil { + logger.Infof("Failed to check if running an outdated replicated version: %v", err) + } + }() + }) } - return nil + return stderrors.Join(errs...) } + +// warnOnOutdatedReplicatedVersionOnce ensures the dev-mode upstream-version +// warning goroutine is launched at most once per process. bootstrapBackground +// can be invoked multiple times by the retry loop and we don't want to +// spawn a fresh goroutine on every retry. +var warnOnOutdatedReplicatedVersionOnce sync.Once diff --git a/pkg/apiserver/bootstrap_test.go b/pkg/apiserver/bootstrap_test.go new file mode 100644 index 00000000..3d700b6e --- /dev/null +++ b/pkg/apiserver/bootstrap_test.go @@ -0,0 +1,128 @@ +package apiserver + +import ( + "net/http" + "sync/atomic" + "testing" + + "github.com/cenkalti/backoff/v4" + kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" + "github.com/replicatedhq/replicated-sdk/pkg/util" + "github.com/stretchr/testify/require" +) + +// blockingTransport is a RoundTripper that fails any HTTP request and +// records that a dial was attempted. Tests use this in place of +// http.DefaultTransport to assert that a code path is fully local. +type blockingTransport struct { + dials atomic.Int32 +} + +func (b *blockingTransport) RoundTrip(req *http.Request) (*http.Response, error) { + b.dials.Add(1) + return nil, &neverDialError{host: req.URL.Host} +} + +type neverDialError struct{ host string } + +func (e *neverDialError) Error() string { + return "blockingTransport: outbound network access denied to " + e.host +} + +// withBlockingTransport swaps http.DefaultTransport for the duration of the +// test and returns a counter the test can inspect. +func withBlockingTransport(t *testing.T) *blockingTransport { + t.Helper() + bt := &blockingTransport{} + orig := http.DefaultTransport + http.DefaultTransport = bt + t.Cleanup(func() { http.DefaultTransport = orig }) + return bt +} + +func TestLoadAndVerifyLicense_InvalidLicenseBytes_ReturnsPermanent(t *testing.T) { + bt := withBlockingTransport(t) + + _, err := loadAndVerifyLicense(APIServerParams{ + LicenseBytes: []byte("not a license"), + }) + require.Error(t, err, "expected an error when LicenseBytes is malformed") + + var perm *backoff.PermanentError + require.ErrorAs(t, err, &perm, "expected backoff.Permanent") + require.Zero(t, bt.dials.Load(), "production-mode license parse must not dial upstream") +} + +func TestLoadAndVerifyLicense_ProductionMode_DoesNotDialUpstream(t *testing.T) { + bt := withBlockingTransport(t) + + // We don't care about the success/failure outcome here — only that + // LicenseBytes-mode doesn't reach for the network. Even with bytes + // that fail signature verification, no upstream call is permitted. + _, _ = loadAndVerifyLicense(APIServerParams{ + LicenseBytes: []byte(validLicenseYAML), + }) + require.Zero(t, bt.dials.Load(), "production-mode license load+verify must not dial upstream") +} + +// resetAirgapOverride restores the package-level airgap override after a +// test that flipped it. Without this, the override leaks into subsequent +// tests in the package and silently disables their upstream gates. +func resetAirgapOverride(t *testing.T) { + t.Helper() + t.Cleanup(util.ResetAirgapOverride) +} + +func devLicenseWrapper() licensewrapper.LicenseWrapper { + return licensewrapper.LicenseWrapper{ + V1: &kotsv1beta1.License{ + Spec: kotsv1beta1.LicenseSpec{LicenseType: "dev"}, + }, + } +} + +func prodLicenseWrapper() licensewrapper.LicenseWrapper { + return licensewrapper.LicenseWrapper{ + V1: &kotsv1beta1.License{ + Spec: kotsv1beta1.LicenseSpec{LicenseType: "prod"}, + }, + } +} + +func TestApplyDevOfflineGuard_OffByDefault_NoOp(t *testing.T) { + resetAirgapOverride(t) + require.NoError(t, applyDevOfflineGuard(prodLicenseWrapper(), false)) + require.False(t, util.IsAirgap(), "devOffline=false must not flip airgap override") +} + +func TestApplyDevOfflineGuard_ProdLicense_Rejected(t *testing.T) { + resetAirgapOverride(t) + err := applyDevOfflineGuard(prodLicenseWrapper(), true) + require.Error(t, err) + + var perm *backoff.PermanentError + require.ErrorAs(t, err, &perm, "non-dev license + devOffline must be a permanent failure") + require.False(t, util.IsAirgap(), "rejected guard must not have flipped the airgap override") +} + +func TestApplyDevOfflineGuard_DevLicense_FlipsAirgap(t *testing.T) { + resetAirgapOverride(t) + require.False(t, util.IsAirgap(), "precondition: airgap should be off before guard runs") + + require.NoError(t, applyDevOfflineGuard(devLicenseWrapper(), true)) + require.True(t, util.IsAirgap(), "devOffline=true + dev license must flip the airgap override") +} + +// TestApplyDevOfflineGuard_NoUpstreamDial defends the contract that the +// guard itself never reaches for the network. The guard only inspects +// already-parsed wrapper fields and writes a process-local atomic; if a +// future refactor accidentally introduced an upstream call here it would +// undermine the entire purpose of the offline opt-in. +func TestApplyDevOfflineGuard_NoUpstreamDial(t *testing.T) { + resetAirgapOverride(t) + bt := withBlockingTransport(t) + + require.NoError(t, applyDevOfflineGuard(devLicenseWrapper(), true)) + require.Zero(t, bt.dials.Load(), "devOffline guard must not dial upstream") +} diff --git a/pkg/apiserver/server.go b/pkg/apiserver/server.go index 02660b8d..c2352cc2 100644 --- a/pkg/apiserver/server.go +++ b/pkg/apiserver/server.go @@ -16,10 +16,33 @@ import ( "github.com/replicatedhq/replicated-sdk/pkg/k8sutil" sdklicensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/replicatedhq/replicated-sdk/pkg/logger" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" ) +const ( + // bootstrapRetryInterval is how long the bootstrap retry loop waits + // between attempts when bootstrapCritical returns a transient error. + bootstrapRetryInterval = 10 * time.Second + + // bootstrapCriticalDeadline bounds how long Start() will wait for + // bootstrapCritical before marking the pod Ready in the default + // resilient mode. Critical continues retrying in the background after + // this deadline; the deadline only governs when /healthz flips to 200. + bootstrapCriticalDeadline = 30 * time.Second + + // bootstrapBackgroundMaxRetries caps how many times the background + // phase is retried before the orchestrator gives up. bootstrapBackground + // joins its step errors via stderrors.Join, which strips any + // backoff.Permanent wrapping from individual steps, so without an + // explicit cap the retry loop would spin forever on a persistent + // failure (log-spamming every retryInterval) instead of giving up. + // 30 attempts × 10s ≈ 5 minutes of recovery window; after that the + // heartbeat cron tick (every 4 hours) is the long-term refresh path. + bootstrapBackgroundMaxRetries = 30 +) + type APIServerParams struct { Context context.Context LicenseBytes []byte @@ -42,22 +65,56 @@ type APIServerParams struct { TlsCertSecretName string ReportAllImages bool ReadOnlyMode bool + + // DevOffline, when true, instructs bootstrapCritical to validate + // the loaded license is a dev license and then flip the runtime + // airgap flag so all !util.IsAirgap() gates skip their upstream + // calls. Production licenses are rejected at bootstrap with a + // permanent error so this opt-in cannot silently disable upstream + // behavior in production. Default false. + DevOffline bool } func Start(params APIServerParams) { log.Println("Replicated version:", buildversion.Version()) - backoffDuration := 10 * time.Second - bootstrapFn := func() error { - return bootstrap(params) - } - err := backoff.RetryNotify(bootstrapFn, backoff.NewConstantBackOff(backoffDuration), func(err error, d time.Duration) { - log.Printf("failed to bootstrap, retrying in %s: %v", d, err) - }) + state := startupstate.New() + handlers.SetStartupState(state) + + srv, err := buildServer(params) if err != nil { - log.Fatalf("failed to bootstrap: %v", err) + // Pre-Phase-1 this same condition (most commonly: TLS secret + // missing or malformed) terminated the process with exit code + // 1 via log.Fatalf. Returning early instead would cause the + // cobra RunE to return nil and the binary to exit 0, masking + // a misconfigured deployment as a "successful" run. Keep the + // fatal exit so the kubelet sees a CrashLoopBackOff signal + // and operators get an obvious failure mode. + log.Fatalf("failed to build server: %v", err) } + go func() { + if err := runBootstrap(params, state); err != nil { + log.Fatalf("%v", err) + } + }() + + if params.TlsCertSecretName != "" { + log.Printf("Starting Replicated API on port %d with TLS...\n", 3000) + log.Fatal(srv.ListenAndServeTLS("", "")) + } else { + log.Printf("Starting Replicated API on port %d...\n", 3000) + log.Fatal(srv.ListenAndServe()) + } +} + +// buildServer constructs the HTTP server and registers all routes. It also +// loads TLS material (from a kubernetes Secret) when configured. +// +// This is split out so the listener can be started in the foreground while +// bootstrap runs concurrently, instead of bootstrap blocking listener +// startup as it did pre-Phase-1. +func buildServer(params APIServerParams) (*http.Server, error) { r := mux.NewRouter() r.Use(handlers.CorsMiddleware) @@ -100,26 +157,147 @@ func Start(params APIServerParams) { Addr: ":3000", } - // Configure TLS if certificate name is provided if params.TlsCertSecretName != "" { clientset, err := k8sutil.GetClientset() if err != nil { - logger.Error(errors.Wrap(err, "failed to get clientset")) - return + return nil, errors.Wrap(err, "failed to get clientset") } - tlsConfig, err := loadTLSConfig(clientset, params.Namespace, params.TlsCertSecretName) if err != nil { - log.Fatalf("failed to load TLS config: %v", err) + return nil, errors.Wrap(err, "failed to load TLS config") } srv.TLSConfig = tlsConfig + } - log.Printf("Starting Replicated API on port %d with TLS...\n", 3000) - log.Fatal(srv.ListenAndServeTLS("", "")) - } else { - log.Printf("Starting Replicated API on port %d...\n", 3000) - log.Fatal(srv.ListenAndServe()) + return srv, nil +} + +// bootstrapDeps wires the orchestrator to its collaborators. The default +// production deps are constructed by defaultBootstrapDeps; tests substitute +// fakes by calling runBootstrapWithDeps directly. +type bootstrapDeps struct { + critical func(APIServerParams) error + background func(APIServerParams) error + deadline time.Duration + retryInterval time.Duration + bgMaxRetries uint64 +} + +func defaultBootstrapDeps() bootstrapDeps { + return bootstrapDeps{ + critical: bootstrapCritical, + background: bootstrapBackground, + deadline: bootstrapCriticalDeadline, + retryInterval: bootstrapRetryInterval, + bgMaxRetries: bootstrapBackgroundMaxRetries, + } +} + +// runBootstrap drives the bootstrap state machine. bootstrapCritical is +// retried with backoff; the pod is marked Ready as soon as critical +// succeeds OR the bootstrapCriticalDeadline elapses, whichever comes +// first. After the deadline, critical continues to retry; on eventual +// success the bootstrapBackground phase runs. bootstrapBackground errors +// are logged but never block readiness. +// +// A non-nil return value indicates the process should exit fatally; the +// readiness state has already been transitioned to Failed when this happens +// so a final scrape of /healthz reflects what occurred. Callers (typically +// Start) map a non-nil error to log.Fatalf. +// +// runBootstrap blocks until the bootstrap pipeline has fully resolved (or +// has resolved enough to know it must exit). It is intended to be invoked +// from a goroutine while the HTTP listener runs in the foreground. +func runBootstrap(params APIServerParams, state *startupstate.Tracker) error { + return runBootstrapWithDeps(params, state, defaultBootstrapDeps()) +} + +func runBootstrapWithDeps(params APIServerParams, state *startupstate.Tracker, deps bootstrapDeps) error { + criticalDone := make(chan error, 1) + go func() { + criticalDone <- backoff.RetryNotify( + func() error { return deps.critical(params) }, + backoff.NewConstantBackOff(deps.retryInterval), + func(err error, d time.Duration) { + log.Printf("failed to bootstrap critical, retrying in %s: %v", d, err) + }, + ) + }() + + timer := time.NewTimer(deps.deadline) + defer timer.Stop() + + select { + case criticalErr := <-criticalDone: + if criticalErr != nil { + state.MarkFailed() + return errors.Wrap(criticalErr, "failed to bootstrap critical") + } + state.MarkReady() + + case <-timer.C: + // Critical hasn't completed in the readiness window. Mark Ready + // anyway — handlers will serve whatever the in-memory store has + // (likely empty until critical completes) but the pod won't be + // stuck in CrashLoopBackOff. Block here until critical does + // resolve so we can decide whether to advance to background. + logger.Warnf( + "sdk_ready_after_critical_timeout: bootstrapCritical did not complete within %s; marking pod Ready and continuing critical bootstrap in background", + deps.deadline, + ) + state.MarkReady() + if criticalErr := <-criticalDone; criticalErr != nil { + // We have already signaled Ready, so the kubelet has + // (or is about to) added this pod to its Service + // endpoints. Flipping back to Failed → log.Fatalf would + // produce a Ready→crash→restart→Ready→crash loop where + // every cycle briefly exposes an under-initialized + // store to traffic. The contract is "serve degraded + // rather than flap" — log loudly and keep the pod + // alive. Background work below is skipped because it + // depends on store state that critical never populated; + // the heartbeat won't run, but existing endpoints will + // continue to answer with whatever defaults the + // in-memory store has. + logger.Errorf( + "sdk_critical_failed_after_readiness: bootstrapCritical failed after the readiness deadline; pod stays Ready to avoid a Ready→crash flap, but the in-memory store may be under-initialized and downstream handlers may return degraded data: %v", + criticalErr, + ) + return nil + } + // Symmetric counterpart to sdk_ready_after_critical_timeout — + // gives operators a closing log line when the deferred critical + // path eventually succeeds, instead of leaving them to infer it + // from the absence of further retry warnings. + logger.Infof("sdk_critical_succeeded_after_readiness: bootstrapCritical completed after the readiness deadline; advancing to background phase") + } + + // Retry bootstrapBackground for a bounded number of attempts. Without + // this loop, a transient failure on the first attempt (e.g. + // heartbeat.Start hitting a momentary cron init issue, or upstream + // being briefly unreachable while the pod was already marked Ready + // via the timeout path) would leave the heartbeat cron job and any + // subsequent license refreshes disabled for the entire pod lifetime. + // + // The cap matters because bootstrapBackground returns + // stderrors.Join(errs...), which strips any backoff.Permanent + // wrapping from inner steps; without WithMaxRetries, a persistent + // background failure would log-spam every retryInterval forever and + // the give-up branch below would be dead code. After the cap, the + // heartbeat cron tick (every 4 hours) takes over as the long-term + // refresh path. Errors are still logged at Warn level on each + // attempt and never block the pod from staying Ready. + bgBackoff := backoff.WithMaxRetries(backoff.NewConstantBackOff(deps.retryInterval), deps.bgMaxRetries) + if err := backoff.RetryNotify( + func() error { return deps.background(params) }, + bgBackoff, + func(err error, d time.Duration) { + logger.Warnf("bootstrap background phase failed, retrying in %s (handlers continue serving from in-memory store): %v", d, err) + }, + ); err != nil { + logger.Warnf("bootstrap background phase gave up (handlers continue serving from in-memory store; heartbeat cron will retry on next tick): %v", err) } + return nil } // loadTLSConfig loads TLS certificate and key from a Kubernetes secret diff --git a/pkg/apiserver/server_test.go b/pkg/apiserver/server_test.go new file mode 100644 index 00000000..0f47d812 --- /dev/null +++ b/pkg/apiserver/server_test.go @@ -0,0 +1,199 @@ +package apiserver + +import ( + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/cenkalti/backoff/v4" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" + "github.com/stretchr/testify/require" +) + +// fastDeps returns bootstrapDeps with short timing so tests run quickly. +// Tests pass critical and background as parameters because each scenario +// needs its own behavior. +func fastDeps(critical, background func(APIServerParams) error) bootstrapDeps { + return bootstrapDeps{ + critical: critical, + background: background, + deadline: 50 * time.Millisecond, + retryInterval: 5 * time.Millisecond, + bgMaxRetries: 100, // generous; individual tests override when they want to exercise the cap + } +} + +func TestRunBootstrap_CriticalTimesOutThenSucceeds(t *testing.T) { + state := startupstate.New() + var bgRan atomic.Bool + criticalCalls := 0 + criticalSlow := func(APIServerParams) error { + criticalCalls++ + // First two attempts: pretend the upstream is slow / failing. + // Third attempt: succeed. The first two attempts together must + // outlast the deadline so we exercise the timeout-then-success + // path. + if criticalCalls < 3 { + time.Sleep(40 * time.Millisecond) + return errors.New("transient") + } + return nil + } + + deps := fastDeps( + criticalSlow, + func(APIServerParams) error { bgRan.Store(true); return nil }, + ) + deps.deadline = 30 * time.Millisecond + + start := time.Now() + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps)) + elapsed := time.Since(start) + + require.True(t, state.IsReady(), "expected state Ready, got %s", state.Get()) + require.True(t, bgRan.Load(), "expected background to run once critical eventually succeeded") + require.GreaterOrEqual(t, elapsed, deps.deadline, "expected to wait at least the deadline") +} + +func TestRunBootstrap_CriticalPermanentError_FailsFast(t *testing.T) { + state := startupstate.New() + var bgRan atomic.Bool + + deps := fastDeps( + func(APIServerParams) error { + return backoff.Permanent(errors.New("license is expired")) + }, + func(APIServerParams) error { bgRan.Store(true); return nil }, + ) + + err := runBootstrapWithDeps(APIServerParams{}, state, deps) + require.Error(t, err) + require.Equal(t, startupstate.Failed, state.Get()) + require.False(t, bgRan.Load(), "background must not run after a permanent critical failure") +} + +func TestRunBootstrap_BackgroundPermanentFailureDoesNotAffectReady(t *testing.T) { + state := startupstate.New() + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + // Permanent so the retry loop gives up quickly — otherwise + // this test would block forever, since the loop must keep + // retrying transient background errors (heartbeat startup, + // upstream sync) for the pod's entire lifetime. + return backoff.Permanent(errors.New("upstream sync permanently failed")) + }, + ) + + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), "background failures should not bubble up") + require.True(t, state.IsReady(), "expected state Ready despite background failure") +} + +// TestRunBootstrap_BackgroundRetriesUntilSuccess verifies that the +// orchestrator keeps retrying bootstrapBackground after transient +// failures rather than swallowing them. Without this retry, a momentary +// hiccup on the first background attempt (e.g. heartbeat cron init, +// upstream license sync) would silently disable the heartbeat job and +// every subsequent license refresh for the entire pod lifetime. +func TestRunBootstrap_BackgroundRetriesUntilSuccess(t *testing.T) { + state := startupstate.New() + var bgCalls atomic.Int32 + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + n := bgCalls.Add(1) + if n < 3 { + return errors.New("transient") + } + return nil + }, + ) + + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps)) + require.True(t, state.IsReady()) + require.GreaterOrEqual(t, bgCalls.Load(), int32(3), "must retry transient background failures") +} + +// TestRunBootstrap_BackgroundGivesUpAfterMaxRetries verifies the retry +// loop terminates in finite time when bootstrapBackground returns +// transient errors that never resolve. bootstrapBackground in production +// returns stderrors.Join(errs...), which strips backoff.Permanent +// wrapping from inner steps; without an explicit max-retries cap, the +// loop would log-spam every retryInterval forever and the give-up +// branch in runBootstrapWithDeps would be dead code. This test exercises +// the cap. +func TestRunBootstrap_BackgroundGivesUpAfterMaxRetries(t *testing.T) { + state := startupstate.New() + var bgCalls atomic.Int32 + + deps := fastDeps( + func(APIServerParams) error { return nil }, + func(APIServerParams) error { + bgCalls.Add(1) + // Plain transient error — never wrapped in + // backoff.Permanent. Without WithMaxRetries this + // would retry forever. + return errors.New("persistent transient failure") + }, + ) + deps.bgMaxRetries = 3 + + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), + "background give-up must not bubble up — pod stays Ready") + require.True(t, state.IsReady()) + // WithMaxRetries(b, n) allows n retries AFTER the initial attempt, + // so total calls is n+1. + require.Equal(t, int32(deps.bgMaxRetries+1), bgCalls.Load(), + "loop must terminate after bgMaxRetries+1 total attempts, not retry forever") +} + +// TestRunBootstrap_CriticalFailsAfterDeadline_StaysReadyButSkipsBackground +// pins the deliberate "false-readiness over Ready→crash flap" trade-off +// in runBootstrapWithDeps: when bootstrapCritical outlasts the readiness +// timer (so the pod has already been marked Ready) and then ultimately +// returns a permanent error, the pod stays Ready and the background +// phase is intentionally skipped. The orchestrator returns nil because +// flipping back to Failed → log.Fatalf would produce a +// Ready→crash→restart→Ready→crash loop where every cycle briefly exposes +// an under-initialized store to traffic. +// +// A future change that "fixes" this perceived false-Ready by transitioning +// to Failed, by running background anyway, or by returning a non-nil +// error from runBootstrap would silently alter a documented contract. +// This test should fail in that case so the change is conscious. +func TestRunBootstrap_CriticalFailsAfterDeadline_StaysReadyButSkipsBackground(t *testing.T) { + state := startupstate.New() + var bgRan atomic.Bool + var criticalCalls atomic.Int32 + + criticalSlowThenPermanent := func(APIServerParams) error { + n := criticalCalls.Add(1) + if n == 1 { + // First attempt: sleep past the deadline so the timer + // fires and the orchestrator marks Ready before we + // return. The returned error must be transient so + // RetryNotify schedules a second attempt. + time.Sleep(40 * time.Millisecond) + return errors.New("transient first failure outlasting deadline") + } + // Second attempt: permanent so RetryNotify gives up and the + // orchestrator observes a non-nil criticalErr post-deadline. + return backoff.Permanent(errors.New("license is unrecoverable")) + } + + deps := fastDeps( + criticalSlowThenPermanent, + func(APIServerParams) error { bgRan.Store(true); return nil }, + ) + deps.deadline = 30 * time.Millisecond + + require.NoError(t, runBootstrapWithDeps(APIServerParams{}, state, deps), + "post-deadline critical failure must not bubble up — pod stays Ready") + require.True(t, state.IsReady(), + "pod must stay Ready after the Ready→fail transition; flipping to Failed would cause a Ready→crash flap") + require.False(t, bgRan.Load(), + "background must be skipped when critical permanently fails after the readiness deadline") +} diff --git a/pkg/apiserver/testdata_test.go b/pkg/apiserver/testdata_test.go new file mode 100644 index 00000000..9159c89e --- /dev/null +++ b/pkg/apiserver/testdata_test.go @@ -0,0 +1,57 @@ +package apiserver + +// validLicenseYAML is a signed test license that round-trips through the +// embedded licensewrapper public key. Sourced from +// pkg/license/signature_test.go's "basic valid signature" fixture. +// +// We carry our own copy so the apiserver test suite can verify +// loadAndVerifyLicense end-to-end without importing test code from another +// package. +const validLicenseYAML = `apiVersion: kots.io/v1beta1 +kind: License +metadata: + name: testcustomer +spec: + appSlug: my-app + channelID: 1vusIYZLAVxMG6q760OJmRKj5i5 + channelName: My Channel + customerName: Test Customer + endpoint: https://replicated.app + entitlements: + bool_field: + title: Bool Field + value: true + valueType: Boolean + expires_at: + description: License Expiration + title: Expiration + value: "2030-07-27T00:00:00Z" + valueType: String + hidden_field: + isHidden: true + title: Hidden Field + value: this is secret + valueType: String + int_field: + title: Int Field + value: 123 + valueType: Integer + string_field: + title: StringField + value: single line text + valueType: String + text_field: + title: Text Field + value: |- + multi + line + text + valueType: Text + isAirgapSupported: true + isGitOpsSupported: true + isSnapshotSupported: true + licenseID: 1vusOokxAVp1tkRGuyxnF23PJcq + licenseSequence: 7 + licenseType: prod + signature: eyJsaWNlbnNlRGF0YSI6ImV5SmhjR2xXWlhKemFXOXVJam9pYTI5MGN5NXBieTkyTVdKbGRHRXhJaXdpYTJsdVpDSTZJa3hwWTJWdWMyVWlMQ0p0WlhSaFpHRjBZU0k2ZXlKdVlXMWxJam9pZEdWemRHTjFjM1J2YldWeUluMHNJbk53WldNaU9uc2liR2xqWlc1elpVbEVJam9pTVhaMWMwOXZhM2hCVm5BeGRHdFNSM1Y1ZUc1R01qTlFTbU54SWl3aWJHbGpaVzV6WlZSNWNHVWlPaUp3Y205a0lpd2lZM1Z6ZEc5dFpYSk9ZVzFsSWpvaVZHVnpkQ0JEZFhOMGIyMWxjaUlzSW1Gd2NGTnNkV2NpT2lKdGVTMWhjSEFpTENKamFHRnVibVZzU1VRaU9pSXhkblZ6U1ZsYVRFRldlRTFITm5FM05qQlBTbTFTUzJvMWFUVWlMQ0pqYUdGdWJtVnNUbUZ0WlNJNklrMTVJRU5vWVc1dVpXd2lMQ0pzYVdObGJuTmxVMlZ4ZFdWdVkyVWlPamNzSW1WdVpIQnZhVzUwSWpvaWFIUjBjSE02THk5eVpYQnNhV05oZEdWa0xtRndjQ0lzSW1WdWRHbDBiR1Z0Wlc1MGN5STZleUppYjI5c1gyWnBaV3hrSWpwN0luUnBkR3hsSWpvaVFtOXZiQ0JHYVdWc1pDSXNJblpoYkhWbElqcDBjblZsTENKMllXeDFaVlI1Y0dVaU9pSkNiMjlzWldGdUluMHNJbVY0Y0dseVpYTmZZWFFpT25zaWRHbDBiR1VpT2lKRmVIQnBjbUYwYVc5dUlpd2laR1Z6WTNKcGNIUnBiMjRpT2lKTWFXTmxibk5sSUVWNGNHbHlZWFJwYjI0aUxDSjJZV3gxWlNJNklqSXdNekF0TURjdE1qZFVNREE2TURBNk1EQmFJaXdpZG1Gc2RXVlVlWEJsSWpvaVUzUnlhVzVuSW4wc0ltaHBaR1JsYmw5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtocFpHUmxiaUJHYVdWc1pDSXNJblpoYkhWbElqb2lkR2hwY3lCcGN5QnpaV055WlhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lMQ0pwYzBocFpHUmxiaUk2ZEhKMVpYMHNJbWx1ZEY5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtsdWRDQkdhV1ZzWkNJc0luWmhiSFZsSWpveE1qTXNJblpoYkhWbFZIbHdaU0k2SWtsdWRHVm5aWElpZlN3aWMzUnlhVzVuWDJacFpXeGtJanA3SW5ScGRHeGxJam9pVTNSeWFXNW5SbWxsYkdRaUxDSjJZV3gxWlNJNkluTnBibWRzWlNCc2FXNWxJSFJsZUhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lmU3dpZEdWNGRGOW1hV1ZzWkNJNmV5SjBhWFJzWlNJNklsUmxlSFFnUm1sbGJHUWlMQ0oyWVd4MVpTSTZJbTExYkhScFhHNXNhVzVsWEc1MFpYaDBJaXdpZG1Gc2RXVlVlWEJsSWpvaVZHVjRkQ0o5ZlN3aWFYTkJhWEpuWVhCVGRYQndiM0owWldRaU9uUnlkV1VzSW1selIybDBUM0J6VTNWd2NHOXlkR1ZrSWpwMGNuVmxMQ0pwYzFOdVlYQnphRzkwVTNWd2NHOXlkR1ZrSWpwMGNuVmxmWDA9IiwiaW5uZXJTaWduYXR1cmUiOiJleUpzYVdObGJuTmxVMmxuYm1GMGRYSmxJam9pYUhneE1XTXZUR1ozUTNoVE5YRmtRWEJGU1hGdVRrMU9NMHBLYTJzNFZHZFhSVVpzVDFKVlJ6UjJjR1YzZEZoV1YzbG1lamRZY0hBd1ExazJZamRyUVRSS2N6TklhR3d3YkZJMFdUQTFMemN2UVVkQ2FEZFZNSGczUkhaTVozUXpVM00wYm5GTFZTdFhXRXBTVHpKWVFVRnZSME4xZFRWR1RGcHJRVWhYY1RSUVFtMXphSFY2Y1ZsdmNucHhlbGhGWVZWVlpFUlVkVXhDTW1nNWFIZ3dXRWhQUmxwUk16bHVkbTlPUjJaT2R5OTRTVmRaZEhSUGRYZHZhMncyTVZsb1JVeFZlRmQxU1ZSRmMwTlVhM2xtTVRNd09IazVSbFJzWlRKeVYyZEVlSEZNYTBSUFNXVXlPRWwzUzJSQkwySXdWVUl5VEZGbVRWcHdWemwyUTNCSkwybHlWek5uYmpaeU5WWjNWMjB2U1dweWJtNDNSelJrVmpadVYzcFRkMGhQUTJSdWEwMTRNRXQ1VVVOa0wxQjFaWEpUYjNSdVEwOXRTMDEzWlRSTGJqaERkMU5YVVRRNGRURkRNbTFpV1VzeGRYTlpOM1YzUFQwaUxDSndkV0pzYVdOTFpYa2lPaUl0TFMwdExVSkZSMGxPSUZCVlFreEpReUJMUlZrdExTMHRMVnh1VFVsSlFrbHFRVTVDWjJ0eGFHdHBSemwzTUVKQlVVVkdRVUZQUTBGUk9FRk5TVWxDUTJkTFEwRlJSVUZ6TkhKdlVIcDFhV1JNZVhOMmIxWTJkemxhTkZ4dVdHRmliME5tWTJNeGFHZFZhQ3N3V1VkS2NFNURSVXhyTjBaTFF5OTJhemR6ZERsR05tY3dUMjlrU0VSbGVYZFJXa2hLZFU1TVpsUnNRbEJHUTJOaU5seHVObTlzVEZOeWNGQTRjbFUzU0d4SGJsRkVSMFJNYVhkS1EyaGtSRGRVVUdSM2FXdHBkMHRGY201aldqaEdaalZsU25vd2RETmlUWFpyVDJaVVluSkJiRnh1WWtGQ1kwbzVNVmxVT1hKdVVXOXFkVWN4UldKUVRqaEZWblI2TWxZNE5IZHViR2Q0TUhCd2JEVjRPSFpOYlhwcE1ISnVibEZVV1VGamJ6WnFhMnBJTTF4dVRuTlVkWE4xUzFkdlJGUjVNWE5yZGtSUk9IbEJZV0ptWTNNME4zWnNRazAwU0RGT1JFNHZSSFJhWWxZdllubDJia0o2YkM4eFZrVnpURmRqWlZWcFRGeHVSWEYxT0VkeWF5dFFVRGQyUkdSd2JFUjNjWFpQV2t4RmRYazNkamhuUm01U09WUlVSV3ByTlVvNWRuWlVTR2RtU25VemVubEVPR2xLWTBSRE5YcHFPVnh1YjFGSlJFRlJRVUpjYmkwdExTMHRSVTVFSUZCVlFreEpReUJMUlZrdExTMHRMVnh1SWl3aWEyVjVVMmxuYm1GMGRYSmxJam9pWlhsS2VtRlhaSFZaV0ZJeFkyMVZhVTlwU2pCUldIQjJXVE5LVms1NmFGaFNSMlJzVVRKb2NtTklXa1ZVVlRsRldqQktXVTFGUmtaVFJFNUZVMGhLYkUxclRUTkxNSEJFVkROR2VGTnROVVJVVlRWVlltMDFiVnBGUm5sWldIQjZaRVJqTVZaSGFFeFBXRUpVVWtacmRrd3diek5aTUZaSlVteFdWRXd5T1VoV1JXeHNWa1ZPTUZSSE1WWlJNR04zVkd4R2JGa3pTblJUUm1zMFZVWk9hMVpWU2pCVU1WbDNZbXQwY0ZSclZuQmpia0poVFZjNWFtSldiSEZaYTNob1UyeHNWV0pGUmtWWGJVWnZWakZLVUZkcWJGSmhXRVp1V2xkb1EyRnVRak5TUjNNd1lWWkpOVTVXVmxkV1ZUVnlUMGhLYjFsVlRYbGhiVGcwVjBkYWVGbHFWbFppYlhoeFpFWkZkMDU1Y3pCaFZsSkpWRVpPTm1WRk1IcGxWWFJ2VFVaR1ZtRXdWVFJSVnpsSFVsaEtVRTFZUmxCU01WcFJVMVJDTmxsV2FIcFdWWEJ0WTBSU2JFMVVRazlPVjNSU1ZucFdUMU5XWTNaU1ZYUkZVMGhzYlU5VmJGaGtNMUl3WTFWc1lXTlhSakJTYTA1RVlVWmtjbUo2VmtSU00wSllUREkxUmsxWVl6SmxWM1JKVlZoQk1sVXhTbEppU0Zwd1VrVXdNRlpFVWt0VU1rWnNVVmQwYzFSV1VrMVVWV055V1RCYVRHSXpaRTlUVm05NVlraE9SR1JzVG5aUmFrWmFaVmRPVGxOVlNteGFiRXB1Wld0U2RVMHhSVGxRVTBselNXMWtjMkl5U21oaVJYUnNaVlZzYTBscWIybFpiVkpzV2xSVk1rNVVXWGRaTWxwcFRrUk9hazlYU1hsUFIwcHRUMVJvYkZsWFRtaGFiVVV5VGtSWmFXWlJQVDBpZlE9PSJ9 +` diff --git a/pkg/handlers/healthz.go b/pkg/handlers/healthz.go index 37b19969..31d2945d 100644 --- a/pkg/handlers/healthz.go +++ b/pkg/handlers/healthz.go @@ -2,18 +2,50 @@ package handlers import ( "net/http" + "sync/atomic" "github.com/replicatedhq/replicated-sdk/pkg/buildversion" + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" ) type HealthzResponse struct { Version string `json:"version"` + Status string `json:"status,omitempty"` +} + +// startupTracker is a package-level pointer to the bootstrap-state tracker. +// apiserver.Start() installs the tracker via SetStartupState before serving. +// We use atomic.Pointer so reads from /healthz are safe regardless of when +// the tracker is installed. +var startupTracker atomic.Pointer[startupstate.Tracker] + +// SetStartupState installs the bootstrap-state tracker for /healthz to consult. +// Pass nil to clear (used by tests). +func SetStartupState(t *startupstate.Tracker) { + startupTracker.Store(t) } func Healthz(w http.ResponseWriter, r *http.Request) { - healthzResponse := HealthzResponse{ + t := startupTracker.Load() + // Fail closed: if no tracker is installed, treat the SDK as still + // starting. Production wiring always installs a tracker before the + // listener accepts traffic, so reaching this branch indicates either a + // misconfigured caller or a regression — either way, 503 is the safe + // answer. + state := startupstate.Starting + if t != nil { + state = t.Get() + } + + resp := HealthzResponse{ Version: buildversion.Version(), + Status: state.String(), } - JSON(w, http.StatusOK, healthzResponse) + switch state { + case startupstate.Ready: + JSON(w, http.StatusOK, resp) + default: + JSON(w, http.StatusServiceUnavailable, resp) + } } diff --git a/pkg/handlers/healthz_test.go b/pkg/handlers/healthz_test.go new file mode 100644 index 00000000..e29a7b98 --- /dev/null +++ b/pkg/handlers/healthz_test.go @@ -0,0 +1,44 @@ +package handlers + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/replicatedhq/replicated-sdk/pkg/startupstate" + "github.com/stretchr/testify/require" +) + +func TestHealthz_Starting_Returns503(t *testing.T) { + tr := startupstate.New() + SetStartupState(tr) + t.Cleanup(func() { SetStartupState(nil) }) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/healthz", nil) + Healthz(w, r) + + require.Equal(t, http.StatusServiceUnavailable, w.Code) + + var body HealthzResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + require.Equal(t, "starting", body.Status) +} + +func TestHealthz_Ready_Returns200(t *testing.T) { + tr := startupstate.New() + tr.MarkReady() + SetStartupState(tr) + t.Cleanup(func() { SetStartupState(nil) }) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/healthz", nil) + Healthz(w, r) + + require.Equal(t, http.StatusOK, w.Code) + + var body HealthzResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + require.Equal(t, "ready", body.Status) +} diff --git a/pkg/heartbeat/heartbeat.go b/pkg/heartbeat/heartbeat.go index 4b13c910..8dee525d 100644 --- a/pkg/heartbeat/heartbeat.go +++ b/pkg/heartbeat/heartbeat.go @@ -61,6 +61,12 @@ func Start() error { } } + // Clientset acquisition is scoped to the report call that needs + // it. License sync above no longer needs a clientset (it talks + // directly to the upstream Vendor Portal), so a transient k8s + // API hiccup must not gate it. Hoisting GetClientset above the + // license sync would couple the two and silently regress the + // per-tick resilience of license refresh. go func() { clientset, err := k8sutil.GetClientset() if err != nil { diff --git a/pkg/license/testdata_test.go b/pkg/license/testdata_test.go new file mode 100644 index 00000000..897a5300 --- /dev/null +++ b/pkg/license/testdata_test.go @@ -0,0 +1,54 @@ +package license + +// validLicenseYAML is a signed test license that round-trips through the +// embedded licensewrapper public key. Sourced from signature_test.go's +// "basic valid signature" fixture, exposed as a constant here so the +// sync_test.go suite can reuse it. +const validLicenseYAML = `apiVersion: kots.io/v1beta1 +kind: License +metadata: + name: testcustomer +spec: + appSlug: my-app + channelID: 1vusIYZLAVxMG6q760OJmRKj5i5 + channelName: My Channel + customerName: Test Customer + endpoint: https://replicated.app + entitlements: + bool_field: + title: Bool Field + value: true + valueType: Boolean + expires_at: + description: License Expiration + title: Expiration + value: "2030-07-27T00:00:00Z" + valueType: String + hidden_field: + isHidden: true + title: Hidden Field + value: this is secret + valueType: String + int_field: + title: Int Field + value: 123 + valueType: Integer + string_field: + title: StringField + value: single line text + valueType: String + text_field: + title: Text Field + value: |- + multi + line + text + valueType: Text + isAirgapSupported: true + isGitOpsSupported: true + isSnapshotSupported: true + licenseID: 1vusOokxAVp1tkRGuyxnF23PJcq + licenseSequence: 7 + licenseType: prod + signature: eyJsaWNlbnNlRGF0YSI6ImV5SmhjR2xXWlhKemFXOXVJam9pYTI5MGN5NXBieTkyTVdKbGRHRXhJaXdpYTJsdVpDSTZJa3hwWTJWdWMyVWlMQ0p0WlhSaFpHRjBZU0k2ZXlKdVlXMWxJam9pZEdWemRHTjFjM1J2YldWeUluMHNJbk53WldNaU9uc2liR2xqWlc1elpVbEVJam9pTVhaMWMwOXZhM2hCVm5BeGRHdFNSM1Y1ZUc1R01qTlFTbU54SWl3aWJHbGpaVzV6WlZSNWNHVWlPaUp3Y205a0lpd2lZM1Z6ZEc5dFpYSk9ZVzFsSWpvaVZHVnpkQ0JEZFhOMGIyMWxjaUlzSW1Gd2NGTnNkV2NpT2lKdGVTMWhjSEFpTENKamFHRnVibVZzU1VRaU9pSXhkblZ6U1ZsYVRFRldlRTFITm5FM05qQlBTbTFTUzJvMWFUVWlMQ0pqYUdGdWJtVnNUbUZ0WlNJNklrMTVJRU5vWVc1dVpXd2lMQ0pzYVdObGJuTmxVMlZ4ZFdWdVkyVWlPamNzSW1WdVpIQnZhVzUwSWpvaWFIUjBjSE02THk5eVpYQnNhV05oZEdWa0xtRndjQ0lzSW1WdWRHbDBiR1Z0Wlc1MGN5STZleUppYjI5c1gyWnBaV3hrSWpwN0luUnBkR3hsSWpvaVFtOXZiQ0JHYVdWc1pDSXNJblpoYkhWbElqcDBjblZsTENKMllXeDFaVlI1Y0dVaU9pSkNiMjlzWldGdUluMHNJbVY0Y0dseVpYTmZZWFFpT25zaWRHbDBiR1VpT2lKRmVIQnBjbUYwYVc5dUlpd2laR1Z6WTNKcGNIUnBiMjRpT2lKTWFXTmxibk5sSUVWNGNHbHlZWFJwYjI0aUxDSjJZV3gxWlNJNklqSXdNekF0TURjdE1qZFVNREE2TURBNk1EQmFJaXdpZG1Gc2RXVlVlWEJsSWpvaVUzUnlhVzVuSW4wc0ltaHBaR1JsYmw5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtocFpHUmxiaUJHYVdWc1pDSXNJblpoYkhWbElqb2lkR2hwY3lCcGN5QnpaV055WlhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lMQ0pwYzBocFpHUmxiaUk2ZEhKMVpYMHNJbWx1ZEY5bWFXVnNaQ0k2ZXlKMGFYUnNaU0k2SWtsdWRDQkdhV1ZzWkNJc0luWmhiSFZsSWpveE1qTXNJblpoYkhWbFZIbHdaU0k2SWtsdWRHVm5aWElpZlN3aWMzUnlhVzVuWDJacFpXeGtJanA3SW5ScGRHeGxJam9pVTNSeWFXNW5SbWxsYkdRaUxDSjJZV3gxWlNJNkluTnBibWRzWlNCc2FXNWxJSFJsZUhRaUxDSjJZV3gxWlZSNWNHVWlPaUpUZEhKcGJtY2lmU3dpZEdWNGRGOW1hV1ZzWkNJNmV5SjBhWFJzWlNJNklsUmxlSFFnUm1sbGJHUWlMQ0oyWVd4MVpTSTZJbTExYkhScFhHNXNhVzVsWEc1MFpYaDBJaXdpZG1Gc2RXVlVlWEJsSWpvaVZHVjRkQ0o5ZlN3aWFYTkJhWEpuWVhCVGRYQndiM0owWldRaU9uUnlkV1VzSW1selIybDBUM0J6VTNWd2NHOXlkR1ZrSWpwMGNuVmxMQ0pwYzFOdVlYQnphRzkwVTNWd2NHOXlkR1ZrSWpwMGNuVmxmWDA9IiwiaW5uZXJTaWduYXR1cmUiOiJleUpzYVdObGJuTmxVMmxuYm1GMGRYSmxJam9pYUhneE1XTXZUR1ozUTNoVE5YRmtRWEJGU1hGdVRrMU9NMHBLYTJzNFZHZFhSVVpzVDFKVlJ6UjJjR1YzZEZoV1YzbG1lamRZY0hBd1ExazJZamRyUVRSS2N6TklhR3d3YkZJMFdUQTFMemN2UVVkQ2FEZFZNSGczUkhaTVozUXpVM00wYm5GTFZTdFhXRXBTVHpKWVFVRnZSME4xZFRWR1RGcHJRVWhYY1RSUVFtMXphSFY2Y1ZsdmNucHhlbGhGWVZWVlpFUlVkVXhDTW1nNWFIZ3dXRWhQUmxwUk16bHVkbTlPUjJaT2R5OTRTVmRaZEhSUGRYZHZhMncyTVZsb1JVeFZlRmQxU1ZSRmMwTlVhM2xtTVRNd09IazVSbFJzWlRKeVYyZEVlSEZNYTBSUFNXVXlPRWwzUzJSQkwySXdWVUl5VEZGbVRWcHdWemwyUTNCSkwybHlWek5uYmpaeU5WWjNWMjB2U1dweWJtNDNSelJrVmpadVYzcFRkMGhQUTJSdWEwMTRNRXQ1VVVOa0wxQjFaWEpUYjNSdVEwOXRTMDEzWlRSTGJqaERkMU5YVVRRNGRURkRNbTFpV1VzeGRYTlpOM1YzUFQwaUxDSndkV0pzYVdOTFpYa2lPaUl0TFMwdExVSkZSMGxPSUZCVlFreEpReUJMUlZrdExTMHRMVnh1VFVsSlFrbHFRVTVDWjJ0eGFHdHBSemwzTUVKQlVVVkdRVUZQUTBGUk9FRk5TVWxDUTJkTFEwRlJSVUZ6TkhKdlVIcDFhV1JNZVhOMmIxWTJkemxhTkZ4dVdHRmliME5tWTJNeGFHZFZhQ3N3V1VkS2NFNURSVXhyTjBaTFF5OTJhemR6ZERsR05tY3dUMjlrU0VSbGVYZFJXa2hLZFU1TVpsUnNRbEJHUTJOaU5seHVObTlzVEZOeWNGQTRjbFUzU0d4SGJsRkVSMFJNYVhkS1EyaGtSRGRVVUdSM2FXdHBkMHRGY201aldqaEdaalZsU25vd2RETmlUWFpyVDJaVVluSkJiRnh1WWtGQ1kwbzVNVmxVT1hKdVVXOXFkVWN4UldKUVRqaEZWblI2TWxZNE5IZHViR2Q0TUhCd2JEVjRPSFpOYlhwcE1ISnVibEZVV1VGamJ6WnFhMnBJTTF4dVRuTlVkWE4xUzFkdlJGUjVNWE5yZGtSUk9IbEJZV0ptWTNNME4zWnNRazAwU0RGT1JFNHZSSFJhWWxZdllubDJia0o2YkM4eFZrVnpURmRqWlZWcFRGeHVSWEYxT0VkeWF5dFFVRGQyUkdSd2JFUjNjWFpQV2t4RmRYazNkamhuUm01U09WUlVSV3ByTlVvNWRuWlVTR2RtU25VemVubEVPR2xLWTBSRE5YcHFPVnh1YjFGSlJFRlJRVUpjYmkwdExTMHRSVTVFSUZCVlFreEpReUJMUlZrdExTMHRMVnh1SWl3aWEyVjVVMmxuYm1GMGRYSmxJam9pWlhsS2VtRlhaSFZaV0ZJeFkyMVZhVTlwU2pCUldIQjJXVE5LVms1NmFGaFNSMlJzVVRKb2NtTklXa1ZVVlRsRldqQktXVTFGUmtaVFJFNUZVMGhLYkUxclRUTkxNSEJFVkROR2VGTnROVVJVVlRWVlltMDFiVnBGUm5sWldIQjZaRVJqTVZaSGFFeFBXRUpVVWtacmRrd3diek5aTUZaSlVteFdWRXd5T1VoV1JXeHNWa1ZPTUZSSE1WWlJNR04zVkd4R2JGa3pTblJUUm1zMFZVWk9hMVpWU2pCVU1WbDNZbXQwY0ZSclZuQmpia0poVFZjNWFtSldiSEZaYTNob1UyeHNWV0pGUmtWWGJVWnZWakZLVUZkcWJGSmhXRVp1V2xkb1EyRnVRak5TUjNNd1lWWkpOVTVXVmxkV1ZUVnlUMGhLYjFsVlRYbGhiVGcwVjBkYWVGbHFWbFppYlhoeFpFWkZkMDU1Y3pCaFZsSkpWRVpPTm1WRk1IcGxWWFJ2VFVaR1ZtRXdWVFJSVnpsSFVsaEtVRTFZUmxCU01WcFJVMVJDTmxsV2FIcFdWWEJ0WTBSU2JFMVVRazlPVjNSU1ZucFdUMU5XWTNaU1ZYUkZVMGhzYlU5VmJGaGtNMUl3WTFWc1lXTlhSakJTYTA1RVlVWmtjbUo2VmtSU00wSllUREkxUmsxWVl6SmxWM1JKVlZoQk1sVXhTbEppU0Zwd1VrVXdNRlpFVWt0VU1rWnNVVmQwYzFSV1VrMVVWV055V1RCYVRHSXpaRTlUVm05NVlraE9SR1JzVG5aUmFrWmFaVmRPVGxOVlNteGFiRXB1Wld0U2RVMHhSVGxRVTBselNXMWtjMkl5U21oaVJYUnNaVlZzYTBscWIybFpiVkpzV2xSVk1rNVVXWGRaTWxwcFRrUk9hazlYU1hsUFIwcHRUMVJvYkZsWFRtaGFiVVV5VGtSWmFXWlJQVDBpZlE9PSJ9 +` diff --git a/pkg/startupstate/startupstate.go b/pkg/startupstate/startupstate.go new file mode 100644 index 00000000..d7ac2242 --- /dev/null +++ b/pkg/startupstate/startupstate.go @@ -0,0 +1,93 @@ +// Package startupstate exposes a shared, concurrency-safe enum that tracks the +// lifecycle of the SDK's bootstrap process. It is consumed by the apiserver +// (which transitions the state) and by the /healthz handler (which reports the +// state to readiness probes). +// +// The package lives outside both apiserver and handlers to avoid an import +// cycle between them. +package startupstate + +import "sync/atomic" + +// State enumerates the bootstrap lifecycle phases. +type State int32 + +const ( + // Starting indicates the API server is up but bootstrapCritical has + // not yet completed. + Starting State = iota + // Ready indicates the SDK is ready to serve requests. With the default + // configuration this fires after bootstrapCritical succeeds or after + // the readiness timeout elapses, whichever comes first. + Ready + // Failed indicates bootstrap encountered a permanent (non-retryable) + // error. /healthz will surface 503 in this state and the process will + // typically exit shortly after. + Failed +) + +// String returns a stable, lower-case representation suitable for logs and +// JSON payloads. +func (s State) String() string { + switch s { + case Starting: + return "starting" + case Ready: + return "ready" + case Failed: + return "failed" + default: + return "unknown" + } +} + +// Tracker holds the current bootstrap state. The zero value is usable and +// reports Starting. +type Tracker struct { + state atomic.Int32 +} + +// New returns a Tracker initialized to Starting. +func New() *Tracker { + return &Tracker{} +} + +// Get returns the current state. +func (t *Tracker) Get() State { + return State(t.state.Load()) +} + +// Set unconditionally writes the supplied state. +func (t *Tracker) Set(s State) { + t.state.Store(int32(s)) +} + +// MarkReady transitions the tracker to Ready unless it is already Failed. +// Returns true if the call caused the transition. +// +// Failed is sticky: once a permanent bootstrap error has been recorded we +// never silently flip back to Ready. +func (t *Tracker) MarkReady() bool { + for { + current := State(t.state.Load()) + if current == Failed { + return false + } + if current == Ready { + return false + } + if t.state.CompareAndSwap(int32(current), int32(Ready)) { + return true + } + } +} + +// MarkFailed transitions the tracker to Failed. Failed is terminal. +func (t *Tracker) MarkFailed() { + t.state.Store(int32(Failed)) +} + +// IsReady is a convenience that returns true iff the current state is Ready. +func (t *Tracker) IsReady() bool { + return t.Get() == Ready +} diff --git a/pkg/startupstate/startupstate_test.go b/pkg/startupstate/startupstate_test.go new file mode 100644 index 00000000..3af9b3fa --- /dev/null +++ b/pkg/startupstate/startupstate_test.go @@ -0,0 +1,43 @@ +package startupstate + +import ( + "sync" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMarkFailed_BlocksLaterMarkReady(t *testing.T) { + tr := New() + tr.MarkFailed() + + require.False(t, tr.MarkReady(), "MarkReady after MarkFailed must not transition") + require.Equal(t, Failed, tr.Get(), "Failed state must be sticky") +} + +func TestConcurrentMarkReady(t *testing.T) { + tr := New() + const goroutines = 64 + + var wg sync.WaitGroup + transitions := make(chan bool, goroutines) + wg.Add(goroutines) + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + transitions <- tr.MarkReady() + }() + } + wg.Wait() + close(transitions) + + count := 0 + for ok := range transitions { + if ok { + count++ + } + } + + require.Equal(t, 1, count, "exactly one MarkReady call should report a transition") + require.True(t, tr.IsReady(), "expected Ready after concurrent MarkReady") +} diff --git a/pkg/store/memory_store.go b/pkg/store/memory_store.go index 7eefd516..cbe81258 100644 --- a/pkg/store/memory_store.go +++ b/pkg/store/memory_store.go @@ -2,18 +2,30 @@ package store import ( "strings" + "sync" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" + "github.com/replicatedhq/replicated-sdk/pkg/logger" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" - licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" ) +// InMemoryStore is the SDK's process-local source of truth for license and +// app state. +// +// Concurrency: fields written outside of InitInMemory are protected by mu +// (an RWMutex). The fields broken out below as "set once at init" are +// written exclusively by InitInMemory, which runs single-threaded before +// the HTTP listener accepts traffic, so they intentionally do not take +// the lock. Adding a setter for any of those fields requires moving it +// under the mu-protected group. type InMemoryStore struct { + mu sync.RWMutex + + // Set once at init (no locking required for read). replicatedID string appID string - license licensewrapper.LicenseWrapper - licenseFields licensetypes.LicenseFields appSlug string appName string channelID string @@ -26,12 +38,18 @@ type InMemoryStore struct { replicatedAppEndpoint string releaseImages []string namespace string - appStatus appstatetypes.AppStatus - updates []upstreamtypes.ChannelRelease + reportAllImages bool + readOnlyMode bool + + // Mutated at runtime by background goroutines (heartbeat, appstate + // operator) and request handlers concurrently with each other. All + // reads and writes of these fields go through mu. + license licensewrapper.LicenseWrapper + licenseFields licensetypes.LicenseFields + appStatus appstatetypes.AppStatus + updates []upstreamtypes.ChannelRelease // podImages holds namespace -> podUID -> []ImageInfo - podImages map[string]map[string][]appstatetypes.ImageInfo - reportAllImages bool - readOnlyMode bool + podImages map[string]map[string][]appstatetypes.ImageInfo } type InitInMemoryStoreOptions struct { @@ -86,41 +104,90 @@ func (s *InMemoryStore) GetAppID() string { } func (s *InMemoryStore) GetLicense() licensewrapper.LicenseWrapper { + s.mu.RLock() + defer s.mu.RUnlock() return s.license } func (s *InMemoryStore) SetLicense(license licensewrapper.LicenseWrapper) { + s.mu.Lock() + defer s.mu.Unlock() // DeepCopy appropriate version if license.V1 != nil { s.license = licensewrapper.LicenseWrapper{ V1: license.V1.DeepCopy(), } - } else if license.V2 != nil { + return + } + if license.V2 != nil { s.license = licensewrapper.LicenseWrapper{ V2: license.V2.DeepCopy(), } + return } -} - + // Both V1 and V2 are nil — the previous license is preserved + // rather than overwritten with a zero value. Production callers go + // through loadAndVerifyLicense / GetLatestLicense which never + // return an empty wrapper alongside a nil error, so reaching this + // branch is a contract violation worth surfacing. + logger.Warnf("InMemoryStore.SetLicense called with empty wrapper; retaining previous license") +} + +// GetLicenseFields returns a defensive copy of the license-fields map so +// callers may safely mutate the returned map (e.g. handlers.GetLicenseField +// does `delete(m, key)` and `m[key] = v` against this result, then writes +// it back via SetLicenseFields). Returning the internal map by reference +// would race with concurrent SetLicenseFields writers under mu, because +// the RLock only guards the pointer load — not the map's contents — once +// the function returns. A shallow copy is sufficient: LicenseField values +// are stored by value, and no caller mutates the inner struct fields. func (s *InMemoryStore) GetLicenseFields() licensetypes.LicenseFields { - return s.licenseFields + s.mu.RLock() + defer s.mu.RUnlock() + if s.licenseFields == nil { + return nil + } + out := make(licensetypes.LicenseFields, len(s.licenseFields)) + for k, v := range s.licenseFields { + out[k] = v + } + return out } +// SetLicenseFields stores the caller-provided map as the new authoritative +// license-fields state, replacing whatever was there before. +// +// Replace (not merge) semantics matter for two callers in particular: +// +// - handlers.GetLicenseField, which deletes a key from a copy returned by +// GetLicenseFields when the upstream Vendor Portal reports the field +// no longer exists, then calls SetLicenseFields with that copy. With +// merge semantics the deletion is silently lost. +// - handlers.GetLicenseFields, which assigns the full upstream-fetched +// map to its local variable and writes it back. With merge semantics a +// field removed upstream would persist forever in the local store. +// +// We also defensive-copy on the way in so that a caller mutating +// `licenseFields` after this call cannot race with subsequent readers. +// Combined with GetLicenseFields' returned-copy contract, this means the +// store's internal map is never aliased outside this package. func (s *InMemoryStore) SetLicenseFields(licenseFields licensetypes.LicenseFields) { - // copy by value not reference + s.mu.Lock() + defer s.mu.Unlock() if licenseFields == nil { s.licenseFields = nil return } - if s.licenseFields == nil { - s.licenseFields = licensetypes.LicenseFields{} - } + cp := make(licensetypes.LicenseFields, len(licenseFields)) for k, v := range licenseFields { - s.licenseFields[k] = v + cp[k] = v } + s.licenseFields = cp } func (s *InMemoryStore) IsDevLicense() bool { + s.mu.RLock() + defer s.mu.RUnlock() return s.license.GetLicenseType() == "dev" } @@ -173,14 +240,20 @@ func (s *InMemoryStore) GetNamespace() string { } func (s *InMemoryStore) GetAppStatus() appstatetypes.AppStatus { + s.mu.RLock() + defer s.mu.RUnlock() return s.appStatus } func (s *InMemoryStore) SetAppStatus(status appstatetypes.AppStatus) { + s.mu.Lock() + defer s.mu.Unlock() s.appStatus = status } func (s *InMemoryStore) SetPodImages(namespace string, podUID string, images []appstatetypes.ImageInfo) { + s.mu.Lock() + defer s.mu.Unlock() if s.podImages == nil { s.podImages = make(map[string]map[string][]appstatetypes.ImageInfo) } @@ -280,6 +353,8 @@ func canonicalNameTag(s string) string { } func (s *InMemoryStore) DeletePodImages(namespace string, podUID string) { + s.mu.Lock() + defer s.mu.Unlock() if s.podImages == nil { return } @@ -293,6 +368,8 @@ func (s *InMemoryStore) DeletePodImages(namespace string, podUID string) { } func (s *InMemoryStore) GetRunningImages() map[string][]string { + s.mu.RLock() + defer s.mu.RUnlock() // Aggregate image -> unique SHA set across all namespaces/pods resultSet := make(map[string]map[string]struct{}) for _, pods := range s.podImages { @@ -320,10 +397,14 @@ func (s *InMemoryStore) GetRunningImages() map[string][]string { } func (s *InMemoryStore) GetUpdates() []upstreamtypes.ChannelRelease { + s.mu.RLock() + defer s.mu.RUnlock() return s.updates } func (s *InMemoryStore) SetUpdates(updates []upstreamtypes.ChannelRelease) { + s.mu.Lock() + defer s.mu.Unlock() s.updates = updates } diff --git a/pkg/store/memory_store_test.go b/pkg/store/memory_store_test.go index 196ec38c..5ad595c8 100644 --- a/pkg/store/memory_store_test.go +++ b/pkg/store/memory_store_test.go @@ -1,9 +1,13 @@ package store import ( + "sync" "testing" + kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" + licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" "github.com/stretchr/testify/require" ) @@ -205,3 +209,169 @@ func TestInMemoryStore_SetPodImages_PrivateRegistryRewrite_DoesNotOvermatchSimil require.Nil(t, got["registry.example.com/team/myapp/mynginx:latest"]) // not matched require.ElementsMatch(t, []string{"sha256:ok"}, got["registry.example.com/team/myapp/nginx:latest"]) // matched } + +// TestInMemoryStore_GetLicenseFields_ReturnsCopy verifies that mutating +// the map returned by GetLicenseFields does not affect the store's +// internal state. This invariant is what lets handlers like +// GetLicenseField (which `delete(m, k)` and `m[k] = v` on the result) +// run concurrently with SetLicenseFields without corrupting either +// goroutine's view of the map. +func TestInMemoryStore_GetLicenseFields_ReturnsCopy(t *testing.T) { + s := &InMemoryStore{} + s.SetLicenseFields(licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + "name": {Name: "name", Value: "acme"}, + }) + + got := s.GetLicenseFields() + require.Len(t, got, 2) + + delete(got, "seats") + got["injected"] = licensetypes.LicenseField{Name: "injected", Value: "should-not-leak"} + + stillThere := s.GetLicenseFields() + require.Len(t, stillThere, 2, "store mutated by external map edit") + require.Contains(t, stillThere, "seats", "deletion on returned map leaked into store") + require.NotContains(t, stillThere, "injected", "insertion on returned map leaked into store") +} + +func TestInMemoryStore_GetLicenseFields_NilWhenEmpty(t *testing.T) { + s := &InMemoryStore{} + require.Nil(t, s.GetLicenseFields(), "empty store should return nil, not an empty map, to preserve pre-PR semantics") +} + +// TestInMemoryStore_SetLicenseFields_ReplacesNotMerges locks in +// replace-not-merge semantics. handlers.GetLicenseField removes a key +// from a copy of the map and writes it back to indicate the upstream +// Vendor Portal has dropped that field; handlers.GetLicenseFields +// likewise overwrites with the full upstream-fetched map. With merge +// semantics, a field that was deleted on either side would persist +// forever in the local store after the next handler write. +func TestInMemoryStore_SetLicenseFields_ReplacesNotMerges(t *testing.T) { + s := &InMemoryStore{} + s.SetLicenseFields(licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + "tier": {Name: "tier", Value: "gold"}, + }) + + // Mirror handlers.GetLicenseField's removal pattern: pull a copy, + // drop a key, write it back. + m := s.GetLicenseFields() + delete(m, "seats") + s.SetLicenseFields(m) + + got := s.GetLicenseFields() + require.NotContains(t, got, "seats", "deleted key persisted because SetLicenseFields merged instead of replacing") + require.Contains(t, got, "tier") + require.Len(t, got, 1) + + // Replacing with a totally different shape must drop the old keys. + s.SetLicenseFields(licensetypes.LicenseFields{ + "newField": {Name: "newField", Value: "x"}, + }) + got = s.GetLicenseFields() + require.Len(t, got, 1) + require.Contains(t, got, "newField") + require.NotContains(t, got, "tier") +} + +// TestInMemoryStore_SetLicenseFields_DefensiveCopyOnWrite verifies the +// store doesn't alias the caller's input map. Without the defensive +// copy, a caller mutating its own variable after writing would corrupt +// what subsequent readers see. +func TestInMemoryStore_SetLicenseFields_DefensiveCopyOnWrite(t *testing.T) { + s := &InMemoryStore{} + input := licensetypes.LicenseFields{ + "seats": {Name: "seats", Value: 10}, + } + s.SetLicenseFields(input) + + input["seats"] = licensetypes.LicenseField{Name: "seats", Value: 999} + input["leak"] = licensetypes.LicenseField{Name: "leak"} + + got := s.GetLicenseFields() + require.Equal(t, 10, got["seats"].Value, "store aliased caller's input map") + require.NotContains(t, got, "leak", "store aliased caller's input map") +} + +// TestInMemoryStore_ConcurrentReadWrite exercises the runtime-mutated +// fields under simultaneous read and write pressure. Without the RWMutex +// added in Phase 2, `go test -race` flags this as a data race. +// +// The scenario mirrors production: bootstrapBackground / heartbeat / +// request handlers all calling Set* on license, license-fields, app +// status, updates, and pod images while readers (request handlers, +// reporters) call the corresponding Get*. +func TestInMemoryStore_ConcurrentReadWrite(t *testing.T) { + s := &InMemoryStore{} + + const goroutines = 32 + const iterations = 50 + + license := licensewrapper.LicenseWrapper{V1: &kotsv1beta1.License{}} + fields := licensetypes.LicenseFields{ + "f1": licensetypes.LicenseField{Title: "F1", Value: "v"}, + } + + var wg sync.WaitGroup + wg.Add(goroutines * 6) + + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetLicense(license) + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + _ = s.GetLicense() + _ = s.IsDevLicense() + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetLicenseFields(fields) + _ = s.GetLicenseFields() + } + }() + // Mirror the handlers.GetLicenseField pattern: pull the + // fields map out, mutate it locally, then push it back. With + // the pre-fix GetLicenseFields (which returned the internal + // map by reference), this races against the SetLicenseFields + // goroutine above and `go test -race` flags it. With the + // defensive copy in place, the mutation operates on a local + // map and never touches the store's internal state. + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + m := s.GetLicenseFields() + if m == nil { + m = licensetypes.LicenseFields{} + } + m["transient"] = licensetypes.LicenseField{Name: "transient"} + delete(m, "transient") + s.SetLicenseFields(m) + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetAppStatus(appstatetypes.AppStatus{}) + _ = s.GetAppStatus() + } + }() + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s.SetPodImages("ns", "pod", []appstatetypes.ImageInfo{{Name: "nginx", SHA: "sha256:x"}}) + _ = s.GetRunningImages() + s.DeletePodImages("ns", "pod") + } + }() + } + + wg.Wait() +} diff --git a/pkg/store/store_interface.go b/pkg/store/store_interface.go index c8fbfb5b..856153bf 100644 --- a/pkg/store/store_interface.go +++ b/pkg/store/store_interface.go @@ -1,18 +1,50 @@ package store import ( + "sync" + "sync/atomic" + + licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" licensetypes "github.com/replicatedhq/replicated-sdk/pkg/license/types" upstreamtypes "github.com/replicatedhq/replicated-sdk/pkg/upstream/types" - licensewrapper "github.com/replicatedhq/kotskinds/pkg/licensewrapper" ) -var ( - store Store +// storePtr holds the package-level Store as an atomic.Pointer so that +// handler goroutines (calling GetStore on every request) can safely +// observe stores written by bootstrapCritical's InitInMemory call. +// +// Pre-Phase-1, the HTTP listener didn't accept traffic until after +// InitInMemory had returned, so a plain interface variable was safe by +// virtue of happens-before via process startup ordering. With the +// listener-up-before-bootstrap refactor, request handlers and the +// bootstrap goroutine race on this variable: an unsynchronized interface +// (itab + data pointer = two words) can be torn under concurrent +// reads/writes. +// +// We store *Store rather than Store directly because atomic.Pointer +// requires a pointer type. SetStore(nil) is preserved as "clear the +// store" (used by test cleanup) by storing a nil *Store. +var storePtr atomic.Pointer[Store] - _ Store = (*InMemoryStore)(nil) +// fallbackStore is the single empty InMemoryStore that GetStore returns +// when no real store has been installed yet. Lazily created on first +// access so unused code paths (especially tests that never call any +// store function) don't pay the allocation. We deliberately reuse the +// same instance across all "store not yet installed" GetStore calls so +// that a sequence like `store.GetStore().SetX(v); store.GetStore().GetX()` +// observes its own write rather than seeing a fresh zero-value store on +// the second call. This window is observable in the resilient-mode +// timeout path, where the pod is marked Ready (and accepts traffic) +// before bootstrapCritical has had a chance to install the real store +// via InitInMemory. +var ( + fallbackStoreOnce sync.Once + fallbackStore *InMemoryStore ) +var _ Store = (*InMemoryStore)(nil) + type Store interface { GetReplicatedID() string GetAppID() string @@ -45,13 +77,37 @@ type Store interface { GetReadOnlyMode() bool } +// SetStore atomically installs s as the package-level store. Passing nil +// clears the store; subsequent GetStore calls will return the shared +// fallback InMemoryStore (see GetStore) until a non-nil value is +// installed. Test cleanup paths rely on the nil-clear behavior. func SetStore(s Store) { - store = s + if s == nil { + storePtr.Store(nil) + return + } + storePtr.Store(&s) } +// GetStore returns the currently installed store, or a single shared +// empty fallback InMemoryStore when none has been installed yet. The +// fallback is process-wide (allocated lazily on first need) rather than +// per-call so that mutations performed against it persist across +// consecutive GetStore reads — handlers that do, e.g., +// `s := GetStore(); s.SetX(v); s2 := GetStore(); _ = s2.GetX()` must +// observe their own writes even before InitInMemory installs the real +// store. Once SetStore is called with a real store, GetStore returns +// that real store and the fallback becomes orphaned; any writes that +// landed on the fallback during the bootstrap window are lost. That is +// an accepted trade-off: the alternative (blocking handlers behind +// bootstrap) is exactly the CrashLoopBackOff regression Phase 1 set +// out to fix. func GetStore() Store { - if store == nil { - return &InMemoryStore{} + if p := storePtr.Load(); p != nil { + return *p } - return store + fallbackStoreOnce.Do(func() { + fallbackStore = &InMemoryStore{} + }) + return fallbackStore } diff --git a/pkg/store/store_interface_test.go b/pkg/store/store_interface_test.go new file mode 100644 index 00000000..51f65a62 --- /dev/null +++ b/pkg/store/store_interface_test.go @@ -0,0 +1,93 @@ +package store + +import ( + "sync" + "sync/atomic" + "testing" + + appstatetypes "github.com/replicatedhq/replicated-sdk/pkg/appstate/types" + "github.com/stretchr/testify/require" +) + +// TestSetStore_GetStore_ConcurrentSwapAndRead verifies that the +// package-level store pointer can be swapped while readers are calling +// GetStore() in parallel. Without atomic.Pointer guarding `storePtr`, +// `go test -race` would flag the read at GetStore against the write at +// SetStore — and even without -race, an unsynchronized interface read +// could observe a torn (itab, data) pair on platforms where two-word +// loads aren't atomic. This is the production scenario the listener + +// background-bootstrap refactor introduced: handlers calling GetStore +// race with bootstrapCritical's InitInMemory → SetStore. +func TestSetStore_GetStore_ConcurrentSwapAndRead(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + + const goroutines = 64 + const iterations = 1000 + + a := &InMemoryStore{} + b := &InMemoryStore{} + + var wg sync.WaitGroup + wg.Add(goroutines * 2) + + var reads atomic.Uint64 + for i := 0; i < goroutines; i++ { + go func(swapToA bool) { + defer wg.Done() + for j := 0; j < iterations; j++ { + if swapToA { + SetStore(a) + } else { + SetStore(b) + } + } + }(i%2 == 0) + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + s := GetStore() + require.NotNil(t, s, "GetStore must never return nil") + reads.Add(1) + } + }() + } + + wg.Wait() + require.Equal(t, uint64(goroutines*iterations), reads.Load()) +} + +func TestSetStore_NilClearsStore(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + + s := &InMemoryStore{} + SetStore(s) + require.Same(t, s, GetStore(), "SetStore should install the exact provided store") + + SetStore(nil) + got := GetStore() + require.NotNil(t, got, "GetStore must return a non-nil fallback after SetStore(nil)") + require.NotSame(t, s, got, "SetStore(nil) should clear; subsequent GetStore must not return the previously installed store") +} + +// TestGetStore_FallbackIsSharedAcrossCalls locks in the contract that +// when no store has been installed (the resilient-mode-timeout window +// where the pod is Ready but bootstrapCritical hasn't run InitInMemory +// yet), consecutive GetStore calls return the SAME fallback instance — +// not a fresh empty one each time. Otherwise a handler doing +// `s := GetStore(); s.SetX(v)` followed by `GetStore().GetX()` would +// silently lose the write because the second call would see a different +// zero-value store. This test would fail with the per-call +// `&InMemoryStore{}` fallback that pre-dated the fix. +func TestGetStore_FallbackIsSharedAcrossCalls(t *testing.T) { + t.Cleanup(func() { SetStore(nil) }) + SetStore(nil) + + a := GetStore() + b := GetStore() + require.Same(t, a, b, "fallback must be shared across consecutive GetStore calls") + + // Mutate via one handle, observe via another to prove the writes + // land on a single underlying instance and aren't discarded. + a.SetAppStatus(appstatetypes.AppStatus{State: appstatetypes.StateReady}) + require.Equal(t, appstatetypes.StateReady, b.GetAppStatus().State, "writes against the fallback must persist across GetStore calls") +} diff --git a/pkg/util/env.go b/pkg/util/env.go index 54e8049f..7e369fc7 100644 --- a/pkg/util/env.go +++ b/pkg/util/env.go @@ -2,23 +2,46 @@ package util import ( "os" - "regexp" - - kotsv1beta1 "github.com/replicatedhq/kotskinds/apis/kots/v1beta1" + "sync/atomic" ) +// airgapOverride is a runtime opt-in that forces IsAirgap() to report +// true even when DISABLE_OUTBOUND_CONNECTIONS is unset. It exists so the +// devOffline workflow (a dev-license-only convenience for working +// offline without configuring full air-gap support) can reuse the +// existing !IsAirgap() gates throughout the codebase rather than +// introducing a parallel set of checks. +// +// The override is set once during bootstrapCritical when params.DevOffline +// is true and the loaded license is a dev license. Tests that flip the +// override must call ResetAirgapOverride in t.Cleanup so state does not +// leak between tests. +// +// The override and DISABLE_OUTBOUND_CONNECTIONS are independent inputs +// to the same boolean: if either (or both) is set, IsAirgap() returns +// true and behavior is identical to either alone. There is no precedence +// to reason about because there is nothing to disagree on. +var airgapOverride atomic.Bool + +// SetAirgapOverride forces IsAirgap to return true regardless of the +// DISABLE_OUTBOUND_CONNECTIONS env var. Call once during bootstrap. +func SetAirgapOverride(on bool) { + airgapOverride.Store(on) +} + +// ResetAirgapOverride clears the override. Intended for test cleanup +// (t.Cleanup); production callers do not unset the override. +func ResetAirgapOverride() { + airgapOverride.Store(false) +} + func IsAirgap() bool { + if airgapOverride.Load() { + return true + } return os.Getenv("DISABLE_OUTBOUND_CONNECTIONS") == "true" } func IsDevEnv() bool { return os.Getenv("REPLICATED_ENV") == "dev" } - -func IsDevLicense(license *kotsv1beta1.License) bool { - if license == nil { - return false - } - result, _ := regexp.MatchString(`replicated-app`, license.Spec.Endpoint) - return result -} diff --git a/pkg/util/env_test.go b/pkg/util/env_test.go new file mode 100644 index 00000000..45f786bb --- /dev/null +++ b/pkg/util/env_test.go @@ -0,0 +1,79 @@ +package util + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +// TestIsAirgap_TruthTable defends the doc claim in env.go that the +// airgapOverride and DISABLE_OUTBOUND_CONNECTIONS env var are independent +// inputs to the same boolean — IsAirgap returns true if either or both +// is set, and there is no precedence between them. A future contributor +// who introduces a precedence rule (e.g. "env wins" or "override wins") +// would silently change a documented invariant; this table makes the +// invariant testable. +func TestIsAirgap_TruthTable(t *testing.T) { + cases := []struct { + name string + envValue string + override bool + want bool + }{ + {"both off", "", false, false}, + {"override only", "", true, true}, + {"env only", "true", false, true}, + {"both on", "true", true, true}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", tc.envValue) + t.Cleanup(ResetAirgapOverride) + SetAirgapOverride(tc.override) + + require.Equal(t, tc.want, IsAirgap()) + }) + } +} + +// TestIsAirgap_EnvVarRequiresExactTrue locks in the strict-string +// contract of the env-var path: only the literal "true" enables airgap. +// "1", "TRUE", "yes" and friends do NOT. Without this test, a future +// "be more lenient with env values" change would silently broaden the +// airgap surface, which is the wrong direction for a flag that disables +// an important class of behavior. +func TestIsAirgap_EnvVarRequiresExactTrue(t *testing.T) { + t.Cleanup(ResetAirgapOverride) + ResetAirgapOverride() + + for _, v := range []string{"", "1", "yes", "TRUE", "True", "false", "0"} { + t.Run("env="+v, func(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", v) + require.False(t, IsAirgap(), + "env=%q must not enable airgap; only the exact string \"true\" does", v) + }) + } +} + +// TestSetAirgapOverride_Idempotent verifies that flipping the override +// to true repeatedly is safe. bootstrapCritical can be re-invoked by +// the orchestrator's retry loop and applyDevOfflineGuard re-runs each +// time, so SetAirgapOverride(true) is called more than once in normal +// operation. The atomic.Bool semantics make this trivially correct, but +// codifying the expectation prevents a future "guard against double-set" +// refactor from introducing accidental panics or state-machine logic. +func TestSetAirgapOverride_Idempotent(t *testing.T) { + t.Setenv("DISABLE_OUTBOUND_CONNECTIONS", "") + t.Cleanup(ResetAirgapOverride) + ResetAirgapOverride() + require.False(t, IsAirgap(), "precondition: override and env both off") + + SetAirgapOverride(true) + SetAirgapOverride(true) + SetAirgapOverride(true) + require.True(t, IsAirgap()) + + ResetAirgapOverride() + require.False(t, IsAirgap()) +}