diff --git a/changes/46291-dynamic-scep-ndes-expired-challenge-self-heal b/changes/46291-dynamic-scep-ndes-expired-challenge-self-heal new file mode 100644 index 00000000000..60abd163438 --- /dev/null +++ b/changes/46291-dynamic-scep-ndes-expired-challenge-self-heal @@ -0,0 +1,2 @@ +- Fixed an issue where macOS hosts using dynamic SCEP (NDES) could land in a terminal `failed` state with "challenge password has expired" and not recover on their own. An expired challenge now resends the Apple profile with a reset retry counter and a freshly fetched challenge, so affected hosts self-heal instead of requiring a manual reissue. +- Fixed an issue where a large backlog of macOS hosts needing dynamic SCEP (NDES) certificates (for example after an outage or upgrade) could overrun the NDES password cache and leave most hosts failed. Fleet now treats a full NDES password cache as a temporary condition: it stops requesting new challenges for the rest of that run and retries the remaining hosts on later runs, draining the backlog gradually instead of failing them. diff --git a/ee/server/service/scep/scep_proxy.go b/ee/server/service/scep/scep_proxy.go index 61d2037621d..59ff00d9e55 100644 --- a/ee/server/service/scep/scep_proxy.go +++ b/ee/server/service/scep/scep_proxy.go @@ -366,8 +366,12 @@ func (svc *scepProxyService) validateIdentifier(ctx context.Context, identifier if status == "" { status = "null" } - // FIXME: MDM client will report a failed status for the profile when we return bad request, which consumes the sole retry attempt. - // Seems like we should proactively use ResendHostCertificateProfile here too? + // Unlike the challenge-expiry path below (which now requeues via + // ResendHostCertificateProfile, see #46291), we intentionally do NOT requeue here: + // a non-pending status means Fleet's DB already advanced this profile (e.g. it was + // re-sent or moved to verifying/verified) and the host is simply redeeming an + // outdated command. We expect a fresh certificate request once the profile updates + // on the host, so proactively resending here would fight with the current state. return "", &scepserver.BadRequestError{Message: fmt.Sprintf("profile status (%s) is not 'pending' for host:%s profile:%s", status, hostUUID, profileUUID)} } @@ -384,9 +388,44 @@ func (svc *scepProxyService) validateIdentifier(ctx context.Context, identifier // The challenge password was retrieved for this profile, and is now invalid. // We need to resend the profile with a new challenge password. // Note: we don't actually know if it is invalid, and we can't get that exact feedback from SCEP server. - if err := svc.ds.ResendHostMDMProfile(ctx, hostUUID, profileUUID); err != nil { + // + // Returning a bad request makes the MDM client mark the profile failed, which + // consumes the profile's automatic retry. For Apple profiles we resend via + // ResendHostCertificateProfile (rather than the generic ResendHostMDMProfile), + // which resets the retry counter and blanks the command UUID. Blanking the + // command UUID forces the reconcile cron to re-render the profile from scratch + // rather than re-enqueue the stale, already-expired command bytes: the NULL + // status re-selects the profile for install (server/mdm/apple/reconcile.go, + // ComputeReconcileDeltas: OperationType==Install && Status==nil), and + // preprocessProfileContents re-fetches a fresh NDES challenge and upserts a new + // challenge_retrieved_at (server/mdm/apple/profile_processor.go, the + // FleetVarNDESSCEPChallenge case + BulkUpsertMDMManagedCertificates). This lets + // the host self-heal rather than landing in a terminal failed state (see #46291). + // + // Known limitation: ResendHostCertificateProfile only operates on + // host_mdm_apple_profiles, so Windows NDES profiles fall back to the generic + // ResendHostMDMProfile and remain subject to the single-retry trap. Tracked as a + // follow-up; out of scope for the macOS issue reported in #46291. + if strings.HasPrefix(profileUUID, fleet.MDMAppleProfileUUIDPrefix) { + if err := svc.ds.ResendHostCertificateProfile(ctx, hostUUID, profileUUID); err != nil { + return "", ctxerr.Wrap(ctx, err, "resending host mdm certificate profile") + } + } else if err := svc.ds.ResendHostMDMProfile(ctx, hostUUID, profileUUID); err != nil { return "", ctxerr.Wrap(ctx, err, "resending host mdm profile") } + // Log the requeue. Because each expiry resets retries, a host that chronically + // checks in more than NDESChallengeInvalidAfter after the profile is built will + // re-fetch a fresh NDES challenge on every reconcile instead of going terminal. + // That is the intended self-heal behavior, but it also means a backlog of + // chronically-late hosts can repeatedly hit the (Okta/WAF-fronted) NDES admin URL, + // so we surface it here for operators monitoring SCEP admin-URL load. + svc.debugLogger.WarnContext(ctx, "dynamic SCEP challenge expired before device redemption; requeuing profile with a fresh challenge", + "host_uuid", hostUUID, + "profile_uuid", profileUUID, + "ca_type", fleet.CAConfigNDES, + "challenge_retrieved_at", challengeRetrievedAt, + "challenge_invalid_after", NDESChallengeInvalidAfter.String(), + ) return "", &scepserver.BadRequestError{Message: "challenge password has expired"} } scepURL = groupedCAs.NDESSCEP.URL diff --git a/ee/server/service/scep/scep_proxy_test.go b/ee/server/service/scep/scep_proxy_test.go index fbaabe9e69d..f299a51db0c 100644 --- a/ee/server/service/scep/scep_proxy_test.go +++ b/ee/server/service/scep/scep_proxy_test.go @@ -440,7 +440,7 @@ func TestValidateIdentifier(t *testing.T) { assert.Equal(t, "https://ndes.example.com/scep", scepURL) }) - t.Run("NDES challenge expired triggers requeue", func(t *testing.T) { + t.Run("NDES challenge expired triggers requeue for Apple profile", func(t *testing.T) { ds := new(mock.DataStore) ds.GetGroupedCertificateAuthoritiesFunc = func(ctx context.Context, includeSecrets bool) (*fleet.GroupedCertificateAuthorities, error) { return &fleet.GroupedCertificateAuthorities{ @@ -459,19 +459,74 @@ func TestValidateIdentifier(t *testing.T) { ChallengeRetrievedAt: &expiredTime, }, nil } - ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { + // Apple profiles must resend via ResendHostCertificateProfile so the retry + // counter is reset and a fresh challenge is fetched (see #46291), not via the + // generic ResendHostMDMProfile which would resend the stale command bytes. + // + // This asserts the routing decision only. The end-to-end self-heal (blanked + // command_uuid -> reconcile re-renders -> fresh NDES challenge + new + // challenge_retrieved_at) relies on the reconcile path referenced in + // validateIdentifier's comment: server/mdm/apple/reconcile.go + // (ComputeReconcileDeltas selects Status==nil installs) and + // server/mdm/apple/profile_processor.go (FleetVarNDESSCEPChallenge re-fetch + + // BulkUpsertMDMManagedCertificates). + ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { assert.Equal(t, "host-uuid", hostUUID) assert.Equal(t, "a-profile-uuid", profileUUID) return nil } + ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { + t.Errorf("ResendHostMDMProfile should not be called for Apple NDES profiles") + return nil + } svc := newTestService(ds) identifier := makeIdentifier("host-uuid", "a-profile-uuid", "NDES", "") _, err := svc.validateIdentifier(ctx, identifier, true) // checkChallenge=true require.Error(t, err) assert.Contains(t, err.Error(), "challenge password has expired") + assert.True(t, ds.ResendHostCertificateProfileFuncInvoked) + assert.False(t, ds.ResendHostMDMProfileFuncInvoked) + }) + + t.Run("NDES challenge expired triggers requeue for Windows profile", func(t *testing.T) { + ds := new(mock.DataStore) + ds.GetGroupedCertificateAuthoritiesFunc = func(ctx context.Context, includeSecrets bool) (*fleet.GroupedCertificateAuthorities, error) { + return &fleet.GroupedCertificateAuthorities{ + NDESSCEP: &fleet.NDESSCEPProxyCA{URL: "https://ndes.example.com/scep"}, + }, nil + } + pendingStatus := fleet.MDMDeliveryPending + expiredTime := time.Now().Add(-58 * time.Minute) // Expired (>57 minutes) + ds.GetWindowsHostMDMCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID, caName string) (*fleet.HostMDMCertificateProfile, error) { + return &fleet.HostMDMCertificateProfile{ + HostUUID: hostUUID, + ProfileUUID: profileUUID, + Status: &pendingStatus, + Type: fleet.CAConfigNDES, + CAName: "NDES", + ChallengeRetrievedAt: &expiredTime, + }, nil + } + // ResendHostCertificateProfile only operates on host_mdm_apple_profiles, so + // Windows profiles must continue to use the generic ResendHostMDMProfile. + ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { + assert.Equal(t, "host-uuid", hostUUID) + assert.Equal(t, "w-profile-uuid", profileUUID) + return nil + } + ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { + t.Errorf("ResendHostCertificateProfile should not be called for Windows NDES profiles") + return nil + } + svc := newTestService(ds) + + identifier := makeIdentifier("host-uuid", "w-profile-uuid", "NDES", "") + _, err := svc.validateIdentifier(ctx, identifier, true) // checkChallenge=true + require.Error(t, err) + assert.Contains(t, err.Error(), "challenge password has expired") assert.True(t, ds.ResendHostMDMProfileFuncInvoked) - ds.ResendHostMDMProfileFuncInvoked = false + assert.False(t, ds.ResendHostCertificateProfileFuncInvoked) }) t.Run("NDES challenge not expired", func(t *testing.T) { @@ -822,7 +877,7 @@ func TestValidateIdentifier(t *testing.T) { ChallengeRetrievedAt: &expiredTime, }, nil } - ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { + ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error { return errors.New("resend failed") } svc := newTestService(ds) @@ -830,7 +885,7 @@ func TestValidateIdentifier(t *testing.T) { identifier := makeIdentifier("host-uuid", "a-profile-uuid", "NDES", "") _, err := svc.validateIdentifier(ctx, identifier, true) require.Error(t, err) - assert.Contains(t, err.Error(), "resending host mdm profile") + assert.Contains(t, err.Error(), "resending host mdm certificate profile") }) t.Run("default CA name is NDES", func(t *testing.T) { diff --git a/server/mdm/apple/profile_processor.go b/server/mdm/apple/profile_processor.go index d3fb4bf487e..5e746e6cc31 100644 --- a/server/mdm/apple/profile_processor.go +++ b/server/mdm/apple/profile_processor.go @@ -179,6 +179,18 @@ func preprocessProfileContents( // need to look it up more than once per host. hostIDForUUIDCache := make(map[string]uint) + // ndesCacheFull is set once the NDES admin URL reports its password cache is + // full during this reconcile pass. NDES only holds a small number of + // outstanding (unredeemed) challenge passwords at once (PasswordMax, default + // 5), and a password is a transient, retryable resource: slots free up as + // devices redeem or as passwords expire. Once the cache is full, every further + // fetch this pass would also fail, so we stop requesting new challenges and + // defer the remaining NDES profiles to a later reconcile run rather than + // hammering the admin URL or terminally failing a large backlog all at once. + // This both throttles our fetches to NDES's capacity and lets the backlog + // drain gradually over successive runs (see #46291). + ndesCacheFull := false + var addedTargets map[string]*fleet.CmdTarget for profUUID, target := range targets { contents, ok := profileContents[profUUID] @@ -352,6 +364,11 @@ func preprocessProfileContents( hostContents := contentsStr failed := false + // deferred is set when this host's profile cannot be rendered right now + // but should be retried on a later reconcile (e.g. the NDES password cache + // is full). Unlike failed, it leaves the profile status untouched (NULL) so + // it is re-selected next run instead of landing in a terminal failed state. + deferred := false fleetVarLoop: for _, fleetVar := range fleetVars { @@ -365,10 +382,30 @@ func preprocessProfileContents( } ndesConfig = groupedCAs.NDESSCEP } + if ndesCacheFull { + // The NDES password cache filled earlier in this reconcile pass. + // Defer this host (leave its status NULL) so it is retried on a + // later run once slots free up, instead of issuing a fetch that we + // know would fail or terminally failing the profile. + deferred = true + break fleetVarLoop + } logger.DebugContext(ctx, "fetching NDES challenge", "host_uuid", hostUUID, "profile_uuid", profUUID) // Insert the SCEP challenge into the profile contents challenge, err := scepConfig.GetNDESSCEPChallenge(ctx, *ndesConfig) if err != nil { + // A full password cache is a transient, retryable condition: NDES + // caps the number of outstanding challenges, and slots free up as + // devices redeem or passwords expire. Don't burn the profile's retry + // or mark it failed; defer it and stop fetching more challenges this + // pass so the backlog drains over successive reconcile runs (#46291). + if errors.As(err, &scep.NDESPasswordCacheFullError{}) { + ndesCacheFull = true + deferred = true + logger.WarnContext(ctx, "NDES password cache full; deferring remaining NDES profiles to a later reconcile", + "host_uuid", hostUUID, "profile_uuid", profUUID) + break fleetVarLoop + } detail := scep.NDESChallengeErrorToDetail(err) err := ds.UpdateOrDeleteHostMDMAppleProfile(ctx, &fleet.HostMDMAppleProfile{ CommandUUID: target.CmdUUID, @@ -631,7 +668,7 @@ func preprocessProfileContents( // This was handled in the above switch statement, so we should never reach this case } } - if !failed { + if !failed && !deferred { addedTargets[tempProfUUID] = &fleet.CmdTarget{ CmdUUID: tempCmdUUID, ProfileIdentifier: target.ProfileIdentifier, diff --git a/server/mdm/apple/profile_processor_test.go b/server/mdm/apple/profile_processor_test.go index 82323f58706..d65062f4df3 100644 --- a/server/mdm/apple/profile_processor_test.go +++ b/server/mdm/apple/profile_processor_test.go @@ -168,7 +168,10 @@ func TestPreprocessProfileContents(t *testing.T) { assert.NotNil(t, updatedProfile.VariablesUpdatedAt) assert.Empty(t, targets) - // Password cache full + // Password cache full is a transient condition (NDES caps the number of + // outstanding challenges). The profile must NOT be marked failed; it is + // deferred and left pending (status untouched) so the next reconcile retries + // it instead of landing in a terminal failed state (see #46291). scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) { assert.Equal(t, ndesPassword, proxy.Password) return "", scep.NewNDESPasswordCacheFullError("NDES error") @@ -177,11 +180,8 @@ func TestPreprocessProfileContents(t *testing.T) { populateTargets() err = preprocessProfileContents(ctx, appCfg, ds, scepConfig, digiCertService, logger, targets, profileContents, hostProfilesToInstallMap, userEnrollmentsToHostUUIDsMap, groupedCAs) require.NoError(t, err) - require.NotNil(t, updatedProfile) - assert.Contains(t, updatedProfile.Detail, "FLEET_VAR_"+fleet.FleetVarNDESSCEPChallenge) - assert.Contains(t, updatedProfile.Detail, "cached passwords") - assert.NotNil(t, updatedProfile.VariablesUpdatedAt) - assert.Empty(t, targets) + assert.Nil(t, updatedProfile, "cache-full must not mark the profile failed") + assert.Empty(t, targets, "deferred profile must not be enqueued") // Insufficient permissions scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) { @@ -415,6 +415,107 @@ func TestPreprocessProfileContents(t *testing.T) { } } +// TestPreprocessProfileContentsNDESCacheFullDefersBacklog verifies the throttle / +// self-heal behavior added for https://github.com/fleetdm/fleet/issues/46291: when +// the NDES password cache fills part-way through a batch, Fleet stops fetching +// further challenges for the rest of the reconcile pass and DEFERS the remaining +// hosts (leaving them pending) instead of terminally failing them or continuing to +// hammer the admin URL. The hosts that did receive a challenge are still enqueued, +// and the deferred ones are re-selected on the next reconcile run. +func TestPreprocessProfileContentsNDESCacheFullDefersBacklog(t *testing.T) { + ctx := context.Background() + ctx = license.NewContext(ctx, &fleet.LicenseInfo{Tier: fleet.TierPremium}) + logger := slog.New(slog.NewJSONHandler(os.Stdout, nil)) + appCfg := &fleet.AppConfig{} + appCfg.ServerSettings.ServerURL = "https://test.example.com" + appCfg.MDM.EnabledAndConfigured = true + ds := new(mock.Store) + digiCertService := digicert.NewService(digicert.WithLogger(logger)) + + hostUUIDs := []string{"host-1", "host-2", "host-3", "host-4"} + const profUUID = "p1" + const cmdUUID = "cmd-1" + + targets := map[string]*fleet.CmdTarget{ + profUUID: {CmdUUID: cmdUUID, ProfileIdentifier: "com.add.profile", EnrollmentIDs: hostUUIDs}, + } + hostProfilesToInstallMap := make(map[fleet.HostProfileUUID]*fleet.MDMAppleBulkUpsertHostProfilePayload, len(hostUUIDs)) + for _, h := range hostUUIDs { + hostProfilesToInstallMap[fleet.HostProfileUUID{HostUUID: h, ProfileUUID: profUUID}] = &fleet.MDMAppleBulkUpsertHostProfilePayload{ + ProfileUUID: profUUID, + ProfileIdentifier: "com.add.profile", + HostUUID: h, + OperationType: fleet.MDMOperationTypeInstall, + Status: &fleet.MDMDeliveryPending, + CommandUUID: cmdUUID, + Scope: fleet.PayloadScopeSystem, + } + } + userEnrollmentsToHostUUIDsMap := make(map[string]string) + profileContents := map[string]mobileconfig.Mobileconfig{ + profUUID: []byte("$FLEET_VAR_" + fleet.FleetVarNDESSCEPChallenge), + } + + ndesPassword := "test-password" + ds.GetAllMDMConfigAssetsByNameFunc = func(ctx context.Context, assetNames []fleet.MDMAssetName, _ sqlx.QueryerContext) (map[fleet.MDMAssetName]fleet.MDMConfigAsset, error) { + return map[fleet.MDMAssetName]fleet.MDMConfigAsset{ + fleet.MDMAssetNDESPassword: {Value: []byte(ndesPassword)}, + }, nil + } + + var failedCalls int + ds.UpdateOrDeleteHostMDMAppleProfileFunc = func(ctx context.Context, profile *fleet.HostMDMAppleProfile) error { + failedCalls++ + return nil + } + var enqueuedHosts []string + ds.BulkUpsertMDMAppleHostProfilesFunc = func(ctx context.Context, payload []*fleet.MDMAppleBulkUpsertHostProfilePayload) error { + for _, p := range payload { + enqueuedHosts = append(enqueuedHosts, p.HostUUID) + } + return nil + } + var managedCertCount int + ds.BulkUpsertMDMManagedCertificatesFunc = func(ctx context.Context, payload []*fleet.MDMManagedCertificate) error { + managedCertCount += len(payload) + return nil + } + + groupedCAs := &fleet.GroupedCertificateAuthorities{ + NDESSCEP: &fleet.NDESSCEPProxyCA{ + URL: "https://test-example.com", + AdminURL: "https://example.com", + Username: "admin", + Password: ndesPassword, + }, + } + + // Succeed for the first 2 challenge fetches, then report the cache is full. + var fetchCalls int + scepConfig := &scep_mock.SCEPConfigService{} + scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) { + fetchCalls++ + if fetchCalls > 2 { + return "", scep.NewNDESPasswordCacheFullError("cache full") + } + return "ndes-challenge", nil + } + + err := preprocessProfileContents(ctx, appCfg, ds, scepConfig, digiCertService, logger, targets, profileContents, hostProfilesToInstallMap, userEnrollmentsToHostUUIDsMap, groupedCAs) + require.NoError(t, err) + + // 2 successful fetches + 1 cache-full signal; after that we stop fetching, so + // host-4 is deferred without another admin-URL hit. + assert.Equal(t, 3, fetchCalls, "must stop fetching once the cache reports full") + // A transient cache-full must not terminally fail any profile. + assert.Zero(t, failedCalls, "cache-full must not terminally fail any profile") + // The 2 hosts that got a challenge are enqueued; the other 2 are deferred and + // left pending for the next reconcile. + assert.ElementsMatch(t, []string{"host-1", "host-2"}, enqueuedHosts) + assert.Equal(t, 2, managedCertCount) + assert.Len(t, targets, 2, "only the hosts with a fresh challenge are enqueued; the rest are deferred") +} + // TestPreprocessProfileContentsDigiCertUPNMultiHost is a regression test for // https://github.com/fleetdm/fleet/issues/39324. When the same DigiCert CA is // used for multiple hosts in a single batch, the CertificateUserPrincipalNames