Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changes/46291-dynamic-scep-ndes-expired-challenge-self-heal
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- Fixed an issue where macOS hosts using dynamic SCEP (NDES) could land in a terminal `failed` state with "challenge password has expired" and not recover on their own. An expired challenge now resends the Apple profile with a reset retry counter and a freshly fetched challenge, so affected hosts self-heal instead of requiring a manual reissue.
- Fixed an issue where a large backlog of macOS hosts needing dynamic SCEP (NDES) certificates (for example after an outage or upgrade) could overrun the NDES password cache and leave most hosts failed. Fleet now treats a full NDES password cache as a temporary condition: it stops requesting new challenges for the rest of that run and retries the remaining hosts on later runs, draining the backlog gradually instead of failing them.
45 changes: 42 additions & 3 deletions ee/server/service/scep/scep_proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,12 @@ func (svc *scepProxyService) validateIdentifier(ctx context.Context, identifier
if status == "" {
status = "null"
}
// FIXME: MDM client will report a failed status for the profile when we return bad request, which consumes the sole retry attempt.
// Seems like we should proactively use ResendHostCertificateProfile here too?
// Unlike the challenge-expiry path below (which now requeues via
// ResendHostCertificateProfile, see #46291), we intentionally do NOT requeue here:
// a non-pending status means Fleet's DB already advanced this profile (e.g. it was
// re-sent or moved to verifying/verified) and the host is simply redeeming an
// outdated command. We expect a fresh certificate request once the profile updates
// on the host, so proactively resending here would fight with the current state.
return "", &scepserver.BadRequestError{Message: fmt.Sprintf("profile status (%s) is not 'pending' for host:%s profile:%s", status,
hostUUID, profileUUID)}
}
Expand All @@ -384,9 +388,44 @@ func (svc *scepProxyService) validateIdentifier(ctx context.Context, identifier
// The challenge password was retrieved for this profile, and is now invalid.
// We need to resend the profile with a new challenge password.
// Note: we don't actually know if it is invalid, and we can't get that exact feedback from SCEP server.
if err := svc.ds.ResendHostMDMProfile(ctx, hostUUID, profileUUID); err != nil {
//
// Returning a bad request makes the MDM client mark the profile failed, which
// consumes the profile's automatic retry. For Apple profiles we resend via
// ResendHostCertificateProfile (rather than the generic ResendHostMDMProfile),
// which resets the retry counter and blanks the command UUID. Blanking the
// command UUID forces the reconcile cron to re-render the profile from scratch
// rather than re-enqueue the stale, already-expired command bytes: the NULL
// status re-selects the profile for install (server/mdm/apple/reconcile.go,
// ComputeReconcileDeltas: OperationType==Install && Status==nil), and
// preprocessProfileContents re-fetches a fresh NDES challenge and upserts a new
// challenge_retrieved_at (server/mdm/apple/profile_processor.go, the
// FleetVarNDESSCEPChallenge case + BulkUpsertMDMManagedCertificates). This lets
// the host self-heal rather than landing in a terminal failed state (see #46291).
//
// Known limitation: ResendHostCertificateProfile only operates on
// host_mdm_apple_profiles, so Windows NDES profiles fall back to the generic
// ResendHostMDMProfile and remain subject to the single-retry trap. Tracked as a
// follow-up; out of scope for the macOS issue reported in #46291.
if strings.HasPrefix(profileUUID, fleet.MDMAppleProfileUUIDPrefix) {
if err := svc.ds.ResendHostCertificateProfile(ctx, hostUUID, profileUUID); err != nil {
return "", ctxerr.Wrap(ctx, err, "resending host mdm certificate profile")
}
} else if err := svc.ds.ResendHostMDMProfile(ctx, hostUUID, profileUUID); err != nil {
return "", ctxerr.Wrap(ctx, err, "resending host mdm profile")
}
// Log the requeue. Because each expiry resets retries, a host that chronically
// checks in more than NDESChallengeInvalidAfter after the profile is built will
// re-fetch a fresh NDES challenge on every reconcile instead of going terminal.
// That is the intended self-heal behavior, but it also means a backlog of
// chronically-late hosts can repeatedly hit the (Okta/WAF-fronted) NDES admin URL,
// so we surface it here for operators monitoring SCEP admin-URL load.
svc.debugLogger.WarnContext(ctx, "dynamic SCEP challenge expired before device redemption; requeuing profile with a fresh challenge",
"host_uuid", hostUUID,
"profile_uuid", profileUUID,
"ca_type", fleet.CAConfigNDES,
"challenge_retrieved_at", challengeRetrievedAt,
"challenge_invalid_after", NDESChallengeInvalidAfter.String(),
)
return "", &scepserver.BadRequestError{Message: "challenge password has expired"}
}
scepURL = groupedCAs.NDESSCEP.URL
Expand Down
65 changes: 60 additions & 5 deletions ee/server/service/scep/scep_proxy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ func TestValidateIdentifier(t *testing.T) {
assert.Equal(t, "https://ndes.example.com/scep", scepURL)
})

t.Run("NDES challenge expired triggers requeue", func(t *testing.T) {
t.Run("NDES challenge expired triggers requeue for Apple profile", func(t *testing.T) {
ds := new(mock.DataStore)
ds.GetGroupedCertificateAuthoritiesFunc = func(ctx context.Context, includeSecrets bool) (*fleet.GroupedCertificateAuthorities, error) {
return &fleet.GroupedCertificateAuthorities{
Expand All @@ -459,19 +459,74 @@ func TestValidateIdentifier(t *testing.T) {
ChallengeRetrievedAt: &expiredTime,
}, nil
}
ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
// Apple profiles must resend via ResendHostCertificateProfile so the retry
// counter is reset and a fresh challenge is fetched (see #46291), not via the
// generic ResendHostMDMProfile which would resend the stale command bytes.
//
// This asserts the routing decision only. The end-to-end self-heal (blanked
// command_uuid -> reconcile re-renders -> fresh NDES challenge + new
// challenge_retrieved_at) relies on the reconcile path referenced in
// validateIdentifier's comment: server/mdm/apple/reconcile.go
// (ComputeReconcileDeltas selects Status==nil installs) and
// server/mdm/apple/profile_processor.go (FleetVarNDESSCEPChallenge re-fetch +
// BulkUpsertMDMManagedCertificates).
ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
assert.Equal(t, "host-uuid", hostUUID)
assert.Equal(t, "a-profile-uuid", profileUUID)
return nil
}
ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
t.Errorf("ResendHostMDMProfile should not be called for Apple NDES profiles")
return nil
}
svc := newTestService(ds)

identifier := makeIdentifier("host-uuid", "a-profile-uuid", "NDES", "")
_, err := svc.validateIdentifier(ctx, identifier, true) // checkChallenge=true
require.Error(t, err)
assert.Contains(t, err.Error(), "challenge password has expired")
assert.True(t, ds.ResendHostCertificateProfileFuncInvoked)
assert.False(t, ds.ResendHostMDMProfileFuncInvoked)
})

t.Run("NDES challenge expired triggers requeue for Windows profile", func(t *testing.T) {
ds := new(mock.DataStore)
ds.GetGroupedCertificateAuthoritiesFunc = func(ctx context.Context, includeSecrets bool) (*fleet.GroupedCertificateAuthorities, error) {
return &fleet.GroupedCertificateAuthorities{
NDESSCEP: &fleet.NDESSCEPProxyCA{URL: "https://ndes.example.com/scep"},
}, nil
}
pendingStatus := fleet.MDMDeliveryPending
expiredTime := time.Now().Add(-58 * time.Minute) // Expired (>57 minutes)
ds.GetWindowsHostMDMCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID, caName string) (*fleet.HostMDMCertificateProfile, error) {
return &fleet.HostMDMCertificateProfile{
HostUUID: hostUUID,
ProfileUUID: profileUUID,
Status: &pendingStatus,
Type: fleet.CAConfigNDES,
CAName: "NDES",
ChallengeRetrievedAt: &expiredTime,
}, nil
}
// ResendHostCertificateProfile only operates on host_mdm_apple_profiles, so
// Windows profiles must continue to use the generic ResendHostMDMProfile.
ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
assert.Equal(t, "host-uuid", hostUUID)
assert.Equal(t, "w-profile-uuid", profileUUID)
return nil
}
ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
t.Errorf("ResendHostCertificateProfile should not be called for Windows NDES profiles")
return nil
}
svc := newTestService(ds)

identifier := makeIdentifier("host-uuid", "w-profile-uuid", "NDES", "")
_, err := svc.validateIdentifier(ctx, identifier, true) // checkChallenge=true
require.Error(t, err)
assert.Contains(t, err.Error(), "challenge password has expired")
assert.True(t, ds.ResendHostMDMProfileFuncInvoked)
ds.ResendHostMDMProfileFuncInvoked = false
assert.False(t, ds.ResendHostCertificateProfileFuncInvoked)
})

t.Run("NDES challenge not expired", func(t *testing.T) {
Expand Down Expand Up @@ -822,15 +877,15 @@ func TestValidateIdentifier(t *testing.T) {
ChallengeRetrievedAt: &expiredTime,
}, nil
}
ds.ResendHostMDMProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
ds.ResendHostCertificateProfileFunc = func(ctx context.Context, hostUUID, profileUUID string) error {
return errors.New("resend failed")
}
svc := newTestService(ds)

identifier := makeIdentifier("host-uuid", "a-profile-uuid", "NDES", "")
_, err := svc.validateIdentifier(ctx, identifier, true)
require.Error(t, err)
assert.Contains(t, err.Error(), "resending host mdm profile")
assert.Contains(t, err.Error(), "resending host mdm certificate profile")
})

t.Run("default CA name is NDES", func(t *testing.T) {
Expand Down
39 changes: 38 additions & 1 deletion server/mdm/apple/profile_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,18 @@ func preprocessProfileContents(
// need to look it up more than once per host.
hostIDForUUIDCache := make(map[string]uint)

// ndesCacheFull is set once the NDES admin URL reports its password cache is
// full during this reconcile pass. NDES only holds a small number of
// outstanding (unredeemed) challenge passwords at once (PasswordMax, default
// 5), and a password is a transient, retryable resource: slots free up as
// devices redeem or as passwords expire. Once the cache is full, every further
// fetch this pass would also fail, so we stop requesting new challenges and
// defer the remaining NDES profiles to a later reconcile run rather than
// hammering the admin URL or terminally failing a large backlog all at once.
// This both throttles our fetches to NDES's capacity and lets the backlog
// drain gradually over successive runs (see #46291).
ndesCacheFull := false

var addedTargets map[string]*fleet.CmdTarget
for profUUID, target := range targets {
contents, ok := profileContents[profUUID]
Expand Down Expand Up @@ -352,6 +364,11 @@ func preprocessProfileContents(

hostContents := contentsStr
failed := false
// deferred is set when this host's profile cannot be rendered right now
// but should be retried on a later reconcile (e.g. the NDES password cache
// is full). Unlike failed, it leaves the profile status untouched (NULL) so
// it is re-selected next run instead of landing in a terminal failed state.
deferred := false

fleetVarLoop:
for _, fleetVar := range fleetVars {
Expand All @@ -365,10 +382,30 @@ func preprocessProfileContents(
}
ndesConfig = groupedCAs.NDESSCEP
}
if ndesCacheFull {
// The NDES password cache filled earlier in this reconcile pass.
// Defer this host (leave its status NULL) so it is retried on a
// later run once slots free up, instead of issuing a fetch that we
// know would fail or terminally failing the profile.
deferred = true
break fleetVarLoop
}
logger.DebugContext(ctx, "fetching NDES challenge", "host_uuid", hostUUID, "profile_uuid", profUUID)
// Insert the SCEP challenge into the profile contents
challenge, err := scepConfig.GetNDESSCEPChallenge(ctx, *ndesConfig)
if err != nil {
// A full password cache is a transient, retryable condition: NDES
// caps the number of outstanding challenges, and slots free up as
// devices redeem or passwords expire. Don't burn the profile's retry
// or mark it failed; defer it and stop fetching more challenges this
// pass so the backlog drains over successive reconcile runs (#46291).
if errors.As(err, &scep.NDESPasswordCacheFullError{}) {
ndesCacheFull = true
deferred = true
logger.WarnContext(ctx, "NDES password cache full; deferring remaining NDES profiles to a later reconcile",
"host_uuid", hostUUID, "profile_uuid", profUUID)
break fleetVarLoop
}
detail := scep.NDESChallengeErrorToDetail(err)
err := ds.UpdateOrDeleteHostMDMAppleProfile(ctx, &fleet.HostMDMAppleProfile{
CommandUUID: target.CmdUUID,
Expand Down Expand Up @@ -631,7 +668,7 @@ func preprocessProfileContents(
// This was handled in the above switch statement, so we should never reach this case
}
}
if !failed {
if !failed && !deferred {
addedTargets[tempProfUUID] = &fleet.CmdTarget{
CmdUUID: tempCmdUUID,
ProfileIdentifier: target.ProfileIdentifier,
Expand Down
113 changes: 107 additions & 6 deletions server/mdm/apple/profile_processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ func TestPreprocessProfileContents(t *testing.T) {
assert.NotNil(t, updatedProfile.VariablesUpdatedAt)
assert.Empty(t, targets)

// Password cache full
// Password cache full is a transient condition (NDES caps the number of
// outstanding challenges). The profile must NOT be marked failed; it is
// deferred and left pending (status untouched) so the next reconcile retries
// it instead of landing in a terminal failed state (see #46291).
scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) {
assert.Equal(t, ndesPassword, proxy.Password)
return "", scep.NewNDESPasswordCacheFullError("NDES error")
Expand All @@ -177,11 +180,8 @@ func TestPreprocessProfileContents(t *testing.T) {
populateTargets()
err = preprocessProfileContents(ctx, appCfg, ds, scepConfig, digiCertService, logger, targets, profileContents, hostProfilesToInstallMap, userEnrollmentsToHostUUIDsMap, groupedCAs)
require.NoError(t, err)
require.NotNil(t, updatedProfile)
assert.Contains(t, updatedProfile.Detail, "FLEET_VAR_"+fleet.FleetVarNDESSCEPChallenge)
assert.Contains(t, updatedProfile.Detail, "cached passwords")
assert.NotNil(t, updatedProfile.VariablesUpdatedAt)
assert.Empty(t, targets)
assert.Nil(t, updatedProfile, "cache-full must not mark the profile failed")
assert.Empty(t, targets, "deferred profile must not be enqueued")

// Insufficient permissions
scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) {
Expand Down Expand Up @@ -415,6 +415,107 @@ func TestPreprocessProfileContents(t *testing.T) {
}
}

// TestPreprocessProfileContentsNDESCacheFullDefersBacklog verifies the throttle /
// self-heal behavior added for https://github.com/fleetdm/fleet/issues/46291: when
// the NDES password cache fills part-way through a batch, Fleet stops fetching
// further challenges for the rest of the reconcile pass and DEFERS the remaining
// hosts (leaving them pending) instead of terminally failing them or continuing to
// hammer the admin URL. The hosts that did receive a challenge are still enqueued,
// and the deferred ones are re-selected on the next reconcile run.
func TestPreprocessProfileContentsNDESCacheFullDefersBacklog(t *testing.T) {
ctx := context.Background()
ctx = license.NewContext(ctx, &fleet.LicenseInfo{Tier: fleet.TierPremium})
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
appCfg := &fleet.AppConfig{}
appCfg.ServerSettings.ServerURL = "https://test.example.com"
appCfg.MDM.EnabledAndConfigured = true
ds := new(mock.Store)
digiCertService := digicert.NewService(digicert.WithLogger(logger))

hostUUIDs := []string{"host-1", "host-2", "host-3", "host-4"}
const profUUID = "p1"
const cmdUUID = "cmd-1"

targets := map[string]*fleet.CmdTarget{
profUUID: {CmdUUID: cmdUUID, ProfileIdentifier: "com.add.profile", EnrollmentIDs: hostUUIDs},
}
hostProfilesToInstallMap := make(map[fleet.HostProfileUUID]*fleet.MDMAppleBulkUpsertHostProfilePayload, len(hostUUIDs))
for _, h := range hostUUIDs {
hostProfilesToInstallMap[fleet.HostProfileUUID{HostUUID: h, ProfileUUID: profUUID}] = &fleet.MDMAppleBulkUpsertHostProfilePayload{
ProfileUUID: profUUID,
ProfileIdentifier: "com.add.profile",
HostUUID: h,
OperationType: fleet.MDMOperationTypeInstall,
Status: &fleet.MDMDeliveryPending,
CommandUUID: cmdUUID,
Scope: fleet.PayloadScopeSystem,
}
}
userEnrollmentsToHostUUIDsMap := make(map[string]string)
profileContents := map[string]mobileconfig.Mobileconfig{
profUUID: []byte("$FLEET_VAR_" + fleet.FleetVarNDESSCEPChallenge),
}

ndesPassword := "test-password"
ds.GetAllMDMConfigAssetsByNameFunc = func(ctx context.Context, assetNames []fleet.MDMAssetName, _ sqlx.QueryerContext) (map[fleet.MDMAssetName]fleet.MDMConfigAsset, error) {
return map[fleet.MDMAssetName]fleet.MDMConfigAsset{
fleet.MDMAssetNDESPassword: {Value: []byte(ndesPassword)},
}, nil
}

var failedCalls int
ds.UpdateOrDeleteHostMDMAppleProfileFunc = func(ctx context.Context, profile *fleet.HostMDMAppleProfile) error {
failedCalls++
return nil
}
var enqueuedHosts []string
ds.BulkUpsertMDMAppleHostProfilesFunc = func(ctx context.Context, payload []*fleet.MDMAppleBulkUpsertHostProfilePayload) error {
for _, p := range payload {
enqueuedHosts = append(enqueuedHosts, p.HostUUID)
}
return nil
}
var managedCertCount int
ds.BulkUpsertMDMManagedCertificatesFunc = func(ctx context.Context, payload []*fleet.MDMManagedCertificate) error {
managedCertCount += len(payload)
return nil
}

groupedCAs := &fleet.GroupedCertificateAuthorities{
NDESSCEP: &fleet.NDESSCEPProxyCA{
URL: "https://test-example.com",
AdminURL: "https://example.com",
Username: "admin",
Password: ndesPassword,
},
}

// Succeed for the first 2 challenge fetches, then report the cache is full.
var fetchCalls int
scepConfig := &scep_mock.SCEPConfigService{}
scepConfig.GetNDESSCEPChallengeFunc = func(ctx context.Context, proxy fleet.NDESSCEPProxyCA) (string, error) {
fetchCalls++
if fetchCalls > 2 {
return "", scep.NewNDESPasswordCacheFullError("cache full")
}
return "ndes-challenge", nil
}

err := preprocessProfileContents(ctx, appCfg, ds, scepConfig, digiCertService, logger, targets, profileContents, hostProfilesToInstallMap, userEnrollmentsToHostUUIDsMap, groupedCAs)
require.NoError(t, err)

// 2 successful fetches + 1 cache-full signal; after that we stop fetching, so
// host-4 is deferred without another admin-URL hit.
assert.Equal(t, 3, fetchCalls, "must stop fetching once the cache reports full")
// A transient cache-full must not terminally fail any profile.
assert.Zero(t, failedCalls, "cache-full must not terminally fail any profile")
// The 2 hosts that got a challenge are enqueued; the other 2 are deferred and
// left pending for the next reconcile.
assert.ElementsMatch(t, []string{"host-1", "host-2"}, enqueuedHosts)
assert.Equal(t, 2, managedCertCount)
assert.Len(t, targets, 2, "only the hosts with a fresh challenge are enqueued; the rest are deferred")
}

// TestPreprocessProfileContentsDigiCertUPNMultiHost is a regression test for
// https://github.com/fleetdm/fleet/issues/39324. When the same DigiCert CA is
// used for multiple hosts in a single batch, the CertificateUserPrincipalNames
Expand Down
Loading