@@ -12,6 +12,7 @@ import (
1212 "path/filepath"
1313 "reflect"
1414 goruntime "runtime"
15+ "slices"
1516 "strconv"
1617 "strings"
1718 "sync"
@@ -1233,7 +1234,14 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
12331234 logSystem ("Starting update from %s to %s: %+v" , oldConfigName , newConfigName , diff )
12341235
12351236 diffFileSet := ctrlcommon .CalculateConfigFileDiffs (& oldIgnConfig , & newIgnConfig )
1236- diffUnitSet := ctrlcommon .CalculateConfigUnitDiffs (& oldIgnConfig , & newIgnConfig )
1237+ // Get the added and updated units
1238+ unitDiff := ctrlcommon .GetChangedConfigUnitsByType (& oldIgnConfig , & newIgnConfig )
1239+ addedOrChangedUnits := slices .Concat (unitDiff .Added , unitDiff .Updated )
1240+ // Get the names of all units changed in some way (added, removed, or updated)
1241+ var allChangedUnitNames []string
1242+ for _ , unit := range append (addedOrChangedUnits , unitDiff .Removed ... ) {
1243+ allChangedUnitNames = append (allChangedUnitNames , unit .Name )
1244+ }
12371245
12381246 var fg featuregates.FeatureGate
12391247
@@ -1247,6 +1255,10 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
12471255 }
12481256 }
12491257
1258+ // Check for forcefile before calculatePostConfigChange* functions delete it.
1259+ // This is needed for updateFiles to know whether to write all units (OCPBUGS-74692).
1260+ forceFilePresent := forceFileExists ()
1261+
12501262 var nodeDisruptionActions []opv1.NodeDisruptionPolicyStatusAction
12511263 var actions []string
12521264 // If FeatureGateNodeDisruptionPolicy is set, calculate NodeDisruptionPolicy based actions for this MC diff
@@ -1359,13 +1371,13 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
13591371 }
13601372
13611373 // update files on disk that need updating
1362- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , skipCertificateWrite ); err != nil {
1374+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , skipCertificateWrite , forceFilePresent ); err != nil {
13631375 return err
13641376 }
13651377
13661378 defer func () {
13671379 if retErr != nil {
1368- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , skipCertificateWrite ); err != nil {
1380+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , skipCertificateWrite , false ); err != nil {
13691381 errs := kubeErrs .NewAggregate ([]error {err , retErr })
13701382 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
13711383 return
@@ -1491,15 +1503,21 @@ func (dn *Daemon) updateHypershift(oldConfig, newConfig *mcfgv1.MachineConfig, d
14911503 return fmt .Errorf ("parsing new Ignition config failed: %w" , err )
14921504 }
14931505
1506+ unitDiff := ctrlcommon .GetChangedConfigUnitsByType (& oldIgnConfig , & newIgnConfig )
1507+ addedOrChangedUnits := slices .Concat (unitDiff .Added , unitDiff .Updated )
1508+
1509+ // Check for forcefile to support config drift recovery (OCPBUGS-74692)
1510+ forceFilePresent := forceFileExists ()
1511+
14941512 // update files on disk that need updating
14951513 // We should't skip the certificate write in HyperShift since it does not run the extra daemon process
1496- if err := dn .updateFiles (oldIgnConfig , newIgnConfig , false ); err != nil {
1514+ if err := dn .updateFiles (oldIgnConfig , newIgnConfig , addedOrChangedUnits , false , forceFilePresent ); err != nil {
14971515 return err
14981516 }
14991517
15001518 defer func () {
15011519 if retErr != nil {
1502- if err := dn .updateFiles (newIgnConfig , oldIgnConfig , false ); err != nil {
1520+ if err := dn .updateFiles (newIgnConfig , oldIgnConfig , addedOrChangedUnits , false , false ); err != nil {
15031521 errs := kubeErrs .NewAggregate ([]error {err , retErr })
15041522 retErr = fmt .Errorf ("error rolling back files writes: %w" , errs )
15051523 return
@@ -1990,12 +2008,26 @@ func (dn *CoreOSDaemon) switchKernel(oldConfig, newConfig *mcfgv1.MachineConfig)
19902008// whatever has been written is picked up by the appropriate daemons, if
19912009// required. in particular, a daemon-reload and restart for any unit files
19922010// touched.
1993- func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , skipCertificateWrite bool ) error {
2011+ func (dn * Daemon ) updateFiles (oldIgnConfig , newIgnConfig ign3types.Config , addedOrChangedUnits []ign3types. Unit , skipCertificateWrite , forceFilePresent bool ) error {
19942012 klog .Info ("Updating files" )
19952013 if err := dn .writeFiles (newIgnConfig .Storage .Files , skipCertificateWrite ); err != nil {
19962014 return err
19972015 }
1998- if err := dn .writeUnits (newIgnConfig .Systemd .Units ); err != nil {
2016+
2017+ // With OCPBUGS-58023, we updated this flow to only write units that were either added or
2018+ // updated. As can be seen in OCPBUGS-74692, this impacted the traditional method to recover
2019+ // from config drifts with systemd units. It makes the `touch /run/machine-config-daemon-force`
2020+ // command useless since the new flow does not rewrite all files, only the ones that have been
2021+ // added or changed with the latest MC. To keep the fix for OCPBUGS-58023 and allow continue
2022+ // supporting the traditional config drift recovery for systemd units, all units should be
2023+ // written when a forcefile exists.
2024+ unitsToWrite := addedOrChangedUnits
2025+ if forceFilePresent {
2026+ klog .Info ("Forcefile exists, writing all units" )
2027+ unitsToWrite = newIgnConfig .Systemd .Units
2028+ }
2029+
2030+ if err := dn .writeUnits (unitsToWrite ); err != nil {
19992031 return err
20002032 }
20012033 return dn .deleteStaleData (oldIgnConfig , newIgnConfig )
0 commit comments