Skip to content

Commit 62edc82

Browse files
Merge pull request #1151 from slintes/mhc_feature_gate
Add feature gate for disabling the MHC controller
2 parents 8f0f041 + ce3fe45 commit 62edc82

50 files changed

Lines changed: 5206 additions & 80 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cmd/machine-api-operator/start.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strconv"
1111

1212
osconfigv1 "github.com/openshift/api/config/v1"
13+
"github.com/openshift/library-go/pkg/operator/events"
1314
"github.com/openshift/machine-api-operator/pkg/metrics"
1415
"github.com/openshift/machine-api-operator/pkg/operator"
1516
"github.com/openshift/machine-api-operator/pkg/util"
@@ -120,7 +121,7 @@ func initMachineAPIInformers(ctx *ControllerContext) {
120121
klog.Info("Synced up machine api informer caches")
121122
}
122123

123-
func initRecorder(kubeClient kubernetes.Interface) (record.EventRecorder, error) {
124+
func initEventRecorder(kubeClient kubernetes.Interface) (record.EventRecorder, error) {
124125
eventRecorderScheme := runtime.NewScheme()
125126
if err := osconfigv1.Install(eventRecorderScheme); err != nil {
126127
return nil, fmt.Errorf("failed to create event recorder scheme: %v", err)
@@ -131,26 +132,41 @@ func initRecorder(kubeClient kubernetes.Interface) (record.EventRecorder, error)
131132
return eventBroadcaster.NewRecorder(eventRecorderScheme, v1.EventSource{Component: "machineapioperator"}), nil
132133
}
133134

135+
func initRecorder(kubeClient kubernetes.Interface) (events.Recorder, error) {
136+
controllerRef, err := events.GetControllerReferenceForCurrentPod(context.Background(), kubeClient, componentNamespace, nil)
137+
if err != nil {
138+
return nil, fmt.Errorf("failed to create controller ref for recorder: %v", err)
139+
}
140+
recorder := events.NewKubeRecorder(kubeClient.CoreV1().Events(componentNamespace), "machineapioperator", controllerRef)
141+
return recorder, nil
142+
}
143+
134144
func startControllersOrDie(ctx *ControllerContext) {
135145
kubeClient := ctx.ClientBuilder.KubeClientOrDie(componentName)
136-
recorder, err := initRecorder(kubeClient)
146+
eventRecorder, err := initEventRecorder(kubeClient)
137147
if err != nil {
138148
klog.Fatalf("failed to create event recorder: %v", err)
139149
}
150+
recorder, err := initRecorder(kubeClient)
151+
if err != nil {
152+
klog.Fatalf("failed to create recorder: %v", err)
153+
}
140154
optr, err := operator.New(
141155
componentNamespace, componentName,
142156
startOpts.imagesFile,
143157
config,
144158
ctx.KubeNamespacedInformerFactory.Apps().V1().Deployments(),
145159
ctx.KubeNamespacedInformerFactory.Apps().V1().DaemonSets(),
146160
ctx.ConfigInformerFactory.Config().V1().FeatureGates(),
161+
ctx.ConfigInformerFactory.Config().V1().ClusterVersions(),
147162
ctx.KubeNamespacedInformerFactory.Admissionregistration().V1().ValidatingWebhookConfigurations(),
148163
ctx.KubeNamespacedInformerFactory.Admissionregistration().V1().MutatingWebhookConfigurations(),
149164
ctx.ConfigInformerFactory.Config().V1().Proxies(),
150165
ctx.ClientBuilder.KubeClientOrDie(componentName),
151166
ctx.ClientBuilder.OpenshiftClientOrDie(componentName),
152167
ctx.ClientBuilder.MachineClientOrDie(componentName),
153168
ctx.ClientBuilder.DynamicClientOrDie(componentName),
169+
eventRecorder,
154170
recorder,
155171
)
156172
if err != nil {

go.mod

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ require (
99
github.com/google/uuid v1.3.0
1010
github.com/onsi/ginkgo/v2 v2.9.5
1111
github.com/onsi/gomega v1.27.7
12-
github.com/openshift/api v0.0.0-20230703162140-6e9853e4c905
12+
github.com/openshift/api v0.0.0-20230711095040-ca06f4a23b64
1313
github.com/openshift/client-go v0.0.0-20230503144108-75015d2347cb
14-
github.com/openshift/library-go v0.0.0-20230508110756-9b7abe2c9cbf
14+
github.com/openshift/library-go v0.0.0-20230706195801-561433066966
1515
github.com/operator-framework/operator-sdk v0.5.1-0.20190301204940-c2efe6f74e7b
1616
github.com/prometheus/client_golang v1.15.1
1717
github.com/spf13/cobra v1.6.1
@@ -189,6 +189,7 @@ require (
189189
github.com/quasilyte/regex/syntax v0.0.0-20210819130434-b3f0c404a727 // indirect
190190
github.com/quasilyte/stdinfo v0.0.0-20220114132959-f7386bf02567 // indirect
191191
github.com/rivo/uniseg v0.4.2 // indirect
192+
github.com/robfig/cron v1.2.0 // indirect
192193
github.com/russross/blackfriday/v2 v2.1.0 // indirect
193194
github.com/ryancurrah/gomodguard v1.3.0 // indirect
194195
github.com/ryanrolds/sqlclosecheck v0.4.0 // indirect

go.sum

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -631,12 +631,12 @@ github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGV
631631
github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
632632
github.com/onsi/gomega v1.27.7 h1:fVih9JD6ogIiHUN6ePK7HJidyEDpWGVB5mzM7cWNXoU=
633633
github.com/onsi/gomega v1.27.7/go.mod h1:1p8OOlwo2iUUDsHnOrjE5UKYJ+e3W8eQ3qSlRahPmr4=
634-
github.com/openshift/api v0.0.0-20230703162140-6e9853e4c905 h1:zvzzN6z/QxwQ6KiHnGb/DuVSOhHkYX6SqkPILdwc/3s=
635-
github.com/openshift/api v0.0.0-20230703162140-6e9853e4c905/go.mod h1:4VWG+W22wrB4HfBL88P40DxLEpSOaiBVxUnfalfJo9k=
634+
github.com/openshift/api v0.0.0-20230711095040-ca06f4a23b64 h1:j7LIIr4Vrdy4Dpd4bw2j53UXUSjA1eXXC0x89g9kyAI=
635+
github.com/openshift/api v0.0.0-20230711095040-ca06f4a23b64/go.mod h1:yimSGmjsI+XF1mr+AKBs2//fSXIOhhetHGbMlBEfXbs=
636636
github.com/openshift/client-go v0.0.0-20230503144108-75015d2347cb h1:Nij5OnaECrkmcRQMAE9LMbQXPo95aqFnf+12B7SyFVI=
637637
github.com/openshift/client-go v0.0.0-20230503144108-75015d2347cb/go.mod h1:Rhb3moCqeiTuGHAbXBOlwPubUMlOZEkrEWTRjIF3jzs=
638-
github.com/openshift/library-go v0.0.0-20230508110756-9b7abe2c9cbf h1:ZpFAN2qprgp7jEhGPrOAwP8mmuYC9BRYzvDefg+k4GM=
639-
github.com/openshift/library-go v0.0.0-20230508110756-9b7abe2c9cbf/go.mod h1:PJVatR/oS/EaFciwylyAr9hORSqQHrC+5bXf4L0wsBY=
638+
github.com/openshift/library-go v0.0.0-20230706195801-561433066966 h1:qJZaVzxJy7s6Cp1908rkSR64YCrpiKMZAkfYhsZPPCw=
639+
github.com/openshift/library-go v0.0.0-20230706195801-561433066966/go.mod h1:PegtilvJPBJXjJG3AV8uL1a0SAnBr6K67ShNiWVb40M=
640640
github.com/operator-framework/operator-sdk v0.5.1-0.20190301204940-c2efe6f74e7b h1:Q1q8w51pAZdx6LEkaYdSbUaaEOHXTyTXLhtGgIiKaiA=
641641
github.com/operator-framework/operator-sdk v0.5.1-0.20190301204940-c2efe6f74e7b/go.mod h1:iVyukRkam5JZa8AnjYf+/G3rk7JI1+M6GsU0sq0B9NA=
642642
github.com/otiai10/copy v1.2.0 h1:HvG945u96iNadPoG2/Ja2+AUJeW5YuFQMixq9yirC+k=
@@ -700,6 +700,8 @@ github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uY
700700
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
701701
github.com/rivo/uniseg v0.4.2 h1:YwD0ulJSJytLpiaWua0sBDusfsCZohxjxzVTYjwxfV8=
702702
github.com/rivo/uniseg v0.4.2/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
703+
github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ=
704+
github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k=
703705
github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
704706
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
705707
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=

install/0000_30_machine-api-operator_09_rbac.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,24 @@ rules:
314314
- patch
315315
- delete
316316

317+
- apiGroups:
318+
- apps
319+
resources:
320+
- replicasets
321+
verbs:
322+
- get
323+
- list
324+
- watch
325+
326+
- apiGroups:
327+
- ""
328+
resources:
329+
- pods
330+
verbs:
331+
- get
332+
- list
333+
- watch
334+
317335
- apiGroups:
318336
- machine.openshift.io
319337
resources:
@@ -394,6 +412,7 @@ rules:
394412
- featuregates
395413
- featuregates/status
396414
- proxies
415+
- clusterversions
397416
verbs:
398417
- get
399418
- list

install/0000_30_machine-api-operator_11_deployment.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ spec:
6868
fieldRef:
6969
apiVersion: v1
7070
fieldPath: metadata.namespace
71+
- name: POD_NAME
72+
valueFrom:
73+
fieldRef:
74+
fieldPath: metadata.name
7175
- name: METRICS_PORT
7276
value: "8080"
7377
resources:

pkg/operator/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func getImagesFromJSONFile(filePath string) (*Images, error) {
7575
return &i, nil
7676
}
7777

78-
func getProviderControllerFromImages(platform configv1.PlatformType, featureGate *configv1.FeatureGate, images Images) (string, error) {
78+
func getProviderControllerFromImages(platform configv1.PlatformType, images Images) (string, error) {
7979
switch platform {
8080
case configv1.AWSPlatformType:
8181
return images.ClusterAPIControllerAWS, nil

pkg/operator/config_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ func TestGetProviderControllerFromImages(t *testing.T) {
350350
}
351351

352352
for _, test := range tests {
353-
res, err := getProviderControllerFromImages(test.provider, &test.featureGate, *img)
353+
res, err := getProviderControllerFromImages(test.provider, *img)
354354
if err != nil {
355355
t.Errorf("failed getProviderControllerFromImages: %v", err)
356356
}

pkg/operator/operator.go

Lines changed: 75 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package operator
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"os"
78
"time"
@@ -12,9 +13,10 @@ import (
1213
configinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
1314
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
1415
machineclientset "github.com/openshift/client-go/machine/clientset/versioned"
16+
"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
17+
"github.com/openshift/library-go/pkg/operator/events"
1518
"github.com/openshift/library-go/pkg/operator/resource/resourceapply"
1619
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
17-
"k8s.io/apimachinery/pkg/api/errors"
1820
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1921
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2022
"k8s.io/apimachinery/pkg/util/wait"
@@ -39,6 +41,9 @@ const (
3941
// 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s, 10.2s, 20.4s, 41s, 82s
4042
maxRetries = 15
4143
maoOwnedAnnotation = "machine.openshift.io/owned"
44+
45+
releaseVersionEnvVariableName = "RELEASE_VERSION"
46+
releaseVersionUnknownValue = "unknown"
4247
)
4348

4449
// Operator defines machine api operator.
@@ -53,6 +58,7 @@ type Operator struct {
5358
machineClient machineclientset.Interface
5459
dynamicClient dynamic.Interface
5560
eventRecorder record.EventRecorder
61+
recorder events.Recorder
5662

5763
syncHandler func(ic string) (reconcile.Result, error)
5864

@@ -71,8 +77,7 @@ type Operator struct {
7177
mutatingWebhookLister admissionlisterv1.MutatingWebhookConfigurationLister
7278
mutatingWebhookListerSynced cache.InformerSynced
7379

74-
featureGateLister configlistersv1.FeatureGateLister
75-
featureGateCacheSynced cache.InformerSynced
80+
featureGateAccessor featuregates.FeatureGateAccess
7681

7782
// queue only ever has one item, but it has nice error handling backoff/retry semantics
7883
queue workqueue.RateLimitingInterface
@@ -91,6 +96,7 @@ func New(
9196
deployInformer appsinformersv1.DeploymentInformer,
9297
daemonsetInformer appsinformersv1.DaemonSetInformer,
9398
featureGateInformer configinformersv1.FeatureGateInformer,
99+
clusterVersionInformer configinformersv1.ClusterVersionInformer,
94100
validatingWebhookInformer admissioninformersv1.ValidatingWebhookConfigurationInformer,
95101
mutatingWebhookInformer admissioninformersv1.MutatingWebhookConfigurationInformer,
96102
proxyInformer configinformersv1.ProxyInformer,
@@ -99,13 +105,18 @@ func New(
99105
machineClient machineclientset.Interface,
100106
dynamicClient dynamic.Interface,
101107

102-
recorder record.EventRecorder,
108+
eventRecorder record.EventRecorder,
109+
recorder events.Recorder,
103110
) (*Operator, error) {
104111
// we must report the version from the release payload when we report available at that level
105112
// TODO we will report the version of the operands (so our machine api implementation version)
106113
operandVersions := []osconfigv1.OperandVersion{}
107-
if releaseVersion := os.Getenv("RELEASE_VERSION"); len(releaseVersion) > 0 {
114+
releaseVersion := os.Getenv(releaseVersionEnvVariableName)
115+
if len(releaseVersion) > 0 {
108116
operandVersions = append(operandVersions, osconfigv1.OperandVersion{Name: "operator", Version: releaseVersion})
117+
} else {
118+
klog.Infof("%s environment variable is missing, defaulting to %q", releaseVersionEnvVariableName, releaseVersionUnknownValue)
119+
releaseVersion = releaseVersionUnknownValue
109120
}
110121

111122
optr := &Operator{
@@ -116,7 +127,8 @@ func New(
116127
osClient: osClient,
117128
machineClient: machineClient,
118129
dynamicClient: dynamicClient,
119-
eventRecorder: recorder,
130+
eventRecorder: eventRecorder,
131+
recorder: recorder,
120132
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineapioperator"),
121133
operandVersions: operandVersions,
122134
}
@@ -137,10 +149,36 @@ func New(
137149
if err != nil {
138150
return nil, fmt.Errorf("error adding event handler to mutatingwebhook informer: %v", err)
139151
}
140-
_, err = featureGateInformer.Informer().AddEventHandler(optr.eventHandler())
141-
if err != nil {
142-
return nil, fmt.Errorf("error adding event handler to featuregates informer: %v", err)
143-
}
152+
153+
desiredVersion := releaseVersion
154+
missingVersion := "0.0.1-snapshot"
155+
featureGateAccessor := featuregates.NewFeatureGateAccess(
156+
desiredVersion, missingVersion,
157+
clusterVersionInformer, featureGateInformer,
158+
recorder,
159+
)
160+
featureGateAccessor.SetChangeHandler(func(featureChange featuregates.FeatureChange) {
161+
if featureChange.Previous == nil {
162+
// When the initial featuregate is set, the previous version is nil.
163+
// Nothing to do in this case, it's handled by the 1st sync, which only runs after the initial feature set was received.
164+
return
165+
}
166+
167+
klog.V(4).InfoS("FeatureGates changed", "enabled", featureChange.New.Enabled, "disabled", featureChange.New.Disabled)
168+
prevDisableMHC := featuregates.NewFeatureGate(featureChange.Previous.Enabled, featureChange.Previous.Disabled).
169+
Enabled(osconfigv1.FeatureGateMachineAPIOperatorDisableMachineHealthCheckController)
170+
newDisableMHC := featuregates.NewFeatureGate(featureChange.New.Enabled, featureChange.New.Disabled).
171+
Enabled(osconfigv1.FeatureGateMachineAPIOperatorDisableMachineHealthCheckController)
172+
173+
if prevDisableMHC != newDisableMHC {
174+
klog.V(2).InfoS("Resync for modified feature gate",
175+
"FeatureGateMachineAPIOperatorDisableMachineHealthCheckController enabled", newDisableMHC,
176+
)
177+
workQueueKey := fmt.Sprintf("%s/%s", optr.namespace, optr.name)
178+
optr.queue.Add(workQueueKey)
179+
}
180+
})
181+
optr.featureGateAccessor = featureGateAccessor
144182

145183
optr.config = config
146184
optr.syncHandler = optr.sync
@@ -160,9 +198,6 @@ func New(
160198
optr.mutatingWebhookLister = mutatingWebhookInformer.Lister()
161199
optr.mutatingWebhookListerSynced = mutatingWebhookInformer.Informer().HasSynced
162200

163-
optr.featureGateLister = featureGateInformer.Lister()
164-
optr.featureGateCacheSynced = featureGateInformer.Informer().HasSynced
165-
166201
return optr, nil
167202
}
168203

@@ -179,12 +214,24 @@ func (optr *Operator) Run(workers int, stopCh <-chan struct{}) {
179214
optr.validatingWebhookListerSynced,
180215
optr.deployListerSynced,
181216
optr.daemonsetListerSynced,
182-
optr.proxyListerSynced,
183-
optr.featureGateCacheSynced) {
217+
optr.proxyListerSynced) {
184218
klog.Error("Failed to sync caches")
185219
return
186220
}
187221
klog.Info("Synced up caches")
222+
223+
ctx, cancelFeatureGateAccessor := context.WithCancel(context.Background())
224+
defer cancelFeatureGateAccessor()
225+
go optr.featureGateAccessor.Run(ctx)
226+
klog.Info("Started feature gate accessor")
227+
select {
228+
case <-optr.featureGateAccessor.InitialFeatureGatesObserved():
229+
klog.V(4).Info("FeatureGates initialized")
230+
case <-time.After(1 * time.Minute):
231+
klog.Error(errors.New("timed out waiting for FeatureGate detection"), "unable to start operator")
232+
return
233+
}
234+
188235
for i := 0; i < workers; i++ {
189236
go wait.Until(optr.worker, time.Second, stopCh)
190237
}
@@ -353,19 +400,6 @@ func (optr *Operator) sync(key string) (reconcile.Result, error) {
353400
return optr.syncAll(operatorConfig)
354401
}
355402

356-
func getFeatureGate(lister configlistersv1.FeatureGateLister) (*osconfigv1.FeatureGate, error) {
357-
featureGate, err := lister.Get("cluster")
358-
if errors.IsNotFound(err) {
359-
// No feature gate is set, therefore cannot be external.
360-
// This is not an error as the feature gate is an optional resource.
361-
return nil, nil
362-
} else if err != nil {
363-
return nil, fmt.Errorf("could not fetch featuregate: %v", err)
364-
}
365-
366-
return featureGate, nil
367-
}
368-
369403
func (optr *Operator) maoConfigFromInfrastructure() (*OperatorConfig, error) {
370404
infra, err := optr.osClient.ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
371405
if err != nil {
@@ -382,12 +416,7 @@ func (optr *Operator) maoConfigFromInfrastructure() (*OperatorConfig, error) {
382416
return nil, err
383417
}
384418

385-
featureGate, err := getFeatureGate(optr.featureGateLister)
386-
if err != nil {
387-
return nil, err
388-
}
389-
390-
providerControllerImage, err := getProviderControllerFromImages(provider, featureGate, *images)
419+
providerControllerImage, err := getProviderControllerFromImages(provider, *images)
391420
if err != nil {
392421
return nil, err
393422
}
@@ -412,14 +441,25 @@ func (optr *Operator) maoConfigFromInfrastructure() (*OperatorConfig, error) {
412441
return nil, err
413442
}
414443

444+
// in case the MHC controller is disabled, leave its image empty
445+
mhcImage := machineAPIOperatorImage
446+
featureGates, err := optr.featureGateAccessor.CurrentFeatureGates()
447+
if err != nil {
448+
return nil, err
449+
}
450+
if featureGates.Enabled(osconfigv1.FeatureGateMachineAPIOperatorDisableMachineHealthCheckController) {
451+
klog.V(2).Info("Disabling MHC controller")
452+
mhcImage = ""
453+
}
454+
415455
return &OperatorConfig{
416456
TargetNamespace: optr.namespace,
417457
Proxy: clusterWideProxy,
418458
Controllers: Controllers{
419459
Provider: providerControllerImage,
420460
MachineSet: machineAPIOperatorImage,
421461
NodeLink: machineAPIOperatorImage,
422-
MachineHealthCheck: machineAPIOperatorImage,
462+
MachineHealthCheck: mhcImage,
423463
KubeRBACProxy: kubeRBACProxy,
424464
TerminationHandler: terminationHandlerImage,
425465
},

0 commit comments

Comments
 (0)