Skip to content

Commit bf00a05

Browse files
committed
[ci-operator]: expand the pod lifecycle metrics to include the state of the machinesets
Signed-off-by: Nikolaos Moraitis <nmoraiti@redhat.com>
1 parent 548ee8d commit bf00a05

19 files changed

Lines changed: 1018 additions & 36 deletions

File tree

go.mod

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ require (
3333
github.com/montanaflynn/stats v0.6.3
3434
github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998
3535
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c
36-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a
36+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910
3737
github.com/openshift/imagebuilder v1.2.15
3838
github.com/openshift/openshift-apiserver v0.0.0-alpha.0
3939
github.com/pkg/errors v0.9.1
@@ -185,11 +185,12 @@ require (
185185
github.com/golang-jwt/jwt v3.2.2+incompatible
186186
github.com/jhump/protoreflect v1.17.0
187187
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e
188-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87
188+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd
189189
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4
190+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675
190191
github.com/openshift/hive/apis v0.0.0-20230525214126-ab571664f899
191192
github.com/openshift/installer v1.4.17
192-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408
193+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b
193194
github.com/ovn-org/ovn-kubernetes/go-controller v0.0.0-20240710195803-425a328cd172
194195
github.com/robfig/cron/v3 v3.0.1
195196
github.com/stretchr/testify v1.10.0

go.sum

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -890,14 +890,16 @@ github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998 h1:
890890
github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998/go.mod h1:PfeYJ/RW2+Jfv3wTz0upbY2TRour/LLqIm2K2Kw5zg0=
891891
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c h1:J1LrylwCrM+K2KiY1VVPYgHTZ56bOBUTUG/WO09oXfA=
892892
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c/go.mod h1:Az5+ON7o5C4Ag2GKFbwBjQO/aQNMNzmD1JLse5w4KRs=
893-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a h1:PVk9YmhCVGDSTFzOkhT81vp062LtYjA4M12iWIuqJRs=
894-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a/go.mod h1:OOh6Qopf21pSzqNVCB5gomomBXb8o5sGKZxG2KNpaXM=
893+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910 h1:8pA9Ugq0hhUbVaVWi5lgF4PGaV1ozpTI1NYn/QWqspg=
894+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910/go.mod h1:OOh6Qopf21pSzqNVCB5gomomBXb8o5sGKZxG2KNpaXM=
895895
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e h1:XmTo1vVHVAcyd9I2UIYnW3PCHUmA8y/gowm3k8Yq7ww=
896896
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e/go.mod h1:nsFLJ3C4RC+6qP2tino47TxLyDpFRxAABrsIvIuap1E=
897-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87 h1:JtLhaGpSEconE+1IKmIgCOof/Len5ceG6H1pk43yv5U=
898-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87/go.mod h1:3IPD4U0qyovZS4EFady2kqY32m8lGcbs/Wx+yprg9z8=
897+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd h1:Gd0+bYdcfGIsDOJ8BwTJJjQeXoziyIsTwqp/s38rKyM=
898+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd/go.mod h1:EB7GeA/vpf9AHklMgnnT0+uG6l/3f8cChtCFbJFrk4g=
899899
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4 h1:nrD3npDGt5bvwNXZKTzzEuZTI/4Uo5PbrkpAjfxhxtE=
900900
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4/go.mod h1:Lzu29TMne5LsgPnyw2n9jrPiD5t6uyG5aE6KFy8cz6w=
901+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675 h1:hPFyXtaR42wqKKGMVP4G7M2vmp5iBxBqKetMguh4Td0=
902+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675/go.mod h1:0tGCwMCgKq7KhJWDGr6Tsqqb6Sk3epz/b6tfFDFK1Ug=
901903
github.com/openshift/custom-resource-status v1.1.3-0.20220503160415-f2fdb4999d87 h1:cHyxR+Y8rAMT6m1jQCaYGRwikqahI0OjjUDhFNf3ySQ=
902904
github.com/openshift/custom-resource-status v1.1.3-0.20220503160415-f2fdb4999d87/go.mod h1:DB/Mf2oTeiAmVVX1gN+NEqweonAPY0TKUwADizj8+ZA=
903905
github.com/openshift/hive/apis v0.0.0-20230525214126-ab571664f899 h1:+HkBwPi47wWXKNLAx82Bh567S6dE6dLD3oMD/VQuXy8=
@@ -906,8 +908,8 @@ github.com/openshift/imagebuilder v1.2.15 h1:MNn1OztEE/l8pSEDPYAQ71Ys6rpXA2P00UF
906908
github.com/openshift/imagebuilder v1.2.15/go.mod h1:cK6MLyBl1IHmIYGLY/2SLOG6p0PtEDUOC7khxsFYUXE=
907909
github.com/openshift/installer v1.4.17 h1:63iijBBgYqQX/p2+Q74gPqnfBN5VNSWX5LxQKuLlj6g=
908910
github.com/openshift/installer v1.4.17/go.mod h1:CtlMEGKJDVMZl4qVBC/xMUXM24YnleT6bakI+KXFAhk=
909-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408 h1:Evg6GEvEuyj9toFX14YenXI6hGRnhLWqYx/rHO7VnQ4=
910-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408/go.mod h1:ePlaOqUiPplRc++6aYdMe+2FmXb2xTNS9Nz5laG2YmI=
911+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b h1:y2DduJug7UZqTu0QTkRPAu73nskuUbFA66fmgxVf/fI=
912+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b/go.mod h1:f8QcnrooSwGa96xI4UaKbKGJZskhTCGeimXKyc4t/ZU=
911913
github.com/openshift/openshift-apiserver v0.0.0-alpha.0 h1:Wk9BYBcIhamxOrK9OyNbmDM5Qb2nzuePr0Pvg7psWKA=
912914
github.com/openshift/openshift-apiserver v0.0.0-alpha.0/go.mod h1:c/cKFsTkfxQC+RhDNdKBwLhPS9ytARfzSiotFRDOFWE=
913915
github.com/ovn-org/ovn-kubernetes/go-controller v0.0.0-20240710195803-425a328cd172 h1:ZvyylzsX7bEpJOq8upPRQRpVJBI8mp/YTCica9PIRAc=

pkg/metrics/machines.go

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
package metrics
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"github.com/sirupsen/logrus"
10+
11+
corev1 "k8s.io/api/core/v1"
12+
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
13+
14+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
15+
autoscalingv1beta1 "github.com/openshift/cluster-autoscaler-operator/pkg/apis/autoscaling/v1beta1"
16+
)
17+
18+
type MachinesEventType string
19+
20+
const (
21+
PodCreation MachinesEventType = "pod_creation"
22+
PodCompletion MachinesEventType = "pod_completion"
23+
)
24+
25+
type MachineInfo struct {
26+
Name string `json:"name"`
27+
Phase string `json:"phase"`
28+
}
29+
30+
type AutoscalerInfo struct {
31+
Name string `json:"name"`
32+
Min int `json:"min"`
33+
Max int `json:"max"`
34+
}
35+
36+
type MachineSetCount struct {
37+
Name string `json:"name"`
38+
Current int `json:"current"`
39+
Autoscaler *AutoscalerInfo `json:"autoscaler,omitempty"`
40+
Machines []MachineInfo `json:"machines,omitempty"`
41+
}
42+
43+
type WorkloadNodeCount struct {
44+
Workload string `json:"workload"`
45+
TotalMachines int `json:"total_machines"`
46+
MachineSets []MachineSetCount `json:"machine_sets"`
47+
}
48+
49+
type MachinesEvent struct {
50+
Type MachinesEventType `json:"type"`
51+
PodName string `json:"pod_name"`
52+
Namespace string `json:"namespace"`
53+
Workload string `json:"workload"`
54+
WorkloadCapacity WorkloadNodeCount `json:"workload_capacity"`
55+
Timestamp time.Time `json:"timestamp"`
56+
}
57+
58+
func (e *MachinesEvent) SetTimestamp(ts time.Time) {
59+
e.Timestamp = ts
60+
}
61+
62+
type MachinesPlugin struct {
63+
ctx context.Context
64+
logger *logrus.Entry
65+
mu sync.Mutex
66+
events []MachinesEvent
67+
client ctrlruntimeclient.Client
68+
autoscalers []autoscalingv1beta1.MachineAutoscaler
69+
}
70+
71+
func NewMachinesPlugin(ctx context.Context, logger *logrus.Entry, client ctrlruntimeclient.Client, autoscalers []autoscalingv1beta1.MachineAutoscaler) *MachinesPlugin {
72+
return &MachinesPlugin{
73+
ctx: ctx,
74+
logger: logger.WithField("plugin", "machines"),
75+
client: client,
76+
autoscalers: autoscalers,
77+
}
78+
}
79+
80+
func (p *MachinesPlugin) Name() string {
81+
return "machines"
82+
}
83+
84+
func (p *MachinesPlugin) Record(ev MetricsEvent) {
85+
e, ok := ev.(*MachinesEvent)
86+
if !ok {
87+
return
88+
}
89+
90+
if e.Type == PodCreation && e.Workload == "" {
91+
workload, err := p.waitForWorkloadLabel(e.Namespace, e.PodName)
92+
if err != nil {
93+
p.logger.WithError(err).Warnf("Failed to get workload label for pod %s/%s", e.Namespace, e.PodName)
94+
return
95+
}
96+
e.Workload = workload
97+
}
98+
99+
e.WorkloadCapacity = p.getWorkloadCounts(e.Workload)
100+
101+
p.mu.Lock()
102+
defer p.mu.Unlock()
103+
p.logger.WithField("event", e).Debug("Recorded machines event")
104+
p.events = append(p.events, *e)
105+
}
106+
107+
func (p *MachinesPlugin) waitForWorkloadLabel(namespace, podName string) (string, error) {
108+
ticker := time.NewTicker(time.Second)
109+
defer ticker.Stop()
110+
111+
timeout := time.After(time.Minute)
112+
for {
113+
select {
114+
case <-p.ctx.Done():
115+
return "", fmt.Errorf("context cancelled")
116+
case <-timeout:
117+
return "", fmt.Errorf("timed out waiting for ci-workload label")
118+
case <-ticker.C:
119+
pod := &corev1.Pod{}
120+
if err := p.client.Get(p.ctx, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: podName}, pod); err != nil {
121+
p.logger.WithError(err).Debugf("Failed to get pod %s/%s while waiting for workload label", namespace, podName)
122+
continue
123+
}
124+
125+
workload := pod.Labels[CIWorkloadLabel]
126+
if workload != "" {
127+
return workload, nil
128+
}
129+
}
130+
}
131+
}
132+
133+
func (p *MachinesPlugin) Events() []MetricsEvent {
134+
p.mu.Lock()
135+
defer p.mu.Unlock()
136+
out := make([]MetricsEvent, len(p.events))
137+
for i := range p.events {
138+
out[i] = &p.events[i]
139+
}
140+
return out
141+
}
142+
143+
func (p *MachinesPlugin) getAutoscaler(machineSetName string) *AutoscalerInfo {
144+
for _, autoscaler := range p.autoscalers {
145+
if autoscaler.Spec.ScaleTargetRef.Name == machineSetName {
146+
return &AutoscalerInfo{
147+
Name: autoscaler.Name,
148+
Min: int(autoscaler.Spec.MinReplicas),
149+
Max: int(autoscaler.Spec.MaxReplicas),
150+
}
151+
}
152+
}
153+
return nil
154+
}
155+
156+
func (p *MachinesPlugin) getWorkloadCounts(workload string) WorkloadNodeCount {
157+
ret := WorkloadNodeCount{Workload: workload}
158+
machineSetList := &machinev1beta1.MachineSetList{}
159+
if err := p.client.List(p.ctx, machineSetList, ctrlruntimeclient.InNamespace(MachineAPINamespace)); err != nil {
160+
p.logger.WithError(err).Warn("Failed to list MachineSets")
161+
return WorkloadNodeCount{}
162+
}
163+
164+
for _, ms := range machineSetList.Items {
165+
msWorkload := ms.Spec.Template.Spec.ObjectMeta.Labels[CIWorkloadLabel]
166+
if msWorkload != workload {
167+
continue
168+
}
169+
170+
current := int(ms.Status.Replicas)
171+
autoscaler := p.getAutoscaler(ms.Name)
172+
173+
machineList := &machinev1beta1.MachineList{}
174+
if err := p.client.List(p.ctx, machineList,
175+
ctrlruntimeclient.InNamespace(MachineAPINamespace),
176+
ctrlruntimeclient.MatchingLabels{MachineSetLabel: ms.Name}); err != nil {
177+
p.logger.WithError(err).Warnf("Failed to list Machines for MachineSet %s", ms.Name)
178+
continue
179+
}
180+
181+
var machines []MachineInfo
182+
for _, machine := range machineList.Items {
183+
phase := "Unknown"
184+
if machine.Status.Phase != nil {
185+
phase = *machine.Status.Phase
186+
}
187+
machines = append(machines, MachineInfo{
188+
Name: machine.Name,
189+
Phase: phase,
190+
})
191+
}
192+
193+
ret.TotalMachines += current
194+
ret.MachineSets = append(ret.MachineSets, MachineSetCount{
195+
Name: ms.Name,
196+
Current: current,
197+
Autoscaler: autoscaler,
198+
Machines: machines,
199+
})
200+
}
201+
202+
return ret
203+
}

pkg/metrics/machines_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
package metrics
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
"github.com/google/go-cmp/cmp"
8+
"github.com/sirupsen/logrus"
9+
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/client-go/kubernetes/scheme"
12+
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
13+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
14+
15+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
16+
autoscalingv1beta1 "github.com/openshift/cluster-autoscaler-operator/pkg/apis/autoscaling/v1beta1"
17+
)
18+
19+
func init() {
20+
_ = machinev1beta1.AddToScheme(scheme.Scheme)
21+
_ = autoscalingv1beta1.SchemeBuilder.AddToScheme(scheme.Scheme)
22+
}
23+
24+
func TestMachinesPlugin_Record(t *testing.T) {
25+
testCases := []struct {
26+
name string
27+
objects []ctrlruntimeclient.Object
28+
autoscalers []autoscalingv1beta1.MachineAutoscaler
29+
event *MachinesEvent
30+
expected []MetricsEvent
31+
}{
32+
{
33+
name: "machines event with workload",
34+
autoscalers: []autoscalingv1beta1.MachineAutoscaler{
35+
{
36+
ObjectMeta: metav1.ObjectMeta{Name: "tests-amd64-us-east-1a-autoscaler"},
37+
Spec: autoscalingv1beta1.MachineAutoscalerSpec{MinReplicas: 0, MaxReplicas: 40, ScaleTargetRef: autoscalingv1beta1.CrossVersionObjectReference{Name: "tests-amd64-us-east-1a"}},
38+
},
39+
},
40+
objects: []ctrlruntimeclient.Object{
41+
&machinev1beta1.MachineSet{
42+
ObjectMeta: metav1.ObjectMeta{
43+
Name: "tests-amd64-us-east-1a",
44+
Namespace: MachineAPINamespace,
45+
},
46+
Spec: machinev1beta1.MachineSetSpec{
47+
Template: machinev1beta1.MachineTemplateSpec{
48+
Spec: machinev1beta1.MachineSpec{
49+
ObjectMeta: machinev1beta1.ObjectMeta{
50+
Labels: map[string]string{CIWorkloadLabel: "tests"},
51+
},
52+
},
53+
},
54+
},
55+
Status: machinev1beta1.MachineSetStatus{Replicas: 3},
56+
},
57+
&machinev1beta1.Machine{
58+
ObjectMeta: metav1.ObjectMeta{
59+
Name: "tests-machine-1",
60+
Namespace: MachineAPINamespace,
61+
Labels: map[string]string{MachineSetLabel: "tests-amd64-us-east-1a"},
62+
},
63+
Status: machinev1beta1.MachineStatus{Phase: stringPtr("Running")},
64+
},
65+
&machinev1beta1.Machine{
66+
ObjectMeta: metav1.ObjectMeta{
67+
Name: "tests-machine-2",
68+
Namespace: MachineAPINamespace,
69+
Labels: map[string]string{MachineSetLabel: "tests-amd64-us-east-1a"},
70+
},
71+
Status: machinev1beta1.MachineStatus{Phase: stringPtr("Running")},
72+
},
73+
&machinev1beta1.Machine{
74+
ObjectMeta: metav1.ObjectMeta{
75+
Name: "tests-machine-3",
76+
Namespace: MachineAPINamespace,
77+
Labels: map[string]string{MachineSetLabel: "tests-amd64-us-east-1a"},
78+
},
79+
Status: machinev1beta1.MachineStatus{Phase: stringPtr("Provisioning")},
80+
},
81+
},
82+
event: &MachinesEvent{
83+
Type: PodCreation,
84+
PodName: "test-pod",
85+
Namespace: "test-ns",
86+
Workload: "tests",
87+
},
88+
expected: []MetricsEvent{
89+
&MachinesEvent{
90+
Type: PodCreation,
91+
PodName: "test-pod",
92+
Namespace: "test-ns",
93+
Workload: "tests",
94+
WorkloadCapacity: WorkloadNodeCount{
95+
Workload: "tests",
96+
TotalMachines: 3,
97+
MachineSets: []MachineSetCount{
98+
{
99+
Name: "tests-amd64-us-east-1a",
100+
Current: 3,
101+
Autoscaler: &AutoscalerInfo{Name: "tests-amd64-us-east-1a-autoscaler", Min: 0, Max: 40},
102+
Machines: []MachineInfo{
103+
{Name: "tests-machine-1", Phase: "Running"},
104+
{Name: "tests-machine-2", Phase: "Running"},
105+
{Name: "tests-machine-3", Phase: "Provisioning"},
106+
},
107+
},
108+
},
109+
},
110+
},
111+
},
112+
},
113+
}
114+
115+
for _, tc := range testCases {
116+
t.Run(tc.name, func(t *testing.T) {
117+
plugin := NewMachinesPlugin(
118+
context.Background(),
119+
logrus.WithField("test", tc.name),
120+
fake.NewClientBuilder().WithObjects(tc.objects...).Build(),
121+
tc.autoscalers,
122+
)
123+
124+
plugin.Record(tc.event)
125+
126+
events := plugin.Events()
127+
if diff := cmp.Diff(tc.expected, events); diff != "" {
128+
t.Errorf("unexpected events (-want +got):\n%s", diff)
129+
}
130+
})
131+
}
132+
}

0 commit comments

Comments
 (0)