Skip to content

Commit 05a3a18

Browse files
committed
[ci-operator]: expand the pod lifecycle metrics to include the state of the machinesets
Signed-off-by: Nikolaos Moraitis <nmoraiti@redhat.com>
1 parent d7866ae commit 05a3a18

14 files changed

Lines changed: 693 additions & 43 deletions

File tree

go.mod

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ require (
3333
github.com/montanaflynn/stats v0.6.3
3434
github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998
3535
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c
36-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a
36+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910
3737
github.com/openshift/imagebuilder v1.2.15
3838
github.com/openshift/openshift-apiserver v0.0.0-alpha.0
3939
github.com/pkg/errors v0.9.1
@@ -185,11 +185,12 @@ require (
185185
github.com/golang-jwt/jwt v3.2.2+incompatible
186186
github.com/jhump/protoreflect v1.17.0
187187
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e
188-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87
188+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd
189189
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4
190+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675
190191
github.com/openshift/hive/apis v0.0.0-20230525214126-ab571664f899
191192
github.com/openshift/installer v1.4.17
192-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408
193+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b
193194
github.com/ovn-org/ovn-kubernetes/go-controller v0.0.0-20240710195803-425a328cd172
194195
github.com/robfig/cron/v3 v3.0.1
195196
github.com/stretchr/testify v1.10.0

go.sum

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -890,14 +890,16 @@ github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998 h1:
890890
github.com/openhistogram/circonusllhist v0.3.1-0.20210608220433-1bd1bfa6c998/go.mod h1:PfeYJ/RW2+Jfv3wTz0upbY2TRour/LLqIm2K2Kw5zg0=
891891
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c h1:J1LrylwCrM+K2KiY1VVPYgHTZ56bOBUTUG/WO09oXfA=
892892
github.com/openshift-eng/openshift-goimports v0.0.0-20220201193023-4f8ea117352c/go.mod h1:Az5+ON7o5C4Ag2GKFbwBjQO/aQNMNzmD1JLse5w4KRs=
893-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a h1:PVk9YmhCVGDSTFzOkhT81vp062LtYjA4M12iWIuqJRs=
894-
github.com/openshift/api v0.0.0-20240918231400-8f6ded478e8a/go.mod h1:OOh6Qopf21pSzqNVCB5gomomBXb8o5sGKZxG2KNpaXM=
893+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910 h1:8pA9Ugq0hhUbVaVWi5lgF4PGaV1ozpTI1NYn/QWqspg=
894+
github.com/openshift/api v0.0.0-20240919193929-2669d1ebc910/go.mod h1:OOh6Qopf21pSzqNVCB5gomomBXb8o5sGKZxG2KNpaXM=
895895
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e h1:XmTo1vVHVAcyd9I2UIYnW3PCHUmA8y/gowm3k8Yq7ww=
896896
github.com/openshift/builder v0.0.0-20240610114444-739f5270219e/go.mod h1:nsFLJ3C4RC+6qP2tino47TxLyDpFRxAABrsIvIuap1E=
897-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87 h1:JtLhaGpSEconE+1IKmIgCOof/Len5ceG6H1pk43yv5U=
898-
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87/go.mod h1:3IPD4U0qyovZS4EFady2kqY32m8lGcbs/Wx+yprg9z8=
897+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd h1:Gd0+bYdcfGIsDOJ8BwTJJjQeXoziyIsTwqp/s38rKyM=
898+
github.com/openshift/client-go v0.0.0-20240918182115-6a8ead8397fd/go.mod h1:EB7GeA/vpf9AHklMgnnT0+uG6l/3f8cChtCFbJFrk4g=
899899
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4 h1:nrD3npDGt5bvwNXZKTzzEuZTI/4Uo5PbrkpAjfxhxtE=
900900
github.com/openshift/cloud-credential-operator v0.0.0-20250120201329-db5f2531a5b4/go.mod h1:Lzu29TMne5LsgPnyw2n9jrPiD5t6uyG5aE6KFy8cz6w=
901+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675 h1:hPFyXtaR42wqKKGMVP4G7M2vmp5iBxBqKetMguh4Td0=
902+
github.com/openshift/cluster-autoscaler-operator v0.0.1-0.20241204142113-43631b045675/go.mod h1:0tGCwMCgKq7KhJWDGr6Tsqqb6Sk3epz/b6tfFDFK1Ug=
901903
github.com/openshift/custom-resource-status v1.1.3-0.20220503160415-f2fdb4999d87 h1:cHyxR+Y8rAMT6m1jQCaYGRwikqahI0OjjUDhFNf3ySQ=
902904
github.com/openshift/custom-resource-status v1.1.3-0.20220503160415-f2fdb4999d87/go.mod h1:DB/Mf2oTeiAmVVX1gN+NEqweonAPY0TKUwADizj8+ZA=
903905
github.com/openshift/hive/apis v0.0.0-20230525214126-ab571664f899 h1:+HkBwPi47wWXKNLAx82Bh567S6dE6dLD3oMD/VQuXy8=
@@ -906,8 +908,8 @@ github.com/openshift/imagebuilder v1.2.15 h1:MNn1OztEE/l8pSEDPYAQ71Ys6rpXA2P00UF
906908
github.com/openshift/imagebuilder v1.2.15/go.mod h1:cK6MLyBl1IHmIYGLY/2SLOG6p0PtEDUOC7khxsFYUXE=
907909
github.com/openshift/installer v1.4.17 h1:63iijBBgYqQX/p2+Q74gPqnfBN5VNSWX5LxQKuLlj6g=
908910
github.com/openshift/installer v1.4.17/go.mod h1:CtlMEGKJDVMZl4qVBC/xMUXM24YnleT6bakI+KXFAhk=
909-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408 h1:Evg6GEvEuyj9toFX14YenXI6hGRnhLWqYx/rHO7VnQ4=
910-
github.com/openshift/library-go v0.0.0-20240207105404-126b47137408/go.mod h1:ePlaOqUiPplRc++6aYdMe+2FmXb2xTNS9Nz5laG2YmI=
911+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b h1:y2DduJug7UZqTu0QTkRPAu73nskuUbFA66fmgxVf/fI=
912+
github.com/openshift/library-go v0.0.0-20240919205913-c96b82b3762b/go.mod h1:f8QcnrooSwGa96xI4UaKbKGJZskhTCGeimXKyc4t/ZU=
911913
github.com/openshift/openshift-apiserver v0.0.0-alpha.0 h1:Wk9BYBcIhamxOrK9OyNbmDM5Qb2nzuePr0Pvg7psWKA=
912914
github.com/openshift/openshift-apiserver v0.0.0-alpha.0/go.mod h1:c/cKFsTkfxQC+RhDNdKBwLhPS9ytARfzSiotFRDOFWE=
913915
github.com/ovn-org/ovn-kubernetes/go-controller v0.0.0-20240710195803-425a328cd172 h1:ZvyylzsX7bEpJOq8upPRQRpVJBI8mp/YTCica9PIRAc=

pkg/metrics/metrics.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,32 @@ import (
1111
"github.com/sirupsen/logrus"
1212

1313
corev1 "k8s.io/api/core/v1"
14+
"k8s.io/client-go/kubernetes/scheme"
1415
"k8s.io/client-go/rest"
1516
metricsclient "k8s.io/metrics/pkg/client/clientset/versioned"
1617
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
1718

1819
configv1 "github.com/openshift/api/config/v1"
20+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
21+
autoscalingv1beta1 "github.com/openshift/cluster-autoscaler-operator/pkg/apis/autoscaling/v1beta1"
1922

2023
"github.com/openshift/ci-tools/pkg/api"
2124
"github.com/openshift/ci-tools/pkg/lease"
2225
"github.com/openshift/ci-tools/pkg/secrets"
2326
)
2427

28+
func init() {
29+
if err := machinev1beta1.AddToScheme(scheme.Scheme); err != nil {
30+
logrus.WithError(err).Error("failed to add machinev1beta1 scheme")
31+
}
32+
if err := autoscalingv1beta1.SchemeBuilder.AddToScheme(scheme.Scheme); err != nil {
33+
logrus.WithError(err).Error("failed to add autoscalingv1beta1 scheme")
34+
}
35+
}
36+
2537
const (
2638
CIOperatorMetricsJSON = "ci-operator-metrics.json"
39+
CIWorkloadLabel = "ci-workload"
2740
)
2841

2942
// MetricsEvent is the interface that every metric event must implement.
@@ -65,6 +78,12 @@ func NewMetricsAgent(ctx context.Context, clusterConfig *rest.Config, censor *se
6578
}
6679

6780
logger := logrus.WithField("component", "metricsAgent")
81+
82+
autoscalerList := &autoscalingv1beta1.MachineAutoscalerList{}
83+
if err := client.List(ctx, autoscalerList); err != nil {
84+
logger.WithError(err).Warn("Failed to list MachineAutoscalers at initialization")
85+
}
86+
6887
return &MetricsAgent{
6988
ctx: ctx,
7089
events: make(chan MetricsEvent, 100),
@@ -76,7 +95,7 @@ func NewMetricsAgent(ctx context.Context, clusterConfig *rest.Config, censor *se
7695
buildPlugin: newBuildPlugin(ctx, logger, client),
7796
nodesPlugin: newNodesMetricsPlugin(ctx, logger, client, metricsClient, nodesCh),
7897
leasePlugin: newLeasesPlugin(logger),
79-
podPlugin: NewPodLifecyclePlugin(ctx, logger, client),
98+
podPlugin: NewPodLifecyclePlugin(ctx, logger, client, autoscalerList.Items),
8099
imagesPlugin: newImagesPlugin(ctx, logger, client),
81100
}, nil
82101
}

pkg/metrics/pods.go

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,26 @@ import (
1010
corev1 "k8s.io/api/core/v1"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1212
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
13+
14+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
15+
autoscalingv1beta1 "github.com/openshift/cluster-autoscaler-operator/pkg/apis/autoscaling/v1beta1"
1316
)
1417

18+
type MachineSetCount struct {
19+
Name string `json:"name"`
20+
Current int `json:"current"`
21+
Min int `json:"min"`
22+
Max int `json:"max"`
23+
}
24+
25+
type WorkloadNodeCount struct {
26+
Workload string `json:"workload"`
27+
Current int `json:"current"`
28+
Min int `json:"min"`
29+
Max int `json:"max"`
30+
MachineSets []MachineSetCount `json:"machine_sets"`
31+
}
32+
1533
type PodLifecycleMetricsEvent struct {
1634
PodName string `json:"pod_name,omitempty"`
1735
Namespace string `json:"namespace,omitempty"`
@@ -31,22 +49,30 @@ type PodLifecycleMetricsEvent struct {
3149
InitContainerRestarts int `json:"init_container_restarts,omitempty"`
3250
InitContainerLastError string `json:"init_container_last_error,omitempty"`
3351
Timestamp time.Time `json:"timestamp,omitempty"`
52+
53+
WorkloadCapacity WorkloadNodeCount `json:"workload_capacity,omitempty"`
3454
}
3555

3656
func (e *PodLifecycleMetricsEvent) SetTimestamp(ts time.Time) {
3757
e.Timestamp = ts
3858
}
3959

4060
type PodLifecyclePlugin struct {
41-
ctx context.Context
42-
logger *logrus.Entry
43-
mu sync.Mutex
44-
events []PodLifecycleMetricsEvent
45-
client ctrlruntimeclient.Client
61+
ctx context.Context
62+
logger *logrus.Entry
63+
mu sync.Mutex
64+
events []PodLifecycleMetricsEvent
65+
client ctrlruntimeclient.Client
66+
autoscalers []autoscalingv1beta1.MachineAutoscaler
4667
}
4768

48-
func NewPodLifecyclePlugin(ctx context.Context, logger *logrus.Entry, client ctrlruntimeclient.Client) *PodLifecyclePlugin {
49-
return &PodLifecyclePlugin{ctx: ctx, logger: logger.WithField("plugin", "pods"), client: client}
69+
func NewPodLifecyclePlugin(ctx context.Context, logger *logrus.Entry, client ctrlruntimeclient.Client, autoscalers []autoscalingv1beta1.MachineAutoscaler) *PodLifecyclePlugin {
70+
return &PodLifecyclePlugin{
71+
ctx: ctx,
72+
logger: logger.WithField("plugin", "pods"),
73+
client: client,
74+
autoscalers: autoscalers,
75+
}
5076
}
5177

5278
func (p *PodLifecyclePlugin) Name() string {
@@ -70,7 +96,10 @@ func (p *PodLifecyclePlugin) Record(ev MetricsEvent) {
7096
e.CreationTime = &pod.CreationTimestamp.Time
7197
e.StartTime = &pod.Status.StartTime.Time
7298
e.CompletionTime = getPodCompletionTime(pod)
73-
e.CIWorkload = pod.Labels["ci-workload"]
99+
e.CIWorkload = pod.Labels[CIWorkloadLabel]
100+
if e.CIWorkload != "" {
101+
e.WorkloadCapacity = p.getWorkloadCounts(e.CIWorkload)
102+
}
74103

75104
// Only set pod phase if not already set by caller (preserves success/failure determination)
76105
if e.PodPhase == "" {
@@ -125,6 +154,41 @@ func (p *PodLifecyclePlugin) Events() []MetricsEvent {
125154
return out
126155
}
127156

157+
func (p *PodLifecyclePlugin) getMinMax(machineSetName string) (int, int) {
158+
for _, autoscaler := range p.autoscalers {
159+
if autoscaler.Spec.ScaleTargetRef.Name == machineSetName {
160+
return int(autoscaler.Spec.MinReplicas), int(autoscaler.Spec.MaxReplicas)
161+
}
162+
}
163+
return 0, 0
164+
}
165+
166+
func (p *PodLifecyclePlugin) getWorkloadCounts(workload string) WorkloadNodeCount {
167+
ret := WorkloadNodeCount{Workload: workload}
168+
machineSetList := &machinev1beta1.MachineSetList{}
169+
if err := p.client.List(p.ctx, machineSetList); err != nil {
170+
p.logger.WithError(err).Warn("Failed to list MachineSets")
171+
return WorkloadNodeCount{}
172+
}
173+
174+
for _, ms := range machineSetList.Items {
175+
msWorkload := ms.Spec.Template.Spec.ObjectMeta.Labels[CIWorkloadLabel]
176+
if msWorkload != workload {
177+
continue
178+
}
179+
180+
current := int(ms.Status.Replicas)
181+
min, max := p.getMinMax(ms.Name)
182+
183+
ret.Current += current
184+
ret.Min += min
185+
ret.Max += max
186+
ret.MachineSets = append(ret.MachineSets, MachineSetCount{Name: ms.Name, Current: current, Min: min, Max: max})
187+
}
188+
189+
return ret
190+
}
191+
128192
func getPodCompletionTime(pod *corev1.Pod) *time.Time {
129193
var end metav1.Time
130194
for _, status := range pod.Status.ContainerStatuses {

0 commit comments

Comments
 (0)