Skip to content

Commit b0b3eec

Browse files
committed
Add authoritative mode and improvements to pod scaler resource recommendations
1 parent 4c0387d commit b0b3eec

9 files changed

Lines changed: 1061 additions & 17 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ index.js
1010
# default working dir
1111
/job-aggregator-working-dir
1212
# go built binary
13-
/job-run-aggregator
13+
/job-run-aggregatorpod-scaler

cmd/pod-scaler/admission.go

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,14 @@ func mutatePodLabels(pod *corev1.Pod, build *buildv1.Build) {
196196
}
197197
}
198198

199-
// useOursIfLarger updates fields in theirs when ours are larger
200-
func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, workloadName, workloadType string, reporter results.PodScalerReporter, logger *logrus.Entry) {
199+
// applyRecommendationsBasedOnRecentData applies resource recommendations based on recent usage data
200+
// (see resourceRecommendationWindow). If they used more, we increase resources. If they used less,
201+
// we decrease them (pod-scaler is always authoritative).
202+
//
203+
// TestApplyRecommendationsBasedOnRecentData_ReducesResources is tested in admission_test.go
204+
// as part of TestUseOursIfLarger. The reduction functionality is verified there with proper
205+
// test cases that handle ResourceQuantity comparison correctly.
206+
func applyRecommendationsBasedOnRecentData(allOfOurs, allOfTheirs *corev1.ResourceRequirements, workloadName, workloadType string, reporter results.PodScalerReporter, logger *logrus.Entry) {
201207
for _, item := range []*corev1.ResourceRequirements{allOfOurs, allOfTheirs} {
202208
if item.Requests == nil {
203209
item.Requests = corev1.ResourceList{}
@@ -215,6 +221,10 @@ func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, worklo
215221
} {
216222
for _, field := range []corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory} {
217223
our := (*pair.ours)[field]
224+
// If we have no recommendation for this resource, skip it
225+
if our.IsZero() {
226+
continue
227+
}
218228
//TODO(sgoeddel): this is a temporary experiment to see what effect setting values that are 120% of what has
219229
// been determined has on the rate of OOMKilled and similar termination of workloads
220230
increased := our.AsApproximateFloat64() * 1.2
@@ -231,13 +241,36 @@ func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, worklo
231241
})
232242
cmp := our.Cmp(their)
233243
if cmp == 1 {
234-
fieldLogger.Debug("determined amount larger than configured")
244+
fieldLogger.Debug("determined amount larger than configured, increasing resources")
235245
(*pair.theirs)[field] = our
236246
if their.Value() > 0 && our.Value() > (their.Value()*10) {
237247
reporter.ReportResourceConfigurationWarning(workloadName, workloadType, their.String(), our.String(), field.String())
238248
}
239249
} else if cmp < 0 {
240-
fieldLogger.Debug("determined amount smaller than configured")
250+
// Apply gradual reduction with safety limits: max 25% reduction per cycle, minimum 5% difference
251+
ourValue := our.AsApproximateFloat64()
252+
theirValue := their.AsApproximateFloat64()
253+
if theirValue == 0 {
254+
fieldLogger.Debug("theirs is zero, applying recommendation")
255+
(*pair.theirs)[field] = our
256+
continue
257+
}
258+
259+
reductionPercent := 1.0 - (ourValue / theirValue)
260+
if reductionPercent < 0.05 {
261+
fieldLogger.Debug("difference less than 5%, skipping micro-adjustment")
262+
continue
263+
}
264+
265+
maxReductionPercent := 0.25
266+
if reductionPercent > maxReductionPercent {
267+
maxAllowed := theirValue * (1.0 - maxReductionPercent)
268+
our.Set(int64(maxAllowed))
269+
fieldLogger.Debugf("applying gradual reduction (limited to 25%% per cycle)")
270+
} else {
271+
fieldLogger.Debug("reducing resources based on recent usage")
272+
}
273+
(*pair.theirs)[field] = our
241274
} else {
242275
fieldLogger.Debug("determined amount equal to configured")
243276
}
@@ -301,7 +334,7 @@ func mutatePodResources(pod *corev1.Pod, server *resourceServer, mutateResourceL
301334
logger.Debugf("recommendation exists for: %s", containers[i].Name)
302335
workloadType := determineWorkloadType(pod.Annotations, pod.Labels)
303336
workloadName := determineWorkloadName(pod.Name, containers[i].Name, workloadType, pod.Labels)
304-
useOursIfLarger(&resources, &containers[i].Resources, workloadName, workloadType, reporter, logger)
337+
applyRecommendationsBasedOnRecentData(&resources, &containers[i].Resources, workloadName, workloadType, reporter, logger)
305338
if mutateResourceLimits {
306339
reconcileLimits(&containers[i].Resources)
307340
}

cmd/pod-scaler/admission_test.go

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,7 @@ func TestUseOursIfLarger(t *testing.T) {
661661
},
662662
},
663663
{
664-
name: "nothing in ours is larger",
664+
name: "ours are smaller - should reduce resources based on recent usage",
665665
ours: corev1.ResourceRequirements{
666666
Limits: corev1.ResourceList{
667667
corev1.ResourceCPU: *resource.NewQuantity(10, resource.DecimalSI),
@@ -684,12 +684,16 @@ func TestUseOursIfLarger(t *testing.T) {
684684
},
685685
expected: corev1.ResourceRequirements{
686686
Limits: corev1.ResourceList{
687-
corev1.ResourceCPU: *resource.NewQuantity(200, resource.DecimalSI),
688-
corev1.ResourceMemory: *resource.NewQuantity(3e10, resource.BinarySI),
687+
// Ours: 10 * 1.2 = 12, Theirs: 200, Reduction: 94% > 25%, so limit to 25%: 200 * 0.75 = 150
688+
corev1.ResourceCPU: *resource.NewQuantity(150, resource.DecimalSI),
689+
// Ours: 10 * 1.2 = 12, Theirs: 3e10, Reduction: >99% > 25%, so limit to 25%: 3e10 * 0.75 = 2.25e10
690+
corev1.ResourceMemory: *resource.NewQuantity(225e8, resource.BinarySI),
689691
},
690692
Requests: corev1.ResourceList{
691-
corev1.ResourceCPU: *resource.NewQuantity(100, resource.DecimalSI),
692-
corev1.ResourceMemory: *resource.NewQuantity(2e10, resource.BinarySI),
693+
// Ours: 10 * 1.2 = 12, Theirs: 100, Reduction: 88% > 25%, so limit to 25%: 100 * 0.75 = 75
694+
corev1.ResourceCPU: *resource.NewQuantity(75, resource.DecimalSI),
695+
// Ours: 10 * 1.2 = 12, Theirs: 2e10, Reduction: >99% > 25%, so limit to 25%: 2e10 * 0.75 = 1.5e10
696+
corev1.ResourceMemory: *resource.NewQuantity(15e9, resource.BinarySI),
693697
},
694698
},
695699
},
@@ -717,19 +721,57 @@ func TestUseOursIfLarger(t *testing.T) {
717721
},
718722
expected: corev1.ResourceRequirements{
719723
Limits: corev1.ResourceList{
720-
corev1.ResourceCPU: *resource.NewQuantity(480, resource.DecimalSI),
721-
corev1.ResourceMemory: *resource.NewQuantity(3e10, resource.BinarySI),
724+
corev1.ResourceCPU: *resource.NewQuantity(480, resource.DecimalSI),
725+
// Ours: 10 * 1.2 = 12, Theirs: 3e10, Reduction: >99% > 25%, so limit to 25%: 3e10 * 0.75 = 2.25e10
726+
corev1.ResourceMemory: *resource.NewQuantity(225e8, resource.BinarySI),
722727
},
723728
Requests: corev1.ResourceList{
724729
corev1.ResourceCPU: *resource.NewQuantity(1200, resource.DecimalSI),
725730
corev1.ResourceMemory: *resource.NewQuantity(48e9, resource.BinarySI),
726731
},
727732
},
728733
},
734+
{
735+
name: "ours are smaller - should reduce resources based on recent usage",
736+
ours: corev1.ResourceRequirements{
737+
Limits: corev1.ResourceList{
738+
corev1.ResourceCPU: *resource.NewQuantity(50, resource.DecimalSI),
739+
corev1.ResourceMemory: *resource.NewQuantity(1e9, resource.BinarySI),
740+
},
741+
Requests: corev1.ResourceList{
742+
corev1.ResourceCPU: *resource.NewQuantity(25, resource.DecimalSI),
743+
corev1.ResourceMemory: *resource.NewQuantity(5e9, resource.BinarySI),
744+
},
745+
},
746+
theirs: corev1.ResourceRequirements{
747+
Limits: corev1.ResourceList{
748+
corev1.ResourceCPU: *resource.NewQuantity(200, resource.DecimalSI),
749+
corev1.ResourceMemory: *resource.NewQuantity(3e10, resource.BinarySI),
750+
},
751+
Requests: corev1.ResourceList{
752+
corev1.ResourceCPU: *resource.NewQuantity(100, resource.DecimalSI),
753+
corev1.ResourceMemory: *resource.NewQuantity(2e10, resource.BinarySI),
754+
},
755+
},
756+
expected: corev1.ResourceRequirements{
757+
Limits: corev1.ResourceList{
758+
// Ours: 50 * 1.2 = 60, Theirs: 200, Reduction: 70% > 25%, so limit to 25%: 200 * 0.75 = 150
759+
corev1.ResourceCPU: *resource.NewQuantity(150, resource.DecimalSI),
760+
// Ours: 1e9 * 1.2 = 1.2e9, Theirs: 3e10, Reduction: 96% > 25%, so limit to 25%: 3e10 * 0.75 = 2.25e10
761+
corev1.ResourceMemory: *resource.NewQuantity(225e8, resource.BinarySI),
762+
},
763+
Requests: corev1.ResourceList{
764+
// Ours: 25 * 1.2 = 30, Theirs: 100, Reduction: 70% > 25%, so limit to 25%: 100 * 0.75 = 75
765+
corev1.ResourceCPU: *resource.NewQuantity(75, resource.DecimalSI),
766+
// Ours: 5e9 * 1.2 = 6e9, Theirs: 2e10, Reduction: 70% > 25%, so limit to 25%: 2e10 * 0.75 = 1.5e10
767+
corev1.ResourceMemory: *resource.NewQuantity(15e9, resource.BinarySI),
768+
},
769+
},
770+
},
729771
}
730772
for _, testCase := range testCases {
731773
t.Run(testCase.name, func(t *testing.T) {
732-
useOursIfLarger(&testCase.ours, &testCase.theirs, "test", "build", &defaultReporter, logrus.WithField("test", testCase.name))
774+
applyRecommendationsBasedOnRecentData(&testCase.ours, &testCase.theirs, "test", "build", &defaultReporter, logrus.WithField("test", testCase.name))
733775
if diff := cmp.Diff(testCase.theirs, testCase.expected); diff != "" {
734776
t.Errorf("%s: got incorrect resources after mutation: %v", testCase.name, diff)
735777
}
@@ -814,7 +856,7 @@ func TestUseOursIsLarger_ReporterReports(t *testing.T) {
814856

815857
for _, tc := range testCases {
816858
t.Run(tc.name, func(t *testing.T) {
817-
useOursIfLarger(&tc.ours, &tc.theirs, "test", "build", &tc.reporter, logrus.WithField("test", tc.name))
859+
applyRecommendationsBasedOnRecentData(&tc.ours, &tc.theirs, "test", "build", &tc.reporter, logrus.WithField("test", tc.name))
818860

819861
if diff := cmp.Diff(tc.reporter.called, tc.expected); diff != "" {
820862
t.Errorf("actual and expected reporter states don't match, : %v", diff)

0 commit comments

Comments
 (0)