openshift
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cmd/ci-operator/main.go‎
Lines changed: 32 additions & 1 deletion b/‎cmd/ci-operator/main.go‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎cmd/pipeline-controller/config_watcher.go‎
Lines changed: 26 additions & 21 deletions b/‎cmd/pipeline-controller/config_watcher.go‎
Lines changed: 26 additions & 21 deletions
diff --git a/‎cmd/pod-scaler/admission.go‎
Lines changed: 128 additions & 23 deletions b/‎cmd/pod-scaler/admission.go‎
Lines changed: 128 additions & 23 deletions
@@ -10,4 +10,7 @@ index.js
 # default working dir
 /job-aggregator-working-dir
 # go built binary
-/job-run-aggregator
+/job-run-aggregatorpod-scaler
+
+pod-scaler
+job-run-aggregator
@@ -91,6 +91,7 @@ import (
 	"github.com/openshift/ci-tools/pkg/results"
 	"github.com/openshift/ci-tools/pkg/secrets"
 	"github.com/openshift/ci-tools/pkg/steps"
+	tooldetector "github.com/openshift/ci-tools/pkg/tool-detector"
 	"github.com/openshift/ci-tools/pkg/util"
 	"github.com/openshift/ci-tools/pkg/util/gzip"
 	"github.com/openshift/ci-tools/pkg/validation"
@@ -451,6 +452,8 @@ type options struct {
 	enableSecretsStoreCSIDriver bool
 
 	metricsAgent *metrics.MetricsAgent
+
+	skippedImages sets.Set[string]
 }
 
 func bindOptions(flag *flag.FlagSet) *options {
@@ -606,6 +609,7 @@ func (o *options) Complete() error {
 	if err := validation.IsValidResolvedConfiguration(o.configSpec, mergedConfig); err != nil {
 		return results.ForReason("validating_config").ForError(err)
 	}
+	o.skippedImages = determineSkippedImages(o.configSpec, o.jobSpec, o.targets.values)
 	o.graphConfig = defaults.FromConfigStatic(o.configSpec)
 	if err := validation.IsValidGraphConfiguration(o.graphConfig.Steps); err != nil {
 		return results.ForReason("validating_config").ForError(err)
@@ -956,7 +960,7 @@ func (o *options) Run() []error {
 	// load the graph from the configuration
 	buildSteps, promotionSteps, err := defaults.FromConfig(ctx, o.configSpec, &o.graphConfig, o.jobSpec, o.templates, o.writeParams, o.promote, o.clusterConfig,
 		o.podPendingTimeout, leaseClient, o.targets.values, o.cloneAuthConfig, o.pullSecret, o.pushSecret, o.censor, o.hiveKubeconfig,
-		o.nodeName, nodeArchitectures, o.targetAdditionalSuffix, o.manifestToolDockerCfg, o.localRegistryDNS, streams, injectedTest, o.enableSecretsStoreCSIDriver, o.metricsAgent)
+		o.nodeName, nodeArchitectures, o.targetAdditionalSuffix, o.manifestToolDockerCfg, o.localRegistryDNS, streams, injectedTest, o.enableSecretsStoreCSIDriver, o.metricsAgent, o.skippedImages)
 	if err != nil {
 		return []error{results.ForReason("defaulting_config").WithError(err).Errorf("failed to generate steps from config: %v", err)}
 	}
@@ -1078,6 +1082,33 @@ func (o *options) Run() []error {
 	})
 }
 
+// determineSkippedImages determines which images can be skipped when
+// build_images_if_affected is enabled and the [images] target is requested.
+func determineSkippedImages(config *api.ReleaseBuildConfiguration, jobSpec *api.JobSpec, targets []string) sets.Set[string] {
+	if config == nil || jobSpec == nil || !config.BuildImagesIfAffected {
+		return nil
+	}
+
+	if !slices.Contains(targets, "[images]") {
+		return nil
+	}
+
+	detector := tooldetector.New(jobSpec, config)
+	affectedTools, err := detector.AffectedTools()
+	if err != nil {
+		logrus.WithError(err).Warn("Failed to detect affected tools; building all images")
+		return nil
+	}
+
+	skipped := sets.New[string]()
+	for _, img := range config.Images {
+		if !affectedTools.Has(string(img.To)) {
+			skipped.Insert(string(img.To))
+		}
+	}
+	return skipped
+}
+
 func runPromotionStep(ctx context.Context, step api.Step, detailsChan chan<- api.CIOperatorStepDetails, errChan chan<- error, metricsAgent *metrics.MetricsAgent) {
 	details, err := runStep(ctx, step, metricsAgent)
 	if err != nil {
 
@@ -2,10 +2,11 @@ package main
 
 import (
 	"os"
+	"reflect"
 	"sync"
+	"time"
 
 	"github.com/sirupsen/logrus"
-	"gopkg.in/fsnotify.v1"
 	"gopkg.in/yaml.v2"
 )
 
@@ -82,35 +83,39 @@ func newWatcher(filePath string, logger *logrus.Entry) *watcher {
 }
 
 func (w *watcher) watch() {
-	fileWatcher, err := fsnotify.NewWatcher()
-	if err != nil {
-		w.logger.Fatal(err)
+	// Load initial config
+	if err := w.reloadConfig(); err != nil {
+		w.logger.WithError(err).Error("Failed to load initial config")
 	}
 
-	defer fileWatcher.Close()
-
-	err = fileWatcher.Add(w.filePath)
-	if err != nil {
-		w.logger.Fatal(err)
-	}
+	// Use polling instead of fsnotify because git-sync doesn't trigger filesystem events
+	ticker := time.NewTicker(3 * time.Minute)
+	defer ticker.Stop()
 
-	err = w.reloadConfig()
-	if err != nil {
-		w.logger.WithError(err)
-	}
+	// Store previous config for comparison
+	prevConfig := w.getConfigCopy()
 
-	for {
-		event := <-fileWatcher.Events
-		if event.Op&fsnotify.Write == fsnotify.Write {
-			err = w.reloadConfig()
-			if err != nil {
-				w.logger.WithError(err)
-			}
+	for range ticker.C {
+		if err := w.reloadConfig(); err != nil {
+			w.logger.WithError(err).Error("Failed to reload config")
+			continue
 		}
 
+		currentConfig := w.getConfigCopy()
+		if !reflect.DeepEqual(currentConfig, prevConfig) {
+			w.logger.Info("Config change detected, config reloaded successfully")
+			prevConfig = currentConfig
+		}
 	}
 }
 
+// getConfigCopy returns a deep copy of the current config for comparison
+func (w *watcher) getConfigCopy() enabledConfig {
+	w.mutex.Lock()
+	defer w.mutex.Unlock()
+	return w.config
+}
+
 func (w *watcher) reloadConfig() error {
 	w.mutex.Lock()
 	defer w.mutex.Unlock()
 
@@ -33,34 +33,51 @@ import (
 	"github.com/openshift/ci-tools/pkg/steps"
 )
 
-func admit(port, healthPort int, certDir string, client buildclientv1.BuildV1Interface, loaders map[string][]*cacheReloader, mutateResourceLimits bool, cpuCap int64, memoryCap string, cpuPriorityScheduling int64, reporter results.PodScalerReporter) {
+func admit(port, healthPort int, certDir string, client buildclientv1.BuildV1Interface, loaders map[string][]*cacheReloader, mutateResourceLimits bool, cpuCap int64, memoryCap string, cpuPriorityScheduling int64, authoritativeCPURequests, authoritativeMemoryRequests bool, enableMeasuredPods bool, bigQueryProjectID, bigQueryDatasetID, bigQueryCredentialsFile string, reporter results.PodScalerReporter) {
 	logger := logrus.WithField("component", "pod-scaler admission")
 	logger.Infof("Initializing admission webhook server with %d loaders.", len(loaders))
 	health := pjutil.NewHealthOnPort(healthPort)
 	resources := newResourceServer(loaders, health)
 	decoder := admission.NewDecoder(scheme.Scheme)
 
+	var bqClient *BigQueryClient
+	if enableMeasuredPods {
+		if bigQueryProjectID == "" || bigQueryDatasetID == "" {
+			logrus.Fatal("bigquery-project-id and bigquery-dataset-id are required when enable-measured-pods is true")
+		}
+		cache := NewMeasuredPodCache(logger)
+		var err error
+		bqClient, err = NewBigQueryClient(bigQueryProjectID, bigQueryDatasetID, bigQueryCredentialsFile, cache, logger)
+		if err != nil {
+			logrus.WithError(err).Fatal("Failed to create BigQuery client for measured pods")
+		}
+		logger.Info("Measured pods feature enabled with BigQuery integration")
+	}
+
 	server := webhook.NewServer(webhook.Options{
 		Port:    port,
 		CertDir: certDir,
 	})
-	server.Register("/pods", &webhook.Admission{Handler: &podMutator{logger: logger, client: client, decoder: decoder, resources: resources, mutateResourceLimits: mutateResourceLimits, cpuCap: cpuCap, memoryCap: memoryCap, cpuPriorityScheduling: cpuPriorityScheduling, reporter: reporter}})
+	server.Register("/pods", &webhook.Admission{Handler: &podMutator{logger: logger, client: client, decoder: decoder, resources: resources, mutateResourceLimits: mutateResourceLimits, cpuCap: cpuCap, memoryCap: memoryCap, cpuPriorityScheduling: cpuPriorityScheduling, authoritativeCPURequests: authoritativeCPURequests, authoritativeMemoryRequests: authoritativeMemoryRequests, bqClient: bqClient, reporter: reporter}})
 	logger.Info("Serving admission webhooks.")
 	if err := server.Start(interrupts.Context()); err != nil {
 		logrus.WithError(err).Fatal("Failed to serve webhooks.")
 	}
 }
 
 type podMutator struct {
-	logger                *logrus.Entry
-	client                buildclientv1.BuildV1Interface
-	resources             *resourceServer
-	mutateResourceLimits  bool
-	decoder               admission.Decoder
-	cpuCap                int64
-	memoryCap             string
-	cpuPriorityScheduling int64
-	reporter              results.PodScalerReporter
+	logger                      *logrus.Entry
+	client                      buildclientv1.BuildV1Interface
+	resources                   *resourceServer
+	mutateResourceLimits        bool
+	decoder                     admission.Decoder
+	cpuCap                      int64
+	memoryCap                   string
+	cpuPriorityScheduling       int64
+	authoritativeCPURequests    bool
+	authoritativeMemoryRequests bool
+	bqClient                    *BigQueryClient
+	reporter                    results.PodScalerReporter
 }
 
 func (m *podMutator) Handle(ctx context.Context, req admission.Request) admission.Response {
@@ -97,7 +114,16 @@ func (m *podMutator) Handle(ctx context.Context, req admission.Request) admissio
 		logger.WithError(err).Error("Failed to handle rehearsal Pod.")
 		return admission.Allowed("Failed to handle rehearsal Pod, ignoring.")
 	}
-	mutatePodResources(pod, m.resources, m.mutateResourceLimits, m.cpuCap, m.memoryCap, m.reporter, logger)
+
+	// Classify pod as normal or measured (if enabled)
+	if m.bqClient != nil {
+		ClassifyPod(pod, m.bqClient, logger)
+		AddPodAntiAffinity(pod, logger)
+		// Apply measured pod resources before regular resource mutation
+		ApplyMeasuredPodResources(pod, m.bqClient, logger)
+	}
+
+	mutatePodResources(pod, m.resources, m.mutateResourceLimits, m.cpuCap, m.memoryCap, m.authoritativeCPURequests, m.authoritativeMemoryRequests, m.reporter, logger)
 	m.addPriorityClass(pod)
 
 	marshaledPod, err := json.Marshal(pod)
@@ -196,8 +222,14 @@ func mutatePodLabels(pod *corev1.Pod, build *buildv1.Build) {
 	}
 }
 
-// useOursIfLarger updates fields in theirs when ours are larger
-func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, workloadName, workloadType string, reporter results.PodScalerReporter, logger *logrus.Entry) {
+// applyRecommendationsBasedOnRecentData applies resource recommendations based on recent usage data
+// (see resourceRecommendationWindow). If they used more, we increase resources. If they used less
+// and authoritative mode is enabled for that resource, we decrease them.
+//
+// Note: The reduction functionality (authoritative mode) is tested in admission_test.go as part
+// of TestUseOursIfLarger. The test cases there properly handle ResourceQuantity comparison
+// and verify the gradual reduction logic with safety limits.
+func applyRecommendationsBasedOnRecentData(allOfOurs, allOfTheirs *corev1.ResourceRequirements, workloadName, workloadType string, authoritativeCPU, authoritativeMemory bool, reporter results.PodScalerReporter, logger *logrus.Entry, pod *corev1.Pod) {
 	for _, item := range []*corev1.ResourceRequirements{allOfOurs, allOfTheirs} {
 		if item.Requests == nil {
 			item.Requests = corev1.ResourceList{}
@@ -215,12 +247,30 @@ func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, worklo
 	} {
 		for _, field := range []corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory} {
 			our := (*pair.ours)[field]
-			//TODO(sgoeddel): this is a temporary experiment to see what effect setting values that are 120% of what has
-			// been determined has on the rate of OOMKilled and similar termination of workloads
-			increased := our.AsApproximateFloat64() * 1.2
-			our.Set(int64(increased))
-
+			// If we have no recommendation for this resource, skip it
+			if our.IsZero() {
+				continue
+			}
 			their := (*pair.theirs)[field]
+
+			// Check if this is a measured pod. Measured pods have resources set by ApplyMeasuredPodResources
+			// which already applies a 1.2x buffer, so we skip applying the buffer again to avoid double
+			// buffering (1.2 * 1.2 = 1.44x instead of 1.2x). We use the existing pod label to determine
+			// this, which is more reliable than inferring from value comparisons.
+			isMeasuredPod := pod != nil && pod.Labels != nil && pod.Labels[PodScalerLabelKey] == PodScalerLabelValueMeasured
+
+			if !isMeasuredPod {
+				// Apply a 1.2x safety buffer to resource recommendations to reduce the rate of OOMKilled
+				// and similar workload terminations. This buffer accounts for:
+				// - Natural variance in resource usage patterns
+				// - Transient spikes in CPU/memory consumption
+				// - Measurement inaccuracies in historical data
+				// The 20% overhead provides a safety margin while still allowing for efficient resource utilization.
+				increased := our.AsApproximateFloat64() * 1.2
+				our.Set(int64(increased))
+			} else {
+				logger.Debugf("Skipping 1.2x buffer for %s %s - pod is marked as measured and resources were set by measured pods logic", pair.resource, field)
+			}
 			fieldLogger := logger.WithFields(logrus.Fields{
 				"workloadName": workloadName,
 				"workloadType": workloadType,
@@ -231,13 +281,40 @@ func useOursIfLarger(allOfOurs, allOfTheirs *corev1.ResourceRequirements, worklo
 			})
 			cmp := our.Cmp(their)
 			if cmp == 1 {
-				fieldLogger.Debug("determined amount larger than configured")
+				fieldLogger.Debug("determined amount larger than configured, increasing resources")
 				(*pair.theirs)[field] = our
 				if their.Value() > 0 && our.Value() > (their.Value()*10) {
 					reporter.ReportResourceConfigurationWarning(workloadName, workloadType, their.String(), our.String(), field.String())
 				}
 			} else if cmp < 0 {
-				fieldLogger.Debug("determined amount smaller than configured")
+				authoritative := (field == corev1.ResourceCPU && authoritativeCPU) || (field == corev1.ResourceMemory && authoritativeMemory)
+				if authoritative {
+					// Apply gradual reduction with safety limits: max 25% reduction per cycle, minimum 5% difference
+					ourValue := our.AsApproximateFloat64()
+					theirValue := their.AsApproximateFloat64()
+					if theirValue > 0 {
+						reductionPercent := 1.0 - (ourValue / theirValue)
+						maxReductionPercent := 0.25
+
+						if reductionPercent >= 0.05 {
+							if reductionPercent > maxReductionPercent {
+								maxAllowed := theirValue * (1.0 - maxReductionPercent)
+								our.Set(int64(maxAllowed))
+								fieldLogger.Debugf("applying gradual reduction (limited to 25%% per cycle)")
+							} else {
+								fieldLogger.Debug("reducing resources based on recent usage")
+							}
+							(*pair.theirs)[field] = our
+						} else {
+							fieldLogger.Debug("difference less than 5%, skipping micro-adjustment")
+						}
+					} else {
+						fieldLogger.Debug("theirs is zero, applying recommendation")
+						(*pair.theirs)[field] = our
+					}
+				} else {
+					fieldLogger.Debug("authoritative mode disabled, keeping existing value")
+				}
 			} else {
 				fieldLogger.Debug("determined amount equal to configured")
 			}
@@ -292,16 +369,44 @@ func preventUnschedulable(resources *corev1.ResourceRequirements, cpuCap int64,
 	}
 }
 
-func mutatePodResources(pod *corev1.Pod, server *resourceServer, mutateResourceLimits bool, cpuCap int64, memoryCap string, reporter results.PodScalerReporter, logger *logrus.Entry) {
+func mutatePodResources(pod *corev1.Pod, server *resourceServer, mutateResourceLimits bool, cpuCap int64, memoryCap string, authoritativeCPU, authoritativeMemory bool, reporter results.PodScalerReporter, logger *logrus.Entry) {
+	// Check if this is a measured pod - measured pods have resources set by ApplyMeasuredPodResources
+	// and we should preserve those instead of overwriting with Prometheus recommendations
+	isMeasuredPod := pod.Labels != nil && pod.Labels[PodScalerLabelKey] == PodScalerLabelValueMeasured
+
 	mutateResources := func(containers []corev1.Container) {
 		for i := range containers {
+			// For measured pods, skip Prometheus-based recommendations if resources were already set
+			// by ApplyMeasuredPodResources (which uses BigQuery measured data)
+			if isMeasuredPod {
+				hasCPURequest := false
+				hasMemoryRequest := false
+				if containers[i].Resources.Requests != nil {
+					if cpuReq, ok := containers[i].Resources.Requests[corev1.ResourceCPU]; ok && cpuReq.Sign() > 0 {
+						hasCPURequest = true
+					}
+					if memReq, ok := containers[i].Resources.Requests[corev1.ResourceMemory]; ok && memReq.Sign() > 0 {
+						hasMemoryRequest = true
+					}
+				}
+				if hasCPURequest || hasMemoryRequest {
+					logger.Debugf("Skipping Prometheus recommendations for measured pod container %s - resources already set from BigQuery data", containers[i].Name)
+					// Still apply caps and limits even for measured pods
+					preventUnschedulable(&containers[i].Resources, cpuCap, memoryCap, logger)
+					if mutateResourceLimits {
+						reconcileLimits(&containers[i].Resources)
+					}
+					continue
+				}
+			}
+
 			meta := podscaler.MetadataFor(pod.ObjectMeta.Labels, pod.ObjectMeta.Name, containers[i].Name)
 			resources, recommendationExists := server.recommendedRequestFor(meta)
 			if recommendationExists {
 				logger.Debugf("recommendation exists for: %s", containers[i].Name)
 				workloadType := determineWorkloadType(pod.Annotations, pod.Labels)
 				workloadName := determineWorkloadName(pod.Name, containers[i].Name, workloadType, pod.Labels)
-				useOursIfLarger(&resources, &containers[i].Resources, workloadName, workloadType, reporter, logger)
+				applyRecommendationsBasedOnRecentData(&resources, &containers[i].Resources, workloadName, workloadType, authoritativeCPU, authoritativeMemory, reporter, logger, pod)
 				if mutateResourceLimits {
 					reconcileLimits(&containers[i].Resources)
 				}