From 6003ae2d6c9fd53f273187ec320b68e31ff73569 Mon Sep 17 00:00:00 2001 From: mblos Date: Wed, 25 Mar 2026 13:11:03 +0100 Subject: [PATCH] Committed resources syncer with more alerts --- .../cortex-nova/alerts/nova.alerts.yaml | 112 +++++++++++++++++ .../commitments/api_change_commitments.go | 8 +- .../commitments/reservation_manager.go | 62 +++++---- .../commitments/reservation_manager_test.go | 48 +++---- .../reservations/commitments/syncer.go | 53 +++++++- .../commitments/syncer_monitor.go | 119 +++++++++++++++--- 6 files changed, 335 insertions(+), 67 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index a69f3be51..784830aac 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -483,3 +483,115 @@ groups: The committed resource capacity API (Limes LIQUID integration) is experiencing high latency (p95 > 5s). This may indicate slow database queries or knowledge CRD retrieval. Limes scrapes may time out, affecting capacity reporting. + + # Committed Resource Syncer Alerts + - alert: CortexNovaCommittedResourceSyncerNotRunning + expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer not running" + description: > + The committed resource syncer has not run in the last 2 hours. This indicates + that the syncer may have stopped or is encountering errors. Check the syncer logs for errors. + + - alert: CortexNovaCommittedResourceSyncerErrorsHigh + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer experiencing errors" + description: > + The committed resource syncer has encountered multiple errors in the last hour. + This may indicate connectivity issues with Limes. Check the syncer logs for error details. + + - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unit mismatch rate >5%" + description: > + More than 5% of commitments are being skipped due to unit mismatches between + Limes and Cortex flavor groups. This happens when Limes has not yet been + updated to use the new unit format after a flavor group change. The affected + commitments will keep their existing reservations until Limes notices the update. + Check the logs if this error persists for longer time. + + - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh + expr: | + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unknown flavor group rate >0%" + description: > + Some commitments reference flavor groups that don't exist in + Cortex Knowledge (anymore). This may indicate that flavor group configuration is + out of sync between Limes and Cortex, or that Knowledge extraction is failing. + Check the flavor group Knowledge CRD and history to see what was changed. + + - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh + expr: | + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer local change rate >1%" + description: > + More than 1% of synced commitments are requiring reservation changes + (creates, deletes, or repairs). This is higher than expected for steady-state + operation and may indicate data inconsistencies, external modifications to + reservations, or issues with the CRDs. Check Cortex logs for details. + + - alert: CortexNovaCommittedResourceSyncerRepairRateHigh + expr: | + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer repair rate >0%" + description: > + Some commitments have reservations that needed repair + (wrong metadata like project ID or flavor group). This may indicate data + corruption, bugs in reservation creation, or external modifications. + Reservations are automatically repaired, but the root cause should be + investigated if this alert persists. diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 953319dbe..3281fc6ec 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -253,15 +253,15 @@ ProcessLoop: logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes) - touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") + applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } - logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) - reservationsToWatch = append(reservationsToWatch, touchedReservations...) + logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations)) + reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...) } } } @@ -305,7 +305,7 @@ ProcessLoop: for commitmentUUID, state := range statesBefore { // Rollback to statesBefore for this commitment logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") + _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") if err != nil { logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) // continue with best effort rollback for other projects diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 40cfa08ef..6db72d21a 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -17,6 +17,20 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// ApplyResult contains the result of applying a commitment state. +type ApplyResult struct { + // Created is the number of reservations created + Created int + // Deleted is the number of reservations deleted + Deleted int + // Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config) + Repaired int + // TouchedReservations are reservations that were created or updated + TouchedReservations []v1alpha1.Reservation + // RemovedReservations are reservations that were deleted + RemovedReservations []v1alpha1.Reservation +} + // ReservationManager handles CRUD operations for Reservation CRDs. type ReservationManager struct { client.Client @@ -42,14 +56,16 @@ func NewReservationManager(k8sClient client.Client) *ReservationManager { // - Deleting unused/excess slots when capacity decreases // - Syncing reservation metadata for all remaining slots // -// Returns touched reservations (created/updated) and removed reservations for caller tracking. +// Returns ApplyResult containing touched/removed reservations and counts for metrics. func (m *ReservationManager) ApplyCommitmentState( ctx context.Context, log logr.Logger, desiredState *CommitmentState, flavorGroups map[string]compute.FlavorGroupFeature, creator string, -) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { +) (*ApplyResult, error) { + + result := &ApplyResult{} log = log.WithName("ReservationManager") @@ -58,7 +74,7 @@ func (m *ReservationManager) ApplyCommitmentState( if err := m.List(ctx, &allReservations, client.MatchingLabels{ v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { - return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + return nil, fmt.Errorf("failed to list reservations: %w", err) } // Filter by name prefix to find reservations for this commitment @@ -74,7 +90,7 @@ func (m *ReservationManager) ApplyCommitmentState( flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] if !exists { - return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) } deltaMemoryBytes := desiredState.TotalMemoryBytes for _, res := range existing { @@ -90,7 +106,6 @@ func (m *ReservationManager) ApplyCommitmentState( // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) // They will be recreated with correct metadata in subsequent phases. var validReservations []v1alpha1.Reservation - var repairedCount int for _, res := range existing { if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { @@ -101,13 +116,13 @@ func (m *ReservationManager) ApplyCommitmentState( "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, "expectedProjectID", desiredState.ProjectID, "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) - repairedCount++ - removedReservations = append(removedReservations, res) + result.Repaired++ + result.RemovedReservations = append(result.RemovedReservations, res) memValue := res.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, &res); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) } } else { validReservations = append(validReservations, res) @@ -139,33 +154,33 @@ func (m *ReservationManager) ApplyCommitmentState( reservationToDelete = &existing[len(existing)-1] existing = existing[:len(existing)-1] // remove from existing list } - removedReservations = append(removedReservations, *reservationToDelete) + result.RemovedReservations = append(result.RemovedReservations, *reservationToDelete) + result.Deleted++ memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes += memValue.Value() if err := m.Delete(ctx, reservationToDelete); err != nil { - return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + return result, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) } } // Phase 5 (CREATE): Create new reservations (capacity increased) - var createdCount int for deltaMemoryBytes > 0 { // Need to create new reservation slots, always prefer largest flavor within the group // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) - touchedReservations = append(touchedReservations, *reservation) + result.TouchedReservations = append(result.TouchedReservations, *reservation) memValue := reservation.Spec.Resources[hv1.ResourceMemory] deltaMemoryBytes -= memValue.Value() - createdCount++ + result.Created++ if err := m.Create(ctx, reservation); err != nil { if apierrors.IsAlreadyExists(err) { - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "reservation %s already exists (collision detected): %w", reservation.Name, err) } - return touchedReservations, removedReservations, fmt.Errorf( + return result, fmt.Errorf( "failed to create reservation slot %d: %w", nextSlotIndex, err) } @@ -177,24 +192,25 @@ func (m *ReservationManager) ApplyCommitmentState( for i := range existing { updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) if err != nil { - return touchedReservations, removedReservations, err + return result, err } if updated != nil { - touchedReservations = append(touchedReservations, *updated) + result.TouchedReservations = append(result.TouchedReservations, *updated) + result.Repaired++ } } // Only log if there were actual changes - if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 { + if hasChanges || result.Created > 0 || len(result.RemovedReservations) > 0 || result.Repaired > 0 { log.Info("commitment state sync completed", "commitmentUUID", desiredState.CommitmentUUID, - "created", createdCount, - "deleted", len(removedReservations), - "repaired", repairedCount, - "total", len(existing)+createdCount) + "created", result.Created, + "deleted", result.Deleted, + "repaired", result.Repaired, + "total", len(existing)+result.Created) } - return touchedReservations, removedReservations, nil + return result, nil } // syncReservationMetadata updates reservation metadata if it differs from desired state. diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index 2f20f33cc..7733cb6c2 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -42,7 +42,7 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { TotalMemoryBytes: 3 * 8192 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -54,18 +54,18 @@ func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != 0 { - t.Errorf("expected 0 removed reservations, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 0 { + t.Errorf("expected 0 applyResult.RemovedReservations reservations, got %d", len(applyResult.RemovedReservations)) } // Should create reservations to fulfill the commitment - if len(touched) == 0 { + if len(applyResult.TouchedReservations) == 0 { t.Fatal("expected at least one reservation to be created") } // Verify created reservations sum to desired state totalMemory := int64(0) - for _, res := range touched { + for _, res := range applyResult.TouchedReservations { memQuantity := res.Spec.Resources[hv1.ResourceMemory] totalMemory += memQuantity.Value() } @@ -142,7 +142,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -158,7 +158,7 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { // This is expected behavior based on the slot sizing algorithm // Should remove excess reservations - if len(removed) == 0 { + if len(applyResult.RemovedReservations) == 0 { t.Fatal("expected reservations to be removed") } @@ -481,7 +481,7 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { TotalMemoryBytes: tt.desiredMemoryBytes, } - _, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -493,12 +493,12 @@ func TestApplyCommitmentState_DeletionPriority(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(removed) != tt.expectedRemovedCount { - t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(removed)) + if len(applyResult.RemovedReservations) != tt.expectedRemovedCount { + t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(applyResult.RemovedReservations)) } if tt.validateRemoved != nil { - tt.validateRemoved(t, removed) + tt.validateRemoved(t, applyResult.RemovedReservations) } // Get remaining reservations @@ -560,7 +560,7 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { TotalMemoryBytes: 0, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -572,13 +572,13 @@ func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(touched) != 0 { - t.Errorf("expected 0 new reservations, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(applyResult.TouchedReservations)) } // Should remove all reservations - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } // Verify no reservations remain @@ -638,7 +638,7 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - touched, removed, err := manager.ApplyCommitmentState( + applyResult, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, @@ -651,18 +651,18 @@ func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { } // Should remove wrong reservation and create new one - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + if len(applyResult.RemovedReservations) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(applyResult.RemovedReservations)) } - if len(touched) != 1 { - t.Fatalf("expected 1 new reservation, got %d", len(touched)) + if len(applyResult.TouchedReservations) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(applyResult.TouchedReservations)) } // Verify new reservation has correct flavor group - if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + if applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { t.Errorf("expected flavor group test-group, got %s", - touched[0].Spec.CommittedResourceReservation.ResourceGroup) + applyResult.TouchedReservations[0].Spec.CommittedResourceReservation.ResourceGroup) } } @@ -686,7 +686,7 @@ func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { TotalMemoryBytes: 8 * 1024 * 1024 * 1024, } - _, _, err := manager.ApplyCommitmentState( + _, err := manager.ApplyCommitmentState( context.Background(), logr.Discard(), desiredState, diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index d6588a3a8..cb3d2fdb4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -85,8 +85,16 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo skippedUUIDs: make(map[string]bool), } for id, commitment := range commitments { + // Record each commitment seen from Limes + if s.monitor != nil { + s.monitor.RecordCommitmentSeen() + } + if commitment.ServiceType != "compute" { log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonNonCompute) + } continue } @@ -97,6 +105,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "id", id, "resourceName", commitment.ResourceName, "error", err) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonInvalidResource) + } continue } @@ -106,6 +117,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo log.Info("skipping commitment with unknown flavor group", "id", id, "flavorGroup", flavorGroupName) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonUnknownFlavorGroup) + } continue } @@ -136,6 +150,9 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo if commitment.UUID == "" { log.Info("skipping commitment with empty UUID", "id", id) + if s.monitor != nil { + s.monitor.RecordCommitmentSkipped(SkipReasonEmptyUUID) + } continue } @@ -155,6 +172,11 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "totalMemoryBytes", state.TotalMemoryBytes) result.states = append(result.states, state) + + // Record successfully processed commitment + if s.monitor != nil { + s.monitor.RecordCommitmentProcessed() + } } return result, nil @@ -171,6 +193,11 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("starting commitment sync with sync interval", "interval", DefaultSyncerConfig().SyncInterval) + // Record sync run + if s.monitor != nil { + s.monitor.RecordSyncRun() + } + // Check if flavor group knowledge is ready knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} knowledgeCRD, err := knowledge.Get(ctx) @@ -201,6 +228,7 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { manager := NewReservationManager(s.Client) // Apply each commitment state using the manager + var totalCreated, totalDeleted, totalRepaired int for _, state := range commitmentResult.states { logger.Info("applying commitment state", "commitmentUUID", state.CommitmentUUID, @@ -208,13 +236,17 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { "flavorGroup", state.FlavorGroupName, "totalMemoryBytes", state.TotalMemoryBytes) - _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) + applyResult, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) if err != nil { logger.Error(err, "failed to apply commitment state", "commitmentUUID", state.CommitmentUUID) // Continue with other commitments even if one fails continue } + + totalCreated += applyResult.Created + totalDeleted += applyResult.Deleted + totalRepaired += applyResult.Repaired } // Delete reservations that are no longer in commitments @@ -255,11 +287,28 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("deleted reservation for expired commitment", "name", existing.Name, "commitmentUUID", commitmentUUID) + totalDeleted++ + } + } + + // Record reservation change metrics + if s.monitor != nil { + if totalCreated > 0 { + s.monitor.RecordReservationsCreated(totalCreated) + } + if totalDeleted > 0 { + s.monitor.RecordReservationsDeleted(totalDeleted) + } + if totalRepaired > 0 { + s.monitor.RecordReservationsRepaired(totalRepaired) } } logger.Info("synced reservations", "processedCount", len(commitmentResult.states), - "skippedCount", len(commitmentResult.skippedUUIDs)) + "skippedCount", len(commitmentResult.skippedUUIDs), + "created", totalCreated, + "deleted", totalDeleted, + "repaired", totalRepaired) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go index 0bbc7fe7f..efa25b0a4 100644 --- a/internal/scheduling/reservations/commitments/syncer_monitor.go +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -7,11 +7,30 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +// Skip reason labels for commitment processing +const ( + SkipReasonUnitMismatch = "unit_mismatch" + SkipReasonUnknownFlavorGroup = "unknown_flavor_group" + SkipReasonInvalidResource = "invalid_resource_name" + SkipReasonEmptyUUID = "empty_uuid" + SkipReasonNonCompute = "non_compute" +) + // SyncerMonitor provides metrics for the commitment syncer. type SyncerMonitor struct { - syncRuns prometheus.Counter - syncErrors prometheus.Counter - unitMismatch *prometheus.CounterVec + // Sync lifecycle + syncRuns prometheus.Counter + syncErrors prometheus.Counter + + // Commitment processing + commitmentsTotal prometheus.Counter // all commitments seen from Limes + commitmentsProcessed prometheus.Counter // successfully processed + commitmentsSkipped *prometheus.CounterVec // skipped with reason label + + // Reservation changes + reservationsCreated prometheus.Counter + reservationsDeleted prometheus.Counter + reservationsRepaired prometheus.Counter } // NewSyncerMonitor creates a new monitor with Prometheus metrics. @@ -25,17 +44,44 @@ func NewSyncerMonitor() *SyncerMonitor { Name: "cortex_committed_resource_syncer_errors_total", Help: "Total number of commitment syncer errors", }), - unitMismatch: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_unit_mismatch_total", - Help: "Total number of commitments with unit mismatch between Limes and Cortex flavor group knowledge", - }, []string{"flavor_group"}), + commitmentsTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_total", + Help: "Total number of commitments seen from Limes", + }), + commitmentsProcessed: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_processed_total", + Help: "Total number of commitments successfully processed", + }), + commitmentsSkipped: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_commitments_skipped_total", + Help: "Total number of commitments skipped during sync", + }, []string{"reason"}), + reservationsCreated: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_created_total", + Help: "Total number of reservations created during sync", + }), + reservationsDeleted: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_deleted_total", + Help: "Total number of reservations deleted during sync", + }), + reservationsRepaired: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_reservations_repaired_total", + Help: "Total number of reservations repaired during sync (wrong metadata)", + }), } - return m -} -// RecordUnitMismatch records a unit mismatch for a flavor group. -func (m *SyncerMonitor) RecordUnitMismatch(flavorGroup string) { - m.unitMismatch.WithLabelValues(flavorGroup).Inc() + // Pre-initialize skip reason labels + for _, reason := range []string{ + SkipReasonUnitMismatch, + SkipReasonUnknownFlavorGroup, + SkipReasonInvalidResource, + SkipReasonEmptyUUID, + SkipReasonNonCompute, + } { + m.commitmentsSkipped.WithLabelValues(reason) + } + + return m } // RecordSyncRun records a syncer run. @@ -48,16 +94,61 @@ func (m *SyncerMonitor) RecordSyncError() { m.syncErrors.Inc() } +// RecordCommitmentSeen records a commitment seen from Limes. +func (m *SyncerMonitor) RecordCommitmentSeen() { + m.commitmentsTotal.Inc() +} + +// RecordCommitmentProcessed records a commitment successfully processed. +func (m *SyncerMonitor) RecordCommitmentProcessed() { + m.commitmentsProcessed.Inc() +} + +// RecordCommitmentSkipped records a commitment skipped with a reason. +func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) { + m.commitmentsSkipped.WithLabelValues(reason).Inc() +} + +// RecordUnitMismatch records a unit mismatch skip (convenience method). +func (m *SyncerMonitor) RecordUnitMismatch(_ string) { + m.commitmentsSkipped.WithLabelValues(SkipReasonUnitMismatch).Inc() +} + +// RecordReservationsCreated records reservations created. +func (m *SyncerMonitor) RecordReservationsCreated(count int) { + m.reservationsCreated.Add(float64(count)) +} + +// RecordReservationsDeleted records reservations deleted. +func (m *SyncerMonitor) RecordReservationsDeleted(count int) { + m.reservationsDeleted.Add(float64(count)) +} + +// RecordReservationsRepaired records reservations repaired. +func (m *SyncerMonitor) RecordReservationsRepaired(count int) { + m.reservationsRepaired.Add(float64(count)) +} + // Describe implements prometheus.Collector. func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) { m.syncRuns.Describe(ch) m.syncErrors.Describe(ch) - m.unitMismatch.Describe(ch) + m.commitmentsTotal.Describe(ch) + m.commitmentsProcessed.Describe(ch) + m.commitmentsSkipped.Describe(ch) + m.reservationsCreated.Describe(ch) + m.reservationsDeleted.Describe(ch) + m.reservationsRepaired.Describe(ch) } // Collect implements prometheus.Collector. func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) { m.syncRuns.Collect(ch) m.syncErrors.Collect(ch) - m.unitMismatch.Collect(ch) + m.commitmentsTotal.Collect(ch) + m.commitmentsProcessed.Collect(ch) + m.commitmentsSkipped.Collect(ch) + m.reservationsCreated.Collect(ch) + m.reservationsDeleted.Collect(ch) + m.reservationsRepaired.Collect(ch) }