Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,13 @@ if 'nova' in ACTIVE_DEPLOYMENTS:
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
)
local_resource(
'Commitments E2E Tests',
'/bin/sh -c "kubectl exec deploy/cortex-nova-scheduling-controller-manager -- /manager e2e-commitments"',
labels=['Cortex-Nova'],
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
)

if 'manila' in ACTIVE_DEPLOYMENTS:
print("Activating Cortex Manila bundle")
Expand Down
8 changes: 7 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ func main() {
manilaChecksConfig := conf.GetConfigOrDie[manila.ChecksConfig]()
manila.RunChecks(ctx, client, manilaChecksConfig)
return
case "e2e-commitments":
commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]()
commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
return
}
}

Expand Down Expand Up @@ -665,7 +669,9 @@ func main() {

if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") {
setupLog.Info("starting commitments syncer")
syncer := commitments.NewSyncer(multiclusterClient)
syncerMonitor := commitments.NewSyncerMonitor()
must.Succeed(metrics.Registry.Register(syncerMonitor))
syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor)
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
syncerDefaults := commitments.DefaultSyncerConfig()
if syncerConfig.SyncInterval == 0 {
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ require (
github.com/poy/onpar v0.3.5 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/sapcc/go-api-declarations v1.20.2
github.com/sapcc/go-api-declarations v1.21.0
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/cobra v1.10.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect
Expand All @@ -98,7 +98,7 @@ require (
golang.org/x/oauth2 v0.34.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.42.0 // indirect
golang.org/x/term v0.41.0 // indirect
golang.org/x/term v0.41.0
golang.org/x/text v0.33.0 // indirect
golang.org/x/time v0.14.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
Expand All @@ -108,7 +108,7 @@ require (
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
gopkg.in/yaml.v3 v3.0.1
gotest.tools v2.2.0+incompatible // indirect
k8s.io/apiextensions-apiserver v0.35.0 // indirect
k8s.io/apiserver v0.35.0 // indirect
Expand Down
8 changes: 2 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ=
github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc=
github.com/sapcc/go-api-declarations v1.21.0 h1:Ag6GXgJLTFdBDKmrJU4QFllQbgGSenSGeHpLuvuxeDk=
github.com/sapcc/go-api-declarations v1.21.0/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc=
github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ=
github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw=
github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
Expand Down Expand Up @@ -251,12 +251,8 @@ golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
Expand Down
20 changes: 19 additions & 1 deletion helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,25 @@ groups:
configuration. It is recommended to investigate the
pipeline status and logs for more details.

# Committed Resource (Limes Integration) Alerts
# Committed Resource Info API Alerts
- alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
context: committed-resource-api
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Committed Resource info API HTTP 500 errors too high"
description: >
The committed resource info API (Limes LIQUID integration) is responding
with HTTP 5xx errors. This indicates internal problems building service info,
such as invalid flavor group data. Limes will not be able to discover available
resources until the issue is resolved.

# Committed Resource Change API Alerts
- alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
for: 5m
Expand Down
3 changes: 3 additions & 0 deletions internal/scheduling/reservations/commitments/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type HTTPAPI struct {
monitor ChangeCommitmentsAPIMonitor
usageMonitor ReportUsageAPIMonitor
capacityMonitor ReportCapacityAPIMonitor
infoMonitor InfoAPIMonitor
// Mutex to serialize change-commitments requests
changeMutex sync.Mutex
}
Expand All @@ -44,13 +45,15 @@ func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaC
monitor: NewChangeCommitmentsAPIMonitor(),
usageMonitor: NewReportUsageAPIMonitor(),
capacityMonitor: NewReportCapacityAPIMonitor(),
infoMonitor: NewInfoAPIMonitor(),
}
}

func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log logr.Logger) {
registry.MustRegister(&api.monitor)
registry.MustRegister(&api.usageMonitor)
registry.MustRegister(&api.capacityMonitor)
registry.MustRegister(&api.infoMonitor)
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
Expand Down
55 changes: 47 additions & 8 deletions internal/scheduling/reservations/commitments/api_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,29 @@ package commitments
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"strconv"
"strings"
"time"

"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
"github.com/go-logr/logr"
"github.com/google/uuid"
liquid "github.com/sapcc/go-api-declarations/liquid"
)

// errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration)
var errInternalServiceInfo = errors.New("internal error building service info")

// handles GET /v1/info requests from Limes:
// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
startTime := time.Now()
statusCode := http.StatusOK

// Extract or generate request ID for tracing
requestID := r.Header.Get("X-Request-ID")
if requestID == "" {
Expand All @@ -33,7 +42,9 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {

// Only accept GET method
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
statusCode = http.StatusMethodNotAllowed
http.Error(w, "Method not allowed", statusCode)
api.recordInfoMetrics(statusCode, startTime)
return
}

Expand All @@ -42,20 +53,35 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
// Build info response
info, err := api.buildServiceInfo(ctx, logger)
if err != nil {
// Use Info level for expected conditions like knowledge not being ready yet
logger.Info("service info not available yet", "error", err.Error())
http.Error(w, "Service temporarily unavailable: "+err.Error(),
http.StatusServiceUnavailable)
if errors.Is(err, errInternalServiceInfo) {
logger.Error(err, "internal error building service info")
statusCode = http.StatusInternalServerError
http.Error(w, "Internal server error: "+err.Error(), statusCode)
} else {
// Use Info level for expected conditions like knowledge not being ready yet
logger.Info("service info not available yet", "error", err.Error())
statusCode = http.StatusServiceUnavailable
http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode)
}
api.recordInfoMetrics(statusCode, startTime)
return
}

// Return response
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.WriteHeader(statusCode)
if err := json.NewEncoder(w).Encode(info); err != nil {
logger.Error(err, "failed to encode service info")
return
}
api.recordInfoMetrics(statusCode, startTime)
}

// recordInfoMetrics records Prometheus metrics for an info API request.
func (api *HTTPAPI) recordInfoMetrics(statusCode int, startTime time.Time) {
duration := time.Since(startTime).Seconds()
statusCodeStr := strconv.Itoa(statusCode)
api.infoMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
api.infoMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
}

// resourceAttributes holds the custom attributes for a resource in the info API response.
Expand Down Expand Up @@ -108,9 +134,22 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l
attrsJSON = nil
}

// Build unit from smallest flavor memory (e.g., "131072 MiB" for 128 GiB)
// Validate memory is positive to avoid panic in MultiplyBy (which panics on factor=0)
if groupData.SmallestFlavor.MemoryMB == 0 {
return liquid.ServiceInfo{}, fmt.Errorf("%w: flavor group %q has invalid smallest flavor with memoryMB=0",
errInternalServiceInfo, groupName)
}
unit, err := liquid.UnitMebibytes.MultiplyBy(groupData.SmallestFlavor.MemoryMB)
if err != nil {
// Note: This error only occurs on uint64 overflow, which is unrealistic for memory values
return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w",
errInternalServiceInfo, groupName, err)
}

resources[resourceName] = liquid.ResourceInfo{
DisplayName: displayName,
Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances
Unit: unit, // Non-standard unit: multiples of smallest flavor RAM
Topology: liquid.AZAwareTopology, // Commitments are per-AZ
NeedsResourceDemand: false, // Capacity planning out of scope for now
HasCapacity: handlesCommitments, // We report capacity via /v1/report-capacity only for groups that accept commitments
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0

package commitments

import (
"github.com/prometheus/client_golang/prometheus"
)

// InfoAPIMonitor provides metrics for the CR info API.
type InfoAPIMonitor struct {
requestCounter *prometheus.CounterVec
requestDuration *prometheus.HistogramVec
}

// NewInfoAPIMonitor creates a new monitor with Prometheus metrics.
// Metrics are pre-initialized with zero values for common HTTP status codes
// to ensure they appear in Prometheus before the first request.
func NewInfoAPIMonitor() InfoAPIMonitor {
m := InfoAPIMonitor{
requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cortex_committed_resource_info_api_requests_total",
Help: "Total number of committed resource info API requests by HTTP status code",
}, []string{"status_code"}),
requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "cortex_committed_resource_info_api_request_duration_seconds",
Help: "Duration of committed resource info API requests in seconds by HTTP status code",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
}, []string{"status_code"}),
}

// Pre-initialize metrics with zero values for common HTTP status codes.
// This ensures metrics exist in Prometheus before the first request,
// preventing "metric missing" warnings in alerting rules.
for _, statusCode := range []string{"200", "405", "500", "503"} {
m.requestCounter.WithLabelValues(statusCode)
m.requestDuration.WithLabelValues(statusCode)
}

return m
}

// Describe implements prometheus.Collector.
func (m *InfoAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
m.requestCounter.Describe(ch)
m.requestDuration.Describe(ch)
}

// Collect implements prometheus.Collector.
func (m *InfoAPIMonitor) Collect(ch chan<- prometheus.Metric) {
m.requestCounter.Collect(ch)
m.requestDuration.Collect(ch)
}
71 changes: 64 additions & 7 deletions internal/scheduling/reservations/commitments/api_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ func TestHandleInfo_KnowledgeNotReady(t *testing.T) {
WithScheme(scheme).
Build()

api := &HTTPAPI{
client: k8sClient,
}
api := NewAPI(k8sClient)

req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
w := httptest.NewRecorder()
Expand Down Expand Up @@ -62,9 +60,7 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) {
WithScheme(scheme).
Build()

api := &HTTPAPI{
client: k8sClient,
}
api := NewAPI(k8sClient)

// Use POST instead of GET
req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody)
Expand All @@ -80,6 +76,67 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) {
}
}

func TestHandleInfo_InvalidFlavorMemory(t *testing.T) {
// Test that a 500 Internal Server Error is returned when a flavor group has invalid data.
//
// A flavor with memoryMB=0 is invalid and should trigger an HTTP 500 error.
// Such data could occur from a bug in the flavor groups extractor.
scheme := runtime.NewScheme()
if err := v1alpha1.AddToScheme(scheme); err != nil {
t.Fatalf("failed to add scheme: %v", err)
}

// Create flavor group with memoryMB=0 (invalid data that could come from a buggy extractor)
features := []map[string]interface{}{
{
"name": "invalid_group",
"flavors": []map[string]interface{}{
{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
},
"largestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
"smallestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
"ramCoreRatio": 4096,
},
}

raw, err := v1alpha1.BoxFeatureList(features)
if err != nil {
t.Fatalf("failed to box features: %v", err)
}

knowledge := &v1alpha1.Knowledge{
ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"},
Spec: v1alpha1.KnowledgeSpec{
SchedulingDomain: v1alpha1.SchedulingDomainNova,
Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"},
},
Status: v1alpha1.KnowledgeStatus{
Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}},
Raw: raw,
LastContentChange: v1.Now(),
},
}

k8sClient := fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(knowledge).
Build()

api := NewAPI(k8sClient)

req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
w := httptest.NewRecorder()
api.HandleInfo(w, req)

resp := w.Result()
defer resp.Body.Close()

// Should return 500 Internal Server Error when unit creation fails
if resp.StatusCode != http.StatusInternalServerError {
t.Errorf("expected status code %d (Internal Server Error), got %d", http.StatusInternalServerError, resp.StatusCode)
}
}

func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) {
// Test that HasCapacity == HandlesCommitments for all resources
// Both should be true only for groups with fixed RAM/core ratio
Expand Down Expand Up @@ -138,7 +195,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) {
WithObjects(knowledge).
Build()

api := &HTTPAPI{client: k8sClient}
api := NewAPI(k8sClient)

req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
w := httptest.NewRecorder()
Expand Down
Loading
Loading