diff --git a/cmd/main.go b/cmd/main.go index 5d8acf1e7..be202f604 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -329,6 +329,9 @@ func main() { os.Exit(1) } novaAPIConfig := conf.GetConfigOrDie[nova.HTTPAPIConfig]() + setupLog.Info("loaded nova API config", + "evacuationShuffleK", novaAPIConfig.EvacuationShuffleK, + "novaLimitHostsToRequest", novaAPIConfig.NovaLimitHostsToRequest) nova.NewAPI(novaAPIConfig, filterWeigherController).Init(mux) // Detector pipeline controller setup. diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 192d714c2..b466e519c 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -141,6 +141,9 @@ cortex-scheduling-controllers: # If true, the external scheduler API will limit the list of hosts in its # response to those included in the scheduling request. novaLimitHostsToRequest: true + # Number of top hosts to shuffle for evacuation requests. + # Set to 0 or negative to disable shuffling. + evacuationShuffleK: 3 # CommittedResourceFlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations # This allows different scheduling strategies per flavor group (e.g., HANA vs GP) committedResourceFlavorGroupPipelines: diff --git a/internal/scheduling/nova/external_scheduler_api.go b/internal/scheduling/nova/external_scheduler_api.go index 59a5b377c..d731f8ffa 100644 --- a/internal/scheduling/nova/external_scheduler_api.go +++ b/internal/scheduling/nova/external_scheduler_api.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "log/slog" + "math/rand" "net/http" api "github.com/cobaltcore-dev/cortex/api/external/nova" @@ -24,6 +25,16 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics" ) +// Custom configuration for the Nova external scheduler api. +type HTTPAPIConfig struct { + // Number of top hosts to shuffle for evacuation requests. + // Set to 0 or negative to disable shuffling. + EvacuationShuffleK int `json:"evacuationShuffleK,omitempty"` + // NovaLimitHostsToRequest, if true, will filter the Nova scheduler response + // to only include hosts that were in the original request. + NovaLimitHostsToRequest bool `json:"novaLimitHostsToRequest,omitempty"` +} + type HTTPAPIDelegate interface { // Process the decision from the API. Should create and return the updated decision. ProcessNewDecisionFromAPI(ctx context.Context, decision *v1alpha1.Decision) error @@ -34,12 +45,6 @@ type HTTPAPI interface { Init(*http.ServeMux) } -type HTTPAPIConfig struct { - // NovaLimitHostsToRequest, if true, will filter the Nova scheduler response - // to only include hosts that were in the original request. - NovaLimitHostsToRequest bool `json:"novaLimitHostsToRequest,omitempty"` -} - type httpAPI struct { monitor scheduling.APIMonitor delegate HTTPAPIDelegate @@ -116,6 +121,26 @@ func (httpAPI *httpAPI) inferPipelineName(requestData api.ExternalSchedulerReque } } +// shuffleTopHosts randomly reorders the first k hosts if the request +// is an evacuation. This helps distribute evacuated VMs across multiple hosts +// rather than concentrating them on the single "best" host. +func shuffleTopHosts(hosts []string, k int) []string { + if k <= 0 { + return hosts + } + n := min(k, len(hosts)) + if n <= 1 { + return hosts + } + result := make([]string, len(hosts)) + copy(result, hosts) + rand.Shuffle(n, func(i, j int) { + result[i], result[j] = result[j], result[i] + }) + slog.Info("shuffled top hosts for evacuation", "k", n, "hosts", result[:n]) + return result +} + // Limit the external scheduler response to the hosts provided in the external // scheduler request. i.e. don't provide new hosts that weren't in the request, // since the Nova scheduler won't know how to handle them. @@ -235,6 +260,12 @@ func (httpAPI *httpAPI) NovaExternalScheduler(w http.ResponseWriter, r *http.Req slog.Info("limited hosts to request", "hosts", hosts, "originalHosts", decision.Status.Result.OrderedHosts) } + // This is a hack to address the problem that Nova only uses the first host in hosts for evacuation requests. + // Only for evacuation we shuffle the first k hosts to ensure that we do not get stuck on a single host + intent, err := requestData.GetIntent() + if err == nil && intent == api.EvacuateIntent { + hosts = shuffleTopHosts(hosts, httpAPI.config.EvacuationShuffleK) + } response := api.ExternalSchedulerResponse{Hosts: hosts} w.Header().Set("Content-Type", "application/json") if err = json.NewEncoder(w).Encode(response); err != nil { diff --git a/internal/scheduling/nova/external_scheduler_api_test.go b/internal/scheduling/nova/external_scheduler_api_test.go index e394eb77a..70ce3c334 100644 --- a/internal/scheduling/nova/external_scheduler_api_test.go +++ b/internal/scheduling/nova/external_scheduler_api_test.go @@ -506,6 +506,114 @@ func TestLimitHostsToRequest(t *testing.T) { } } +func TestShuffleTopHosts(t *testing.T) { + tests := []struct { + name string + hosts []string + k int + unchangedTailFrom int // index from which hosts should be unchanged (-1 if all can change) + expectUnchanged bool + }{ + { + name: "empty hosts returns empty", + hosts: []string{}, + k: 3, + }, + { + name: "single host returns unchanged", + hosts: []string{"host1"}, + k: 3, + unchangedTailFrom: 0, + }, + { + name: "two hosts with k=3 shuffles all", + hosts: []string{"host1", "host2"}, + k: 3, + unchangedTailFrom: -1, + }, + { + name: "three hosts with k=3 shuffles all", + hosts: []string{"host1", "host2", "host3"}, + k: 3, + unchangedTailFrom: -1, + }, + { + name: "four hosts with k=3 shuffles first 3", + hosts: []string{"host1", "host2", "host3", "host4"}, + k: 3, + unchangedTailFrom: 3, + }, + { + name: "shuffles only first k hosts", + hosts: []string{"host1", "host2", "host3", "host4", "host5"}, + k: 3, + unchangedTailFrom: 3, + }, + { + name: "k=0 disables shuffling", + hosts: []string{"host1", "host2", "host3", "host4", "host5"}, + k: 0, + expectUnchanged: true, + }, + { + name: "negative k disables shuffling", + hosts: []string{"host1", "host2", "host3", "host4", "host5"}, + k: -1, + expectUnchanged: true, + }, + { + name: "k larger than hosts shuffles all", + hosts: []string{"host1", "host2"}, + k: 10, + unchangedTailFrom: -1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + original := make([]string, len(tt.hosts)) + copy(original, tt.hosts) + + result := shuffleTopHosts(tt.hosts, tt.k) + + if len(result) != len(tt.hosts) { + t.Fatalf("expected %d hosts, got %d", len(tt.hosts), len(result)) + } + // Verify original slice not modified + for i, h := range original { + if tt.hosts[i] != h { + t.Errorf("original slice modified at %d: expected %s, got %s", i, h, tt.hosts[i]) + } + } + // When k <= 0, expect the same slice returned (no copy) + if tt.expectUnchanged { + if len(result) > 0 && &result[0] != &tt.hosts[0] { + t.Error("expected same slice returned when k <= 0") + } + return + } + // Verify tail unchanged + if tt.unchangedTailFrom >= 0 { + for i := tt.unchangedTailFrom; i < len(original); i++ { + if result[i] != original[i] { + t.Errorf("expected host[%d] = %s unchanged, got %s", i, original[i], result[i]) + } + } + } + // Verify all hosts present + hostSet := make(map[string]bool) + for _, h := range result { + hostSet[h] = true + } + for _, h := range original { + if !hostSet[h] { + t.Errorf("host %s missing from result", h) + } + } + }) + } +} + func TestHTTPAPI_inferPipelineName(t *testing.T) { delegate := &mockHTTPAPIDelegate{} api := NewAPI(HTTPAPIConfig{}, delegate).(*httpAPI) diff --git a/internal/scheduling/nova/integration_test.go b/internal/scheduling/nova/integration_test.go index 47f371f0b..f4bb38a79 100644 --- a/internal/scheduling/nova/integration_test.go +++ b/internal/scheduling/nova/integration_test.go @@ -283,7 +283,9 @@ func NewIntegrationTestServer(t *testing.T, pipelineConfig PipelineConfig, objec controller.PipelineConfigs[testPipeline.Name] = testPipeline // Create the HTTP API with the controller as delegate - skip metrics registration + // Set EvacuationShuffleK=0 to disable shuffle for deterministic test results api := &httpAPI{ + config: HTTPAPIConfig{EvacuationShuffleK: 0}, monitor: lib.NewSchedulerMonitor(), // Create new monitor but don't register delegate: controller, }