Skip to content

Commit 72e6a96

Browse files
authored
Add multicluster support for the event recorder (#619)
## Changes - Add cluster routing capabilities to EventRecorder
1 parent 39d1957 commit 72e6a96

13 files changed

Lines changed: 636 additions & 11 deletions

docs/guides/multicluster/readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Cortex Multi-Cluster Testing
22

33
> [!NOTE]
4-
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you.
4+
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. If you want to test the multi-cluster setup you can run the `schedule.sh` script, which will create a scheduling request and show you how it gets processed across the clusters.
55
66
Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. Similarly, another use case is to have multiple remote clusters that maintain all the compute workloads and expose resources that Cortex needs to access, such as the `Hypervisor` resource.
77

docs/guides/multicluster/run.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,20 @@ global:
5252
gvks:
5353
- kvm.cloud.sap/v1/Hypervisor
5454
- kvm.cloud.sap/v1/HypervisorList
55+
- cortex.cloud/v1alpha1/History
56+
- cortex.cloud/v1alpha1/HistoryList
5557
labels:
56-
az: cortex-remote-az-a
58+
availabilityZone: cortex-remote-az-a
5759
caCert: |
5860
$(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /')
5961
- host: https://host.docker.internal:8445
6062
gvks:
6163
- kvm.cloud.sap/v1/Hypervisor
6264
- kvm.cloud.sap/v1/HypervisorList
65+
- cortex.cloud/v1alpha1/History
66+
- cortex.cloud/v1alpha1/HistoryList
6367
labels:
64-
az: cortex-remote-az-b
68+
availabilityZone: cortex-remote-az-b
6569
caCert: |
6670
$(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /')
6771
EOF
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
API_URL="http://localhost:8001/scheduler/nova/external"
6+
INSTANCE_UUID="cortex-test-instance-001"
7+
HISTORY_NAME="nova-$INSTANCE_UUID"
8+
9+
# --- Step 1: Apply the test pipeline -----------------------------------------
10+
11+
echo "=== Step 1: Apply test pipeline ==="
12+
echo ""
13+
echo "The test pipeline is a minimal filter-weigher pipeline with:"
14+
echo " - createHistory: true (so a History CRD is created for each decision)"
15+
echo " - filter_correct_az (filters hosts not matching the requested AZ)"
16+
echo " - no weighers (hosts are returned in their original order)"
17+
echo ""
18+
19+
kubectl --context kind-cortex-home apply -f docs/guides/multicluster/test-pipeline.yaml
20+
21+
echo ""
22+
echo "Press enter to send a scheduling request..."
23+
read -r
24+
25+
# --- Step 2: Send scheduling request -----------------------------------------
26+
27+
echo "=== Step 2: Send scheduling request ==="
28+
echo ""
29+
echo "Sending a Nova external scheduler request to the cortex API."
30+
echo ""
31+
echo " Instance UUID: $INSTANCE_UUID"
32+
echo " Availability Zone: cortex-remote-az-b"
33+
echo " Pipeline: multicluster-test"
34+
echo " Candidate hosts: hypervisor-{1,2}-az-{a,b} (4 hosts across 2 AZs)"
35+
echo ""
36+
echo "The pipeline's filter_correct_az step should filter out the az-a hosts,"
37+
echo "leaving only hypervisor-1-az-b and hypervisor-2-az-b."
38+
echo ""
39+
40+
RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL" \
41+
-H "Content-Type: application/json" \
42+
-d @- <<EOF
43+
{
44+
"spec": {
45+
"nova_object.name": "RequestSpec",
46+
"nova_object.namespace": "nova",
47+
"nova_object.version": "1.14",
48+
"nova_object.changes": [],
49+
"nova_object.data": {
50+
"project_id": "test-project",
51+
"user_id": "test-user",
52+
"instance_uuid": "$INSTANCE_UUID",
53+
"availability_zone": "cortex-remote-az-b",
54+
"num_instances": 1,
55+
"is_bfv": false,
56+
"scheduler_hints": {},
57+
"ignore_hosts": null,
58+
"force_hosts": null,
59+
"force_nodes": null,
60+
"image": {
61+
"nova_object.name": "ImageMeta",
62+
"nova_object.namespace": "nova",
63+
"nova_object.version": "1.8",
64+
"nova_object.changes": [],
65+
"nova_object.data": {
66+
"id": "00000000-0000-0000-0000-000000000001",
67+
"name": "test-image",
68+
"status": "active",
69+
"checksum": "0000000000000000",
70+
"owner": "test-project",
71+
"size": 1024,
72+
"container_format": "bare",
73+
"disk_format": "raw",
74+
"created_at": "2025-01-01T00:00:00Z",
75+
"updated_at": "2025-01-01T00:00:00Z",
76+
"min_ram": 0,
77+
"min_disk": 0,
78+
"properties": {
79+
"nova_object.name": "ImageMetaProps",
80+
"nova_object.namespace": "nova",
81+
"nova_object.version": "1.36",
82+
"nova_object.changes": [],
83+
"nova_object.data": {}
84+
}
85+
}
86+
},
87+
"flavor": {
88+
"nova_object.name": "Flavor",
89+
"nova_object.namespace": "nova",
90+
"nova_object.version": "1.2",
91+
"nova_object.changes": [],
92+
"nova_object.data": {
93+
"id": 1,
94+
"name": "m1.small",
95+
"memory_mb": 2048,
96+
"vcpus": 1,
97+
"root_gb": 20,
98+
"ephemeral_gb": 0,
99+
"flavorid": "1",
100+
"swap": 0,
101+
"rxtx_factor": 1.0,
102+
"vcpu_weight": 0,
103+
"disabled": false,
104+
"is_public": true,
105+
"extra_specs": {
106+
"capabilities:hypervisor_type": "qemu"
107+
},
108+
"description": null,
109+
"created_at": "2025-01-01T00:00:00Z",
110+
"updated_at": null
111+
}
112+
},
113+
"request_level_params": {
114+
"nova_object.name": "RequestLevelParams",
115+
"nova_object.namespace": "nova",
116+
"nova_object.version": "1.1",
117+
"nova_object.changes": [],
118+
"nova_object.data": {
119+
"root_required": [],
120+
"root_forbidden": [],
121+
"same_subtree": []
122+
}
123+
},
124+
"network_metadata": {
125+
"nova_object.name": "NetworkMetadata",
126+
"nova_object.namespace": "nova",
127+
"nova_object.version": "1.0",
128+
"nova_object.changes": [],
129+
"nova_object.data": {
130+
"physnets": [],
131+
"tunneled": false
132+
}
133+
},
134+
"limits": {
135+
"nova_object.name": "SchedulerLimits",
136+
"nova_object.namespace": "nova",
137+
"nova_object.version": "1.0",
138+
"nova_object.changes": [],
139+
"nova_object.data": {}
140+
},
141+
"requested_networks": {
142+
"objects": null
143+
},
144+
"security_groups": {
145+
"objects": null
146+
}
147+
}
148+
},
149+
"context": {
150+
"user": "test-user",
151+
"project_id": "test-project",
152+
"system_scope": null,
153+
"project": "test-project",
154+
"domain": null,
155+
"user_domain": "Default",
156+
"project_domain": "Default",
157+
"is_admin": false,
158+
"read_only": false,
159+
"show_deleted": false,
160+
"request_id": "req-test-001",
161+
"global_request_id": null,
162+
"resource_uuid": null,
163+
"roles": [],
164+
"user_identity": "test-user test-project - Default -",
165+
"is_admin_project": false,
166+
"read_deleted": "no",
167+
"remote_address": "127.0.0.1",
168+
"timestamp": "2025-01-01T00:00:00.000000",
169+
"quota_class": null,
170+
"user_name": "test-user",
171+
"project_name": "test-project"
172+
},
173+
"hosts": [
174+
{"host": "hypervisor-1-az-a", "hypervisor_hostname": "hypervisor-1-az-a"},
175+
{"host": "hypervisor-2-az-a", "hypervisor_hostname": "hypervisor-2-az-a"},
176+
{"host": "hypervisor-1-az-b", "hypervisor_hostname": "hypervisor-1-az-b"},
177+
{"host": "hypervisor-2-az-b", "hypervisor_hostname": "hypervisor-2-az-b"}
178+
],
179+
"weights": {
180+
"hypervisor-1-az-a": 1.0,
181+
"hypervisor-2-az-a": 2.0,
182+
"hypervisor-1-az-b": 3.0,
183+
"hypervisor-2-az-b": 4.0
184+
},
185+
"pipeline": "multicluster-test"
186+
}
187+
EOF
188+
)
189+
190+
HTTP_CODE=$(echo "$RESPONSE" | tail -1)
191+
BODY=$(echo "$RESPONSE" | sed '$d')
192+
193+
echo "Response (HTTP $HTTP_CODE):"
194+
echo "$BODY" | python3 -m json.tool 2>/dev/null || echo "$BODY"
195+
196+
if [ "$HTTP_CODE" != "200" ]; then
197+
echo ""
198+
echo "ERROR: Scheduling request failed. Check the controller logs:"
199+
echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager"
200+
exit 1
201+
fi
202+
203+
echo ""
204+
echo "Press enter to check History CRDs and events across all clusters..."
205+
read -r
206+
207+
# --- Step 3: Check History and Events ----------------------------------------
208+
209+
echo "=== Step 3: Check History CRDs and Events ==="
210+
echo ""
211+
echo "The pipeline has createHistory: true, so a History CRD named '$HISTORY_NAME'"
212+
echo "should have been created. An event should also have been recorded on it."
213+
echo "Based on the multicluster config, this should be on the remote cluster cortex-remote-az-b."
214+
echo ""
215+
216+
sleep 1
217+
218+
for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do
219+
echo "--- $CLUSTER ---"
220+
echo "Histories:"
221+
kubectl --context "$CLUSTER" get histories 2>/dev/null || echo " (none)"
222+
echo "Events:"
223+
kubectl --context "$CLUSTER" get events --field-selector reason=SchedulingSucceeded 2>/dev/null || echo " (none)"
224+
echo ""
225+
done
226+
227+
echo "Press enter to describe the History CRD and see the full scheduling result..."
228+
read -r
229+
230+
# --- Step 4: Describe History ------------------------------------------------
231+
232+
echo "=== Step 4: Describe History CRD ==="
233+
echo ""
234+
echo "The History CRD contains the full scheduling decision context:"
235+
echo " - Which pipeline was used"
236+
echo " - The target host that was selected"
237+
echo " - An explanation of each filter/weigher step"
238+
echo " - The Ready condition (True = host selected, False = no host found)"
239+
echo ""
240+
241+
# Try all clusters to find where the History ended up.
242+
for CLUSTER in kind-cortex-home kind-cortex-remote-az-a kind-cortex-remote-az-b; do
243+
if kubectl --context "$CLUSTER" get history "$HISTORY_NAME" &>/dev/null; then
244+
echo "Found History '$HISTORY_NAME' in $CLUSTER:"
245+
echo ""
246+
kubectl --context "$CLUSTER" describe history "$HISTORY_NAME"
247+
exit 0
248+
fi
249+
done
250+
251+
echo "WARNING: History '$HISTORY_NAME' was not found in any cluster."
252+
echo "Check the controller logs for errors:"
253+
echo " kubectl --context kind-cortex-home logs deploy/cortex-nova-scheduling-controller-manager | grep -i history"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: cortex.cloud/v1alpha1
2+
kind: Pipeline
3+
metadata:
4+
name: multicluster-test
5+
spec:
6+
schedulingDomain: nova
7+
description: Minimal test pipeline for the multicluster guide.
8+
type: filter-weigher
9+
createHistory: true
10+
filters:
11+
- name: filter_correct_az
12+
weighers: []

internal/scheduling/cinder/filter_weigher_pipeline_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
148148
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
149149
c.Initializer = c
150150
c.SchedulingDomain = v1alpha1.SchedulingDomainCinder
151-
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-cinder-scheduler")}
151+
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-cinder-scheduler")}
152152
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
153153
return err
154154
}

internal/scheduling/machines/filter_weigher_pipeline_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler {
222222
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
223223
c.Initializer = c
224224
c.SchedulingDomain = v1alpha1.SchedulingDomainMachines
225-
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-machines-scheduler")}
225+
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-machines-scheduler")}
226226
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
227227
return err
228228
}

internal/scheduling/manila/filter_weigher_pipeline_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
148148
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
149149
c.Initializer = c
150150
c.SchedulingDomain = v1alpha1.SchedulingDomainManila
151-
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-manila-scheduler")}
151+
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-manila-scheduler")}
152152
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
153153
return err
154154
}

internal/scheduling/nova/filter_weigher_pipeline_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
199199
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
200200
c.Initializer = c
201201
c.SchedulingDomain = v1alpha1.SchedulingDomainNova
202-
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-nova-scheduler")}
202+
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-nova-scheduler")}
203203
c.gatherer = &candidateGatherer{Client: mcl}
204204
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
205205
return err

internal/scheduling/pods/filter_weigher_pipeline_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler {
234234
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
235235
c.Initializer = c
236236
c.SchedulingDomain = v1alpha1.SchedulingDomainPods
237-
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-pods-scheduler")}
237+
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-pods-scheduler")}
238238
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
239239
return err
240240
}

internal/scheduling/reservations/failover/controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ func (c *FailoverReservationController) patchReservationStatus(ctx context.Conte
766766
// SetupWithManager sets up the watch-based reconciler with the Manager.
767767
// This handles per-reservation reconciliation triggered by CRD changes.
768768
func (c *FailoverReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error {
769-
c.Recorder = mgr.GetEventRecorder("failover-reservation-controller")
769+
c.Recorder = mcl.GetEventRecorder("failover-reservation-controller")
770770

771771
bldr := multicluster.BuildController(mcl, mgr)
772772
bldr, err := bldr.WatchesMulticluster(

0 commit comments

Comments
 (0)