Skip to content

Commit 7210bde

Browse files
authored
feat(supervisor): custom tolerations for scheduled runs (#3297)
Adds support for taint tolerations for scheduled runs. Useful for selectively tolerating taints on dedicated node pools. The new `KUBERNETES_SCHEDULED_RUN_TOLERATIONS` env variable accepts a comma-separated list in the format key=value:effect (or key:effect for the Exists operator). Drive-by: renames all `KUBERNETES_SCHEDULE_*` affinity env vars to KUBERNETES_SCHEDULED_RUN_* for clarity — this feature isn't used in production yet or published in a tagged image; the name change is fine.
1 parent 66b7010 commit 7210bde

File tree

2 files changed

+102
-14
lines changed

2 files changed

+102
-14
lines changed

apps/supervisor/src/env.ts

Lines changed: 86 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,16 @@ const Env = z
123123
KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods
124124
// Large machine affinity settings - large-* presets prefer a dedicated pool
125125
KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false),
126-
KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"),
127-
KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("large-machines"),
126+
KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z
127+
.string()
128+
.trim()
129+
.min(1)
130+
.default("node.cluster.x-k8s.io/machinepool"),
131+
KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z
132+
.string()
133+
.trim()
134+
.min(1)
135+
.default("large-machines"),
128136
KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100),
129137

130138
// Project affinity settings - pods from the same project prefer the same node
@@ -137,11 +145,82 @@ const Env = z
137145
.default("kubernetes.io/hostname"),
138146

139147
// Schedule affinity settings - runs from schedule trees prefer a dedicated pool
140-
KUBERNETES_SCHEDULE_AFFINITY_ENABLED: BoolEnv.default(false),
141-
KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"),
142-
KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("scheduled-runs"),
143-
KUBERNETES_SCHEDULE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80),
144-
KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(20),
148+
KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false),
149+
KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z
150+
.string()
151+
.trim()
152+
.min(1)
153+
.default("node.cluster.x-k8s.io/machinepool"),
154+
KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z
155+
.string()
156+
.trim()
157+
.min(1)
158+
.default("scheduled-runs"),
159+
KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80),
160+
KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce
161+
.number()
162+
.int()
163+
.min(1)
164+
.max(100)
165+
.default(20),
166+
167+
// Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool
168+
// Comma-separated list of tolerations in the format: key=value:effect
169+
// For Exists operator (no value): key:effect
170+
KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z
171+
.string()
172+
.transform((val, ctx) => {
173+
const tolerations = val
174+
.split(",")
175+
.map((entry) => entry.trim())
176+
.filter((entry) => entry.length > 0)
177+
.map((entry) => {
178+
const colonIdx = entry.lastIndexOf(":");
179+
if (colonIdx === -1) {
180+
ctx.addIssue({
181+
code: z.ZodIssueCode.custom,
182+
message: `Invalid toleration format (missing effect): "${entry}"`,
183+
});
184+
return z.NEVER;
185+
}
186+
187+
const effect = entry.slice(colonIdx + 1);
188+
const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"];
189+
if (!validEffects.includes(effect)) {
190+
ctx.addIssue({
191+
code: z.ZodIssueCode.custom,
192+
message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join(", ")}`,
193+
});
194+
return z.NEVER;
195+
}
196+
197+
const keyValue = entry.slice(0, colonIdx);
198+
const eqIdx = keyValue.indexOf("=");
199+
const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx);
200+
201+
if (!key) {
202+
ctx.addIssue({
203+
code: z.ZodIssueCode.custom,
204+
message: `Invalid toleration format (empty key): "${entry}"`,
205+
});
206+
return z.NEVER;
207+
}
208+
209+
if (eqIdx === -1) {
210+
return { key, operator: "Exists" as const, effect };
211+
}
212+
213+
return {
214+
key,
215+
operator: "Equal" as const,
216+
value: keyValue.slice(eqIdx + 1),
217+
effect,
218+
};
219+
});
220+
221+
return tolerations;
222+
})
223+
.optional(),
145224

146225
// Placement tags settings
147226
PLACEMENT_TAGS_ENABLED: BoolEnv.default(false),

apps/supervisor/src/workloadManager/kubernetes.ts

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ export class KubernetesWorkloadManager implements WorkloadManager {
121121
spec: {
122122
...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags),
123123
affinity: this.#getAffinity(opts),
124+
tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)),
124125
terminationGracePeriodSeconds: 60 * 60,
125126
containers: [
126127
{
@@ -485,7 +486,7 @@ export class KubernetesWorkloadManager implements WorkloadManager {
485486
}
486487

487488
#getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined {
488-
if (!env.KUBERNETES_SCHEDULE_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE) {
489+
if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) {
489490
return undefined;
490491
}
491492

@@ -494,13 +495,13 @@ export class KubernetesWorkloadManager implements WorkloadManager {
494495
return {
495496
preferredDuringSchedulingIgnoredDuringExecution: [
496497
{
497-
weight: env.KUBERNETES_SCHEDULE_AFFINITY_WEIGHT,
498+
weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT,
498499
preference: {
499500
matchExpressions: [
500501
{
501-
key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY,
502+
key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY,
502503
operator: "In",
503-
values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE],
504+
values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE],
504505
},
505506
],
506507
},
@@ -513,13 +514,13 @@ export class KubernetesWorkloadManager implements WorkloadManager {
513514
return {
514515
preferredDuringSchedulingIgnoredDuringExecution: [
515516
{
516-
weight: env.KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT,
517+
weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT,
517518
preference: {
518519
matchExpressions: [
519520
{
520-
key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY,
521+
key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY,
521522
operator: "NotIn",
522-
values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE],
523+
values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE],
523524
},
524525
],
525526
},
@@ -528,6 +529,14 @@ export class KubernetesWorkloadManager implements WorkloadManager {
528529
};
529530
}
530531

532+
#getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined {
533+
if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) {
534+
return undefined;
535+
}
536+
537+
return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS;
538+
}
539+
531540
#getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined {
532541
if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) {
533542
return undefined;

0 commit comments

Comments
 (0)