From 2ed57940672d895aa4c6f3534b33e3be150f0803 Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Mon, 30 Mar 2026 12:29:23 +0200 Subject: [PATCH 1/3] feat(supervisor): custom tolerations for scheduled runs --- apps/supervisor/src/env.ts | 40 ++++++++++++--- .../src/workloadManager/kubernetes.ts | 49 ++++++++++++++++--- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index fa5b23efbb..7e9dacbc2a 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -123,8 +123,16 @@ const Env = z KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods // Large machine affinity settings - large-* presets prefer a dedicated pool KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"), - KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("large-machines"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("large-machines"), KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100), // Project affinity settings - pods from the same project prefer the same node @@ -137,11 +145,29 @@ const Env = z .default("kubernetes.io/hostname"), // Schedule affinity settings - runs from schedule trees prefer a dedicated pool - KUBERNETES_SCHEDULE_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"), - KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("scheduled-runs"), - KUBERNETES_SCHEDULE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), - KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(20), + KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("scheduled-runs"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), + KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce + .number() + .int() + .min(1) + .max(100) + .default(20), + + // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool + // Comma-separated list of tolerations in the format: key=value:effect + // For Exists operator (no value): key:effect + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z.string().optional(), // Placement tags settings PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index a7402e9a34..58bea2f8b8 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -121,6 +121,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { spec: { ...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags), affinity: this.#getAffinity(opts), + tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)), terminationGracePeriodSeconds: 60 * 60, containers: [ { @@ -485,7 +486,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { } #getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined { - if (!env.KUBERNETES_SCHEDULE_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE) { + if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) { return undefined; } @@ -494,13 +495,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { return { preferredDuringSchedulingIgnoredDuringExecution: [ { - weight: env.KUBERNETES_SCHEDULE_AFFINITY_WEIGHT, + weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT, preference: { matchExpressions: [ { - key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY, + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, operator: "In", - values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE], + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -513,13 +514,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { return { preferredDuringSchedulingIgnoredDuringExecution: [ { - weight: env.KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT, + weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT, preference: { matchExpressions: [ { - key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY, + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, operator: "NotIn", - values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE], + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -528,6 +529,40 @@ export class KubernetesWorkloadManager implements WorkloadManager { }; } + #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS) { + return undefined; + } + + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS.split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + // Format: key=value:effect or key:effect (Exists operator) + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + throw new Error(`Invalid toleration format (missing effect): "${entry}"`); + } + + const effect = entry.slice(colonIdx + 1) as k8s.V1Toleration["effect"]; + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + + if (eqIdx === -1) { + // key:effect -> Exists operator + return { key: keyValue, operator: "Exists" as const, effect }; + } + + // key=value:effect -> Equal operator + return { + key: keyValue.slice(0, eqIdx), + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + } + #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) { return undefined; From 46521c5f81652549ad07efc4c3728e3410f9f54f Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Mon, 30 Mar 2026 12:39:44 +0200 Subject: [PATCH 2/3] Do the parsing on startup instead of at run time --- apps/supervisor/src/env.ts | 46 ++++++++++++++++++- .../src/workloadManager/kubernetes.ts | 30 +----------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 7e9dacbc2a..d662f16ea7 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -167,7 +167,51 @@ const Env = z // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool // Comma-separated list of tolerations in the format: key=value:effect // For Exists operator (no value): key:effect - KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z.string().optional(), + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z + .string() + .transform((val, ctx) => { + const tolerations = val + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (missing effect): "${entry}"`, + }); + return z.NEVER; + } + + const effect = entry.slice(colonIdx + 1); + const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"]; + if (!validEffects.includes(effect)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join(", ")}`, + }); + return z.NEVER; + } + + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + + if (eqIdx === -1) { + return { key: keyValue, operator: "Exists" as const, effect }; + } + + return { + key: keyValue.slice(0, eqIdx), + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + + return tolerations; + }) + .optional(), // Placement tags settings PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index 58bea2f8b8..ec08926721 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -530,37 +530,11 @@ export class KubernetesWorkloadManager implements WorkloadManager { } #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { - if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS) { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) { return undefined; } - return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS.split(",") - .map((entry) => entry.trim()) - .filter((entry) => entry.length > 0) - .map((entry) => { - // Format: key=value:effect or key:effect (Exists operator) - const colonIdx = entry.lastIndexOf(":"); - if (colonIdx === -1) { - throw new Error(`Invalid toleration format (missing effect): "${entry}"`); - } - - const effect = entry.slice(colonIdx + 1) as k8s.V1Toleration["effect"]; - const keyValue = entry.slice(0, colonIdx); - const eqIdx = keyValue.indexOf("="); - - if (eqIdx === -1) { - // key:effect -> Exists operator - return { key: keyValue, operator: "Exists" as const, effect }; - } - - // key=value:effect -> Equal operator - return { - key: keyValue.slice(0, eqIdx), - operator: "Equal" as const, - value: keyValue.slice(eqIdx + 1), - effect, - }; - }); + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS; } #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { From 450910e982b9871bcb69dc0fbcab4e13cc87dfda Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Mon, 30 Mar 2026 12:49:36 +0200 Subject: [PATCH 3/3] Add validation guard for empty keys --- apps/supervisor/src/env.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index d662f16ea7..b69fb24d73 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -196,13 +196,22 @@ const Env = z const keyValue = entry.slice(0, colonIdx); const eqIdx = keyValue.indexOf("="); + const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx); + + if (!key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (empty key): "${entry}"`, + }); + return z.NEVER; + } if (eqIdx === -1) { - return { key: keyValue, operator: "Exists" as const, effect }; + return { key, operator: "Exists" as const, effect }; } return { - key: keyValue.slice(0, eqIdx), + key, operator: "Equal" as const, value: keyValue.slice(eqIdx + 1), effect,