diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index fa5b23efbb..b69fb24d73 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -123,8 +123,16 @@ const Env = z KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods // Large machine affinity settings - large-* presets prefer a dedicated pool KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"), - KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("large-machines"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("large-machines"), KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100), // Project affinity settings - pods from the same project prefer the same node @@ -137,11 +145,82 @@ const Env = z .default("kubernetes.io/hostname"), // Schedule affinity settings - runs from schedule trees prefer a dedicated pool - KUBERNETES_SCHEDULE_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY: z.string().trim().min(1).default("node.cluster.x-k8s.io/machinepool"), - KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE: z.string().trim().min(1).default("scheduled-runs"), - KUBERNETES_SCHEDULE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), - KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(20), + KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("scheduled-runs"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), + KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce + .number() + .int() + .min(1) + .max(100) + .default(20), + + // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool + // Comma-separated list of tolerations in the format: key=value:effect + // For Exists operator (no value): key:effect + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z + .string() + .transform((val, ctx) => { + const tolerations = val + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (missing effect): "${entry}"`, + }); + return z.NEVER; + } + + const effect = entry.slice(colonIdx + 1); + const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"]; + if (!validEffects.includes(effect)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join(", ")}`, + }); + return z.NEVER; + } + + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx); + + if (!key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (empty key): "${entry}"`, + }); + return z.NEVER; + } + + if (eqIdx === -1) { + return { key, operator: "Exists" as const, effect }; + } + + return { + key, + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + + return tolerations; + }) + .optional(), // Placement tags settings PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index a7402e9a34..ec08926721 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -121,6 +121,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { spec: { ...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags), affinity: this.#getAffinity(opts), + tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)), terminationGracePeriodSeconds: 60 * 60, containers: [ { @@ -485,7 +486,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { } #getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined { - if (!env.KUBERNETES_SCHEDULE_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE) { + if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) { return undefined; } @@ -494,13 +495,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { return { preferredDuringSchedulingIgnoredDuringExecution: [ { - weight: env.KUBERNETES_SCHEDULE_AFFINITY_WEIGHT, + weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT, preference: { matchExpressions: [ { - key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY, + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, operator: "In", - values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE], + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -513,13 +514,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { return { preferredDuringSchedulingIgnoredDuringExecution: [ { - weight: env.KUBERNETES_SCHEDULE_ANTI_AFFINITY_WEIGHT, + weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT, preference: { matchExpressions: [ { - key: env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_KEY, + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, operator: "NotIn", - values: [env.KUBERNETES_SCHEDULE_AFFINITY_POOL_LABEL_VALUE], + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -528,6 +529,14 @@ export class KubernetesWorkloadManager implements WorkloadManager { }; } + #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) { + return undefined; + } + + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS; + } + #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) { return undefined;