diff --git a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java index 2eb231897..d44fb7bfd 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java @@ -76,13 +76,37 @@ public String getName() { * on memory usage. * * Fast mode books all the idle cores on the the host at one time. - * - * @param host - * @param frame - * @return */ public static final VirtualProc build(DispatchHost host, DispatchFrame frame, String... selfishServices) { + VirtualProc proc = seedProc(host, frame); + + // Give away all existing stranded cores to drift host back to a balanced state + // Read "docs > User Guides > Stranded Cores in Cuebot" for more info + if (host.strandedCores > 0) { + proc.coresReserved += host.strandedCores; + } + proc.canHandleNegativeCoresRequest = host.canHandleNegativeCoresRequest(proc.coresReserved); + + int requested = proc.coresReserved; + if (requested == 0) { + logger.debug("Reserving all cores"); + proc.coresReserved = host.cores; + } else if (requested < 0) { + logger.debug("Reserving all cores minus " + requested); + proc.coresReserved = host.cores + requested; + } else if (requested >= 100) { + proc.coresReserved = + computeMultiCoreReservation(host, frame, requested, selfishServices); + } + + if (!frame.threadable && proc.coresReserved > 100) { + proc.coresReserved = 100; + } + return proc; + } + + private static VirtualProc seedProc(DispatchHost host, DispatchFrame frame) { VirtualProc proc = new VirtualProc(); proc.allocationId = host.getAllocationId(); proc.hostId = host.getHostId(); @@ -101,108 +125,89 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame, proc.memoryReserved = frame.getMinMemory(); proc.gpusReserved = frame.minGpus; proc.gpuMemoryReserved = frame.minGpuMemory; + return proc; + } - /* - * Frames that are announcing cores less than 100 are not multi-threaded so there is no - * reason for the frame to span more than a single core. - * - * If we are in "fast mode", we just book all the cores If the host is nimby, desktops are - * automatically fast mode. - */ - - if (host.strandedCores > 0) { - proc.coresReserved = proc.coresReserved + host.strandedCores; - } - - proc.canHandleNegativeCoresRequest = host.canHandleNegativeCoresRequest(proc.coresReserved); + private static int computeMultiCoreReservation(DispatchHost host, DispatchFrame frame, + int requested, String[] selfishServices) { + int wholeCores = wholeIdleCores(host); + requireWholeCore(wholeCores, frame); - if (proc.coresReserved == 0) { - logger.debug("Reserving all cores"); - proc.coresReserved = host.cores; - } else if (proc.coresReserved < 0) { - logger.debug("Reserving all cores minus " + proc.coresReserved); - proc.coresReserved = host.cores + proc.coresReserved; - } else if (proc.coresReserved >= 100) { - - int originalCores = proc.coresReserved; - - /* - * wholeCores could be 0 if we have a fraction of a core, we can just throw a re - */ - int wholeCores = (int) (Math.floor(host.idleCores / 100.0)); - if (wholeCores == 0) { - throw new EntityException("The host had only a fraction of a core remaining " - + "but the frame required " + frame.minCores); - } + int cores = baseReservationByMode(host, frame, wholeCores, requested, selfishServices); + cores = enforceVariableFloor(host, frame, cores); + cores = applySanityBounds(cores, requested, frame.maxCores); + cores = clampToIdle(host, frame, cores, wholeCores); + return cores; + } - // if (host.threadMode == ThreadMode.Variable.value() && - // CueUtil.isDayTime()) { - if (host.threadMode == ThreadMode.ALL_VALUE) { - proc.coresReserved = wholeCores * 100; - } else { - if (frame.threadable) { - if (selfishServices != null && frame.services != null - && containsSelfishService(frame.services.split(","), selfishServices)) { - proc.coresReserved = wholeCores * 100; - } else { - if (host.idleMemory - - frame.getMinMemory() <= Dispatcher.MEM_STRANDED_THRESHHOLD) { - proc.coresReserved = wholeCores * 100; - } else { - proc.coresReserved = - getCoreSpan(host, frame.getMinMemory(), frame.maxCores); - } - } - if (host.threadMode == ThreadMode.VARIABLE_VALUE && proc.coresReserved <= 200) { - proc.coresReserved = 200; - if (proc.coresReserved > host.idleCores) { - // Do not allow threadable frame running on 1 core. - throw new JobDispatchException( - "Do not allow threadable frame running one core on a ThreadMode.Variable host."); - } - } - } - } + private static int wholeIdleCores(DispatchHost host) { + return (int) Math.floor(host.idleCores / 100.0); + } - /* - * Sanity checks to ensure coreUnits are not to high or to low. - */ - if (proc.coresReserved < 100) { - proc.coresReserved = 100; - } + private static void requireWholeCore(int wholeCores, DispatchFrame frame) { + if (wholeCores == 0) { + throw new EntityException("The host had only a fraction of a core remaining " + + "but the frame required " + frame.minCores); + } + } - /* - * If the core value is changed it can never fall below the original. - */ - if (proc.coresReserved < originalCores) { - proc.coresReserved = originalCores; - } + private static int baseReservationByMode(DispatchHost host, DispatchFrame frame, int wholeCores, + int requested, String[] selfishServices) { + if (host.threadMode == ThreadMode.ALL_VALUE) { + return wholeCores * 100; + } + if (!frame.threadable) { + return requested; + } + if (isSelfishService(frame, selfishServices)) { + return wholeCores * 100; + } + if (host.idleMemory - frame.getMinMemory() <= Dispatcher.MEM_STRANDED_THRESHHOLD) { + return wholeCores * 100; + } + return getCoreSpan(host, frame.getMinMemory(), frame.maxCores); + } - /* - * Check to ensure we haven't exceeded max cores. - */ - if (frame.maxCores > 0 && proc.coresReserved > frame.maxCores) { - proc.coresReserved = frame.maxCores; - } + private static int enforceVariableFloor(DispatchHost host, DispatchFrame frame, int cores) { + if (host.threadMode != ThreadMode.VARIABLE_VALUE || !frame.threadable || cores > 200) { + return cores; + } + if (200 > host.idleCores) { + // Do not allow threadable frame running on 1 core. + throw new JobDispatchException( + "Do not allow threadable frame running one core on a ThreadMode.Variable host."); + } + return 200; + } - if (proc.coresReserved > host.idleCores) { - if (host.threadMode == ThreadMode.VARIABLE_VALUE && frame.threadable - && wholeCores == 1) { - throw new JobDispatchException( - "Do not allow threadable frame running one core on a ThreadMode.Variable host."); - } - proc.coresReserved = wholeCores * 100; - } + private static int applySanityBounds(int cores, int originalCores, int maxCores) { + if (cores < 100) { + cores = 100; + } + if (cores < originalCores) { + cores = originalCores; } + if (maxCores > 0 && cores > maxCores) { + cores = maxCores; + } + return cores; + } - /* - * Don't thread non-threadable layers, no matter what people put for the number of cores. - */ - if (!frame.threadable && proc.coresReserved > 100) { - proc.coresReserved = 100; + private static int clampToIdle(DispatchHost host, DispatchFrame frame, int cores, + int wholeCores) { + if (cores <= host.idleCores) { + return cores; } + if (host.threadMode == ThreadMode.VARIABLE_VALUE && frame.threadable && wholeCores == 1) { + throw new JobDispatchException( + "Do not allow threadable frame running one core on a ThreadMode.Variable host."); + } + return wholeCores * 100; + } - return proc; + private static boolean isSelfishService(DispatchFrame frame, String[] selfishServices) { + return selfishServices != null && frame.services != null + && containsSelfishService(frame.services.split(","), selfishServices); } private static final boolean containsSelfishService(String[] frameServices, diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/util/VirtualProcBuildTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/util/VirtualProcBuildTests.java new file mode 100644 index 000000000..df7e59b78 --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/util/VirtualProcBuildTests.java @@ -0,0 +1,405 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.test.util; + +import junit.framework.TestCase; +import org.junit.Before; + +import com.imageworks.spcue.DispatchFrame; +import com.imageworks.spcue.DispatchHost; +import com.imageworks.spcue.EntityException; +import com.imageworks.spcue.JobDispatchException; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.dispatcher.Dispatcher; +import com.imageworks.spcue.grpc.host.ThreadMode; +import com.imageworks.spcue.util.CueUtil; + +/** + * Characterization tests for {@link VirtualProc#build(DispatchHost, DispatchFrame, String...)}. + * + * Pin every observable branch of the current implementation. Used as a safety net for a subsequent + * readability-only refactor of the method body. + */ +public class VirtualProcBuildTests extends TestCase { + + DispatchHost host; + + @Before + public void setUp() throws Exception { + host = new DispatchHost(); + host.isNimby = false; + } + + private DispatchFrame newThreadableFrame(int minCores, long minMemory) { + DispatchFrame frame = new DispatchFrame(); + frame.minCores = minCores; + frame.setMinMemory(minMemory); + frame.threadable = true; + return frame; + } + + // ---------- Zero / negative core requests ---------- + + public void testBooksAllCoresWhenMinCoresIsZero() { + host.cores = 800; + host.idleCores = 800; + + DispatchFrame frame = new DispatchFrame(); + frame.minCores = 0; + frame.threadable = true; + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(800, proc.coresReserved); + } + + public void testBooksAllMinusNWhenMinCoresNegativeAndHostFullyIdle() { + host.cores = 800; + host.idleCores = 800; + + DispatchFrame frame = new DispatchFrame(); + frame.minCores = -100; + frame.threadable = true; + + VirtualProc proc = VirtualProc.build(host, frame); + // host.cores + minCores = 800 + (-100) = 700. + assertEquals(700, proc.coresReserved); + assertTrue(proc.canHandleNegativeCoresRequest); + } + + public void testNegativeMinCoresFlagsHostBusyWhenNotFullyIdle() { + host.cores = 800; + host.idleCores = 400; + + DispatchFrame frame = new DispatchFrame(); + frame.minCores = -100; + frame.threadable = true; + + VirtualProc proc = VirtualProc.build(host, frame); + // coresReserved is still computed; only the flag flips off. + assertEquals(700, proc.coresReserved); + assertFalse(proc.canHandleNegativeCoresRequest); + } + + // ---------- Stranded cores ---------- + + public void testStrandedCoresAreAddedToReservation() { + // strandedCores=200 pushes a 100-core request up to 300. Memory inflation via + // getCoreSpan returns 100 (small frame relative to per-core memory), so the + // never-below-original floor keeps the result at 300. + host.strandedCores = 200; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(300, proc.coresReserved); + } + + // ---------- Fractional idle cores ---------- + + public void testFractionalIdleCoresThrowsEntityException() { + host.cores = 800; + host.idleCores = 50; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + + try { + VirtualProc.build(host, frame); + fail("Expected EntityException for fractional idle cores"); + } catch (EntityException expected) { + // ok + } + } + + // ---------- ThreadMode.ALL ---------- + + public void testThreadModeAllBooksAllWholeIdleCores() { + host.threadMode = ThreadMode.ALL_VALUE; + host.cores = 800; + host.idleCores = 750; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + + VirtualProc proc = VirtualProc.build(host, frame); + // floor(750/100) * 100 = 700 + assertEquals(700, proc.coresReserved); + } + + // ---------- Selfish service short-circuit ---------- + + public void testSelfishServiceBooksAllWholeCores() { + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 700; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + frame.services = "shell,arnold"; + + VirtualProc proc = VirtualProc.build(host, frame, "arnold"); + // wholeCores * 100 = 7 * 100 = 700 + assertEquals(700, proc.coresReserved); + } + + public void testNonSelfishServiceFallsThroughToMemoryBranch() { + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 700; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + frame.services = "shell"; + + // No selfish match. idleMemory(GB16) - minMemory(GB) = GB15, well above the + // stranded threshold, so getCoreSpan drives the result: memPerCore = GB16/8 = + // GB2, procs = GB/GB2 = 0.5, rounds to 1, reserveCores = 100. + VirtualProc proc = VirtualProc.build(host, frame, "arnold"); + assertEquals(100, proc.coresReserved); + } + + public void testSelfishWithNullServicesDoesNotShortCircuit() { + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 700; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + // frame.services left null. + + VirtualProc proc = VirtualProc.build(host, frame, "arnold"); + // Same getCoreSpan result as above. + assertEquals(100, proc.coresReserved); + } + + // ---------- Memory stranded threshold ---------- + + public void testMemoryStrandedThresholdBooksAllWholeCores() { + // idleMemory - minMemory = GB8 - GB7 = GB, which is <= MEM_STRANDED_THRESHHOLD + // (GB + MB512). + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 700; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + assertTrue(host.idleMemory - (CueUtil.GB * 7) <= Dispatcher.MEM_STRANDED_THRESHHOLD); + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB * 7); + + VirtualProc proc = VirtualProc.build(host, frame); + // wholeCores * 100 = 7 * 100 = 700 + assertEquals(700, proc.coresReserved); + } + + public void testMemoryProportionalSpanWhenNotStranded() { + // idleMemory(GB8) - minMemory(~3.2GB) ≈ 4.8GB > threshold, so getCoreSpan runs. + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 780; + host.memory = CueUtil.GB8; + host.idleMemory = CueUtil.GB8; + long minMemory = 3355443L; // ~3.2 GB + + DispatchFrame frame = newThreadableFrame(100, minMemory); + + VirtualProc proc = VirtualProc.build(host, frame); + // memPerCore = GB8/8 = GB; procs = 3355443/GB ≈ 3.2; round=3; reserveCores=300. + assertEquals(300, proc.coresReserved); + } + + // ---------- ThreadMode.VARIABLE ---------- + + public void testVariableThreadModeFloorsAtTwoCores() { + // Base reservation lands at or below 200 (memory-proportional with tiny + // minMemory yields 0), so the VARIABLE floor pushes to 200. + host.threadMode = ThreadMode.VARIABLE_VALUE; + host.cores = 800; + host.idleCores = 400; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB32; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.MB128); + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(200, proc.coresReserved); + } + + public void testVariableThreadModeFloorThrowsWhenIdleBelowTwoCores() { + // wholeCores == 1, memory-stranded path lands at 100, floor bumps to 200, + // 200 > idleCores(150) -> first JobDispatchException raise site. + host.threadMode = ThreadMode.VARIABLE_VALUE; + host.cores = 800; + host.idleCores = 150; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + + try { + VirtualProc.build(host, frame); + fail("Expected JobDispatchException from the VARIABLE floor + idle check"); + } catch (JobDispatchException expected) { + // ok + } + } + + public void testVariableThreadModeFinalClampThrowsWhenSingleCoreHost() { + // wholeCores == 1, getCoreSpan returns 400 (> 200, skipping the floor), then + // the final >idleCores clamp triggers the second JobDispatchException raise. + host.threadMode = ThreadMode.VARIABLE_VALUE; + host.cores = 800; + host.idleCores = 100; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB * 8); + + try { + VirtualProc.build(host, frame); + fail("Expected JobDispatchException from the final idle-cores clamp"); + } catch (JobDispatchException expected) { + // ok + } + } + + // ---------- Sanity / never-below-original / maxCores ---------- + + public void testNeverFallsBelowOriginalRequest() { + // originalCores = 400, getCoreSpan would return 100, but the never-below-original + // floor bumps the result back to 400. + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + + DispatchFrame frame = newThreadableFrame(400, CueUtil.GB); + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(400, proc.coresReserved); + } + + public void testMaxCoresCapsReservation() { + // ThreadMode.ALL would book all 800 cores; maxCores=200 clamps the result. + host.threadMode = ThreadMode.ALL_VALUE; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + frame.maxCores = 200; + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(200, proc.coresReserved); + } + + public void testFinalClampWhenExceedsIdleCores() { + // getCoreSpan returns 400 but idleCores only allows 200, so the final clamp + // overrides via wholeCores * 100. + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 200; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB * 8); + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(200, proc.coresReserved); + } + + public void testSanityFloorYieldsAtLeastHundredWhenSpanReturnsZero() { + // Tiny minMemory makes getCoreSpan round to 0; the <100 sanity floor and the + // never-below-original floor both push the result back to 100 (the entry + // condition requires originalCores >= 100, so the two floors converge here). + host.threadMode = ThreadMode.AUTO_VALUE; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB16; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.MB128); + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(100, proc.coresReserved); + } + + // ---------- Non-threadable cap ---------- + + public void testNonThreadableCappedAtHundred() { + host.threadMode = ThreadMode.ALL_VALUE; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + + DispatchFrame frame = new DispatchFrame(); + frame.minCores = 400; + frame.setMinMemory(CueUtil.GB); + frame.threadable = false; + + VirtualProc proc = VirtualProc.build(host, frame); + assertEquals(100, proc.coresReserved); + } + + // ---------- Field propagation ---------- + + public void testFieldsArePropagatedFromHostAndFrame() { + host.id = "host-id-1"; + host.name = "host-name-1"; + host.allocationId = "alloc-id-1"; + host.isLocalDispatch = true; + host.cores = 800; + host.idleCores = 800; + host.memory = CueUtil.GB32; + host.idleMemory = CueUtil.GB8; + host.threadMode = ThreadMode.ALL_VALUE; + + DispatchFrame frame = newThreadableFrame(100, CueUtil.GB); + frame.layerId = "layer-id-1"; + frame.jobId = "job-id-1"; + frame.showId = "show-id-1"; + frame.facilityId = "facility-id-1"; + frame.os = "linux"; + frame.minGpus = 2; + frame.minGpuMemory = CueUtil.GB2; + + VirtualProc proc = VirtualProc.build(host, frame); + + assertEquals("host-id-1", proc.hostId); + assertEquals("host-name-1", proc.hostName); + assertEquals("alloc-id-1", proc.allocationId); + assertTrue(proc.isLocalDispatch); + assertNull(proc.frameId); + assertEquals("layer-id-1", proc.layerId); + assertEquals("job-id-1", proc.jobId); + assertEquals("show-id-1", proc.showId); + assertEquals("facility-id-1", proc.facilityId); + assertEquals("linux", proc.os); + assertEquals(CueUtil.GB, proc.memoryReserved); + assertEquals(2, proc.gpusReserved); + assertEquals(CueUtil.GB2, proc.gpuMemoryReserved); + assertFalse(proc.unbooked); + } +} diff --git a/docs/_docs/user-guides/stranded-cores.md b/docs/_docs/user-guides/stranded-cores.md new file mode 100644 index 000000000..270b140b9 --- /dev/null +++ b/docs/_docs/user-guides/stranded-cores.md @@ -0,0 +1,102 @@ +--- +title: "Stranded Cores in Cuebot" +nav_order: 49 +parent: User Guides +layout: default +linkTitle: "Stranded Cores in Cuebot" +date: 2025-05-11 +description: > + Handling Stranded cores on Cuebot +--- + +# Stranded Cores in Cuebot + +## The Problem + +A render host has two resources that frames consume in pairs: **CPU cores** and **memory**. The scheduler reserves both per frame based on what the layer asks for. But these resources don't deplete in lockstep — a memory-hungry frame can leave plenty of cores idle while exhausting RAM. + +When idle memory drops below ~1.5 GB (`Dispatcher.java:53` — `MEM_STRANDED_THRESHHOLD = 1 GB + 512 MB`), no realistic frame can be booked on that host even if 8+ cores are still free. Those cores are **stranded** — physically idle but functionally unusable. On a render farm with thousands of hosts, this silently bleeds capacity. + +## The Solution: Detect, Re-attach, Recover + +Cuebot doesn't try to *prevent* stranding through perfect bin-packing — instead it **detects stranding after the fact and donates the orphaned cores to the next frame** booked on that host. + +### 1. Detect at frame completion +When a frame finishes, `FrameCompleteHandler.java:535-547` checks: did this proc leave the host memory-starved with idle cores? The check (`HostDaoJdbc.java:575-584`) queries idle cores on hosts where `int_mem_idle <= MEM_STRANDED_THRESHHOLD`, rounded down to whole cores (100 units each). + +Recovery is only attempted when: +- The proc isn't a local (desktop) dispatch +- The job's layer is threadable (can absorb extra cores) +- The job is still bookable +- At least 1 whole core is stranded + +### 2. Mark the host +`DispatchSupportService.strandCores()` (`DispatchSupportService.java:112-120`) records the stranded count on `DispatchHost.strandedCores`, stashes it in a `ConcurrentHashMap` with a **5-second TTL** (`StrandedCores.java`) so stale entries can't accumulate, and forces the host's `threadMode` to `ALL` to encourage aggressive packing. + +### 3. Donate to the next frame +When the next `VirtualProc` is built for that host (`VirtualProc.java:113-115`), the stranded cores are silently added to `coresReserved`: +```java +if (host.strandedCores > 0) { + proc.coresReserved = proc.coresReserved + host.strandedCores; +} +``` +After dispatching that one frame, `CoreUnitDispatcher.java:300-302` calls `pickupStrandedCores()` to clear the marker and `break`s out of the dispatch loop — one frame "absorbs the debt," and the host returns to normal scheduling. + +## Memory-Aware Prevention (the proactive half) + +Two pieces of code try to *avoid* stranding in the first place: + +- **`VirtualProc.getCoreSpan()`** (`VirtualProc.java:266-284`) computes `memPerCore = host.idleMemory / totalCores` and reserves `ceil(minMemory / memPerCore)` cores — sizing the booking to the host's actual memory-per-core ratio rather than the layer's hardcoded core request. +- **Pre-emptive whole-host booking** (`VirtualProc.java:148-150`): if booking the requested memory *would* leave less than `MEM_STRANDED_THRESHHOLD` free, the proc grabs *all* remaining whole cores immediately. Better to over-allocate cores to one frame than leave them stranded. + +A parallel system handles GPUs (`DispatchHost.removeGpu()` reserves 4 GB RAM + 100 cores on GPU hosts so a CPU job can't strand the GPU dispatch path). + +## Observability + +`DispatchSupport.java:119-134` exposes counters — `strandedCoresCount`, `pickedUpCoresCount`, and GPU equivalents — through `CueStatic.cueGetSystemStats()` so operators can see how often stranding occurs vs. is recovered. + +## TL;DR + +Memory exhausts before cores do, so cores get stuck. Cuebot's answer is a small feedback loop: notice stranding when a frame completes, attach the orphaned cores to the next frame booked on that host (within 5 seconds), and use memory-per-core math to size bookings sensibly in the first place. The recovery is opportunistic rather than perfect — the trade-off is simple, low-overhead code instead of a globally optimal bin-packing solver. + +## A working example + +Consider a 16-core / 64 GB host. Its **natural ratio** is 4 GB/core — if every frame consumed resources in that ratio, nothing would ever strand. + +But layers don't request resources in that ratio. Suppose frames on this host keep asking for 8 GB and 1 core each (a memory-heavy workload): + +| Step | Idle cores | Idle mem | Notes | +|---|---|---|---| +| Start | 16 | 64 GB | | +| Book 7× (8 GB / 1 core) | 9 | 8 GB | | +| Book 1× (8 GB / 1 core) | **8** | **0 GB** | 8 cores stranded | + +Eight cores sit idle and **nothing can be booked here** — every layer requesting more than 0 GB is rejected. The host is functionally dead until a frame completes. + +## What "donating" the stranded cores actually accomplishes + +Now one of those 8 GB frames finishes: + +**Without recovery**: The completed frame returns 8 GB + 1 core → host has 9 idle cores, 8 GB free. The scheduler books another 8 GB / 1 core frame. Back to 8 cores idle, 0 GB. **The strand re-forms immediately.** This repeats forever; the 8 stranded cores never do work. + +**With recovery**: When the frame completes, cuebot notices the host *was* in a stranded state and marks `host.strandedCores = 8`. The next 8 GB / 1 core frame is booked, but `VirtualProc.java:113` quietly inflates its reservation to **1 + 8 = 9 cores** for the same 8 GB memory. The threadable layer now runs 9-way parallel on those cores. + +The crucial effect: **that one frame is now consuming resources in a 0.9 GB/core ratio** — wildly core-heavy. It absorbs the imbalance the previous memory-heavy frames created. When it finishes, it returns 9 cores + 8 GB, and the host is back near its natural 4 GB/core ratio. Future bookings start from a balanced state instead of a stranded one. + +## Why this helps the host + +- The 8 stranded cores **actually run code** instead of idling. Effective host utilization goes from ~50% to ~100% for the duration of that frame. +- Future bookings on the host don't re-strand because the ratio has been restored. + +## Why this helps the farm + +- **Throughput**: those 8 cores are now contributing render work. Multiply across thousands of hosts and a busy farm with chronically memory-heavy shows recovers a meaningful chunk of capacity. +- **Faster job completion**: the recipient layer is threadable, so 9× parallelism makes that frame finish ~9× faster (modulo Amdahl). The job's overall wallclock drops. +- **Scheduler honesty**: stranded cores show up in `host.idleCores` but are unbookable. Without recovery, the scheduler keeps "trying" this host on every report and getting rejected — wasted dispatch cycles. Recovery either uses the cores or clears the marker after 5 s, so the scheduler's view of capacity matches reality. +- **No global coordination required**: the fix is purely local to the host that completed a frame. No farm-wide repacking, no preemption, no migration. The cost is one extra field on `DispatchHost` and a check in `FrameCompleteHandler`. + +## Why "give to one frame" specifically + +You might expect "give the stranded cores to the next *several* frames." But that would just split the same problem across multiple frames — each would still book its own memory, and you'd re-strand. Giving **all the stranded cores to one threadable frame** concentrates the imbalance into a single booking that consumes lots of cores per GB. That single frame is the "balancer." Then `break` exits the dispatch loop (`CoreUnitDispatcher.java:300-302`) — the debt is paid, the host returns to normal scheduling on the next report. + +It's basically a one-shot rebalancing trick: let a memory-heavy workload drift the host into a bad ratio, then use one fat threadable booking to drift it back.