Skip to content

Commit 13bdb9f

Browse files
committed
[scheduler/cuebot] Bulk resource accounting
This change shifts resource accounting (subscription, layer_resource, job_resource, folder_resource, point tables) from incremental delta updates at dispatch/release time to periodic bulk recomputation from the proc table. This affects both the Java cuebot and the Rust scheduler. Key changes: 1. Java (cuebot): Wraps existing incremental resource updates behind a dispatcher.scheduler_manages_resources feature flag 2. Rust (scheduler): Replaces the delta-accumulate-and-flush pattern with periodic recompute_all_from_proc() and recalculate_subs() 3. New ResourceAccountingService: Periodic loop recomputing layer/job/folder/point resource tables 4. Simplified AllocationService: Removes pending_deltas mutex, DeltaKey/DeltaValue types, retry logic, and delta re-application after cache refresh
1 parent 690d3be commit 13bdb9f

15 files changed

Lines changed: 346 additions & 303 deletions

File tree

cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java

Lines changed: 74 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -539,39 +539,44 @@ private void procDestroyed(VirtualProc proc) {
539539
proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved,
540540
proc.getHostId());
541541

542-
if (!proc.isLocalDispatch) {
543-
getJdbcTemplate().update(
544-
"UPDATE " + "subscription " + "SET " + "int_cores = int_cores - ?,"
545-
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_show = ? " + "AND "
546-
+ "pk_alloc = ?",
547-
proc.coresReserved, proc.gpusReserved, proc.getShowId(),
548-
proc.getAllocationId());
549-
}
550-
551-
getJdbcTemplate().update(
552-
"UPDATE " + "layer_resource " + "SET " + "int_cores = int_cores - ?,"
553-
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_layer = ?",
554-
proc.coresReserved, proc.gpusReserved, proc.getLayerId());
555-
556-
if (!proc.isLocalDispatch) {
557-
558-
getJdbcTemplate().update(
559-
"UPDATE " + "job_resource " + "SET " + "int_cores = int_cores - ?,"
560-
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_job = ?",
561-
proc.coresReserved, proc.gpusReserved, proc.getJobId());
562-
563-
getJdbcTemplate().update(
564-
"UPDATE " + "folder_resource " + "SET " + "int_cores = int_cores - ?,"
565-
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_folder = "
566-
+ "(SELECT pk_folder FROM job WHERE pk_job=?)",
567-
proc.coresReserved, proc.gpusReserved, proc.getJobId());
542+
boolean schedulerManagesResources =
543+
env.getProperty("dispatcher.scheduler_manages_resources", Boolean.class, false);
544+
545+
if (!schedulerManagesResources) {
546+
if (!proc.isLocalDispatch) {
547+
getJdbcTemplate().update(
548+
"UPDATE " + "subscription " + "SET " + "int_cores = int_cores - ?,"
549+
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_show = ? " + "AND "
550+
+ "pk_alloc = ?",
551+
proc.coresReserved, proc.gpusReserved, proc.getShowId(),
552+
proc.getAllocationId());
553+
}
568554

569555
getJdbcTemplate().update(
570-
"UPDATE " + "point " + "SET " + "int_cores = int_cores - ?, "
571-
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_dept = "
572-
+ "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND " + "pk_show = "
573-
+ "(SELECT pk_show FROM job WHERE pk_job=?) ",
574-
proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId());
556+
"UPDATE " + "layer_resource " + "SET " + "int_cores = int_cores - ?,"
557+
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_layer = ?",
558+
proc.coresReserved, proc.gpusReserved, proc.getLayerId());
559+
560+
if (!proc.isLocalDispatch) {
561+
562+
getJdbcTemplate().update(
563+
"UPDATE " + "job_resource " + "SET " + "int_cores = int_cores - ?,"
564+
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_job = ?",
565+
proc.coresReserved, proc.gpusReserved, proc.getJobId());
566+
567+
getJdbcTemplate().update(
568+
"UPDATE " + "folder_resource " + "SET " + "int_cores = int_cores - ?,"
569+
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_folder = "
570+
+ "(SELECT pk_folder FROM job WHERE pk_job=?)",
571+
proc.coresReserved, proc.gpusReserved, proc.getJobId());
572+
573+
getJdbcTemplate().update(
574+
"UPDATE " + "point " + "SET " + "int_cores = int_cores - ?, "
575+
+ "int_gpus = int_gpus - ? " + "WHERE " + "pk_dept = "
576+
+ "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND "
577+
+ "pk_show = " + "(SELECT pk_show FROM job WHERE pk_job=?) ",
578+
proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId());
579+
}
575580
}
576581

577582
if (proc.isLocalDispatch) {
@@ -607,43 +612,48 @@ private void procCreated(VirtualProc proc) {
607612
proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved,
608613
proc.getHostId());
609614

610-
/**
611-
* Not keeping track of local cores this way.
612-
*/
615+
boolean schedulerManagesResources =
616+
env.getProperty("dispatcher.scheduler_manages_resources", Boolean.class, false);
613617

614-
if (!proc.isLocalDispatch) {
615-
getJdbcTemplate().update(
616-
"UPDATE " + "subscription " + "SET " + "int_cores = int_cores + ?,"
617-
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_show = ? " + "AND "
618-
+ "pk_alloc = ?",
619-
proc.coresReserved, proc.gpusReserved, proc.getShowId(),
620-
proc.getAllocationId());
621-
}
622-
623-
getJdbcTemplate().update(
624-
"UPDATE " + "layer_resource " + "SET " + "int_cores = int_cores + ?,"
625-
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_layer = ?",
626-
proc.coresReserved, proc.gpusReserved, proc.getLayerId());
627-
628-
if (!proc.isLocalDispatch) {
629-
630-
getJdbcTemplate().update(
631-
"UPDATE " + "job_resource " + "SET " + "int_cores = int_cores + ?,"
632-
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_job = ?",
633-
proc.coresReserved, proc.gpusReserved, proc.getJobId());
618+
if (!schedulerManagesResources) {
619+
/**
620+
* Not keeping track of local cores this way.
621+
*/
634622

635-
getJdbcTemplate().update(
636-
"UPDATE " + "folder_resource " + "SET " + "int_cores = int_cores + ?,"
637-
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_folder = "
638-
+ "(SELECT pk_folder FROM job WHERE pk_job=?)",
639-
proc.coresReserved, proc.gpusReserved, proc.getJobId());
623+
if (!proc.isLocalDispatch) {
624+
getJdbcTemplate().update(
625+
"UPDATE " + "subscription " + "SET " + "int_cores = int_cores + ?,"
626+
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_show = ? " + "AND "
627+
+ "pk_alloc = ?",
628+
proc.coresReserved, proc.gpusReserved, proc.getShowId(),
629+
proc.getAllocationId());
630+
}
640631

641632
getJdbcTemplate().update(
642-
"UPDATE " + "point " + "SET " + "int_cores = int_cores + ?,"
643-
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_dept = "
644-
+ "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND " + "pk_show = "
645-
+ "(SELECT pk_show FROM job WHERE pk_job=?) ",
646-
proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId());
633+
"UPDATE " + "layer_resource " + "SET " + "int_cores = int_cores + ?,"
634+
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_layer = ?",
635+
proc.coresReserved, proc.gpusReserved, proc.getLayerId());
636+
637+
if (!proc.isLocalDispatch) {
638+
639+
getJdbcTemplate().update(
640+
"UPDATE " + "job_resource " + "SET " + "int_cores = int_cores + ?,"
641+
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_job = ?",
642+
proc.coresReserved, proc.gpusReserved, proc.getJobId());
643+
644+
getJdbcTemplate().update(
645+
"UPDATE " + "folder_resource " + "SET " + "int_cores = int_cores + ?,"
646+
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_folder = "
647+
+ "(SELECT pk_folder FROM job WHERE pk_job=?)",
648+
proc.coresReserved, proc.gpusReserved, proc.getJobId());
649+
650+
getJdbcTemplate().update(
651+
"UPDATE " + "point " + "SET " + "int_cores = int_cores + ?,"
652+
+ "int_gpus = int_gpus + ? " + "WHERE " + "pk_dept = "
653+
+ "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND "
654+
+ "pk_show = " + "(SELECT pk_show FROM job WHERE pk_job=?) ",
655+
proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId());
656+
}
647657
}
648658

649659
if (proc.isLocalDispatch) {

cuebot/src/main/resources/opencue.properties

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ log.frame-log-root.default_os=${CUE_FRAME_LOG_DIR:/shots}
9090
# it will not use the Loki backend for logging
9191
log.loki.url=
9292

93+
# When true, cuebot skips updating subscription, layer_resource, job_resource,
94+
# folder_resource, and point tables during dispatch/release. The scheduler
95+
# manages these via periodic recalculation from the proc table.
96+
dispatcher.scheduler_manages_resources=false
97+
9398
# Maximum number of jobs to query.
9499
dispatcher.job_query_max=20
95100
# Number of seconds before waiting to book the same job from a different host.

rust/config/scheduler.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,13 @@ queue:
109109
# Default: 250MiB
110110
# mem_reserved_min: 250MiB
111111

112-
# Interval to refresh allocation data
112+
# Interval to recalculate subscription table from proc
113113
# Default: 3s
114-
# allocation_refresh_interval: 3s
114+
# subscription_recalculation_interval: 3s
115+
116+
# Interval to recalculate resource tables (layer/job/folder/point) from proc
117+
# Default: 10s
118+
# resource_recalculation_interval: 10s
115119

116120
# List of services that are selfish (require exclusive host access)
117121
# Default: []

0 commit comments

Comments
 (0)