Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8143764
Add int_concurrent_frames_limit to host
DiegoTavares Dec 9, 2025
b30c886
[scheduler/cuebot] Introduce booking by slot to scheduler
DiegoTavares Dec 17, 2025
cfe2366
Add Layer Slots Required Field
DiegoTavares Dec 11, 2025
769d8ef
Add slots_required field to layer in OpenCue DTD and PyOutline layer
DiegoTavares Dec 17, 2025
b3b90cf
[rework] Handle hardcoded values
DiegoTavares Dec 12, 2025
4cee5d0
Clean up warning from rust modules
DiegoTavares Dec 17, 2025
02b1f20
[pycue/cuegui/cuebot] Add concurrent_procs_limit
DiegoTavares Dec 17, 2025
ed316e1
Rename concurrent_procs to concurrent slots
DiegoTavares Dec 18, 2025
73212de
Merge branch 'master' into slot-based-scheduling
DiegoTavares Jan 7, 2026
33acb5c
Minor fixes
DiegoTavares Dec 18, 2025
2ad42b4
Revert re-formatting
DiegoTavares Jan 7, 2026
db53152
Fix formatting
DiegoTavares Jan 7, 2026
de7b26c
Fix formatting
DiegoTavares Jan 7, 2026
d6888ee
Add slots_required to rqd.RunFrame
DiegoTavares Jan 7, 2026
19791d0
Implement slot based booking on rqd
DiegoTavares Jan 9, 2026
fb4172a
spotless Apply
DiegoTavares Jan 9, 2026
e7bc337
Add slots_required to LayerMonitorTree
DiegoTavares Jan 9, 2026
9729954
Add slots required configuration to LayerDialog
DiegoTavares Jan 9, 2026
3bbd8e9
Merge branch 'master' into slot-based-scheduling
DiegoTavares Feb 3, 2026
cb61fd7
Update documentation
DiegoTavares Feb 3, 2026
14fc0d1
Version Up
DiegoTavares Feb 3, 2026
e984bef
Fix migration id
DiegoTavares Feb 3, 2026
74ebcad
Fix python linting
DiegoTavares Feb 3, 2026
192e2bf
Fix review comments
DiegoTavares Feb 11, 2026
38bdeda
Improve required_slots accounting
DiegoTavares Feb 12, 2026
091a98b
Add runningSlots tracking to Host and display available slots in GUI
DiegoTavares Feb 12, 2026
45bc449
Fix cicd
DiegoTavares Feb 12, 2026
ea5a80e
Fix tests and spotless errors
DiegoTavares Feb 12, 2026
49ea65d
Merge branch 'master' into slot-based-scheduling
DiegoTavares Mar 2, 2026
6b97a04
Fix issue identified on the last review pass
DiegoTavares Mar 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ sandbox/kafka-data
sandbox/zookeeper-data
sandbox/zookeeper-logs
sandbox/rqd/shots/
sandbox/pgadmin-data
docs/_data/version.yml
target/*

Expand Down
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.17
1.18
1 change: 1 addition & 0 deletions cuebot/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ compileTestJava {
options.compilerArgs << "-Xlint:all,-serial" << "-Werror"
}


protobuf {
protoc {
// The protoc compiler
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {
public int minGpus;
public int maxGpus;
public long minGpuMemory;
public int slotsRequired;

// A comma separated list of services
public String services;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ public class DispatchHost extends Entity
public long idleGpuMemory;
public String tags;
private String os;
public int runningSlots;

public boolean isNimby;
public boolean isLocalDispatch = false;
Expand Down
4 changes: 4 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/HostEntity.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public class HostEntity extends Entity implements HostInterface {
public int idleGpus;
public long gpuMemory;
public long idleGpuMemory;
public int concurrentSlotsLimit;
public int runningSlots;

public boolean unlockAtBoot;

Expand All @@ -61,6 +63,8 @@ public HostEntity(Host grpcHost) {
this.idleGpus = (int) grpcHost.getIdleGpus();
this.gpuMemory = grpcHost.getGpuMemory();
this.idleGpuMemory = grpcHost.getIdleGpuMemory();
this.concurrentSlotsLimit = grpcHost.getConcurrentSlotsLimit();
this.runningSlots = grpcHost.getRunningSlots();
}

public String getHostId() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class LayerDetail extends LayerEntity implements LayerInterface {
public int timeout_llu;
public int dispatchOrder;
public int totalFrameCount;
public int slotsRequired;

public Set<String> tags = new LinkedHashSet<String>();
public Set<String> services = new LinkedHashSet<String>();
Expand Down
4 changes: 4 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class VirtualProc extends FrameEntity implements ProcInterface {
public long gpuMemoryUsed;
public long gpuMemoryMax;

public int slotsRequired;

public boolean unbooked;
public boolean usageRecorded = false;
public boolean isLocalDispatch = false;
Expand Down Expand Up @@ -101,6 +103,7 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame,
proc.memoryReserved = frame.getMinMemory();
proc.gpusReserved = frame.minGpus;
proc.gpuMemoryReserved = frame.minGpuMemory;
proc.slotsRequired = frame.slotsRequired;

/*
* Frames that are announcing cores less than 100 are not multi-threaded so there is no
Expand Down Expand Up @@ -237,6 +240,7 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame,
proc.memoryReserved = frame.getMinMemory();
proc.gpusReserved = frame.minGpus;
proc.gpuMemoryReserved = frame.minGpuMemory;
proc.slotsRequired = frame.slotsRequired;

int wholeCores = (int) (Math.floor(host.idleCores / 100.0));
if (wholeCores == 0) {
Expand Down
19 changes: 18 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,22 @@ public interface HostDao {
*/
void updateThreadMode(HostInterface host, ThreadMode mode);

/**
* Update the host's concurrent procs limit.
*
* @param host HostInterface
* @param limit int (0 for no limit)
*/
void updateConcurrentSlotsLimit(HostInterface host, int limit);

/**
* Get the host's concurrent slots limit by hostname.
*
* @param hostname String
* @return int the concurrent slots limit
*/
int getHostConcurrentSlotsLimit(String hostname);

/**
* Update the specified host's hardware information.
*
Expand All @@ -257,10 +273,11 @@ public interface HostDao {
* @param freeGpuMemory long
* @param load int
* @param os String
* @param runningSlots int
*/
void updateHostStats(HostInterface host, long totalMemory, long freeMemory, long totalSwap,
long freeSwap, long totalMcp, long freeMcp, long totalGpuMemory, long freeGpuMemory,
int load, Timestamp bootTime, String os);
int load, Timestamp bootTime, String os, int runningSlots);

/**
* Return true if the HardwareState is Up, false if it is anything else.
Expand Down
8 changes: 8 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,14 @@ public interface LayerDao {
*/
void updateTimeoutLLU(LayerInterface layer, int timeout_llu);

/**
* Updates the slots required for a layer.
*
* @param layer the layer to update
* @param slots the number of slots required (<0 means the host is not slot-based)
*/
void updateLayerSlotsRequired(LayerInterface layer, int slots);

/**
* Lowers the minimum memory on a layer if the layer is using less memory and the currnet min
* memory is the dispatcher default.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ private static final String replaceQueryForFifo(String query) {
"int_gpus_min, " +
"int_gpus_max, " +
"int_gpu_mem_min, " +
"int_slots_required, " +
"str_cmd, " +
"str_range, " +
"int_chunk_size, " +
Expand Down Expand Up @@ -588,6 +589,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -676,6 +678,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.b_threadable, " +
"layer.int_mem_min, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -765,6 +768,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -847,6 +851,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -932,6 +937,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -1020,6 +1026,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down Expand Up @@ -1108,6 +1115,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.int_cores_max, " +
"layer.str_cmd, " +
"layer.str_range, " +
Expand Down Expand Up @@ -1191,6 +1199,7 @@ private static final String replaceQueryForFifo(String query) {
"layer.int_gpus_min, " +
"layer.int_gpus_max, " +
"layer.int_gpu_mem_min, " +
"layer.int_slots_required, " +
"layer.str_cmd, " +
"layer.str_range, " +
"layer.int_chunk_size, " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException {
frame.minGpus = rs.getInt("int_gpus_min");
frame.maxGpus = rs.getInt("int_gpus_max");
frame.minGpuMemory = rs.getLong("int_gpu_mem_min");
frame.slotsRequired = rs.getInt("int_slots_required");
frame.version = rs.getInt("int_version");
frame.services = rs.getString("str_services");
frame.os = rs.getString("str_os");
Expand All @@ -252,8 +253,8 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException {
+ "layer.str_type AS layer_type, " + "layer.str_cmd, " + "layer.int_cores_min,"
+ "layer.int_cores_max," + "layer.b_threadable," + "layer.int_mem_min, "
+ "layer.int_gpus_min," + "layer.int_gpus_max," + "layer.int_gpu_mem_min, "
+ "layer.str_range, " + "layer.int_chunk_size, " + "layer.str_services " + "FROM "
+ "layer, " + "job, " + "show, "
+ "layer.int_slots_required, " + "layer.str_range, " + "layer.int_chunk_size, "
+ "layer.str_services " + "FROM " + "layer, " + "job, " + "show, "
+ "frame LEFT JOIN proc ON (proc.pk_frame = frame.pk_frame) " + "WHERE "
+ "job.pk_show = show.pk_show " + "AND " + "frame.pk_job = job.pk_job " + "AND "
+ "frame.pk_layer = layer.pk_layer " + "AND " + "frame.pk_frame = ?";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ public HostEntity mapRow(ResultSet rs, int rowNum) throws SQLException {
host.idleGpus = rs.getInt("int_gpus_idle");
host.gpuMemory = rs.getLong("int_gpu_mem");
host.idleGpuMemory = rs.getLong("int_gpu_mem_idle");
host.concurrentSlotsLimit = rs.getInt("int_concurrent_slots_limit");
host.runningSlots = rs.getInt("int_running_slots");
host.dateBooted = rs.getDate("ts_booted");
host.dateCreated = rs.getDate("ts_created");
host.datePinged = rs.getDate("ts_ping");
Expand Down Expand Up @@ -131,11 +133,13 @@ public String getFacilityId() {
+ " host.int_gpus_idle, "
+ " host.int_gpu_mem, "
+ " host.int_gpu_mem_idle, "
+ " host.int_concurrent_slots_limit, "
+ " host.ts_created, "
+ " host.str_name, "
+ " host_stat.str_state, "
+ " host_stat.ts_ping, "
+ " host_stat.ts_booted, "
+ " host_stat.int_running_slots, "
+ " alloc.pk_facility "
+ "FROM "
+ " host, "
Expand Down Expand Up @@ -229,6 +233,7 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException {
host.tags = rs.getString("str_tags");
host.setOs(rs.getString("str_os"));
host.hardwareState = HardwareState.valueOf(rs.getString("str_state"));
host.runningSlots = rs.getInt("int_running_slots");
return host;
}
};
Expand All @@ -253,6 +258,7 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException {
+ " host.str_tags, "
+ " host_stat.str_os, "
+ " host_stat.str_state, "
+ " host_stat.int_running_slots, "
+ " alloc.pk_facility "
+ "FROM "
+ " host "
Expand Down Expand Up @@ -395,22 +401,23 @@ public CallableStatement createCallableStatement(Connection con) throws SQLExcep
+ " int_load = ?, "
+ " ts_booted = ?, "
+ " ts_ping = current_timestamp, "
+ " str_os = ? "
+ " str_os = ?, "
+ " int_running_slots = ? "
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Race condition on slot overbooking

The runningSlots value is set from host reports (periodic). Concurrent dispatcher threads can read the same stale count and both book a frame, exceeding the limit. This needs database-level locking (SELECT ... FOR UPDATE) or atomic compare-and-update.

+ "WHERE "
+ " pk_host = ?";

@Override
public void updateHostStats(HostInterface host, long totalMemory, long freeMemory,
long totalSwap, long freeSwap, long totalMcp, long freeMcp, long totalGpuMemory,
long freeGpuMemory, int load, Timestamp bootTime, String os) {
long freeGpuMemory, int load, Timestamp bootTime, String os, int runningSlots) {

if (os == null) {
os = Dispatcher.OS_DEFAULT;
}

getJdbcTemplate().update(UPDATE_RENDER_HOST, totalMemory, freeMemory, totalSwap, freeSwap,
totalMcp, freeMcp, totalGpuMemory, freeGpuMemory, load, bootTime, os,
host.getHostId());
runningSlots, host.getHostId());
}

@Override
Expand Down Expand Up @@ -562,6 +569,23 @@ public void updateThreadMode(HostInterface host, ThreadMode mode) {
mode.getNumber(), host.getHostId());
}

@Override
public void updateConcurrentSlotsLimit(HostInterface host, int limit) {
getJdbcTemplate().update("UPDATE host SET int_concurrent_slots_limit=? WHERE pk_host=?",
limit, host.getHostId());
}

@Override
public int getHostConcurrentSlotsLimit(String hostname) {
try {
return getJdbcTemplate().queryForObject(
"SELECT int_concurrent_slots_limit FROM host WHERE str_name = ?",
Integer.class, hostname);
} catch (EmptyResultDataAccessException e) {
return 0;
}
}

@Override
public void updateHostOs(HostInterface host, String os) {
getJdbcTemplate().update("UPDATE host_stat SET str_os=? WHERE pk_host=?", os,
Expand Down Expand Up @@ -631,7 +655,7 @@ public boolean isNimbyHost(HostInterface h) {
/**
* Checks if the passed in name looks like a fully qualified domain name. If so, returns the
* hostname without the domain. Otherwise returns the passed in name unchanged.
*
*
* @param fqdn - String
* @return String - hostname
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ public LayerDetail mapRow(ResultSet rs, int rowNum) throws SQLException {
layer.services.addAll(Lists.newArrayList(rs.getString("str_services").split(",")));
layer.timeout = rs.getInt("int_timeout");
layer.timeout_llu = rs.getInt("int_timeout_llu");
layer.slotsRequired = rs.getInt("int_slots_required");
return layer;
}
};
Expand Down Expand Up @@ -241,7 +242,8 @@ public LayerInterface getLayer(String id) {
+ "int_dispatch_order, " + "str_tags, " + "str_type," + "int_cores_min, "
+ "int_cores_max, " + "b_threadable, " + "int_mem_min, " + "int_gpus_min, "
+ "int_gpus_max, " + "int_gpu_mem_min, " + "str_services, " + "int_timeout,"
+ "int_timeout_llu " + ") " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
+ "int_timeout_llu, " + "int_slots_required " + ") "
+ "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";

@Override
public void insertLayerDetail(LayerDetail l) {
Expand All @@ -250,7 +252,7 @@ public void insertLayerDetail(LayerDetail l) {
l.chunkSize, l.dispatchOrder, StringUtils.join(l.tags, " | "), l.type.toString(),
l.minimumCores, l.maximumCores, l.isThreadable, l.minimumMemory, l.minimumGpus,
l.maximumGpus, l.minimumGpuMemory, StringUtils.join(l.services, ","), l.timeout,
l.timeout_llu);
l.timeout_llu, l.slotsRequired);
}

@Override
Expand Down Expand Up @@ -571,6 +573,15 @@ public void updateTimeoutLLU(LayerInterface layer, int timeout_llu) {
layer.getLayerId());
}

@Override
public void updateLayerSlotsRequired(LayerInterface layer, int slots) {
// Avoid negative numbers as they have the same meaning as zero
slots = Math.max(slots, 0);

getJdbcTemplate().update("UPDATE layer SET int_slots_required=? WHERE pk_layer=?", slots,
layer.getLayerId());
}

@Override
public void enableMemoryOptimizer(LayerInterface layer, boolean value) {
getJdbcTemplate().update("UPDATE layer SET b_optimize=? WHERE pk_layer=?", value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,11 @@ private static final NestedJob mapResultSetToJob(ResultSet rs) throws SQLExcepti
+ "host.int_gpus, " + "host.int_gpus_idle, " + "host.int_gpu_mem, "
+ "host.int_gpu_mem_idle, " + "host.int_mem, " + "host.int_mem_idle, "
+ "host.str_lock_state, " + "host.str_tags, " + "host.b_comment, "
+ "host.int_thread_mode, " + "host_stat.str_os, " + "host_stat.int_mem_total, "
+ "host_stat.int_mem_free, " + "host_stat.int_swap_total, "
+ "host_stat.int_swap_free, " + "host_stat.int_mcp_total, " + "host_stat.int_mcp_free, "
+ "host.int_thread_mode, " + "host_stat.int_running_slots, "
+ "host.int_concurrent_slots_limit, " + "host_stat.str_os, "
+ "host_stat.int_mem_total, " + "host_stat.int_mem_free, "
+ "host_stat.int_swap_total, " + "host_stat.int_swap_free, "
+ "host_stat.int_mcp_total, " + "host_stat.int_mcp_free, "
+ "host_stat.int_gpu_mem_total, " + "host_stat.int_gpu_mem_free, "
+ "host_stat.int_load, " + "proc.pk_proc, " + "proc.int_cores_reserved AS proc_cores, "
+ "proc.int_gpus_reserved AS proc_gpus, " + "proc.int_mem_reserved AS proc_memory, "
Expand Down
Loading
Loading