Skip to content

Commit

Permalink
feat: Add multiple GPU support
Browse files Browse the repository at this point in the history
Add support to manage multiple GPU's similar to CPU's

- Rename gpu to gpu memory across the board to make way for gpu min and
max values.
- Rename mem to memory to more descriptive.
- Make gpu memory a proper value in the host reports not and additional
attribute.
- Add setting and updating GPU and GPU memory counts via the API.
- GPU list is given to frames in RQD using the CUE_GPU_CORES env
variable.

Missing from the MR.

1) for simplicity I modified the Initial migration to incorporate all the changed needed in both the tables, functions and triggers.
To keep backward compatibility for users it will need to make it into a migration.

2) Our cuegui and rqd have diverged too much for easy merge. I've ported what I can, but it will likely be missing elements.

3) We don't use windows, the GPU RQD side uses nvidia-smi directly. we will want to find an OS-agnostic method.

4) tests. we will definitely need to write some tests.
  • Loading branch information
Lars van der Bijl committed Aug 16, 2020
1 parent e29edf3 commit babeb19
Show file tree
Hide file tree
Showing 150 changed files with 3,157 additions and 1,026 deletions.
4 changes: 2 additions & 2 deletions cueadmin/cueadmin/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def promptYesNo(prompt, force=False):
@param prompt: The question that the user can see
@type force: boolean
@param force: (Optional) If true, skips the prompt and returns true
@rtype: bool
:rtype: bool
@return: The users response"""
try:
result = force or input("%s [y/n] " % prompt) in ("y", "Y")
Expand All @@ -68,7 +68,7 @@ def waitOnJobName(jobName, maxWaitForLaunch=None):
@type maxWaitForLaunch: int
@param maxWaitForLaunch: (Optional) The maximum number of seconds to wait
for the job to launch.
@rtype: bool
:rtype: bool
@return: Returns True if the job was found and is now Finished, False if
the job was not found before maxWaitForLaunch was reached"""
isLocated = False
Expand Down
17 changes: 17 additions & 0 deletions cuebot/.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>cuebot</name>
<comment>Project cuebot created by Buildship.</comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.buildship.core.gradleprojectbuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.buildship.core.gradleprojectnature</nature>
</natures>
</projectDescription>
2 changes: 1 addition & 1 deletion cuebot/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ COPY --chown=gradle:gradle ./proto /home/gradle/proto/

WORKDIR /home/gradle/cuebot

RUN gradle build --stacktrace
RUN gradle build --stacktrace --info

COPY --chown=gradle:gradle VERSION.in VERSIO[N] ./
RUN test -e VERSION || echo "$(cat VERSION.in)-custom" | tee VERSION
Expand Down
4 changes: 3 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {

public int minCores;
public int maxCores;
public int minGpu;
public int maxGpu;
public boolean threadable;
public long minMemory;
public long minGpu;
public long minGpuMemory;

public String services;
}
Expand Down
36 changes: 23 additions & 13 deletions cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ public class DispatchHost extends Entity
public int cores;
public int idleCores;

public int gpu;
public int idleGpu;

// Basically an 0 = auto, 1 = all.
public int threadMode;

public long memory;
public long idleMemory;
public long gpu;
public long idleGpu;
public long gpuMemory;
public long idleGpuMemory;
public String tags;
public String os;

Expand All @@ -53,11 +56,13 @@ public class DispatchHost extends Entity
* booked to this host.
*/
public int strandedCores = 0;
public int strandedGpu = 0;

// To reserve resources for future gpu job
long idleMemoryOrig = 0;
int idleCoresOrig = 0;
long idleGpuOrig = 0;
long idleMemoryOrig = 0;
int idleGpuOrig = 0;
long idleGpuMemoryOrig = 0;

public String getHostId() {
return id;
Expand All @@ -72,41 +77,44 @@ public String getFacilityId() {
}

@Override
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
public boolean hasAdditionalResources(int minCores, int minGpu, long minMemory, long minGpuMemory) {

if (idleCores < minCores) {
return false;
}
else if (idleMemory < minMemory) {
} else if (idleGpu < minGpu) {
return false;
}
else if (idleGpu < minGpu) {
} else if (idleMemory < minMemory) {
return false;
} else if (idleGpuMemory < minGpuMemory) {
return false;
}

return true;
}

@Override
public void useResources(int coreUnits, long memory, long gpu) {
public void useResources(int coreUnits, int gpu, long memory, long gpuMemory) {
idleCores = idleCores - coreUnits;
idleMemory = idleMemory - memory;
idleGpu = idleGpu - gpu;
idleGpuMemory = idleGpuMemory - gpuMemory;
}

/**
* If host has idle gpu, remove enough resources to book a gpu frame later.
*
*/
public void removeGpu() {
if (idleGpu > 0 && idleGpuOrig == 0) {
if (idleGpuMemory > 0 && idleGpuMemoryOrig == 0) {
idleMemoryOrig = idleMemory;
idleCoresOrig = idleCores;
idleGpuOrig = idleGpu;
idleGpuMemoryOrig = idleGpuMemory;

idleMemory = (long) idleMemory - Math.min(CueUtil.GB4, idleMemory);
idleCores = (int) idleCores - Math.min(100, idleCores);
idleGpu = 0;
idleGpu = (int) idleGpu - idleGpu;
idleGpuMemory = 0;
}
}

Expand All @@ -115,14 +123,16 @@ public void removeGpu() {
*
*/
public void restoreGpu() {
if (idleGpuOrig > 0) {
if (idleGpuMemoryOrig > 0) {
idleMemory = idleMemoryOrig;
idleCores = idleCoresOrig;
idleGpu = idleGpuOrig;
idleGpuMemory = idleGpuMemoryOrig;

idleMemoryOrig = 0;
idleCoresOrig = 0;
idleGpuOrig = 0;
idleGpuMemoryOrig = 0;
}
}
}
Expand Down
27 changes: 27 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ public class ExecutionSummary {
public long coreTime;
public long coreTimeSuccess;
public long coreTimeFail;
public long gpuTime;
public long gpuTimeSuccess;
public long gpuTimeFail;
public long highMemoryKb;

public long getHighMemoryKb() {
Expand Down Expand Up @@ -69,5 +72,29 @@ public long getCoreTimeFail() {
public void setCoreTimeFail(long coreTimeFail) {
this.coreTimeFail = coreTimeFail;
}

public long getGpuTime() {
return gpuTime;
}

public void setGpuTime(long gpuTime) {
this.gpuTime = gpuTime;
}

public long getGpuTimeSuccess() {
return gpuTimeSuccess;
}

public void setGpuTimeSuccess(long gpuTimeSuccess) {
this.gpuTimeSuccess = gpuTimeSuccess;
}

public long getGpuTimeFail() {
return gpuTimeFail;
}

public void setGpuTimeFail(long gpuTimeFail) {
this.gpuTimeFail = gpuTimeFail;
}
}

5 changes: 5 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,16 @@ public class GroupDetail extends Entity implements GroupInterface, DepartmentInt

public int jobMinCores = -1;
public int jobMaxCores = -1;
public int jobMinGpu = -1;
public int jobMaxGpu = -1;
public int jobPriority = -1;

public int minCores = -1;
public int maxCores = -1;

public int minGpu = -1;
public int maxGpu = -1;

public String parentId = null;
public String showId;
public String deptId;
Expand Down
12 changes: 8 additions & 4 deletions cuebot/src/main/java/com/imageworks/spcue/HostEntity.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ public class HostEntity extends Entity implements HostInterface {

public int procs;
public int cores;
public int gpu;
public int idleCores;
public int idleGpu;
public int memory;
public int idleMemory;
public int gpu;
public int idleGpu;
public int gpuMemory;
public int idleGpuMemory;

public boolean unlockAtBoot;

Expand All @@ -57,10 +59,12 @@ public HostEntity(Host grpcHost) {
this.nimbyEnabled = grpcHost.getNimbyEnabled();
this.cores = (int) grpcHost.getCores();
this.idleCores = (int) grpcHost.getIdleCores();
this.memory = (int) grpcHost.getMemory();
this.idleMemory = (int) grpcHost.getIdleMemory();
this.gpu = (int) grpcHost.getGpu();
this.idleGpu = (int) grpcHost.getIdleGpu();
this.memory = (int) grpcHost.getMemory();
this.idleMemory = (int) grpcHost.getIdleMemory();
this.gpuMemory = (int) grpcHost.getGpuMemory();
this.idleGpuMemory = (int) grpcHost.getIdleGpuMemory();
}

public String getHostId() {
Expand Down
2 changes: 2 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/Inherit.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public enum Inherit {
Priority,
MinCores,
MaxCores,
MinGpu,
MaxGpu,
All
}

5 changes: 4 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/JobDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte
public int priority = 1;
public int minCoreUnits = 100;
public int maxCoreUnits = 200000;
public int minGpuUnits = 0;
public int maxGpuUnits = 100;
public boolean isLocal = false;
public String localHostName;
public int localMaxCores;
public int localMaxGpu;
public int localMaxMemory;
public int localThreadNumber;
public int localMaxGpu;
public int localMaxGpuMemory;

public String getDepartmentId() {
return deptId;
Expand Down
20 changes: 15 additions & 5 deletions cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ public class LayerDetail extends LayerEntity implements LayerInterface {
public LayerType type;
public int minimumCores;
public int maximumCores;
public int minimumGpu;
public int maximumGpu;
public boolean isThreadable;
public long minimumMemory;
public long minimumGpu;
public long minimumGpuMemory;
public int chunkSize;
public int dispatchOrder;
public int totalFrameCount;
Expand Down Expand Up @@ -82,6 +84,14 @@ public void setMinimumCores(int minimumCores) {
this.minimumCores = minimumCores;
}

public int getMinimumGpu() {
return minimumGpu;
}

public void setMinimumGpu(int minimumGpu) {
this.minimumGpu = minimumGpu;
}

public boolean isThreadable() {
return isThreadable;
}
Expand All @@ -98,12 +108,12 @@ public void setMinimumMemory(long minimumMemory) {
this.minimumMemory = minimumMemory;
}

public long getMinimumGpu() {
return minimumGpu;
public long getMinimumGpuMemory() {
return minimumGpuMemory;
}

public void setMinimumGpu(long minimumGpu) {
this.minimumGpu = minimumGpu;
public void setMinimumGpuMemory(long minimumGpuMemory) {
this.minimumGpuMemory = minimumGpuMemory;
}

public int getChunkSize() {
Expand Down
Loading

0 comments on commit babeb19

Please sign in to comment.