Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multiple GPU support #760 #924

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.12
0.13
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {
public int maxCores;
public boolean threadable;
public long minMemory;
public long minGpu;
public int minGpus;
public int maxGpus;
public long minGpuMemory;

public String services;
}
Expand Down
39 changes: 26 additions & 13 deletions cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ public class DispatchHost extends Entity
public int cores;
public int idleCores;

public int gpus;
public int idleGpus;

// Basically an 0 = auto, 1 = all.
public int threadMode;

public long memory;
public long idleMemory;
public long gpu;
public long idleGpu;
public long gpuMemory;
public long idleGpuMemory;
public String tags;
public String os;

Expand All @@ -53,11 +56,13 @@ public class DispatchHost extends Entity
* booked to this host.
*/
public int strandedCores = 0;
public int strandedGpus = 0;

// To reserve resources for future gpu job
long idleMemoryOrig = 0;
int idleCoresOrig = 0;
long idleGpuOrig = 0;
long idleGpuMemoryOrig = 0;
int idleGpusOrig = 0;

public String getHostId() {
return id;
Expand All @@ -72,41 +77,47 @@ public String getFacilityId() {
}

@Override
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {

if (idleCores < minCores) {
return false;
}
else if (idleMemory < minMemory) {
return false;
}
else if (idleGpu < minGpu) {
else if (idleGpus < minGpus) {
return false;
}
else if (idleGpuMemory < minGpuMemory) {
return false;
}

return true;
}

@Override
public void useResources(int coreUnits, long memory, long gpu) {
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
idleCores = idleCores - coreUnits;
idleMemory = idleMemory - memory;
idleGpu = idleGpu - gpu;
idleGpus = idleGpus - gpuUnits;
idleGpuMemory = idleGpuMemory - gpuMemory;
}

/**
* If host has idle gpu, remove enough resources to book a gpu frame later.
*
*/
public void removeGpu() {
if (idleGpu > 0 && idleGpuOrig == 0) {
if (idleGpuMemory > 0 && idleGpuMemoryOrig == 0) {
idleMemoryOrig = idleMemory;
idleCoresOrig = idleCores;
idleGpuOrig = idleGpu;
idleGpuMemoryOrig = idleGpuMemory;
idleGpusOrig = idleGpus;

idleMemory = idleMemory - Math.min(CueUtil.GB4, idleMemory);
idleCores = idleCores - Math.min(100, idleCores);
idleGpu = 0;
idleGpuMemory = idleGpuMemory - Math.min(CueUtil.GB4, idleGpuMemory);
idleGpus = idleGpus - Math.min(1, idleGpus);
}
}

Expand All @@ -115,14 +126,16 @@ public void removeGpu() {
*
*/
public void restoreGpu() {
if (idleGpuOrig > 0) {
if (idleGpuMemoryOrig > 0) {
idleMemory = idleMemoryOrig;
idleCores = idleCoresOrig;
idleGpu = idleGpuOrig;
idleGpuMemory = idleGpuMemoryOrig;
idleGpus = idleGpusOrig;

idleMemoryOrig = 0;
idleCoresOrig = 0;
idleGpuOrig = 0;
idleGpuMemoryOrig = 0;
idleGpusOrig = 0;
}
}
}
Expand Down
27 changes: 27 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ public class ExecutionSummary {
public long coreTime;
public long coreTimeSuccess;
public long coreTimeFail;
public long gpuTime;
public long gpuTimeSuccess;
public long gpuTimeFail;
public long highMemoryKb;

public long getHighMemoryKb() {
Expand Down Expand Up @@ -69,5 +72,29 @@ public long getCoreTimeFail() {
public void setCoreTimeFail(long coreTimeFail) {
this.coreTimeFail = coreTimeFail;
}

public long getGpuTime() {
return gpuTime;
}

public void setGpuTime(long gpuTime) {
this.gpuTime = gpuTime;
}

public long getGpuTimeSuccess() {
return gpuTimeSuccess;
}

public void setGpuTimeSuccess(long gpuTimeSuccess) {
this.gpuTimeSuccess = gpuTimeSuccess;
}

public long getGpuTimeFail() {
return gpuTimeFail;
}

public void setGpuTimeFail(long gpuTimeFail) {
this.gpuTimeFail = gpuTimeFail;
}
}

5 changes: 5 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,16 @@ public class GroupDetail extends Entity implements GroupInterface, DepartmentInt

public int jobMinCores = -1;
public int jobMaxCores = -1;
public int jobMinGpus = -1;
public int jobMaxGpus = -1;
public int jobPriority = -1;

public int minCores = -1;
public int maxCores = -1;

public int minGpus = -1;
public int maxGpus = -1;

public String parentId = null;
public String showId;
public String deptId;
Expand Down
20 changes: 12 additions & 8 deletions cuebot/src/main/java/com/imageworks/spcue/HostEntity.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ public class HostEntity extends Entity implements HostInterface {
public int procs;
public int cores;
public int idleCores;
public int memory;
public int idleMemory;
public int gpu;
public int idleGpu;
public long memory;
public long idleMemory;
public int gpus;
public int idleGpus;
public long gpuMemory;
public long idleGpuMemory;

public boolean unlockAtBoot;

Expand All @@ -57,10 +59,12 @@ public HostEntity(Host grpcHost) {
this.nimbyEnabled = grpcHost.getNimbyEnabled();
this.cores = (int) grpcHost.getCores();
this.idleCores = (int) grpcHost.getIdleCores();
this.memory = (int) grpcHost.getMemory();
this.idleMemory = (int) grpcHost.getIdleMemory();
this.gpu = (int) grpcHost.getGpu();
this.idleGpu = (int) grpcHost.getIdleGpu();
this.memory = grpcHost.getMemory();
this.idleMemory = grpcHost.getIdleMemory();
this.gpus = (int) grpcHost.getGpus();
this.idleGpus = (int) grpcHost.getIdleGpus();
this.gpuMemory = grpcHost.getGpuMemory();
this.idleGpuMemory = grpcHost.getIdleGpuMemory();
}

public String getHostId() {
Expand Down
2 changes: 2 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/Inherit.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public enum Inherit {
Priority,
MinCores,
MaxCores,
MinGpus,
MaxGpus,
All
}

7 changes: 5 additions & 2 deletions cuebot/src/main/java/com/imageworks/spcue/JobDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte
public int priority = 1;
public int minCoreUnits = 100;
public int maxCoreUnits = 200000;
public int minGpuUnits = 0;
public int maxGpuUnits = 1000;
public boolean isLocal = false;
public String localHostName;
public int localMaxCores;
public int localMaxMemory;
public long localMaxMemory;
public int localThreadNumber;
public int localMaxGpu;
public int localMaxGpus;
public long localMaxGpuMemory;

public String getDepartmentId() {
return deptId;
Expand Down
20 changes: 15 additions & 5 deletions cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ public class LayerDetail extends LayerEntity implements LayerInterface {
public LayerType type;
public int minimumCores;
public int maximumCores;
public int minimumGpus;
public int maximumGpus;
public boolean isThreadable;
public long minimumMemory;
public long minimumGpu;
public long minimumGpuMemory;
public int chunkSize;
public int timeout;
public int timeout_llu;
Expand Down Expand Up @@ -116,12 +118,20 @@ public void setMinimumMemory(long minimumMemory) {
this.minimumMemory = minimumMemory;
}

public long getMinimumGpu() {
return minimumGpu;
public int getMinimumGpus() {
return minimumGpus;
}

public void setMinimumGpu(long minimumGpu) {
this.minimumGpu = minimumGpu;
public void setMinimumGpus(int minimumGpus) {
this.minimumGpus = minimumGpus;
}

public long getMinimumGpuMemory() {
return minimumGpuMemory;
}

public void setMinimumGpuMemory(long minimumGpuMemory) {
this.minimumGpuMemory = minimumGpuMemory;
}

public int getChunkSize() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ public class LocalHostAssignment extends Entity

private int idleCoreUnits;
private long idleMemory;
private long idleGpu;
private int idleGpuUnits;
private long idleGpuMemory;

private long maxMemory;
private long maxGpu;
private long maxGpuMemory;
private int maxCoreUnits;
private int maxGpuUnits;

private int threads;

Expand All @@ -52,34 +54,39 @@ public class LocalHostAssignment extends Entity

public LocalHostAssignment() { }

public LocalHostAssignment(int maxCores, int threads, long maxMemory, long maxGpu) {
public LocalHostAssignment(int maxCores, int threads, long maxMemory, int maxGpus, long maxGpuMemory) {
this.maxCoreUnits = maxCores;
this.threads = threads;
this.maxMemory = maxMemory;
this.maxGpu = maxGpu;
this.maxGpuUnits = maxGpus;
this.maxGpuMemory = maxGpuMemory;
}

@Override
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {

if (idleCoreUnits < minCores) {
return false;
}
else if (idleMemory < minMemory) {
return false;
}
else if (idleGpu < minGpu) {
else if (idleGpuUnits < minGpus) {
return false;
}
else if (idleGpuMemory < minGpuMemory) {
return false;
}

return true;
}

@Override
public void useResources(int coreUnits, long memory, long gpu) {
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
idleCoreUnits = idleCoreUnits - coreUnits;
idleMemory = idleMemory - memory;
idleGpu = idleGpu - gpu;
idleGpuUnits = idleGpuUnits - gpuUnits;
idleGpuMemory = idleGpuMemory - gpuMemory;
}

public int getThreads() {
Expand Down Expand Up @@ -110,16 +117,24 @@ public long getIdleMemory() {
return this.idleMemory;
}

public long getMaxGpu() {
return maxGpu;
public int getMaxGpuUnits() {
return maxGpuUnits;
}

public void setMaxGpuUnits(int maxGpuUnits) {
this.maxGpuUnits = maxGpuUnits;
}

public long getMaxGpuMemory() {
return maxGpuMemory;
}

public void setMaxGpu(long maxGpu) {
this.maxGpu = maxGpu;
public void setMaxGpuMemory(long maxGpuMemory) {
this.maxGpuMemory = maxGpuMemory;
}

public long getIdleGpu() {
return this.idleGpu;
public long getIdleGpuMemory() {
return this.idleGpuMemory;
}

public int getIdleCoreUnits() {
Expand All @@ -134,8 +149,16 @@ public void setIdleMemory(long idleMemory) {
this.idleMemory = idleMemory;
}

public void setIdleGpu(long idleGpu) {
this.idleGpu = idleGpu;
public int getIdleGpuUnits() {
return this.idleGpuUnits;
}

public void setIdleGpuUnits(int idleGpuUnits) {
this.idleGpuUnits = idleGpuUnits;
}

public void setIdleGpuMemory(long idleGpuMemory) {
this.idleGpuMemory = idleGpuMemory;
}

public String getHostId() {
Expand Down
Loading