diff --git a/cuegui/cuegui/CueJobMonitorTree.py b/cuegui/cuegui/CueJobMonitorTree.py index 8d1307771..3c8fe9633 100644 --- a/cuegui/cuegui/CueJobMonitorTree.py +++ b/cuegui/cuegui/CueJobMonitorTree.py @@ -98,53 +98,71 @@ def __init__(self, parent): data=lambda job: "%.02f" % job.data.job_stats.reserved_cores, sort=lambda job: job.data.job_stats.reserved_cores, tip="The number of reserved cores.") - self.addColumn("Wait", 45, id=6, + self.addColumn("Gpus", 55, id=6, + data=lambda job: "%d" % job.data.job_stats.reserved_gpus, + sort=lambda job: job.data.job_stats.reserved_gpus, + tip="The number of reserved gpus.") + self.addColumn("Wait", 45, id=7, data=lambda job: job.data.job_stats.waiting_frames, sort=lambda job: job.data.job_stats.waiting_frames, tip="The number of waiting frames.") - self.addColumn("Depend", 55, id=7, + self.addColumn("Depend", 55, id=8, data=lambda job: job.data.job_stats.depend_frames, sort=lambda job: job.data.job_stats.depend_frames, tip="The number of dependent frames.") - self.addColumn("Total", 50, id=8, + self.addColumn("Total", 50, id=9, data=lambda job: job.data.job_stats.total_frames, sort=lambda job: job.data.job_stats.total_frames, tip="The total number of frames.") - self.addColumn("_Booking Bar", 150, id=9, + self.addColumn("_Booking Bar", 150, id=10, delegate=cuegui.ItemDelegate.JobBookingBarDelegate) - self.addColumn("Min", 38, id=10, + self.addColumn("Min", 38, id=11, data=lambda job: "%.0f" % job.data.min_cores, sort=lambda job: job.data.min_cores, tip="The minimum number of running cores that the cuebot\n" "will try to maintain.") - self.addColumn("Max", 38, id=11, + self.addColumn("Max", 38, id=12, data=lambda job: "%.0f" % job.data.max_cores, sort=lambda job: job.data.max_cores, tip="The maximum number of running cores that the cuebot\n" "will allow.") + self.addColumn("Min Gpus", 38, id=13, + data=lambda job: "%d" % job.data.min_gpus, + sort=lambda job: job.data.min_gpus, + tip="The minimum number of running gpus that the cuebot\n" + "will try to maintain.") + self.addColumn("Max Gpus", 38, id=14, + data=lambda job: "%d" % job.data.max_gpus, + sort=lambda job: job.data.max_gpus, + tip="The maximum number of running gpus that the cuebot\n" + "will allow.") self.addColumn( - "Age", 50, id=12, + "Age", 50, id=15, data=lambda job: cuegui.Utils.secondsToHHHMM(self.currtime - job.data.start_time), sort=lambda job: self.currtime - job.data.start_time, tip="The HOURS:MINUTES since the job was launched.") - self.addColumn("Pri", 30, id=13, + self.addColumn("Pri", 30, id=16, data=lambda job: job.data.priority, sort=lambda job: job.data.priority, tip="The job priority. The cuebot uses this as a suggestion\n" "to determine what job needs the next available matching\n" "resource.") - self.addColumn("ETA", 65, id=14, + self.addColumn("ETA", 65, id=17, data=lambda job: "", tip="(Inacurate and disabled until a better solution exists)\n" "A very rough estimate of the number of HOURS:MINUTES\n" "it will be before the entire job is done.") - self.addColumn("MaxRss", 60, id=15, + self.addColumn("MaxRss", 60, id=18, data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_rss), sort=lambda job: job.data.job_stats.max_rss, tip="The most memory used at one time by any single frame.") - self.addColumn("_Blank", 20, id=16, + self.addColumn("MaxGpuMem", 60, id=19, + data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_gpu_mem), + sort=lambda job: job.data.job_stats.max_gpu_mem, + tip="The most gpu memory used at one time by any single frame.") + self.addColumn("_Blank", 20, id=20, tip="Spacer") - self.addColumn("Progress", 0, id=17, + self.addColumn("Progress", 0, id=21, delegate=cuegui.ItemDelegate.JobThinProgressBarDelegate, tip="A visual overview of the job progress.\n" "Green \t is succeeded\n" @@ -164,23 +182,31 @@ def __init__(self, parent): self.addColumn("", 0, id=5, data=lambda group: "%.2f" % group.data.stats.reserved_cores) self.addColumn("", 0, id=6, + data=lambda group: "%d" % group.data.stats.reserved_gpus) + self.addColumn("", 0, id=7, data=lambda group: group.data.stats.waiting_frames) - self.addColumn("", 0, id=7) self.addColumn("", 0, id=8) - self.addColumn("", 0, id=9, - data=lambda group: (group.data.min_cores or "")) + self.addColumn("", 0, id=9) self.addColumn("", 0, id=10, + data=lambda group: (group.data.min_cores or "")) + self.addColumn("", 0, id=11, data=lambda group: ( group.data.max_cores > 0 and group.data.max_cores or "")) - self.addColumn("", 0, id=11) - self.addColumn("", 0, id=12) - self.addColumn("", 0, id=13) + self.addColumn("", 0, id=12, + data=lambda group: (group.data.min_gpus or "")) + self.addColumn("", 0, id=13, + data=lambda group: ( + group.data.max_gpus > 0 and group.data.max_gpus or "")) self.addColumn("", 0, id=14) self.addColumn("", 0, id=15) - self.addColumn("", 0, id=16, + self.addColumn("", 0, id=16) + self.addColumn("", 0, id=17) + self.addColumn("", 0, id=18) + self.addColumn("", 0, id=19) + self.addColumn("", 0, id=20, data=lambda group: (group.data.department != "Unknown" and group.data.department or "")) - self.addColumn("", 0, id=17) + self.addColumn("", 0, id=21) cuegui.AbstractTreeWidget.AbstractTreeWidget.__init__(self, parent) @@ -528,6 +554,8 @@ def contextMenuEvent(self, e): menu.addSeparator() self.__menuActions.jobs().addAction(menu, "setMinCores") self.__menuActions.jobs().addAction(menu, "setMaxCores") + self.__menuActions.jobs().addAction(menu, "setMinGpu") + self.__menuActions.jobs().addAction(menu, "setMaxGpu") self.__menuActions.jobs().addAction(menu, "setPriority") self.__menuActions.jobs().addAction(menu, "setMaxRetries") if counts["job"] == 1: diff --git a/cuegui/cuegui/FrameMonitorTree.py b/cuegui/cuegui/FrameMonitorTree.py index 33ccccad9..7e6e1a92d 100644 --- a/cuegui/cuegui/FrameMonitorTree.py +++ b/cuegui/cuegui/FrameMonitorTree.py @@ -100,25 +100,29 @@ def __init__(self, parent): data=lambda job, frame: (self.getCores(frame, format_as_string=True) or ""), sort=lambda job, frame: (self.getCores(frame)), tip="The number of cores a frame is using") - self.addColumn("Host", 120, id=6, + self.addColumn("GPUs", 55, id=6, + data=lambda job, frame: (self.getGpus(frame, format_as_string=True) or ""), + sort=lambda job, frame: (self.getGpus(frame)), + tip="The number of gpus a frame is using") + self.addColumn("Host", 120, id=7, data=lambda job, frame: frame.data.last_resource, sort=lambda job, frame: frame.data.last_resource, tip="The last or current resource that the frame used or is using.") - self.addColumn("Retries", 55, id=7, + self.addColumn("Retries", 55, id=8, data=lambda job, frame: frame.data.retry_count, sort=lambda job, frame: frame.data.retry_count, tip="The number of times that each frame has had to retry.") - self.addColumn("_CheckpointEnabled", 20, id=8, + self.addColumn("_CheckpointEnabled", 20, id=9, data=lambda job, frame: "", sort=lambda job, frame: ( frame.data.checkpoint_state == opencue.api.job_pb2.ENABLED), tip="A green check mark here indicates the frame has written out at least " "1 checkpoint segment.") - self.addColumn("CheckP", 55, id=9, + self.addColumn("CheckP", 55, id=10, data=lambda job, frame: frame.data.checkpoint_count, sort=lambda job, frame: frame.data.checkpoint_count, tip="The number of times a frame has been checkpointed.") - self.addColumn("Runtime", 70, id=10, + self.addColumn("Runtime", 70, id=11, data=lambda job, frame: (cuegui.Utils.secondsToHMMSS( frame.data.start_time and frame.data.stop_time and @@ -138,7 +142,7 @@ def __init__(self, parent): tip="The amount of HOURS:MINUTES:SECONDS that the frame\n" "has run for or last ran for.\n") - self.addColumn("LLU", 70, id=11, + self.addColumn("LLU", 70, id=12, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameLogDataBuffer.getLastLineData( job, frame)[FrameLogDataBuffer.LLU] or ""), @@ -150,7 +154,7 @@ def __init__(self, parent): "time without an update is an indication of a stuck\n" "frame for most types of jobs") - self.addColumn("Memory", 60, id=12, + self.addColumn("Memory", 60, id=13, data=lambda job, frame: ( frame.data.state == opencue.api.job_pb2.RUNNING and cuegui.Utils.memoryToString(frame.data.used_memory) or @@ -162,7 +166,20 @@ def __init__(self, parent): "If a frame is not running:\n" "\t The most memory this frame has used at one time.") - self.addColumn("Remain", 70, id=13, + self.addColumn("GPU Memory", 60, id=14, + data=lambda job, frame: ( + frame.data.state == opencue.api.job_pb2.RUNNING and + cuegui.Utils.memoryToString(frame.data.used_gpu_memory) or + cuegui.Utils.memoryToString(frame.data.max_gpu_memory)), + sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and + frame.data.used_gpu_memory or + frame.data.max_gpu_memory), + tip="If a frame is running:\n" + "\t The amount of GPU memory currently used by the frame.\n" + "If a frame is not running:\n" + "\t The most GPU memory this frame has used at one time.") + + self.addColumn("Remain", 70, id=15, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameEtaDataBuffer.getEtaFormatted(job, frame) or ""), @@ -170,16 +187,16 @@ def __init__(self, parent): self.frameEtaDataBuffer.getEta(job, frame) or -1), tip="Hours:Minutes:Seconds remaining.") - self.addColumn("Start Time", 100, id=14, + self.addColumn("Start Time", 100, id=16, data=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""), sort=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""), tip="The time the frame was started or retried.") - self.addColumn("Stop Time", 100, id=15, + self.addColumn("Stop Time", 100, id=17, data=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""), sort=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""), tip="The time that the frame finished or died.") - self.addColumn("Last Line", 0, id=16, + self.addColumn("Last Line", 0, id=18, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameLogDataBuffer.getLastLineData( job, frame)[FrameLogDataBuffer.LASTLINE] or ""), @@ -240,7 +257,7 @@ def getCores(frame, format_as_string=False): """Gets the number of cores a frame is using.""" cores = None - m = re.search(r".*\/(\d+\.?\d*)", frame.data.last_resource) + m = re.search(r".*\/(\d+\.?\d*)\/.*", frame.data.last_resource) if m: cores = float(m.group(1)) @@ -249,6 +266,20 @@ def getCores(frame, format_as_string=False): return cores + @staticmethod + def getGpus(frame, format_as_string=False): + """Gets the number of gpus a frame is using.""" + gpus = None + + m = re.search(r".*\/.*\/(\d+)", frame.data.last_resource) + if m: + gpus = m.group(1) + + if not format_as_string: + gpus = int(gpus) + + return gpus + @staticmethod def getTimeString(timestamp): """Gets a timestamp formatted as a string.""" diff --git a/cuegui/cuegui/GroupDialog.py b/cuegui/cuegui/GroupDialog.py index 2c59f405c..2a38d906a 100644 --- a/cuegui/cuegui/GroupDialog.py +++ b/cuegui/cuegui/GroupDialog.py @@ -56,6 +56,11 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent): __minCores = defaults["minCores"] __maxCores = defaults["maxCores"] + __defaultJobMinGpus = defaults["defaultJobMinGpus"] + __defaultJobMaxGpus = defaults["defaultJobMaxGpus"] + __minGpus = defaults["minGpus"] + __maxGpus = defaults["maxGpus"] + self.setWindowTitle(__title) layout.addWidget(QtWidgets.QLabel(__message, self), 0, 1, 1, 3) @@ -90,8 +95,25 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent): __modify and __maxCores != -1.0, __maxCores, 1) + (self._defaultJobMinGpusCheck, self._defaultJobMinGpusValue) = \ + self.__createToggleSpinBox("Job Default Minimum Gpus", 8, + __modify and __defaultJobMinGpus != -1, + __defaultJobMinGpus, 1) + (self._defaultJobMaxGpusCheck, self._defaultJobMaxGpusValue) = \ + self.__createToggleSpinBox("Job Default Maximum Gpus", 9, + __modify and __defaultJobMaxGpus != -1, + __defaultJobMaxGpus, 1) + (self._minGpusCheck, self._minGpusValue) = \ + self.__createToggleSpinBox("Group Minimum Gpus", 10, + __modify and __minGpus != 0, + __minGpus) + (self._maxGpusCheck, self._maxGpusValue) = \ + self.__createToggleSpinBox("Group Maximum Gpus", 11, + __modify and __maxGpus != -1, + __maxGpus, 1) + self.__createButtons( - QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 8, 3) + QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 12, 3) def __createToggleDoubleSpinBox( self, text, row, startEnabled = False, currentValue = 0, minValue = 0): @@ -169,6 +191,26 @@ def accept(self): float(self._maxCoresValue.value()), __group.data.max_cores, float(-1)) + self.__setValue(self._defaultJobMinGpusCheck, + __group.setDefaultJobMinGpus, + float(self._defaultJobMinGpusValue.value()), + __group.data.default_job_min_gpus, -1) + + self.__setValue(self._defaultJobMaxGpusCheck, + __group.setDefaultJobMaxGpus, + float(self._defaultJobMaxGpusValue.value()), + __group.data.default_job_max_gpus, -1) + + self.__setValue(self._minGpusCheck, + __group.setMinGpus, + float(self._minGpusValue.value()), + __group.data.min_gpus, 0) + + self.__setValue(self._maxGpusCheck, + __group.setMaxGpus, + float(self._maxGpusValue.value()), + __group.data.max_gpus, -1) + self.close() @staticmethod @@ -195,7 +237,11 @@ def __init__(self, modifyGroup, parent=None): "defaultJobMinCores": modifyGroup.data.default_job_min_cores, "defaultJobMaxCores": modifyGroup.data.default_job_max_cores, "minCores": modifyGroup.data.min_cores, - "maxCores": modifyGroup.data.max_cores} + "maxCores": modifyGroup.data.max_cores, + "defaultJobMinGpus": modifyGroup.data.default_job_min_gpus, + "defaultJobMaxGpus": modifyGroup.data.default_job_max_gpus, + "minGpus": modifyGroup.data.min_gpus, + "maxGpus": modifyGroup.data.max_gpus} GroupDialog.__init__(self, None, modifyGroup, defaults, parent) @@ -212,5 +258,9 @@ def __init__(self, parentGroup, parent=None): "defaultJobMinCores": 1.0, "defaultJobMaxCores": 1.0, "minCores": 0.0, - "maxCores": 1.0} + "maxCores": 1.0, + "defaultJobMinGpus": 0, + "defaultJobMaxGpus": 0, + "minGpus": 0, + "maxGpus": 0} GroupDialog.__init__(self, parentGroup, None, defaults, parent) diff --git a/cuegui/cuegui/HostMonitorTree.py b/cuegui/cuegui/HostMonitorTree.py index f9e5a7c90..30db0d850 100644 --- a/cuegui/cuegui/HostMonitorTree.py +++ b/cuegui/cuegui/HostMonitorTree.py @@ -78,9 +78,9 @@ def __init__(self, parent): data=lambda host: cuegui.Utils.memoryToString(host.data.free_memory), sort=lambda host: host.data.free_memory, tip="The amount of used memory (red) vs available gpu memory (green)") - self.addColumn("GPU", 60, id=6, - data=lambda host: cuegui.Utils.memoryToString(host.data.free_gpu), - sort=lambda host: host.data.free_gpu, + self.addColumn("GPU Memory", 60, id=6, + data=lambda host: cuegui.Utils.memoryToString(host.data.free_gpu_memory), + sort=lambda host: host.data.free_gpu_memory, delegate=cuegui.ItemDelegate.HostGpuBarDelegate, tip="The amount of used gpu memory (red) vs available gpu memory (green)") self.addColumn("freeMcp", 60, id=7, @@ -105,27 +105,40 @@ def __init__(self, parent): data=lambda host: cuegui.Utils.memoryToString(host.data.idle_memory), sort=lambda host: host.data.idle_memory, tip="The amount of unreserved memory.") - self.addColumn("GPU", 50, id=12, - data=lambda host: cuegui.Utils.memoryToString(host.data.gpu), - sort=lambda host: host.data.gpu, + self.addColumn("GPUs", 50, id=12, + data=lambda host: "%d" % host.data.gpus, + sort=lambda host: host.data.gpus, + tip="The total number of gpus.\n\n" + "On a frame it is the number of gpus reserved.") + self.addColumn("Idle GPUs", 40, id=13, + data=lambda host: "%d" % host.data.idle_gpus, + sort=lambda host: host.data.idle_gpus, + tip="The number of gpus that are not reserved.") + self.addColumn("GPU Mem", 50, id=14, + data=lambda host: cuegui.Utils.memoryToString(host.data.gpu_memory), + sort=lambda host: host.data.gpu_memory, tip="The total amount of reservable gpu memory.\n\n" "On a frame it is the amount of gpu memory reserved.") - self.addColumn("Idle", 50, id=13, + self.addColumn("Gpu Mem Idle", 50, id=15, + data=lambda host: cuegui.Utils.memoryToString(host.data.idle_gpu_memory), + sort=lambda host: host.data.idle_gpu_memory, + tip="The amount of unreserved gpu memory.") + self.addColumn("Idle", 50, id=16, data=lambda host: cuegui.Utils.memoryToString(host.data.idle_gpu), sort=lambda host: host.data.idle_gpu, tip="The amount of unreserved gpu memory.") - self.addColumn("Ping", 50, id=14, + self.addColumn("Ping", 50, id=17, data=lambda host: int(time.time() - host.data.ping_time), sort=lambda host: host.data.ping_time, tip="The number of seconds since the cuebot last received\n" "a report from the host. A host is configured to report\n" "in every 60 seconds so a number larger than this\n" "indicates a problem") - self.addColumn("Hardware", 70, id=15, + self.addColumn("Hardware", 70, id=18, data=lambda host: HardwareState.Name(host.data.state), tip="The state of the hardware as Up or Down.\n\n" "On a frame it is the amount of memory used.") - self.addColumn("Locked", 90, id=16, + self.addColumn("Locked", 90, id=19, data=lambda host: LockState.Name(host.data.lock_state), tip="A host can be:\n" "Locked \t\t It was manually locked to prevent booking\n" @@ -133,12 +146,12 @@ def __init__(self, parent): "NimbyLocked \t It is a desktop machine and there is\n" "\t\t someone actively using it or not enough \n" "\t\t resources are available on a desktop.") - self.addColumn("ThreadMode", 80, id=17, + self.addColumn("ThreadMode", 80, id=20, data=lambda host: ThreadMode.Name(host.data.thread_mode), tip="A frame that runs on this host will:\n" "All: Use all cores.\n" "Auto: Use the number of cores as decided by the cuebot.\n") - self.addColumn("Tags/Job", 50, id=18, + self.addColumn("Tags/Job", 50, id=21, data=lambda host: ",".join(host.data.tags), tip="The tags applied to the host.\n\n" "On a frame it is the name of the job.") @@ -340,7 +353,8 @@ def data(self, col, role): self.rpcObject.data.total_memory] if role == QtCore.Qt.UserRole + 3: - return [self.rpcObject.data.total_gpu - self.rpcObject.data.free_gpu, - self.rpcObject.data.total_gpu] + return [self.rpcObject.data.total_gpu_memory - + self.rpcObject.data.free_gpu_memory, + self.rpcObject.data.total_gpu_memory] return cuegui.Constants.QVARIANT_NULL diff --git a/cuegui/cuegui/LayerDialog.py b/cuegui/cuegui/LayerDialog.py index c337dda80..4c79b805f 100644 --- a/cuegui/cuegui/LayerDialog.py +++ b/cuegui/cuegui/LayerDialog.py @@ -117,12 +117,12 @@ def __init__(self, layers, parent=None): self.mem_max_kb = int(self.mem_max_gb * 1024 * 1024) self.mem_min_kb = int(self.mem_min_gb * 1024 * 1024) - self.gpu_max_kb = 2 * 1024 * 1024 - self.gpu_min_kb = 0 - self.gpu_tick_kb = 256 * 1024 - self.gpu_max_gb = 2.0 - self.gpu_min_gb = 0.0 - self.gpu_tick_gb = .25 + self.gpu_mem_max_kb = 256 * 1024 * 1024 + self.gpu_mem_min_kb = 0 + self.gpu_mem_tick_kb = 256 * 1024 + self.gpu_mem_max_gb = 256.0 + self.gpu_mem_min_gb = 0.0 + self.gpu_mem_tick_gb = .25 self.__group = QtWidgets.QGroupBox("Resource Options", self) @@ -180,16 +180,28 @@ def __init__(self, layers, parent=None): # Limits self.__limits = LayerLimitsWidget(self.__layers, self) + # Min gpus + self.__min_gpus = QtWidgets.QSpinBox(self) + self.__min_gpus.setValue(0) + self.__min_gpus.setRange(0, int(self._cfg().get('max_gpus', 16))) + self.__min_gpus.setSingleStep(1) + + # Max gpus + self.__max_gpus = QtWidgets.QSpinBox(self) + self.__max_gpus.setRange(0, int(self._cfg().get('max_gpus', 16))) + self.__max_gpus.setSingleStep(1) + # GPU Memory - self.__gpu = SlideSpinner(self) - self.__gpu.slider.setMinimumWidth(200) - self.__gpu.slider.setRange(self.gpu_min_kb, self.gpu_max_kb // self.gpu_tick_kb) - self.__gpu.slider.setTickInterval(1) - self.__gpu.slider.setSingleStep(1) - self.__gpu.slider.setPageStep(1) - self.__gpu.spinner.setSuffix(' GB') - self.__gpu.spinner.setRange(self.gpu_min_gb, self.gpu_max_gb) - self.__gpu.spinner.setSingleStep(self.gpu_tick_gb) + self.__gpu_mem = SlideSpinner(self) + self.__gpu_mem.slider.setMinimumWidth(200) + self.__gpu_mem.slider.setRange(self.gpu_mem_min_kb, + self.gpu_mem_max_kb // self.gpu_mem_tick_kb) + self.__gpu_mem.slider.setTickInterval(1) + self.__gpu_mem.slider.setSingleStep(1) + self.__gpu_mem.slider.setPageStep(1) + self.__gpu_mem.spinner.setSuffix(' GB') + self.__gpu_mem.spinner.setRange(self.gpu_mem_min_gb, self.gpu_mem_max_gb) + self.__gpu_mem.spinner.setSingleStep(self.gpu_mem_tick_gb) # Our dialog buttons. self.__buttons = QtWidgets.QDialogButtonBox(QtWidgets.QDialogButtonBox.Save | @@ -200,16 +212,18 @@ def __init__(self, layers, parent=None): # Setup signals self.__mem.slider.valueChanged.connect(self.__translateToMemSpinbox) self.__mem.spinner.valueChanged.connect(self.__translateToMemSlider) - self.__gpu.slider.valueChanged.connect(self.__translateToGpuSpinbox) - self.__gpu.spinner.valueChanged.connect(self.__translateToGpuSlider) + self.__gpu_mem.slider.valueChanged.connect(self.__translateToGpuMemSpinbox) + self.__gpu_mem.spinner.valueChanged.connect(self.__translateToGpuMemSlider) self.__buttons.accepted.connect(self.verify) self.__buttons.rejected.connect(self.reject) # Set actual values once signals are setup self.__mem.slider.setValue(self.getMaxMemory()) - self.__gpu.slider.setValue(self.getMaxGpu()) + self.__gpu_mem.slider.setValue(self.getMaxGpuMemory()) self.__core.setValue(self.getMinCores()) self.__max_cores.setValue(self.getMaxCores()) + self.__min_gpus.setValue(self.getMinGpus()) + self.__max_gpus.setValue(self.getMaxGpus()) self.__timeout.setValue(self.getTimeout()) self.__timeout_llu.setValue(self.getTimeoutLLU()) @@ -236,8 +250,16 @@ def __init__(self, layers, parent=None): self.__thread, True), multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Min GPUs:", + self.__min_gpus, + False), + multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Max GPUs:", + self.__max_gpus, + False), + multiSelect)) layout.addWidget(EnableableItem(LayerPropertiesItem("Minimum Gpu Memory:", - self.__gpu, + self.__gpu_mem, False), multiSelect)) layout.addWidget(EnableableItem(LayerPropertiesItem("Timeout:", @@ -280,8 +302,8 @@ def verify(self): if mem_value < self.mem_min_kb or mem_value > self.mem_max_kb: warning("The memory setting is too high.") return False - gpu_value = self.__gpu.slider.value() - if gpu_value < self.gpu_min_kb or gpu_value > self.gpu_max_kb: + gpu_mem_value = self.__gpu_mem.slider.value() + if gpu_mem_value < self.gpu_mem_min_kb or gpu_mem_value > self.gpu_mem_max_kb: warning("The gpu memory setting is too high.") return False @@ -302,8 +324,8 @@ def apply(self): layer.setMaxCores(self.__max_cores.value() * 100.0) if self.__thread.isEnabled(): layer.setThreadable(self.__thread.isChecked()) - if self.__gpu.isEnabled(): - layer.setMinGpu(self.__gpu.slider.value() * self.gpu_tick_kb) + if self.__gpu_mem.isEnabled(): + layer.setMinGpuMemory(self.__gpu_mem.slider.value() * self.gpu_mem_tick_kb) if self.__timeout.isEnabled(): layer.setTimeout(self.__timeout.value()) if self.__timeout_llu.isEnabled(): @@ -322,9 +344,9 @@ def getMaxMemory(self): result = layer.data.min_memory return result - def getMaxGpu(self): - """Gets the layer max GPU.""" - return max([layer.data.min_gpu // self.gpu_tick_kb for layer in self.__layers]) + def getMaxGpuMemory(self): + """Gets the layer max GPU memory.""" + return max([layer.data.min_gpu_memory // self.gpu_mem_tick_kb for layer in self.__layers]) def getMinCores(self): """Gets the layer min cores.""" @@ -342,6 +364,22 @@ def getMaxCores(self): result = layer.data.max_cores return result + def getMinGpus(self): + """Gets the layer min gpus.""" + result = 0 + for layer in self.__layers: + if layer.data.min_gpus > result: + result = layer.data.min_gpus + return result + + def getMaxGpus(self): + """Gets the layer max gpus.""" + result = 0 + for layer in self.__layers: + if layer.data.max_gpus > result: + result = layer.data.max_gpus + return result + def getThreading(self): """Gets whether the layer is threadable.""" result = False @@ -382,12 +420,11 @@ def __translateToMemSpinbox(self, value): def __translateToMemSlider(self, value): self.__mem.slider.setValue(int(value * 1048576.0)) - def __translateToGpuSpinbox(self, value): - self.__gpu.spinner.setValue(float(value * self.gpu_tick_kb) / 1024.0 / 1024.0) - - def __translateToGpuSlider(self, value): - self.__gpu.slider.setValue(int(value * 1024.0 * 1024.0) // self.gpu_tick_kb) + def __translateToGpuMemSpinbox(self, value): + self.__gpu_mem.spinner.setValue(float(value * self.gpu_mem_tick_kb) / 1024.0 / 1024.0) + def __translateToGpuMemSlider(self, value): + self.__gpu_mem.slider.setValue(int(value * 1024.0 * 1024.0) // self.gpu_mem_tick_kb) class LayerTagsWidget(QtWidgets.QWidget): """ diff --git a/cuegui/cuegui/LayerMonitorTree.py b/cuegui/cuegui/LayerMonitorTree.py index fdf0c249e..5b15450b6 100644 --- a/cuegui/cuegui/LayerMonitorTree.py +++ b/cuegui/cuegui/LayerMonitorTree.py @@ -74,65 +74,70 @@ def __init__(self, parent): "will reserve for its use. If the frame begins to use\n" "more memory than this, the cuebot will increase this\n" "number.") - self.addColumn("Gpu", 40, id=8, - data=lambda layer: cuegui.Utils.memoryToString(layer.data.min_gpu), - sort=lambda layer: layer.data.min_gpu, + self.addColumn("Gpus", 45, id=8, + data=lambda layer: "%d" % layer.data.min_gpus, + sort=lambda layer: layer.data.min_gpus, + tip="The number of gpus that the frames in this layer\n" + "will reserve as a minimum.") + self.addColumn("Gpu Memory", 40, id=9, + data=lambda layer: cuegui.Utils.memoryToString(layer.data.min_gpu_memory), + sort=lambda layer: layer.data.min_gpu_memory, tip="The amount of gpu memory each frame in this layer\n" "will reserve for its use. Note that we may not have\n" "machines as much gpu memory as you request.") self.addColumn( - "MaxRss", 60, id=9, + "MaxRss", 60, id=10, data=lambda layer: cuegui.Utils.memoryToString(layer.data.layer_stats.max_rss), sort=lambda layer: layer.data.layer_stats.max_rss, tip="Maximum amount of memory used by any frame in\n" "this layer at any time since the job was launched.") - self.addColumn("Total", 40, id=10, + self.addColumn("Total", 40, id=11, data=lambda layer: layer.data.layer_stats.total_frames, sort=lambda layer: layer.data.layer_stats.total_frames, tip="Total number of frames in this layer.") - self.addColumn("Done", 40, id=11, + self.addColumn("Done", 40, id=12, data=lambda layer: layer.data.layer_stats.succeeded_frames, sort=lambda layer: layer.data.layer_stats.succeeded_frames, tip="Total number of done frames in this layer.") - self.addColumn("Run", 40, id=12, + self.addColumn("Run", 40, id=13, data=lambda layer: layer.data.layer_stats.running_frames, sort=lambda layer: layer.data.layer_stats.running_frames, tip="Total number or running frames in this layer.") - self.addColumn("Depend", 53, id=13, + self.addColumn("Depend", 53, id=14, data=lambda layer: layer.data.layer_stats.depend_frames, sort=lambda layer: layer.data.layer_stats.depend_frames, tip="Total number of dependent frames in this layer.") - self.addColumn("Wait", 40, id=14, + self.addColumn("Wait", 40, id=15, data=lambda layer: layer.data.layer_stats.waiting_frames, sort=lambda layer: layer.data.layer_stats.waiting_frames, tip="Total number of waiting frames in this layer.") - self.addColumn("Eaten", 40, id=15, + self.addColumn("Eaten", 40, id=16, data=lambda layer: layer.data.layer_stats.eaten_frames, sort=lambda layer: layer.data.layer_stats.eaten_frames, tip="Total number of eaten frames in this layer.") - self.addColumn("Dead", 40, id=16, + self.addColumn("Dead", 40, id=17, data=lambda layer: layer.data.layer_stats.dead_frames, sort=lambda layer: layer.data.layer_stats.dead_frames, tip="Total number of dead frames in this layer.") self.addColumn( - "Avg", 65, id=17, + "Avg", 65, id=18, data=lambda layer: cuegui.Utils.secondsToHHMMSS(layer.data.layer_stats.avg_frame_sec), sort=lambda layer: layer.data.layer_stats.avg_frame_sec, tip="Average number of HOURS:MINUTES:SECONDS per frame\nin this layer.") - self.addColumn("Tags", 100, id=18, + self.addColumn("Tags", 100, id=19, data=lambda layer: " | ".join(layer.data.tags), tip="The tags define what resources may be booked on\n" "frames in this layer.") - self.addColumn("Progress", 100, id=19, + self.addColumn("Progress", 100, id=20, delegate=cuegui.ItemDelegate.ProgressDelegate, data=lambda layer: layer.percentCompleted(), sort=lambda layer: layer.percentCompleted(), tip="Progress for the Layer") - self.addColumn("Timeout", 45, id=20, + self.addColumn("Timeout", 45, id=21, data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout*60), sort=lambda layer: layer.data.timeout, tip="Timeout for the frames, Hours:Minutes") - self.addColumn("Timeout LLU", 45, id=21, + self.addColumn("Timeout LLU", 45, id=22, data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout_llu*60), sort=lambda layer: layer.data.timeout_llu, tip="Timeout for a frames\' LLU, Hours:Minutes") diff --git a/cuegui/cuegui/MenuActions.py b/cuegui/cuegui/MenuActions.py index 5c9b63139..cb7a8ae0a 100644 --- a/cuegui/cuegui/MenuActions.py +++ b/cuegui/cuegui/MenuActions.py @@ -269,6 +269,38 @@ def setMaxCores(self, rpcObjects=None): job.setMaxCores(float(value)) self._update() + setMinGpu_info = ["Set Minimum Gpu...", "Set Job(s) Minimum Gpu", "configure"] + def setMinGpu(self, rpcObjects=None): + jobs = self._getOnlyJobObjects(rpcObjects) + if jobs: + current = max([job.data.min_cores for job in jobs]) + title = "Set Minimum Gpu" + body = "Please enter the new minimum gpu value:" + (value, choice) = QtWidgets.QInputDialog.getDouble(self._caller, + title, body, + current, + 0, 50000, 0) + if choice: + for job in jobs: + job.setMinGpu(float(value)) + self._update() + + setMaxGpu_info = ["Set Maximum Gpu...", "Set Job(s) Maximum Gpu", "configure"] + def setMaxGpu(self, rpcObjects=None): + jobs = self._getOnlyJobObjects(rpcObjects) + if jobs: + current = max([job.data.max_cores for job in jobs]) + title = "Set Maximum Gpu" + body = "Please enter the new maximum gpu value:" + (value, choice) = QtWidgets.QInputDialog.getDouble(self._caller, + title, body, + current, + 0, 50000, 0) + if choice: + for job in jobs: + job.setMaxGpu(float(value)) + self._update() + setPriority_info = ["Set Priority...", None, "configure"] def setPriority(self, rpcObjects=None): @@ -1461,6 +1493,24 @@ def clearRepair(self, rpcObjects=None): host.setHardwareState(down) self._update() + setThreadModeAuto_info = ["Thread Mode Auto", None, "configure"] + def setThreadModeAuto(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("AUTO") + self._update() + + setThreadModeAll_info = ["Thread Mode All", None, "configure"] + def setThreadModeAll(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("ALL") + self._update() + + setThreadModeVariable_info = ["Thread Mode Variable", None, "configure"] + def setThreadModeVariable(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("VARIABLE") + self._update() + class ProcActions(AbstractActions): """Actions for procs.""" diff --git a/cuegui/cuegui/config/cue_resources.yaml b/cuegui/cuegui/config/cue_resources.yaml index d54cfcbfc..501b6aff4 100644 --- a/cuegui/cuegui/config/cue_resources.yaml +++ b/cuegui/cuegui/config/cue_resources.yaml @@ -10,6 +10,9 @@ max_cores: 32 max_memory: 128 +max_gpus: 8 +max_gpu_memory: 128 + # Redirect Plugin maximum allowed core-hour cutoff. # Users will not be able to search for procs with frames that have been diff --git a/cuegui/tests/FrameMonitorTree_tests.py b/cuegui/tests/FrameMonitorTree_tests.py index e28e8229f..c3c2b4963 100644 --- a/cuegui/tests/FrameMonitorTree_tests.py +++ b/cuegui/tests/FrameMonitorTree_tests.py @@ -120,7 +120,7 @@ def test_tickFullUpdate(self, getFramesMock, getUpdatedFramesMock): def test_getCores(self): frame = opencue.wrappers.frame.Frame( - opencue.compiled_proto.job_pb2.Frame(last_resource='foo/125.82723')) + opencue.compiled_proto.job_pb2.Frame(last_resource='foo/125.82723/0')) self.assertEqual(125.82723, self.frameMonitorTree.getCores(frame)) self.assertEqual('125.83', self.frameMonitorTree.getCores(frame, format_as_string=True)) diff --git a/cuegui/tests/LayerDialog_tests.py b/cuegui/tests/LayerDialog_tests.py index 1f9624d59..5e515775d 100644 --- a/cuegui/tests/LayerDialog_tests.py +++ b/cuegui/tests/LayerDialog_tests.py @@ -55,13 +55,15 @@ def setUp(self, get_stub_mock, get_layer_mock, get_limits_mock): 'layer1Id': opencue.wrappers.layer.Layer( opencue.compiled_proto.job_pb2.Layer( id='layer1Id', name='layer1Name', range='1-5', tags=['tag1', 'tag2'], - min_cores=1, max_cores=3, is_threadable=False, min_memory=2097152, min_gpu=1, + min_cores=1, max_cores=3, is_threadable=False, + min_memory=2097152, min_gpu_memory=1, chunk_size=1, timeout=30, timeout_llu=1, memory_optimizer_enabled=True, limits=['limit1Name', 'limit2Name'])), 'layer2Id': opencue.wrappers.layer.Layer( opencue.compiled_proto.job_pb2.Layer( id='layer2Id', name='layer2Name', range='2-22', tags=['tag2', 'tag3'], - min_cores=2, max_cores=2, is_threadable=True, min_memory=6291456, min_gpu=2, + min_cores=2, max_cores=2, is_threadable=True, + min_memory=6291456, min_gpu_memory=2, chunk_size=5, timeout=60, timeout_llu=5, memory_optimizer_enabled=False, limits=['limit2Name', 'limit3Name'])), } @@ -124,12 +126,12 @@ def test__should_display_current_values(self): self.assertTrue(self.layer_properties_dialog._LayerPropertiesDialog__thread.isChecked()) self.assertEqual( - int(self.layer_properties_dialog.gpu_min_gb * 1024 * 1024), - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.minimum()) + int(self.layer_properties_dialog.gpu_mem_min_gb * 1024 * 1024), + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.minimum()) self.assertEqual( - int(self.layer_properties_dialog.gpu_max_gb * 1024 * 1024) // - int(self.layer_properties_dialog.gpu_tick_gb * 1024 * 1024), - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.maximum()) + int(self.layer_properties_dialog.gpu_mem_max_gb * 1024 * 1024) // + int(self.layer_properties_dialog.gpu_mem_tick_gb * 1024 * 1024), + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.maximum()) # Layer with the highest timeout determines the initial value. self.assertEqual(60, self.layer_properties_dialog._LayerPropertiesDialog__timeout.value()) @@ -163,13 +165,13 @@ def test__should_fail_on_memory_too_low(self): self.assertFalse(self.layer_properties_dialog.verify()) def test__should_fail_on_gpu_too_high(self): - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue( - self.layer_properties_dialog.gpu_max_kb * 2) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + self.layer_properties_dialog.gpu_mem_max_kb * 2) self.assertFalse(self.layer_properties_dialog.verify()) def test__should_fail_on_gpu_too_low(self): - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue( - self.layer_properties_dialog.gpu_min_kb / 3) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + self.layer_properties_dialog.gpu_mem_min_kb / 3) self.assertFalse(self.layer_properties_dialog.verify()) def test__should_apply_new_settings(self): @@ -206,9 +208,10 @@ def test__should_apply_new_settings(self): self.layer_properties_dialog._LayerPropertiesDialog__thread.parent().parent().enable(True) self.layer_properties_dialog._LayerPropertiesDialog__thread.setChecked(new_is_threadable) - new_min_gpu = 6 - self.layer_properties_dialog._LayerPropertiesDialog__gpu.parent().parent().enable(True) - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue(new_min_gpu) + new_min_gpu_memory = 6 + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.parent().parent().enable(True) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + new_min_gpu_memory) new_timeout = 20 self.layer_properties_dialog._LayerPropertiesDialog__timeout.parent().parent().enable(True) @@ -239,10 +242,10 @@ def test__should_apply_new_settings(self): layer2_mock.setMaxCores.assert_called_with(100 * new_max_cores) layer1_mock.setThreadable.assert_called_with(new_is_threadable) layer2_mock.setThreadable.assert_called_with(new_is_threadable) - layer1_mock.setMinGpu.assert_called_with( - new_min_gpu * self.layer_properties_dialog.gpu_tick_kb) - layer2_mock.setMinGpu.assert_called_with( - new_min_gpu * self.layer_properties_dialog.gpu_tick_kb) + layer1_mock.setMinGpuMemory.assert_called_with( + new_min_gpu_memory * self.layer_properties_dialog.gpu_mem_tick_kb) + layer2_mock.setMinGpuMemory.assert_called_with( + new_min_gpu_memory * self.layer_properties_dialog.gpu_mem_tick_kb) layer1_mock.setTimeout.assert_called_with(new_timeout) layer2_mock.setTimeout.assert_called_with(new_timeout) layer1_mock.setTimeoutLLU.assert_called_with(new_timeout_llu)