Skip to content

Commit

Permalink
[cuegui] Sync with proto changes
Browse files Browse the repository at this point in the history
Co-authored-by: Lars van der Bijl <[email protected]>
  • Loading branch information
splhack and larsbijl committed Mar 24, 2021
1 parent 574f29e commit d2a82dc
Show file tree
Hide file tree
Showing 11 changed files with 345 additions and 124 deletions.
68 changes: 48 additions & 20 deletions cuegui/cuegui/CueJobMonitorTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,53 +98,71 @@ def __init__(self, parent):
data=lambda job: "%.02f" % job.data.job_stats.reserved_cores,
sort=lambda job: job.data.job_stats.reserved_cores,
tip="The number of reserved cores.")
self.addColumn("Wait", 45, id=6,
self.addColumn("Gpus", 55, id=6,
data=lambda job: "%d" % job.data.job_stats.reserved_gpus,
sort=lambda job: job.data.job_stats.reserved_gpus,
tip="The number of reserved gpus.")
self.addColumn("Wait", 45, id=7,
data=lambda job: job.data.job_stats.waiting_frames,
sort=lambda job: job.data.job_stats.waiting_frames,
tip="The number of waiting frames.")
self.addColumn("Depend", 55, id=7,
self.addColumn("Depend", 55, id=8,
data=lambda job: job.data.job_stats.depend_frames,
sort=lambda job: job.data.job_stats.depend_frames,
tip="The number of dependent frames.")
self.addColumn("Total", 50, id=8,
self.addColumn("Total", 50, id=9,
data=lambda job: job.data.job_stats.total_frames,
sort=lambda job: job.data.job_stats.total_frames,
tip="The total number of frames.")
self.addColumn("_Booking Bar", 150, id=9,
self.addColumn("_Booking Bar", 150, id=10,
delegate=cuegui.ItemDelegate.JobBookingBarDelegate)
self.addColumn("Min", 38, id=10,
self.addColumn("Min", 38, id=11,
data=lambda job: "%.0f" % job.data.min_cores,
sort=lambda job: job.data.min_cores,
tip="The minimum number of running cores that the cuebot\n"
"will try to maintain.")
self.addColumn("Max", 38, id=11,
self.addColumn("Max", 38, id=12,
data=lambda job: "%.0f" % job.data.max_cores,
sort=lambda job: job.data.max_cores,
tip="The maximum number of running cores that the cuebot\n"
"will allow.")
self.addColumn("Min Gpus", 38, id=13,
data=lambda job: "%d" % job.data.min_gpus,
sort=lambda job: job.data.min_gpus,
tip="The minimum number of running gpus that the cuebot\n"
"will try to maintain.")
self.addColumn("Max Gpus", 38, id=14,
data=lambda job: "%d" % job.data.max_gpus,
sort=lambda job: job.data.max_gpus,
tip="The maximum number of running gpus that the cuebot\n"
"will allow.")
self.addColumn(
"Age", 50, id=12,
"Age", 50, id=15,
data=lambda job: cuegui.Utils.secondsToHHHMM(self.currtime - job.data.start_time),
sort=lambda job: self.currtime - job.data.start_time,
tip="The HOURS:MINUTES since the job was launched.")
self.addColumn("Pri", 30, id=13,
self.addColumn("Pri", 30, id=16,
data=lambda job: job.data.priority,
sort=lambda job: job.data.priority,
tip="The job priority. The cuebot uses this as a suggestion\n"
"to determine what job needs the next available matching\n"
"resource.")
self.addColumn("ETA", 65, id=14,
self.addColumn("ETA", 65, id=17,
data=lambda job: "",
tip="(Inacurate and disabled until a better solution exists)\n"
"A very rough estimate of the number of HOURS:MINUTES\n"
"it will be before the entire job is done.")
self.addColumn("MaxRss", 60, id=15,
self.addColumn("MaxRss", 60, id=18,
data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_rss),
sort=lambda job: job.data.job_stats.max_rss,
tip="The most memory used at one time by any single frame.")
self.addColumn("_Blank", 20, id=16,
self.addColumn("MaxGpuMem", 60, id=19,
data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_gpu_mem),
sort=lambda job: job.data.job_stats.max_gpu_mem,
tip="The most gpu memory used at one time by any single frame.")
self.addColumn("_Blank", 20, id=20,
tip="Spacer")
self.addColumn("Progress", 0, id=17,
self.addColumn("Progress", 0, id=21,
delegate=cuegui.ItemDelegate.JobThinProgressBarDelegate,
tip="A visual overview of the job progress.\n"
"Green \t is succeeded\n"
Expand All @@ -164,23 +182,31 @@ def __init__(self, parent):
self.addColumn("", 0, id=5,
data=lambda group: "%.2f" % group.data.stats.reserved_cores)
self.addColumn("", 0, id=6,
data=lambda group: "%d" % group.data.stats.reserved_gpus)
self.addColumn("", 0, id=7,
data=lambda group: group.data.stats.waiting_frames)
self.addColumn("", 0, id=7)
self.addColumn("", 0, id=8)
self.addColumn("", 0, id=9,
data=lambda group: (group.data.min_cores or ""))
self.addColumn("", 0, id=9)
self.addColumn("", 0, id=10,
data=lambda group: (group.data.min_cores or ""))
self.addColumn("", 0, id=11,
data=lambda group: (
group.data.max_cores > 0 and group.data.max_cores or ""))
self.addColumn("", 0, id=11)
self.addColumn("", 0, id=12)
self.addColumn("", 0, id=13)
self.addColumn("", 0, id=12,
data=lambda group: (group.data.min_gpus or ""))
self.addColumn("", 0, id=13,
data=lambda group: (
group.data.max_gpus > 0 and group.data.max_gpus or ""))
self.addColumn("", 0, id=14)
self.addColumn("", 0, id=15)
self.addColumn("", 0, id=16,
self.addColumn("", 0, id=16)
self.addColumn("", 0, id=17)
self.addColumn("", 0, id=18)
self.addColumn("", 0, id=19)
self.addColumn("", 0, id=20,
data=lambda group: (group.data.department != "Unknown" and
group.data.department or ""))
self.addColumn("", 0, id=17)
self.addColumn("", 0, id=21)

cuegui.AbstractTreeWidget.AbstractTreeWidget.__init__(self, parent)

Expand Down Expand Up @@ -528,6 +554,8 @@ def contextMenuEvent(self, e):
menu.addSeparator()
self.__menuActions.jobs().addAction(menu, "setMinCores")
self.__menuActions.jobs().addAction(menu, "setMaxCores")
self.__menuActions.jobs().addAction(menu, "setMinGpu")
self.__menuActions.jobs().addAction(menu, "setMaxGpu")
self.__menuActions.jobs().addAction(menu, "setPriority")
self.__menuActions.jobs().addAction(menu, "setMaxRetries")
if counts["job"] == 1:
Expand Down
55 changes: 43 additions & 12 deletions cuegui/cuegui/FrameMonitorTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,25 +100,29 @@ def __init__(self, parent):
data=lambda job, frame: (self.getCores(frame, format_as_string=True) or ""),
sort=lambda job, frame: (self.getCores(frame)),
tip="The number of cores a frame is using")
self.addColumn("Host", 120, id=6,
self.addColumn("GPUs", 55, id=6,
data=lambda job, frame: (self.getGpus(frame, format_as_string=True) or ""),
sort=lambda job, frame: (self.getGpus(frame)),
tip="The number of gpus a frame is using")
self.addColumn("Host", 120, id=7,
data=lambda job, frame: frame.data.last_resource,
sort=lambda job, frame: frame.data.last_resource,
tip="The last or current resource that the frame used or is using.")
self.addColumn("Retries", 55, id=7,
self.addColumn("Retries", 55, id=8,
data=lambda job, frame: frame.data.retry_count,
sort=lambda job, frame: frame.data.retry_count,
tip="The number of times that each frame has had to retry.")
self.addColumn("_CheckpointEnabled", 20, id=8,
self.addColumn("_CheckpointEnabled", 20, id=9,
data=lambda job, frame: "",
sort=lambda job, frame: (
frame.data.checkpoint_state == opencue.api.job_pb2.ENABLED),
tip="A green check mark here indicates the frame has written out at least "
"1 checkpoint segment.")
self.addColumn("CheckP", 55, id=9,
self.addColumn("CheckP", 55, id=10,
data=lambda job, frame: frame.data.checkpoint_count,
sort=lambda job, frame: frame.data.checkpoint_count,
tip="The number of times a frame has been checkpointed.")
self.addColumn("Runtime", 70, id=10,
self.addColumn("Runtime", 70, id=11,
data=lambda job, frame: (cuegui.Utils.secondsToHMMSS(
frame.data.start_time and
frame.data.stop_time and
Expand All @@ -138,7 +142,7 @@ def __init__(self, parent):
tip="The amount of HOURS:MINUTES:SECONDS that the frame\n"
"has run for or last ran for.\n")

self.addColumn("LLU", 70, id=11,
self.addColumn("LLU", 70, id=12,
data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
self.frameLogDataBuffer.getLastLineData(
job, frame)[FrameLogDataBuffer.LLU] or ""),
Expand All @@ -150,7 +154,7 @@ def __init__(self, parent):
"time without an update is an indication of a stuck\n"
"frame for most types of jobs")

self.addColumn("Memory", 60, id=12,
self.addColumn("Memory", 60, id=13,
data=lambda job, frame: (
frame.data.state == opencue.api.job_pb2.RUNNING and
cuegui.Utils.memoryToString(frame.data.used_memory) or
Expand All @@ -162,24 +166,37 @@ def __init__(self, parent):
"If a frame is not running:\n"
"\t The most memory this frame has used at one time.")

self.addColumn("Remain", 70, id=13,
self.addColumn("GPU Memory", 60, id=14,
data=lambda job, frame: (
frame.data.state == opencue.api.job_pb2.RUNNING and
cuegui.Utils.memoryToString(frame.data.used_gpu_memory) or
cuegui.Utils.memoryToString(frame.data.max_gpu_memory)),
sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
frame.data.used_gpu_memory or
frame.data.max_gpu_memory),
tip="If a frame is running:\n"
"\t The amount of GPU memory currently used by the frame.\n"
"If a frame is not running:\n"
"\t The most GPU memory this frame has used at one time.")

self.addColumn("Remain", 70, id=15,
data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
self.frameEtaDataBuffer.getEtaFormatted(job, frame)
or ""),
sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
self.frameEtaDataBuffer.getEta(job, frame) or -1),
tip="Hours:Minutes:Seconds remaining.")

self.addColumn("Start Time", 100, id=14,
self.addColumn("Start Time", 100, id=16,
data=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""),
sort=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""),
tip="The time the frame was started or retried.")
self.addColumn("Stop Time", 100, id=15,
self.addColumn("Stop Time", 100, id=17,
data=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""),
sort=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""),
tip="The time that the frame finished or died.")

self.addColumn("Last Line", 0, id=16,
self.addColumn("Last Line", 0, id=18,
data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
self.frameLogDataBuffer.getLastLineData(
job, frame)[FrameLogDataBuffer.LASTLINE] or ""),
Expand Down Expand Up @@ -240,7 +257,7 @@ def getCores(frame, format_as_string=False):
"""Gets the number of cores a frame is using."""
cores = None

m = re.search(r".*\/(\d+\.?\d*)", frame.data.last_resource)
m = re.search(r".*\/(\d+\.?\d*)\/.*", frame.data.last_resource)
if m:
cores = float(m.group(1))

Expand All @@ -249,6 +266,20 @@ def getCores(frame, format_as_string=False):

return cores

@staticmethod
def getGpus(frame, format_as_string=False):
"""Gets the number of gpus a frame is using."""
gpus = None

m = re.search(r".*\/.*\/(\d+)", frame.data.last_resource)
if m:
gpus = m.group(1)

if not format_as_string:
gpus = int(gpus)

return gpus

@staticmethod
def getTimeString(timestamp):
"""Gets a timestamp formatted as a string."""
Expand Down
56 changes: 53 additions & 3 deletions cuegui/cuegui/GroupDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent):
__minCores = defaults["minCores"]
__maxCores = defaults["maxCores"]

__defaultJobMinGpus = defaults["defaultJobMinGpus"]
__defaultJobMaxGpus = defaults["defaultJobMaxGpus"]
__minGpus = defaults["minGpus"]
__maxGpus = defaults["maxGpus"]

self.setWindowTitle(__title)
layout.addWidget(QtWidgets.QLabel(__message, self), 0, 1, 1, 3)

Expand Down Expand Up @@ -90,8 +95,25 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent):
__modify and __maxCores != -1.0,
__maxCores, 1)

(self._defaultJobMinGpusCheck, self._defaultJobMinGpusValue) = \
self.__createToggleSpinBox("Job Default Minimum Gpus", 8,
__modify and __defaultJobMinGpus != -1,
__defaultJobMinGpus, 1)
(self._defaultJobMaxGpusCheck, self._defaultJobMaxGpusValue) = \
self.__createToggleSpinBox("Job Default Maximum Gpus", 9,
__modify and __defaultJobMaxGpus != -1,
__defaultJobMaxGpus, 1)
(self._minGpusCheck, self._minGpusValue) = \
self.__createToggleSpinBox("Group Minimum Gpus", 10,
__modify and __minGpus != 0,
__minGpus)
(self._maxGpusCheck, self._maxGpusValue) = \
self.__createToggleSpinBox("Group Maximum Gpus", 11,
__modify and __maxGpus != -1,
__maxGpus, 1)

self.__createButtons(
QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 8, 3)
QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 12, 3)

def __createToggleDoubleSpinBox(
self, text, row, startEnabled = False, currentValue = 0, minValue = 0):
Expand Down Expand Up @@ -169,6 +191,26 @@ def accept(self):
float(self._maxCoresValue.value()),
__group.data.max_cores, float(-1))

self.__setValue(self._defaultJobMinGpusCheck,
__group.setDefaultJobMinGpus,
float(self._defaultJobMinGpusValue.value()),
__group.data.default_job_min_gpus, -1)

self.__setValue(self._defaultJobMaxGpusCheck,
__group.setDefaultJobMaxGpus,
float(self._defaultJobMaxGpusValue.value()),
__group.data.default_job_max_gpus, -1)

self.__setValue(self._minGpusCheck,
__group.setMinGpus,
float(self._minGpusValue.value()),
__group.data.min_gpus, 0)

self.__setValue(self._maxGpusCheck,
__group.setMaxGpus,
float(self._maxGpusValue.value()),
__group.data.max_gpus, -1)

self.close()

@staticmethod
Expand All @@ -195,7 +237,11 @@ def __init__(self, modifyGroup, parent=None):
"defaultJobMinCores": modifyGroup.data.default_job_min_cores,
"defaultJobMaxCores": modifyGroup.data.default_job_max_cores,
"minCores": modifyGroup.data.min_cores,
"maxCores": modifyGroup.data.max_cores}
"maxCores": modifyGroup.data.max_cores,
"defaultJobMinGpus": modifyGroup.data.default_job_min_gpus,
"defaultJobMaxGpus": modifyGroup.data.default_job_max_gpus,
"minGpus": modifyGroup.data.min_gpus,
"maxGpus": modifyGroup.data.max_gpus}
GroupDialog.__init__(self, None, modifyGroup, defaults, parent)


Expand All @@ -212,5 +258,9 @@ def __init__(self, parentGroup, parent=None):
"defaultJobMinCores": 1.0,
"defaultJobMaxCores": 1.0,
"minCores": 0.0,
"maxCores": 1.0}
"maxCores": 1.0,
"defaultJobMinGpus": 0,
"defaultJobMaxGpus": 0,
"minGpus": 0,
"maxGpus": 0}
GroupDialog.__init__(self, parentGroup, None, defaults, parent)
Loading

0 comments on commit d2a82dc

Please sign in to comment.