[cuegui] Sync with proto changes

Co-authored-by: Lars van der Bijl <[email protected]>
AcademySoftwareFoundation · Mar 8, 2021 · a958c3a · a958c3a
1 parent 8f11fe6
commit a958c3a
Show file tree

Hide file tree

Showing 11 changed files with 345 additions and 124 deletions.
diff --git a/cuegui/cuegui/CueJobMonitorTree.py b/cuegui/cuegui/CueJobMonitorTree.py
@@ -98,53 +98,71 @@ def __init__(self, parent):
                        data=lambda job: "%.02f" % job.data.job_stats.reserved_cores,
                        sort=lambda job: job.data.job_stats.reserved_cores,
                        tip="The number of reserved cores.")
-        self.addColumn("Wait", 45, id=6,
+        self.addColumn("Gpus", 55, id=6,
+                       data=lambda job: "%d" % job.data.job_stats.reserved_gpus,
+                       sort=lambda job: job.data.job_stats.reserved_gpus,
+                       tip="The number of reserved gpus.")
+        self.addColumn("Wait", 45, id=7,
                        data=lambda job: job.data.job_stats.waiting_frames,
                        sort=lambda job: job.data.job_stats.waiting_frames,
                        tip="The number of waiting frames.")
-        self.addColumn("Depend", 55, id=7,
+        self.addColumn("Depend", 55, id=8,
                        data=lambda job: job.data.job_stats.depend_frames,
                        sort=lambda job: job.data.job_stats.depend_frames,
                        tip="The number of dependent frames.")
-        self.addColumn("Total", 50, id=8,
+        self.addColumn("Total", 50, id=9,
                        data=lambda job: job.data.job_stats.total_frames,
                        sort=lambda job: job.data.job_stats.total_frames,
                        tip="The total number of frames.")
-        self.addColumn("_Booking Bar", 150, id=9,
+        self.addColumn("_Booking Bar", 150, id=10,
                        delegate=cuegui.ItemDelegate.JobBookingBarDelegate)
-        self.addColumn("Min", 38, id=10,
+        self.addColumn("Min", 38, id=11,
                        data=lambda job: "%.0f" % job.data.min_cores,
                        sort=lambda job: job.data.min_cores,
                        tip="The minimum number of running cores that the cuebot\n"
                            "will try to maintain.")
-        self.addColumn("Max", 38, id=11,
+        self.addColumn("Max", 38, id=12,
                        data=lambda job: "%.0f" % job.data.max_cores,
                        sort=lambda job: job.data.max_cores,
                        tip="The maximum number of running cores that the cuebot\n"
                            "will allow.")
+        self.addColumn("Min Gpus", 38, id=13,
+                       data=lambda job: "%d" % job.data.min_gpus,
+                       sort=lambda job: job.data.min_gpus,
+                       tip="The minimum number of running gpus that the cuebot\n"
+                           "will try to maintain.")
+        self.addColumn("Max Gpus", 38, id=14,
+                       data=lambda job: "%d" % job.data.max_gpus,
+                       sort=lambda job: job.data.max_gpus,
+                       tip="The maximum number of running gpus that the cuebot\n"
+                           "will allow.")
         self.addColumn(
-            "Age", 50, id=12,
+            "Age", 50, id=15,
             data=lambda job: cuegui.Utils.secondsToHHHMM(self.currtime - job.data.start_time),
             sort=lambda job: self.currtime - job.data.start_time,
             tip="The HOURS:MINUTES since the job was launched.")
-        self.addColumn("Pri", 30, id=13,
+        self.addColumn("Pri", 30, id=16,
                        data=lambda job: job.data.priority,
                        sort=lambda job: job.data.priority,
                        tip="The job priority. The cuebot uses this as a suggestion\n"
                            "to determine what job needs the next available matching\n"
                            "resource.")
-        self.addColumn("ETA", 65, id=14,
+        self.addColumn("ETA", 65, id=17,
                        data=lambda job: "",
                        tip="(Inacurate and disabled until a better solution exists)\n"
                            "A very rough estimate of the number of HOURS:MINUTES\n"
                            "it will be before the entire job is done.")
-        self.addColumn("MaxRss", 60, id=15,
+        self.addColumn("MaxRss", 60, id=18,
                        data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_rss),
                        sort=lambda job: job.data.job_stats.max_rss,
                        tip="The most memory used at one time by any single frame.")
-        self.addColumn("_Blank", 20, id=16,
+        self.addColumn("MaxGpuMem", 60, id=19,
+                       data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_gpu_mem),
+                       sort=lambda job: job.data.job_stats.max_gpu_mem,
+                       tip="The most gpu memory used at one time by any single frame.")
+        self.addColumn("_Blank", 20, id=20,
                        tip="Spacer")
-        self.addColumn("Progress", 0, id=17,
+        self.addColumn("Progress", 0, id=21,
                        delegate=cuegui.ItemDelegate.JobThinProgressBarDelegate,
                        tip="A visual overview of the job progress.\n"
                            "Green \t is succeeded\n"
@@ -164,23 +182,31 @@ def __init__(self, parent):
             self.addColumn("", 0, id=5,
                            data=lambda group: "%.2f" % group.data.stats.reserved_cores)
             self.addColumn("", 0, id=6,
+                           data=lambda group: "%d" % group.data.stats.reserved_gpus)
+            self.addColumn("", 0, id=7,
                            data=lambda group: group.data.stats.waiting_frames)
-            self.addColumn("", 0, id=7)
             self.addColumn("", 0, id=8)
-            self.addColumn("", 0, id=9,
-                           data=lambda group: (group.data.min_cores or ""))
+            self.addColumn("", 0, id=9)
             self.addColumn("", 0, id=10,
+                           data=lambda group: (group.data.min_cores or ""))
+            self.addColumn("", 0, id=11,
                            data=lambda group: (
                                    group.data.max_cores > 0 and group.data.max_cores or ""))
-            self.addColumn("", 0, id=11)
-            self.addColumn("", 0, id=12)
-            self.addColumn("", 0, id=13)
+            self.addColumn("", 0, id=12,
+                           data=lambda group: (group.data.min_gpus or ""))
+            self.addColumn("", 0, id=13,
+                           data=lambda group: (
+                                   group.data.max_gpus > 0 and group.data.max_gpus or ""))
             self.addColumn("", 0, id=14)
             self.addColumn("", 0, id=15)
-            self.addColumn("", 0, id=16,
+            self.addColumn("", 0, id=16)
+            self.addColumn("", 0, id=17)
+            self.addColumn("", 0, id=18)
+            self.addColumn("", 0, id=19)
+            self.addColumn("", 0, id=20,
                            data=lambda group: (group.data.department != "Unknown" and
                                                group.data.department or ""))
-            self.addColumn("", 0, id=17)
+            self.addColumn("", 0, id=21)
 
         cuegui.AbstractTreeWidget.AbstractTreeWidget.__init__(self, parent)
 
@@ -528,6 +554,8 @@ def contextMenuEvent(self, e):
             menu.addSeparator()
             self.__menuActions.jobs().addAction(menu, "setMinCores")
             self.__menuActions.jobs().addAction(menu, "setMaxCores")
+            self.__menuActions.jobs().addAction(menu, "setMinGpu")
+            self.__menuActions.jobs().addAction(menu, "setMaxGpu")
             self.__menuActions.jobs().addAction(menu, "setPriority")
             self.__menuActions.jobs().addAction(menu, "setMaxRetries")
             if counts["job"] == 1:

diff --git a/cuegui/cuegui/FrameMonitorTree.py b/cuegui/cuegui/FrameMonitorTree.py
@@ -100,25 +100,29 @@ def __init__(self, parent):
                        data=lambda job, frame: (self.getCores(frame, format_as_string=True) or ""),
                        sort=lambda job, frame: (self.getCores(frame)),
                        tip="The number of cores a frame is using")
-        self.addColumn("Host", 120, id=6,
+        self.addColumn("GPUs", 55, id=6,
+                       data=lambda job, frame: (self.getGpus(frame, format_as_string=True) or ""),
+                       sort=lambda job, frame: (self.getGpus(frame)),
+                       tip="The number of gpus a frame is using")
+        self.addColumn("Host", 120, id=7,
                        data=lambda job, frame: frame.data.last_resource,
                        sort=lambda job, frame: frame.data.last_resource,
                        tip="The last or current resource that the frame used or is using.")
-        self.addColumn("Retries", 55, id=7,
+        self.addColumn("Retries", 55, id=8,
                        data=lambda job, frame: frame.data.retry_count,
                        sort=lambda job, frame: frame.data.retry_count,
                        tip="The number of times that each frame has had to retry.")
-        self.addColumn("_CheckpointEnabled", 20, id=8,
+        self.addColumn("_CheckpointEnabled", 20, id=9,
                        data=lambda job, frame: "",
                        sort=lambda job, frame: (
                                frame.data.checkpoint_state == opencue.api.job_pb2.ENABLED),
                        tip="A green check mark here indicates the frame has written out at least "
                            "1 checkpoint segment.")
-        self.addColumn("CheckP", 55, id=9,
+        self.addColumn("CheckP", 55, id=10,
                        data=lambda job, frame: frame.data.checkpoint_count,
                        sort=lambda job, frame: frame.data.checkpoint_count,
                        tip="The number of times a frame has been checkpointed.")
-        self.addColumn("Runtime", 70, id=10,
+        self.addColumn("Runtime", 70, id=11,
                        data=lambda job, frame: (cuegui.Utils.secondsToHMMSS(
                            frame.data.start_time and
                            frame.data.stop_time and
@@ -138,7 +142,7 @@ def __init__(self, parent):
                        tip="The amount of HOURS:MINUTES:SECONDS that the frame\n"
                            "has run for or last ran for.\n")
 
-        self.addColumn("LLU", 70, id=11,
+        self.addColumn("LLU", 70, id=12,
                        data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
                                                 self.frameLogDataBuffer.getLastLineData(
                                                     job, frame)[FrameLogDataBuffer.LLU] or ""),
@@ -150,7 +154,7 @@ def __init__(self, parent):
                            "time without an update is an indication of a stuck\n"
                            "frame for most types of jobs")
 
-        self.addColumn("Memory", 60, id=12,
+        self.addColumn("Memory", 60, id=13,
                        data=lambda job, frame: (
                                frame.data.state == opencue.api.job_pb2.RUNNING and
                                cuegui.Utils.memoryToString(frame.data.used_memory) or
@@ -162,24 +166,37 @@ def __init__(self, parent):
                            "If a frame is not running:\n"
                            "\t The most memory this frame has used at one time.")
 
-        self.addColumn("Remain", 70, id=13,
+        self.addColumn("GPU Memory", 60, id=14,
+                       data=lambda job, frame: (
+                               frame.data.state == opencue.api.job_pb2.RUNNING and
+                               cuegui.Utils.memoryToString(frame.data.used_gpu_memory) or
+                               cuegui.Utils.memoryToString(frame.data.max_gpu_memory)),
+                       sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
+                                                frame.data.used_gpu_memory or
+                                                frame.data.max_gpu_memory),
+                       tip="If a frame is running:\n"
+                           "\t The amount of GPU memory currently used by the frame.\n"
+                           "If a frame is not running:\n"
+                           "\t The most GPU memory this frame has used at one time.")
+
+        self.addColumn("Remain", 70, id=15,
                        data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
                                                 self.frameEtaDataBuffer.getEtaFormatted(job, frame)
                                                 or ""),
                        sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
                                                 self.frameEtaDataBuffer.getEta(job, frame) or -1),
                        tip="Hours:Minutes:Seconds remaining.")
 
-        self.addColumn("Start Time", 100, id=14,
+        self.addColumn("Start Time", 100, id=16,
                        data=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""),
                        sort=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""),
                        tip="The time the frame was started or retried.")
-        self.addColumn("Stop Time", 100, id=15,
+        self.addColumn("Stop Time", 100, id=17,
                        data=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""),
                        sort=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""),
                        tip="The time that the frame finished or died.")
 
-        self.addColumn("Last Line", 0, id=16,
+        self.addColumn("Last Line", 0, id=18,
                        data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and
                                                 self.frameLogDataBuffer.getLastLineData(
                                                     job, frame)[FrameLogDataBuffer.LASTLINE] or ""),
@@ -240,7 +257,7 @@ def getCores(frame, format_as_string=False):
         """Gets the number of cores a frame is using."""
         cores = None
 
-        m = re.search(r".*\/(\d+\.?\d*)", frame.data.last_resource)
+        m = re.search(r".*\/(\d+\.?\d*)\/.*", frame.data.last_resource)
         if m:
             cores = float(m.group(1))
 
@@ -249,6 +266,20 @@ def getCores(frame, format_as_string=False):
 
         return cores
 
+    @staticmethod
+    def getGpus(frame, format_as_string=False):
+        """Gets the number of gpus a frame is using."""
+        gpus = None
+
+        m = re.search(r".*\/.*\/(\d+)", frame.data.last_resource)
+        if m:
+            gpus = m.group(1)
+
+            if not format_as_string:
+                gpus = int(gpus)
+
+        return gpus
+
     @staticmethod
     def getTimeString(timestamp):
         """Gets a timestamp formatted as a string."""

diff --git a/cuegui/cuegui/GroupDialog.py b/cuegui/cuegui/GroupDialog.py
@@ -56,6 +56,11 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent):
         __minCores = defaults["minCores"]
         __maxCores = defaults["maxCores"]
 
+        __defaultJobMinGpus = defaults["defaultJobMinGpus"]
+        __defaultJobMaxGpus = defaults["defaultJobMaxGpus"]
+        __minGpus = defaults["minGpus"]
+        __maxGpus = defaults["maxGpus"]
+
         self.setWindowTitle(__title)
         layout.addWidget(QtWidgets.QLabel(__message, self), 0, 1, 1, 3)
 
@@ -90,8 +95,25 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent):
                                              __modify and __maxCores != -1.0,
                                              __maxCores, 1)
 
+        (self._defaultJobMinGpusCheck, self._defaultJobMinGpusValue) = \
+            self.__createToggleSpinBox("Job Default Minimum Gpus", 8,
+                                             __modify and __defaultJobMinGpus != -1,
+                                             __defaultJobMinGpus, 1)
+        (self._defaultJobMaxGpusCheck, self._defaultJobMaxGpusValue) = \
+            self.__createToggleSpinBox("Job Default Maximum Gpus", 9,
+                                             __modify and __defaultJobMaxGpus != -1,
+                                             __defaultJobMaxGpus, 1)
+        (self._minGpusCheck, self._minGpusValue) = \
+            self.__createToggleSpinBox("Group Minimum Gpus", 10,
+                                             __modify and __minGpus != 0,
+                                             __minGpus)
+        (self._maxGpusCheck, self._maxGpusValue) = \
+            self.__createToggleSpinBox("Group Maximum Gpus", 11,
+                                             __modify and __maxGpus != -1,
+                                             __maxGpus, 1)
+
         self.__createButtons(
-            QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 8, 3)
+            QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 12, 3)
 
     def __createToggleDoubleSpinBox(
             self, text, row, startEnabled = False, currentValue = 0, minValue = 0):
@@ -169,6 +191,26 @@ def accept(self):
                         float(self._maxCoresValue.value()),
                         __group.data.max_cores, float(-1))
 
+        self.__setValue(self._defaultJobMinGpusCheck,
+                        __group.setDefaultJobMinGpus,
+                        float(self._defaultJobMinGpusValue.value()),
+                        __group.data.default_job_min_gpus, -1)
+
+        self.__setValue(self._defaultJobMaxGpusCheck,
+                        __group.setDefaultJobMaxGpus,
+                        float(self._defaultJobMaxGpusValue.value()),
+                        __group.data.default_job_max_gpus, -1)
+
+        self.__setValue(self._minGpusCheck,
+                        __group.setMinGpus,
+                        float(self._minGpusValue.value()),
+                        __group.data.min_gpus, 0)
+
+        self.__setValue(self._maxGpusCheck,
+                        __group.setMaxGpus,
+                        float(self._maxGpusValue.value()),
+                        __group.data.max_gpus, -1)
+
         self.close()
 
     @staticmethod
@@ -195,7 +237,11 @@ def __init__(self, modifyGroup, parent=None):
             "defaultJobMinCores": modifyGroup.data.default_job_min_cores,
             "defaultJobMaxCores": modifyGroup.data.default_job_max_cores,
             "minCores": modifyGroup.data.min_cores,
-            "maxCores": modifyGroup.data.max_cores}
+            "maxCores": modifyGroup.data.max_cores,
+            "defaultJobMinGpus": modifyGroup.data.default_job_min_gpus,
+            "defaultJobMaxGpus": modifyGroup.data.default_job_max_gpus,
+            "minGpus": modifyGroup.data.min_gpus,
+            "maxGpus": modifyGroup.data.max_gpus}
         GroupDialog.__init__(self, None, modifyGroup, defaults, parent)
 
 
@@ -212,5 +258,9 @@ def __init__(self, parentGroup, parent=None):
             "defaultJobMinCores": 1.0,
             "defaultJobMaxCores": 1.0,
             "minCores": 0.0,
-            "maxCores": 1.0}
+            "maxCores": 1.0,
+            "defaultJobMinGpus": 0,
+            "defaultJobMaxGpus": 0,
+            "minGpus": 0,
+            "maxGpus": 0}
         GroupDialog.__init__(self, parentGroup, None, defaults, parent)