From deef8d047085e16c9c4d3825fddddcb00e82a53a Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 16 Oct 2024 15:55:35 -0700 Subject: [PATCH 01/51] Add runDocker mode to rqd When RUN_ON_DOCKER is set on rqd.conf, each frame will be launched as a docker container using the base image configured as DOCKER_IMAGE. --- requirements.txt | 4 +- rqd/rqd.example.conf | 12 + rqd/rqd/rqconstants.py | 41 + rqd/rqd/rqcore.py | 1854 +++++++++++++++++++++------------------- 4 files changed, 1030 insertions(+), 881 deletions(-) diff --git a/requirements.txt b/requirements.txt index cceee9237..dc0f8d570 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,6 @@ six==1.16.0 # Optional requirements # Sentry support for rqd -sentry-sdk==2.11.0 \ No newline at end of file +sentry-sdk==2.11.0 + +docker==7.1.0 \ No newline at end of file diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index e51782272..78c9cfdab 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -27,3 +27,15 @@ SYSTEMDRIVE MAYA_MODULE_PATH MAYA_SCRIPT_PATH PIXAR_LICENSE_FILE + +[docker.config] +DOCKER_IMAGE="" +RUN_ON_DOCKER=False + +[docker.mounts] +MCP="type=bind,source=/mcp,target=/mcp,bind-propagation=slave" +NET="type=bind,source=/net,target=/net,bind-propagation=slave" +TMP="type=bind,source=/tmp,target=/tmp,bind-propagation=slave" +SCRATCH="type=bind,source=/scratch,target=/scratch,bind-propagation=slave" +LIMITS="type=bind,source=/etc/security/limits.d/,target=/etc/security/limits.d/,bind-propagation=slave" +FUSE="type=bind,source=/dev/fuse,target=/dev/fuse,bind-propagation=shared" \ No newline at end of file diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 6f23ebc89..54239d321 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -153,11 +153,18 @@ SP_OS = platform.system() +# Docker mode config +RUN_ON_DOCKER = False +DOCKER_IMAGE = "Invalid" +DOCKER_MOUNTS = [] + try: if os.path.isfile(CONFIG_FILE): # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: @@ -230,6 +237,40 @@ if config.has_section(__host_env_var_section): RQD_HOST_ENV_VARS = config.options(__host_env_var_section) + if config.has_section(__docker_config): + RUN_ON_DOCKER = config.getboolean(__docker_config, "RUN_ON_DOCKER") + if RUN_ON_DOCKER: + import docker + import docker.models + import docker.types + + def parse_mount(mount_str): + """ + Parse mount definitions similar to a docker run command into a docker + mount obj + + Format: type=bind,source=/tmp,target=/tmp,bind-propagation=slave + """ + mount_dict = {} + # bind-propagation defaults to None as only type=bind accepts it + mount_dict["bind-propagation"] = None + for item in mount_str.split(","): + key, value = item.split("=") + mount_dic[key.strip()] = value.strip() + return mount_dic + + DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") + # Parse values under the category docker.mounts into Mount objects + mounts = config.options(__docker_mounts) + for mount_name in mounts: + mount_str = config.get(__docker_mounts, mount_name) + mount_dic = parse_mount(mount_str) + mount = docker.types.Mount(mount_dic["target"], + mount_dic["source"], + type=mount_dic["type"], + propagation=mount_dic["bind-propagation"]) + DOCKER_MOUNTS.append(mount) + # pylint: disable=broad-except except Exception as e: logging.warning( diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 5b85efe75..4bb0de433 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -51,1059 +51,1153 @@ log = logging.getLogger(__name__) -class FrameAttendantThread(threading.Thread): - """Once a frame has been received and checked by RQD, this class handles - the launching, waiting on, and cleanup work related to running the - frame.""" - def __init__(self, rqCore, runFrame, frameInfo): - """FrameAttendantThread class initialization - @type rqCore: RqCore - @param rqCore: Main RQD Object - @type runFrame: RunFrame - @param runFrame: rqd_pb2.RunFrame - @type frameInfo: rqd.rqnetwork.RunningFrame - @param frameInfo: Servant for running frame - """ - threading.Thread.__init__(self) - self.rqCore = rqCore - self.frameId = runFrame.frame_id - self.runFrame = runFrame - self.startTime = 0 - self.endTime = 0 - self.frameInfo = frameInfo - self._tempLocations = [] - self.rqlog = None +class RqCore(object): + """Main body of RQD, handles the integration of all components, + the setup and launching of a frame and acts on all gRPC calls + that are passed from the Network module.""" - def __createEnvVariables(self): - """Define the environmental variables for the frame""" - # If linux specific, they need to move into self.runLinux() - # pylint: disable=attribute-defined-outside-init - self.frameEnv = {} - self.frameEnv["PATH"] = self.rqCore.machine.getPathEnv() - self.frameEnv["TERM"] = "unknown" - self.frameEnv["TZ"] = self.rqCore.machine.getTimezone() - self.frameEnv["USER"] = self.runFrame.user_name - self.frameEnv["LOGNAME"] = self.runFrame.user_name - self.frameEnv["mcp"] = "1" - self.frameEnv["show"] = self.runFrame.show - self.frameEnv["shot"] = self.runFrame.shot - self.frameEnv["jobid"] = self.runFrame.job_name - self.frameEnv["jobhost"] = self.rqCore.machine.getHostname() - self.frameEnv["frame"] = self.runFrame.frame_name - self.frameEnv["zframe"] = self.runFrame.frame_name - self.frameEnv["logfile"] = self.runFrame.log_file - self.frameEnv["maxframetime"] = "0" - self.frameEnv["minspace"] = "200" - self.frameEnv["CUE3"] = "True" - self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) - self.frameEnv["SP_NOMYCSHRC"] = "1" + def __init__(self, optNimbyoff=False): + """RqCore class initialization""" + self.__whenIdle = False + self.__reboot = False - if platform.system() == "Windows": - for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: - if variable in os.environ: - self.frameEnv[variable] = os.environ[variable] - for variable in rqd.rqconstants.RQD_HOST_ENV_VARS: - # Fallback to empty string, easy to spot what is missing in the log - self.frameEnv[variable] = os.environ.get(variable, '') + self.__optNimbyoff = optNimbyoff - for key, value in self.runFrame.environment.items(): - if key == 'PATH': - self.frameEnv[key] += os.pathsep + value - else: - self.frameEnv[key] = value + self.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=0, + idle_cores=0, + locked_cores=0, + booked_cores=0, + reserved_cores=[], + ) - # Add threads to use all assigned hyper-threading cores - if 'CPU_LIST' in self.runFrame.attributes and 'CUE_THREADS' in self.frameEnv: - self.frameEnv['CUE_THREADS'] = str(max( - int(self.frameEnv['CUE_THREADS']), - len(self.runFrame.attributes['CPU_LIST'].split(',')))) - self.frameEnv['CUE_HT'] = "True" + self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self) - # Add GPU's to use all assigned GPU cores - if 'GPU_LIST' in self.runFrame.attributes: - self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + self.machine = rqd.rqmachine.Machine(self, self.cores) - # pylint: disable=inconsistent-return-statements - def _createCommandFile(self, command): - """Creates a file that subprocess. Popen then executes. - @type command: string - @param command: The command specified in the runFrame request - @rtype: string - @return: Command file location""" - # TODO: this should use tempfile to create the files and clean them up afterwards - try: - if platform.system() == "Windows": - rqd_tmp_dir = os.path.join(tempfile.gettempdir(), 'rqd') - try: - os.mkdir(rqd_tmp_dir) - except OSError: - pass # okay, already exists + self.network = rqd.rqnetwork.Network(self) + self.__threadLock = threading.Lock() + self.__cache = {} - # Windows Batch needs some characters escaped: - command = command.replace('%', '%%') - for char in '^&<>|': - command = command.replace(char, '^' + char) + self.updateRssThread = None + self.onIntervalThread = None + self.intervalStartTime = None + self.intervalSleepTime = rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC - commandFile = os.path.join( - rqd_tmp_dir, - 'cmd-%s-%s.bat' % (self.runFrame.frame_id, time.time())) + # pylint: disable=unused-private-member + self.__cluster = None + self.__session = None + self.__stmt = None + + self.docker_client = None + self.docker_mounts = [] + self.docker_image = "Invalid" + if rqd.rqconstants.RUN_ON_DOCKER: + import docker + self.docker_client = docker.from_env() + self.docker_image = rqd.rqconstants.DOCKER_IMAGE + self.docker_mounts = rqd.rqconstants.DOCKER_MOUNTS + + signal.signal(signal.SIGINT, self.handleExit) + signal.signal(signal.SIGTERM, self.handleExit) + + def start(self): + """Called by main to start the rqd service""" + if self.machine.isDesktop(): + if self.__optNimbyoff: + log.warning('Nimby startup has been disabled via --nimbyoff') + elif not rqd.rqconstants.OVERRIDE_NIMBY: + if rqd.rqconstants.OVERRIDE_NIMBY is None: + log.warning('OVERRIDE_NIMBY is not defined, Nimby startup has been disabled') + else: + log.warning('OVERRIDE_NIMBY is False, Nimby startup has been disabled') else: - commandFile = os.path.join(tempfile.gettempdir(), - 'rqd-cmd-%s-%s' % (self.runFrame.frame_id, time.time())) - with open(commandFile, "w", encoding='utf-8') as rqexe: - self._tempLocations.append(commandFile) - rqexe.write(command) - rqexe.close() - os.chmod(commandFile, 0o777) - return commandFile - # pylint: disable=broad-except - except Exception as e: - log.critical( - "Unable to make command file: %s due to %s at %s", - commandFile, e, traceback.extract_tb(sys.exc_info()[2])) + self.nimbyOn() + elif rqd.rqconstants.OVERRIDE_NIMBY: + log.warning('Nimby startup has been triggered by OVERRIDE_NIMBY') + self.nimbyOn() + self.network.start_grpc() - def __writeHeader(self): - """Writes the frame's log header""" + def grpcConnected(self): + """After gRPC connects to the cuebot, this function is called""" + self.network.reportRqdStartup(self.machine.getBootReport()) - self.startTime = time.time() + self.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) + self.updateRssThread.start() - try: - print("="*59, file=self.rqlog) - print("RenderQ JobSpec %s" % time.ctime(self.startTime), "\n", file=self.rqlog) - print("proxy rqd.rqnetwork.RunningFrame/%s -t:tcp -h %s -p 10021" % ( - self.runFrame.frame_id, - self.rqCore.machine.getHostname()), file=self.rqlog) - print("%-21s%s" % ("command", self.runFrame.command), file=self.rqlog) - print("%-21s%s" % ("uid", self.runFrame.uid), file=self.rqlog) - print("%-21s%s" % ("gid", self.runFrame.gid), file=self.rqlog) - print("%-21s%s" % ("logDestination", - self.runFrame.log_dir_file), file=self.rqlog) - print("%-21s%s" % ("cwd", self.runFrame.frame_temp_dir), file=self.rqlog) - print("%-21s%s" % ("renderHost", - self.rqCore.machine.getHostname()), file=self.rqlog) - print("%-21s%s" % ("jobId", self.runFrame.job_id), file=self.rqlog) - print("%-21s%s" % ("frameId", self.runFrame.frame_id), file=self.rqlog) - for env in sorted(self.frameEnv): - print("%-21s%s=%s" % ("env", env, self.frameEnv[env]), file=self.rqlog) - print("="*59, file=self.rqlog) + self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) + self.intervalStartTime = time.time() + self.onIntervalThread.start() - if 'CPU_LIST' in self.runFrame.attributes: - print('Hyper-threading enabled', file=self.rqlog) + log.warning('RQD Started') + + def onInterval(self, sleepTime=None): + """This is called by self.grpcConnected as a timer thread to execute + every interval""" + if sleepTime is None: + self.intervalSleepTime = random.randint( + rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC, + rqd.rqconstants.RQD_MAX_PING_INTERVAL_SEC) + else: + self.intervalSleepTime = sleepTime + try: + self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) + self.intervalStartTime = time.time() + self.onIntervalThread.start() # pylint: disable=broad-except except Exception as e: log.critical( - "Unable to write header to rqlog: %s due to %s at %s", - self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - - def __writeFooter(self): - """Writes frame's log footer""" + 'Unable to schedule a ping due to %s at %s', + e, traceback.extract_tb(sys.exc_info()[2])) - self.endTime = time.time() - self.frameInfo.runTime = int(self.endTime - self.startTime) try: - print("", file=self.rqlog) - print("="*59, file=self.rqlog) - print("RenderQ Job Complete\n", file=self.rqlog) - print("%-20s%s" % ("exitStatus", self.frameInfo.exitStatus), file=self.rqlog) - print("%-20s%s" % ("exitSignal", self.frameInfo.exitSignal), file=self.rqlog) - if self.frameInfo.killMessage: - print("%-20s%s" % ("killMessage", self.frameInfo.killMessage), file=self.rqlog) - print("%-20s%s" % ("startTime", - time.ctime(self.startTime)), file=self.rqlog) - print("%-20s%s" % ("endTime", - time.ctime(self.endTime)), file=self.rqlog) - print("%-20s%s" % ("maxrss", self.frameInfo.maxRss), file=self.rqlog) - print("%-20s%s" % ("maxUsedGpuMemory", - self.frameInfo.maxUsedGpuMemory), file=self.rqlog) - print("%-20s%s" % ("utime", self.frameInfo.utime), file=self.rqlog) - print("%-20s%s" % ("stime", self.frameInfo.stime), file=self.rqlog) - print("%-20s%s" % ("renderhost", self.rqCore.machine.getHostname()), file=self.rqlog) - - print("%-20s%s" % ("maxrss (KB)", self.frameInfo.maxRss), file=self.rqlog) - for child in sorted(self.frameInfo.childrenProcs.items(), - key=lambda item: item[1]['start_time']): - print("\t%-20s%s" % (child[1]['name'], child[1]['rss']), file=self.rqlog) - print("\t%-20s%s" % ("start_time", - datetime.timedelta(seconds=child[1]["start_time"])), - file=self.rqlog) - print("\t%-20s%s" % ("cmdline", " ".join(child[1]["cmd_line"])), file=self.rqlog) - - print("="*59, file=self.rqlog) + if self.__whenIdle and not self.__cache: + if not self.machine.isUserLoggedIn(): + self.shutdownRqdNow() + else: + log.warning('Shutdown requested but a user is logged in.') + # pylint: disable=broad-except + except Exception as e: + log.warning( + 'Unable to shutdown due to %s at %s', e, traceback.extract_tb(sys.exc_info()[2])) + try: + self.sendStatusReport() # pylint: disable=broad-except except Exception as e: log.critical( - "Unable to write footer: %s due to %s at %s", - self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) + 'Unable to send status report due to %s at %s', + e, traceback.extract_tb(sys.exc_info()[2])) - def __cleanup(self): - """Cleans up temporary files""" - rqd.rqutil.permissionsHigh() - try: - for location in self._tempLocations: - if os.path.isfile(location): - try: - os.remove(location) - # pylint: disable=broad-except - except Exception as e: - log.warning( - "Unable to delete file: %s due to %s at %s", - location, e, traceback.extract_tb(sys.exc_info()[2])) - finally: - rqd.rqutil.permissionsLow() + def updateRss(self): + """Triggers and schedules the updating of rss information""" + if self.__cache: + try: + self.machine.rssUpdate(self.__cache) + finally: + self.updateRssThread = threading.Timer( + rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) + self.updateRssThread.start() - # Close log file - try: - self.rqlog.close() - # pylint: disable=broad-except - except Exception as e: - log.warning( - "Unable to close file: %s due to %s at %s", - self.runFrame.log_file, e, traceback.extract_tb(sys.exc_info()[2])) + def getFrame(self, frameId): + """Gets a frame from the cache based on frameId + @type frameId: string + @param frameId: A frame's unique Id + @rtype: rqd.rqnetwork.RunningFrame + @return: rqd.rqnetwork.RunningFrame object""" + return self.__cache[frameId] - def runLinux(self): - """The steps required to handle a frame under linux""" - frameInfo = self.frameInfo - runFrame = self.runFrame + def getFrameKeys(self): + """Gets a list of all keys from the cache + @rtype: list + @return: List of all frameIds running on host""" + return list(self.__cache.keys()) - self.__createEnvVariables() - self.__writeHeader() + def storeFrame(self, frameId, runningFrame): + """Stores a frame in the cache and adds the network adapter + @type frameId: string + @param frameId: A frame's unique Id + @type runningFrame: rqd.rqnetwork.RunningFrame + @param runningFrame: rqd.rqnetwork.RunningFrame object""" + with self.__threadLock: + if frameId in self.__cache: + raise rqd.rqexceptions.RqdException( + "frameId " + frameId + " is already running on this machine") + self.__cache[frameId] = runningFrame - tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), - frameInfo.frameId, - time.time()) - self._tempLocations.append(tempStatFile) - tempCommand = [] - if self.rqCore.machine.isDesktop(): - tempCommand += ["/bin/nice"] - tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] + def deleteFrame(self, frameId): + """Deletes a frame from the cache + @type frameId: string + @param frameId: A frame's unique Id""" + with self.__threadLock: + if frameId in self.__cache: + del self.__cache[frameId] + # pylint: disable=no-member + if not self.__cache and self.cores.reserved_cores: + # pylint: disable=no-member + log.error( + 'No running frames but reserved_cores is not empty: %s', + self.cores.reserved_cores) + # pylint: disable=no-member + self.cores.reserved_cores.clear() + log.info("Successfully delete frame with Id: %s", frameId) + else: + log.warning("Frame with Id: %s not found in cache", frameId) - if 'CPU_LIST' in runFrame.attributes: - tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] + def killAllFrame(self, reason): + """Will execute .kill() on every frame in cache until no frames remain + @type reason: string + @param reason: Reason for requesting all frames to be killed""" - rqd.rqutil.permissionsHigh() - try: - if rqd.rqconstants.RQD_BECOME_JOB_USER: - tempCommand += ["/bin/su", runFrame.user_name, rqd.rqconstants.SU_ARGUMENT, - '"' + self._createCommandFile(runFrame.command) + '"'] + if self.__cache: + log.warning( + "killAllFrame called due to: %s\n%s", reason, ",".join(self.getFrameKeys())) + + while self.__cache: + if reason.startswith("NIMBY"): + # Since this is a nimby kill, ignore any frames that are ignoreNimby + frameKeys = [ + frame.frameId for frame in list(self.__cache.values()) if not frame.ignoreNimby] else: - tempCommand += [self._createCommandFile(runFrame.command)] + frameKeys = list(self.__cache.keys()) - # pylint: disable=subprocess-popen-preexec-fn,consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - cwd=self.rqCore.machine.getTempPath(), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - close_fds=True, - preexec_fn=os.setsid) - finally: - rqd.rqutil.permissionsLow() + if not frameKeys: + # No frames left to kill + return - frameInfo.pid = frameInfo.forkedCommand.pid + for frameKey in frameKeys: + try: + self.__cache[frameKey].kill(reason) + except KeyError: + pass + time.sleep(1) - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + def releaseCores(self, reqRelease, releaseHT=None, releaseGpus=None): + """The requested number of cores are released + @type reqRelease: int + @param reqRelease: Number of cores to release, 100 = 1 physical core""" + with self.__threadLock: + # pylint: disable=no-member + self.cores.booked_cores -= reqRelease + maxRelease = (self.cores.total_cores - + self.cores.locked_cores - + self.cores.idle_cores - + self.cores.booked_cores) - poller = select.poll() - poller.register(frameInfo.forkedCommand.stdout, select.POLLIN) - poller.register(frameInfo.forkedCommand.stderr, select.POLLIN) - while True: - for fd, event in poller.poll(): - if event & select.POLLIN: - if fd == frameInfo.forkedCommand.stdout.fileno(): - line = frameInfo.forkedCommand.stdout.readline() - elif fd == frameInfo.forkedCommand.stderr.fileno(): - line = frameInfo.forkedCommand.stderr.readline() - else: - continue - if not line: - break - self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - if frameInfo.forkedCommand.poll() is not None: - break + if maxRelease > 0: + self.cores.idle_cores += min(maxRelease, reqRelease) + # pylint: enable=no-member - returncode = frameInfo.forkedCommand.wait() + if releaseHT: + self.machine.releaseHT(releaseHT) - # Find exitStatus and exitSignal - if returncode < 0: - # Exited with a signal - frameInfo.exitStatus = 1 - frameInfo.exitSignal = -returncode + if releaseGpus: + self.machine.releaseGpus(releaseGpus) + + # pylint: disable=no-member + if self.cores.idle_cores > self.cores.total_cores: + log.critical( + "idle_cores (%d) have become greater than total_cores (%d): %s at %s", + self.cores.idle_cores, self.cores.total_cores, sys.exc_info()[0], + traceback.extract_tb(sys.exc_info()[2])) + # pylint: enable=no-member + + def shutdown(self): + """Shuts down all rqd systems""" + self.nimbyOff() + if self.onIntervalThread is not None: + self.onIntervalThread.cancel() + if self.updateRssThread is not None: + self.updateRssThread.cancel() + elif self.__reboot: + log.warning("Rebooting machine by request") + self.machine.reboot() else: - frameInfo.exitStatus = returncode - frameInfo.exitSignal = 0 + log.warning("Shutting down RQD by request. pid(%s)", os.getpid()) + self.network.stopGrpc() + # Using sys.exit would raise SystemExit, giving exception handlers a chance + # to block this + # pylint: disable=protected-access + os._exit(0) - try: - with open(tempStatFile, "r", encoding='utf-8') as statFile: - frameInfo.realtime = statFile.readline().split()[1] - frameInfo.utime = statFile.readline().split()[1] - frameInfo.stime = statFile.readline().split()[1] - statFile.close() - # pylint: disable=broad-except - except Exception: - pass # This happens when frames are killed + def handleExit(self, signalnum, flag): + """Shutdown threads and exit RQD.""" + del signalnum + del flag + self.shutdown() - self.__writeFooter() - self.__cleanup() + def launchFrame(self, runFrame): + """This will setup for the launch the frame specified in the arguments. + If a problem is encountered, a CueException will be thrown. + @type runFrame: RunFrame + @param runFrame: rqd_pb2.RunFrame""" + log.info("Running command %s for %s", runFrame.command, runFrame.frame_id) + log.debug(runFrame) - def runWindows(self): - """The steps required to handle a frame under windows""" - frameInfo = self.frameInfo - runFrame = self.runFrame + # + # Check for reasons to abort launch + # - self.__createEnvVariables() - self.__writeHeader() + if self.machine.state != rqd.compiled_proto.host_pb2.UP: + err = "Not launching, rqd HardwareState is not Up" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - try: - runFrame.command = runFrame.command.replace('%{frame}', self.frameEnv['CUE_IFRAME']) - tempCommand = [self._createCommandFile(runFrame.command)] + if self.__whenIdle: + err = "Not launching, rqd is waiting for idle to shutdown" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - # pylint: disable=consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - # pylint: disable=broad-except - except Exception: - log.critical( - "Failed subprocess.Popen: Due to: \n%s", - ''.join(traceback.format_exception(*sys.exc_info()))) + if self.nimby.locked and not runFrame.ignore_nimby: + err = "Not launching, rqd is lockNimby and not Ignore Nimby" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - frameInfo.pid = frameInfo.forkedCommand.pid + if rqd.rqconstants.OVERRIDE_NIMBY and self.nimby.isNimbyActive(): + err = "Not launching, rqd is lockNimby and User is Active" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + if runFrame.frame_id in self.__cache: + err = "Not launching, frame is already running on this proc %s" % runFrame.frame_id + log.critical(err) + raise rqd.rqexceptions.DuplicateFrameViolationException(err) + + if runFrame.HasField("uid") and runFrame.uid <= 0: + err = "Not launching, will not run frame as uid=%d" % runFrame.uid + log.warning(err) + raise rqd.rqexceptions.InvalidUserException(err) + + if runFrame.num_cores <= 0: + err = "Not launching, numCores must be > 0" + log.warning(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - while True: - output = frameInfo.forkedCommand.stdout.readline() - if not output and frameInfo.forkedCommand.poll() is not None: - break - if output: - self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + # See if all requested cores are available + with self.__threadLock: + # pylint: disable=no-member + if self.cores.idle_cores < runFrame.num_cores: + err = "Not launching, insufficient idle cores" + log.critical(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) + # pylint: enable=no-member - frameInfo.forkedCommand.wait() + if runFrame.environment.get('CUE_THREADABLE') == '1': + reserveHT = self.machine.reserveHT(runFrame.num_cores) + if reserveHT: + runFrame.attributes['CPU_LIST'] = reserveHT - # Find exitStatus and exitSignal - returncode = frameInfo.forkedCommand.returncode - if returncode < INT32_MIN: - returncode = 303 - if returncode > INT32_MAX: - returncode = 304 - frameInfo.exitStatus = returncode - frameInfo.exitSignal = returncode + if runFrame.num_gpus: + reserveGpus = self.machine.reserveGpus(runFrame.num_gpus) + if reserveGpus: + runFrame.attributes['GPU_LIST'] = reserveGpus - frameInfo.realtime = 0 - frameInfo.utime = 0 - frameInfo.stime = 0 + # They must be available at this point, reserve them + # pylint: disable=no-member + self.cores.idle_cores -= runFrame.num_cores + self.cores.booked_cores += runFrame.num_cores + # pylint: enable=no-member - self.__writeFooter() - self.__cleanup() + runningFrame = rqd.rqnetwork.RunningFrame(self, runFrame) + runningFrame.frameAttendantThread = FrameAttendantThread(self, runFrame, runningFrame) + runningFrame.frameAttendantThread.start() - def runDarwin(self): - """The steps required to handle a frame under mac""" - frameInfo = self.frameInfo + def getRunningFrame(self, frameId): + """Gets the currently running frame.""" + try: + return self.__cache[frameId] + except KeyError: + log.info("frameId %s is not running on this machine", frameId) + return None - self.__createEnvVariables() - self.__writeHeader() + def getCoreInfo(self): + """Gets the core info report.""" + return self.cores - rqd.rqutil.permissionsHigh() + def reportStatus(self): + """Replies with hostReport""" + return self.machine.getHostReport() + + def shutdownRqdNow(self): + """Kill all running frames and shutdown RQD""" + self.machine.state = rqd.compiled_proto.host_pb2.DOWN try: - tempCommand = ["/usr/bin/su", frameInfo.runFrame.user_name, "-c", '"' + - self._createCommandFile(frameInfo.runFrame.command) + '"'] + self.lockAll() + self.killAllFrame("shutdownRqdNow Command") + # pylint: disable=broad-except + except Exception: + log.exception("Failed to kill frames, stopping service anyways") + if not self.__cache: + self.shutdown() - # pylint: disable=subprocess-popen-preexec-fn,consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - cwd=self.rqCore.machine.getTempPath(), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - preexec_fn=os.setsid) - finally: - rqd.rqutil.permissionsLow() + def shutdownRqdIdle(self): + """When machine is idle, shutdown RQD""" + log.info("shutdownRqdIdle") + self.lockAll() + self.__whenIdle = True + self.sendStatusReport() + if not self.__cache: + self.shutdownRqdNow() - frameInfo.pid = frameInfo.forkedCommand.pid + def rebootNow(self): + """Kill all running frames and reboot machine. + This is not available when a user is logged in""" + log.warning('Requested to reboot now') + if self.machine.isUserLoggedIn(): + err = ('Rebooting via RQD is not supported for a desktop machine ' + 'when a user is logged in') + log.warning(err) + raise rqd.rqexceptions.RqdException(err) + self.__reboot = True + self.shutdownRqdNow() - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + def rebootIdle(self): + """When machine is idle, reboot it""" + log.warning('Requested to reboot machine when idle') + self.lockAll() + self.__whenIdle = True + self.__reboot = True + self.sendStatusReport() + if not self.__cache and not self.machine.isUserLoggedIn(): + self.shutdownRqdNow() - while True: - output = frameInfo.forkedCommand.stdout.readline() - if not output and frameInfo.forkedCommand.poll() is not None: - break - if output: - self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + def nimbyOn(self): + """Activates nimby, does not kill any running frames until next nimby + event. Also does not unlock until sufficient idle time is reached.""" + if self.nimby and not self.nimby.active: + try: + self.nimby.run() + log.warning("Nimby has been activated") + # pylint: disable=broad-except + except Exception: + self.nimby.locked = False + err = "Nimby is in the process of shutting down" + log.exception(err) + raise rqd.rqexceptions.RqdException(err) - frameInfo.forkedCommand.wait() + def nimbyOff(self): + """Deactivates nimby and unlocks any nimby lock""" + if self.nimby.active: + self.nimby.stop() + log.info("Nimby has been deactivated") - # Find exitStatus and exitSignal - returncode = frameInfo.forkedCommand.returncode - if os.WIFEXITED(returncode): - frameInfo.exitStatus = os.WEXITSTATUS(returncode) - else: - frameInfo.exitStatus = 1 - if os.WIFSIGNALED(returncode): - frameInfo.exitSignal = os.WTERMSIG(returncode) + def onNimbyLock(self): + """This is called by nimby when it locks the machine. + All running frames are killed. + A new report is sent to the cuebot.""" + self.killAllFrame("NIMBY Triggered") + self.sendStatusReport() - self.__writeFooter() - self.__cleanup() + def onNimbyUnlock(self, asOf=None): + """This is called by nimby when it unlocks the machine due to sufficient + idle. A new report is sent to the cuebot. + @param asOf: Time when idle state began, if known.""" + del asOf + self.sendStatusReport() - def runUnknown(self): - """The steps required to handle a frame under an unknown OS.""" + def lock(self, reqLock): + """Locks the requested core. + If a locked status changes, a status report is sent to the cuebot. + @type reqLock: int + @param reqLock: Number of cores to lock, 100 = 1 physical core""" + sendUpdate = False + with self.__threadLock: + # pylint: disable=no-member + numLock = min(self.cores.total_cores - self.cores.locked_cores, + reqLock) + if numLock > 0: + self.cores.locked_cores += numLock + self.cores.idle_cores -= min(numLock, self.cores.idle_cores) + sendUpdate = True + # pylint: enable=no-member - def run(self): - """Thread initialization""" - log.info("Monitor frame started for frameId=%s", self.frameId) + log.debug(self.cores) - runFrame = self.runFrame + if sendUpdate: + self.sendStatusReport() - # pylint: disable=too-many-nested-blocks - try: - runFrame.job_temp_dir = os.path.join(self.rqCore.machine.getTempPath(), - runFrame.job_name) - runFrame.frame_temp_dir = os.path.join(runFrame.job_temp_dir, - runFrame.frame_name) - runFrame.log_file = "%s.%s.rqlog" % (runFrame.job_name, - runFrame.frame_name) - runFrame.log_dir_file = os.path.join(runFrame.log_dir, runFrame.log_file) + def lockAll(self): + """"Locks all cores on the machine. + If a locked status changes, a status report is sent.""" + sendUpdate = False + with self.__threadLock: + # pylint: disable=no-member + if self.cores.locked_cores < self.cores.total_cores: + self.cores.locked_cores = self.cores.total_cores + self.cores.idle_cores = 0 + sendUpdate = True + # pylint: enable=no-member - try: # Exception block for all exceptions - # Ensure permissions return to Low after this block - try: - if rqd.rqconstants.RQD_CREATE_USER_IF_NOT_EXISTS and runFrame.HasField("uid"): - rqd.rqutil.checkAndCreateUser(runFrame.user_name, - runFrame.uid, - runFrame.gid) - # Do everything as launching user: - runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID - rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) + log.debug(self.cores) - # Setup frame logging - try: - self.rqlog = rqd.rqlogging.RqdLogger(runFrame.log_dir_file) - self.rqlog.waitForFile() - # pylint: disable=broad-except - except Exception as e: - err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) - raise RuntimeError(err) + if sendUpdate: + self.sendStatusReport() - finally: - rqd.rqutil.permissionsLow() + def unlock(self, reqUnlock): + """Unlocks the requested number of cores. + Also resets reboot/shutdown/restart when idle requests. + If a locked status changes, a status report is sent to the cuebot. + @type reqUnlock: int + @param reqUnlock: Number of cores to unlock, 100 = 1 physical core""" - # Store frame in cache and register servant - self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) + sendUpdate = False - if platform.system() == "Linux": - self.runLinux() - elif platform.system() == "Windows": - self.runWindows() - elif platform.system() == "Darwin": - self.runDarwin() - else: - self.runUnknown() + if (self.__whenIdle or self.__reboot or + self.machine.state != rqd.compiled_proto.host_pb2.UP): + sendUpdate = True + + self.__whenIdle = False + self.__reboot = False + self.machine.state = rqd.compiled_proto.host_pb2.UP - # pylint: disable=broad-except - except Exception: - log.critical( - "Failed launchFrame: For %s due to: \n%s", - runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) - # Notifies the cuebot that there was an error launching - self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH - # Delay keeps the cuebot from spamming failing booking requests - time.sleep(10) - finally: - self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST'), - runFrame.attributes.get('GPU_LIST') - if 'GPU_LIST' in self.runFrame.attributes else None) + with self.__threadLock: + # pylint: disable=no-member + numUnlock = min(self.cores.locked_cores, reqUnlock) + if numUnlock > 0: + self.cores.locked_cores -= numUnlock + self.cores.idle_cores += numUnlock + sendUpdate = True + # pylint: enable=no-member - self.rqCore.deleteFrame(self.runFrame.frame_id) + log.debug(self.cores) - self.rqCore.sendFrameCompleteReport(self.frameInfo) - time_till_next = ( - (self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time()) - if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC): - self.rqCore.onIntervalThread.cancel() - self.rqCore.onInterval(rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC) + if sendUpdate: + self.sendStatusReport() - log.info("Monitor frame ended for frameId=%s", - self.runFrame.frame_id) + def unlockAll(self): + """"Unlocks all cores on the machine. + Also resets reboot/shutdown/restart when idle requests. + If a locked status changes, a status report is sent.""" + sendUpdate = False -class RqCore(object): - """Main body of RQD, handles the integration of all components, - the setup and launching of a frame and acts on all gRPC calls - that are passed from the Network module.""" + if (self.__whenIdle or self.__reboot + or self.machine.state != rqd.compiled_proto.host_pb2.UP): + sendUpdate = True - def __init__(self, optNimbyoff=False): - """RqCore class initialization""" self.__whenIdle = False self.__reboot = False + self.machine.state = rqd.compiled_proto.host_pb2.UP - self.__optNimbyoff = optNimbyoff + with self.__threadLock: + # pylint: disable=no-member + if self.cores.locked_cores > 0: + if not self.nimby.locked: + self.cores.idle_cores += self.cores.locked_cores + self.cores.locked_cores = 0 + sendUpdate = True + # pylint: enable=no-member - self.cores = rqd.compiled_proto.report_pb2.CoreDetail( - total_cores=0, - idle_cores=0, - locked_cores=0, - booked_cores=0, - reserved_cores=[], - ) + log.debug(self.cores) - self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self) + if sendUpdate: + self.sendStatusReport() - self.machine = rqd.rqmachine.Machine(self, self.cores) + def sendStatusReport(self): + """Sends the current host report to Cuebot.""" + self.network.reportStatus(self.machine.getHostReport()) - self.network = rqd.rqnetwork.Network(self) - self.__threadLock = threading.Lock() - self.__cache = {} + def isWaitingForIdle(self): + """Returns whether the host is waiting until idle to take some action.""" + return self.__whenIdle - self.updateRssThread = None - self.onIntervalThread = None - self.intervalStartTime = None - self.intervalSleepTime = rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC + def sendFrameCompleteReport(self, runningFrame): + """Send a frameCompleteReport to Cuebot""" + if not runningFrame.completeReportSent: + report = rqd.compiled_proto.report_pb2.FrameCompleteReport() + # pylint: disable=no-member + report.host.CopyFrom(self.machine.getHostInfo()) + report.frame.CopyFrom(runningFrame.runningFrameInfo()) + # pylint: enable=no-member - # pylint: disable=unused-private-member - self.__cluster = None - self.__session = None - self.__stmt = None + if runningFrame.exitStatus is None: + report.exit_status = 1 + else: + report.exit_status = runningFrame.exitStatus - signal.signal(signal.SIGINT, self.handleExit) - signal.signal(signal.SIGTERM, self.handleExit) + report.exit_signal = runningFrame.exitSignal + report.run_time = int(runningFrame.runTime) - def start(self): - """Called by main to start the rqd service""" - if self.machine.isDesktop(): - if self.__optNimbyoff: - log.warning('Nimby startup has been disabled via --nimbyoff') - elif not rqd.rqconstants.OVERRIDE_NIMBY: - if rqd.rqconstants.OVERRIDE_NIMBY is None: - log.warning('OVERRIDE_NIMBY is not defined, Nimby startup has been disabled') - else: - log.warning('OVERRIDE_NIMBY is False, Nimby startup has been disabled') - else: - self.nimbyOn() - elif rqd.rqconstants.OVERRIDE_NIMBY: - log.warning('Nimby startup has been triggered by OVERRIDE_NIMBY') - self.nimbyOn() - self.network.start_grpc() + # If nimby is active, then frame must have been killed by nimby + # Set the exitSignal to indicate this event + if self.nimby.locked and not runningFrame.ignoreNimby: + report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL - def grpcConnected(self): - """After gRPC connects to the cuebot, this function is called""" - self.network.reportRqdStartup(self.machine.getBootReport()) + self.network.reportRunningFrameCompletion(report) + runningFrame.completeReportSent = True - self.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) - self.updateRssThread.start() + def sanitizeFrames(self): + """ + Iterate over the cache and update the status of frames that might have + completed but never reported back to cuebot. + """ + for frameId, runningFrame in self.__cache.items(): + # If the frame was marked as completed (exitStatus) and a report has not been sent + # try to file the report again + if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: + try: + self.sendFrameCompleteReport(runningFrame) + self.deleteFrame(frameId) + log.info("Successfully deleted frame from cache for %s/%s (%s)", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name, + frameId) + # pylint: disable=broad-except + except Exception: + log.exception("Failed to sanitize frame %s/%s", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name) - self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) - self.intervalStartTime = time.time() - self.onIntervalThread.start() - log.warning('RQD Started') +class FrameAttendantThread(threading.Thread): + """Once a frame has been received and checked by RQD, this class handles + the launching, waiting on, and cleanup work related to running the + frame.""" + def __init__(self, rqCore: RqCore, runFrame, frameInfo): + """FrameAttendantThread class initialization + @type rqCore: RqCore + @param rqCore: Main RQD Object + @type runFrame: RunFrame + @param runFrame: rqd_pb2.RunFrame + @type frameInfo: rqd.rqnetwork.RunningFrame + @param frameInfo: Servant for running frame + """ + threading.Thread.__init__(self) + self.rqCore = rqCore + self.frameId = runFrame.frame_id + self.runFrame = runFrame + self.startTime = 0 + self.endTime = 0 + self.frameInfo = frameInfo + self._tempLocations = [] + self.rqlog = None - def onInterval(self, sleepTime=None): + def __createEnvVariables(self): + """Define the environmental variables for the frame""" + # If linux specific, they need to move into self.runLinux() + # pylint: disable=attribute-defined-outside-init + self.frameEnv = {} + self.frameEnv["PATH"] = self.rqCore.machine.getPathEnv() + self.frameEnv["TERM"] = "unknown" + self.frameEnv["TZ"] = self.rqCore.machine.getTimezone() + self.frameEnv["USER"] = self.runFrame.user_name + self.frameEnv["LOGNAME"] = self.runFrame.user_name + self.frameEnv["mcp"] = "1" + self.frameEnv["show"] = self.runFrame.show + self.frameEnv["shot"] = self.runFrame.shot + self.frameEnv["jobid"] = self.runFrame.job_name + self.frameEnv["jobhost"] = self.rqCore.machine.getHostname() + self.frameEnv["frame"] = self.runFrame.frame_name + self.frameEnv["zframe"] = self.runFrame.frame_name + self.frameEnv["logfile"] = self.runFrame.log_file + self.frameEnv["maxframetime"] = "0" + self.frameEnv["minspace"] = "200" + self.frameEnv["CUE3"] = "True" + self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) + self.frameEnv["SP_NOMYCSHRC"] = "1" - """This is called by self.grpcConnected as a timer thread to execute - every interval""" - if sleepTime is None: - self.intervalSleepTime = random.randint( - rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC, - rqd.rqconstants.RQD_MAX_PING_INTERVAL_SEC) - else: - self.intervalSleepTime = sleepTime + if platform.system() == "Windows": + for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: + if variable in os.environ: + self.frameEnv[variable] = os.environ[variable] + for variable in rqd.rqconstants.RQD_HOST_ENV_VARS: + # Fallback to empty string, easy to spot what is missing in the log + self.frameEnv[variable] = os.environ.get(variable, '') + + for key, value in self.runFrame.environment.items(): + if key == 'PATH': + self.frameEnv[key] += os.pathsep + value + else: + self.frameEnv[key] = value + + # Add threads to use all assigned hyper-threading cores + if 'CPU_LIST' in self.runFrame.attributes and 'CUE_THREADS' in self.frameEnv: + self.frameEnv['CUE_THREADS'] = str(max( + int(self.frameEnv['CUE_THREADS']), + len(self.runFrame.attributes['CPU_LIST'].split(',')))) + self.frameEnv['CUE_HT'] = "True" + + # Add GPU's to use all assigned GPU cores + if 'GPU_LIST' in self.runFrame.attributes: + self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + + # pylint: disable=inconsistent-return-statements + def _createCommandFile(self, command): + """Creates a file that subprocess. Popen then executes. + @type command: string + @param command: The command specified in the runFrame request + @rtype: string + @return: Command file location""" + # TODO: this should use tempfile to create the files and clean them up afterwards try: - self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) - self.intervalStartTime = time.time() - self.onIntervalThread.start() + if platform.system() == "Windows": + rqd_tmp_dir = os.path.join(tempfile.gettempdir(), 'rqd') + try: + os.mkdir(rqd_tmp_dir) + except OSError: + pass # okay, already exists + + # Windows Batch needs some characters escaped: + command = command.replace('%', '%%') + for char in '^&<>|': + command = command.replace(char, '^' + char) + + commandFile = os.path.join( + rqd_tmp_dir, + 'cmd-%s-%s.bat' % (self.runFrame.frame_id, time.time())) + else: + commandFile = os.path.join(tempfile.gettempdir(), + 'rqd-cmd-%s-%s' % (self.runFrame.frame_id, time.time())) + with open(commandFile, "w", encoding='utf-8') as rqexe: + self._tempLocations.append(commandFile) + rqexe.write(command) + rqexe.close() + os.chmod(commandFile, 0o777) + return commandFile # pylint: disable=broad-except except Exception as e: log.critical( - 'Unable to schedule a ping due to %s at %s', - e, traceback.extract_tb(sys.exc_info()[2])) + "Unable to make command file: %s due to %s at %s", + commandFile, e, traceback.extract_tb(sys.exc_info()[2])) - try: - if self.__whenIdle and not self.__cache: - if not self.machine.isUserLoggedIn(): - self.shutdownRqdNow() - else: - log.warning('Shutdown requested but a user is logged in.') - # pylint: disable=broad-except - except Exception as e: - log.warning( - 'Unable to shutdown due to %s at %s', e, traceback.extract_tb(sys.exc_info()[2])) + def __writeHeader(self): + """Writes the frame's log header""" + + self.startTime = time.time() try: - self.sendStatusReport() + print("="*59, file=self.rqlog) + print("RenderQ JobSpec %s" % time.ctime(self.startTime), "\n", file=self.rqlog) + print("proxy rqd.rqnetwork.RunningFrame/%s -t:tcp -h %s -p 10021" % ( + self.runFrame.frame_id, + self.rqCore.machine.getHostname()), file=self.rqlog) + print("%-21s%s" % ("command", self.runFrame.command), file=self.rqlog) + print("%-21s%s" % ("uid", self.runFrame.uid), file=self.rqlog) + print("%-21s%s" % ("gid", self.runFrame.gid), file=self.rqlog) + print("%-21s%s" % ("logDestination", + self.runFrame.log_dir_file), file=self.rqlog) + print("%-21s%s" % ("cwd", self.runFrame.frame_temp_dir), file=self.rqlog) + print("%-21s%s" % ("renderHost", + self.rqCore.machine.getHostname()), file=self.rqlog) + print("%-21s%s" % ("jobId", self.runFrame.job_id), file=self.rqlog) + print("%-21s%s" % ("frameId", self.runFrame.frame_id), file=self.rqlog) + for env in sorted(self.frameEnv): + print("%-21s%s=%s" % ("env", env, self.frameEnv[env]), file=self.rqlog) + print("="*59, file=self.rqlog) + + if 'CPU_LIST' in self.runFrame.attributes: + print('Hyper-threading enabled', file=self.rqlog) + # pylint: disable=broad-except except Exception as e: log.critical( - 'Unable to send status report due to %s at %s', - e, traceback.extract_tb(sys.exc_info()[2])) + "Unable to write header to rqlog: %s due to %s at %s", + self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - def updateRss(self): - """Triggers and schedules the updating of rss information""" - if self.__cache: - try: - self.machine.rssUpdate(self.__cache) - finally: - self.updateRssThread = threading.Timer( - rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) - self.updateRssThread.start() + def __writeFooter(self): + """Writes frame's log footer""" - def getFrame(self, frameId): - """Gets a frame from the cache based on frameId - @type frameId: string - @param frameId: A frame's unique Id - @rtype: rqd.rqnetwork.RunningFrame - @return: rqd.rqnetwork.RunningFrame object""" - return self.__cache[frameId] + self.endTime = time.time() + self.frameInfo.runTime = int(self.endTime - self.startTime) + try: + print("", file=self.rqlog) + print("="*59, file=self.rqlog) + print("RenderQ Job Complete\n", file=self.rqlog) + print("%-20s%s" % ("exitStatus", self.frameInfo.exitStatus), file=self.rqlog) + print("%-20s%s" % ("exitSignal", self.frameInfo.exitSignal), file=self.rqlog) + if self.frameInfo.killMessage: + print("%-20s%s" % ("killMessage", self.frameInfo.killMessage), file=self.rqlog) + print("%-20s%s" % ("startTime", + time.ctime(self.startTime)), file=self.rqlog) + print("%-20s%s" % ("endTime", + time.ctime(self.endTime)), file=self.rqlog) + print("%-20s%s" % ("maxrss", self.frameInfo.maxRss), file=self.rqlog) + print("%-20s%s" % ("maxUsedGpuMemory", + self.frameInfo.maxUsedGpuMemory), file=self.rqlog) + print("%-20s%s" % ("utime", self.frameInfo.utime), file=self.rqlog) + print("%-20s%s" % ("stime", self.frameInfo.stime), file=self.rqlog) + print("%-20s%s" % ("renderhost", self.rqCore.machine.getHostname()), file=self.rqlog) - def getFrameKeys(self): - """Gets a list of all keys from the cache - @rtype: list - @return: List of all frameIds running on host""" - return list(self.__cache.keys()) + print("%-20s%s" % ("maxrss (KB)", self.frameInfo.maxRss), file=self.rqlog) + for child in sorted(self.frameInfo.childrenProcs.items(), + key=lambda item: item[1]['start_time']): + print("\t%-20s%s" % (child[1]['name'], child[1]['rss']), file=self.rqlog) + print("\t%-20s%s" % ("start_time", + datetime.timedelta(seconds=child[1]["start_time"])), + file=self.rqlog) + print("\t%-20s%s" % ("cmdline", " ".join(child[1]["cmd_line"])), file=self.rqlog) - def storeFrame(self, frameId, runningFrame): - """Stores a frame in the cache and adds the network adapter - @type frameId: string - @param frameId: A frame's unique Id - @type runningFrame: rqd.rqnetwork.RunningFrame - @param runningFrame: rqd.rqnetwork.RunningFrame object""" - with self.__threadLock: - if frameId in self.__cache: - raise rqd.rqexceptions.RqdException( - "frameId " + frameId + " is already running on this machine") - self.__cache[frameId] = runningFrame + print("="*59, file=self.rqlog) - def deleteFrame(self, frameId): - """Deletes a frame from the cache - @type frameId: string - @param frameId: A frame's unique Id""" - with self.__threadLock: - if frameId in self.__cache: - del self.__cache[frameId] - # pylint: disable=no-member - if not self.__cache and self.cores.reserved_cores: - # pylint: disable=no-member - log.error( - 'No running frames but reserved_cores is not empty: %s', - self.cores.reserved_cores) - # pylint: disable=no-member - self.cores.reserved_cores.clear() - log.info("Successfully delete frame with Id: %s", frameId) - else: - log.warning("Frame with Id: %s not found in cache", frameId) + # pylint: disable=broad-except + except Exception as e: + log.critical( + "Unable to write footer: %s due to %s at %s", + self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - def killAllFrame(self, reason): - """Will execute .kill() on every frame in cache until no frames remain - @type reason: string - @param reason: Reason for requesting all frames to be killed""" + def __cleanup(self): + """Cleans up temporary files""" + rqd.rqutil.permissionsHigh() + try: + for location in self._tempLocations: + if os.path.isfile(location): + try: + os.remove(location) + # pylint: disable=broad-except + except Exception as e: + log.warning( + "Unable to delete file: %s due to %s at %s", + location, e, traceback.extract_tb(sys.exc_info()[2])) + finally: + rqd.rqutil.permissionsLow() - if self.__cache: + # Close log file + try: + self.rqlog.close() + # pylint: disable=broad-except + except Exception as e: log.warning( - "killAllFrame called due to: %s\n%s", reason, ",".join(self.getFrameKeys())) - - while self.__cache: - if reason.startswith("NIMBY"): - # Since this is a nimby kill, ignore any frames that are ignoreNimby - frameKeys = [ - frame.frameId for frame in list(self.__cache.values()) if not frame.ignoreNimby] - else: - frameKeys = list(self.__cache.keys()) - - if not frameKeys: - # No frames left to kill - return - - for frameKey in frameKeys: - try: - self.__cache[frameKey].kill(reason) - except KeyError: - pass - time.sleep(1) + "Unable to close file: %s due to %s at %s", + self.runFrame.log_file, e, traceback.extract_tb(sys.exc_info()[2])) - def releaseCores(self, reqRelease, releaseHT=None, releaseGpus=None): - """The requested number of cores are released - @type reqRelease: int - @param reqRelease: Number of cores to release, 100 = 1 physical core""" - with self.__threadLock: - # pylint: disable=no-member - self.cores.booked_cores -= reqRelease - maxRelease = (self.cores.total_cores - - self.cores.locked_cores - - self.cores.idle_cores - - self.cores.booked_cores) + def runLinux(self): + """The steps required to handle a frame under linux""" + frameInfo = self.frameInfo + runFrame = self.runFrame - if maxRelease > 0: - self.cores.idle_cores += min(maxRelease, reqRelease) - # pylint: enable=no-member + self.__createEnvVariables() + self.__writeHeader() - if releaseHT: - self.machine.releaseHT(releaseHT) + tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), + frameInfo.frameId, + time.time()) + self._tempLocations.append(tempStatFile) + tempCommand = [] + if self.rqCore.machine.isDesktop(): + tempCommand += ["/bin/nice"] + tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if releaseGpus: - self.machine.releaseGpus(releaseGpus) + if 'CPU_LIST' in runFrame.attributes: + tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] - # pylint: disable=no-member - if self.cores.idle_cores > self.cores.total_cores: - log.critical( - "idle_cores (%d) have become greater than total_cores (%d): %s at %s", - self.cores.idle_cores, self.cores.total_cores, sys.exc_info()[0], - traceback.extract_tb(sys.exc_info()[2])) - # pylint: enable=no-member + rqd.rqutil.permissionsHigh() + try: + if rqd.rqconstants.RQD_BECOME_JOB_USER: + tempCommand += ["/bin/su", runFrame.user_name, rqd.rqconstants.SU_ARGUMENT, + '"' + self._createCommandFile(runFrame.command) + '"'] + else: + tempCommand += [self._createCommandFile(runFrame.command)] - def shutdown(self): - """Shuts down all rqd systems""" - self.nimbyOff() - if self.onIntervalThread is not None: - self.onIntervalThread.cancel() - if self.updateRssThread is not None: - self.updateRssThread.cancel() - elif self.__reboot: - log.warning("Rebooting machine by request") - self.machine.reboot() - else: - log.warning("Shutting down RQD by request. pid(%s)", os.getpid()) - self.network.stopGrpc() - # Using sys.exit would raise SystemExit, giving exception handlers a chance - # to block this - # pylint: disable=protected-access - os._exit(0) + # pylint: disable=subprocess-popen-preexec-fn,consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + cwd=self.rqCore.machine.getTempPath(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + preexec_fn=os.setsid) + finally: + rqd.rqutil.permissionsLow() - def handleExit(self, signalnum, flag): - """Shutdown threads and exit RQD.""" - del signalnum - del flag - self.shutdown() + frameInfo.pid = frameInfo.forkedCommand.pid - def launchFrame(self, runFrame): - """This will setup for the launch the frame specified in the arguments. - If a problem is encountered, a CueException will be thrown. - @type runFrame: RunFrame - @param runFrame: rqd_pb2.RunFrame""" - log.info("Running command %s for %s", runFrame.command, runFrame.frame_id) - log.debug(runFrame) + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - # - # Check for reasons to abort launch - # + poller = select.poll() + poller.register(frameInfo.forkedCommand.stdout, select.POLLIN) + poller.register(frameInfo.forkedCommand.stderr, select.POLLIN) + while True: + for fd, event in poller.poll(): + if event & select.POLLIN: + if fd == frameInfo.forkedCommand.stdout.fileno(): + line = frameInfo.forkedCommand.stdout.readline() + elif fd == frameInfo.forkedCommand.stderr.fileno(): + line = frameInfo.forkedCommand.stderr.readline() + else: + continue + if not line: + break + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + if frameInfo.forkedCommand.poll() is not None: + break - if self.machine.state != rqd.compiled_proto.host_pb2.UP: - err = "Not launching, rqd HardwareState is not Up" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + returncode = frameInfo.forkedCommand.wait() - if self.__whenIdle: - err = "Not launching, rqd is waiting for idle to shutdown" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + # Find exitStatus and exitSignal + if returncode < 0: + # Exited with a signal + frameInfo.exitStatus = 1 + frameInfo.exitSignal = -returncode + else: + frameInfo.exitStatus = returncode + frameInfo.exitSignal = 0 - if self.nimby.locked and not runFrame.ignore_nimby: - err = "Not launching, rqd is lockNimby and not Ignore Nimby" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + try: + with open(tempStatFile, "r", encoding='utf-8') as statFile: + frameInfo.realtime = statFile.readline().split()[1] + frameInfo.utime = statFile.readline().split()[1] + frameInfo.stime = statFile.readline().split()[1] + statFile.close() + # pylint: disable=broad-except + except Exception: + pass # This happens when frames are killed - if rqd.rqconstants.OVERRIDE_NIMBY and self.nimby.isNimbyActive(): - err = "Not launching, rqd is lockNimby and User is Active" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + self.__writeFooter() + self.__cleanup() - if runFrame.frame_id in self.__cache: - err = "Not launching, frame is already running on this proc %s" % runFrame.frame_id - log.critical(err) - raise rqd.rqexceptions.DuplicateFrameViolationException(err) + def runDocker(self): + """The steps required to handle a frame under a docker container""" + frameInfo = self.frameInfo + runFrame = self.runFrame - if runFrame.HasField("uid") and runFrame.uid <= 0: - err = "Not launching, will not run frame as uid=%d" % runFrame.uid - log.warning(err) - raise rqd.rqexceptions.InvalidUserException(err) + # TODO: implement support for multiple images + # requires adding `string os = 25;` to rqd.proto/RunFrame + # + # image = self.rqCore.docker_images.get(runFrame.os) + # if image is None: + # raise RuntimeError("rqd not configured to run an image for this frame OS: %s", runFrame.os) + image = self.rqCore.docker_image - if runFrame.num_cores <= 0: - err = "Not launching, numCores must be > 0" - log.warning(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + self.__createEnvVariables() + self.__writeHeader() - # See if all requested cores are available - with self.__threadLock: - # pylint: disable=no-member - if self.cores.idle_cores < runFrame.num_cores: - err = "Not launching, insufficient idle cores" - log.critical(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) - # pylint: enable=no-member + tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), + frameInfo.frameId, + time.time()) + self._tempLocations.append(tempStatFile) + tempCommand = [] + if self.rqCore.machine.isDesktop(): + tempCommand += ["/bin/nice"] + tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if runFrame.environment.get('CUE_THREADABLE') == '1': - reserveHT = self.machine.reserveHT(runFrame.num_cores) - if reserveHT: - runFrame.attributes['CPU_LIST'] = reserveHT + if 'CPU_LIST' in runFrame.attributes: + tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] - if runFrame.num_gpus: - reserveGpus = self.machine.reserveGpus(runFrame.num_gpus) - if reserveGpus: - runFrame.attributes['GPU_LIST'] = reserveGpus + tempCommand += [runFrame.command] - # They must be available at this point, reserve them - # pylint: disable=no-member - self.cores.idle_cores -= runFrame.num_cores - self.cores.booked_cores += runFrame.num_cores - # pylint: enable=no-member + # Print PID before executing + command = ["sh", "-c", "echo '$$'; exec " + " ".join(tempCommand)] - runningFrame = rqd.rqnetwork.RunningFrame(self, runFrame) - runningFrame.frameAttendantThread = FrameAttendantThread(self, runFrame, runningFrame) - runningFrame.frameAttendantThread.start() + client = self.rqCore.docker_client + container = client.containers.run(image=image, + detach=True, + environment=self.frameEnv, + working_dir=self.rqCore.machine.getTempPath(), + mounts=self.rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=self.frameEnv["jobhost"], + entrypoint=command) - def getRunningFrame(self, frameId): - """Gets the currently running frame.""" - try: - return self.__cache[frameId] - except KeyError: - log.info("frameId %s is not running on this machine", frameId) - return None + log_stream = container.logs(stream=True) + # CMD prints the process PID before executing the actual command + frameInfo.pid = int(next(log_stream)) - def getCoreInfo(self): - """Gets the core info report.""" - return self.cores + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - def reportStatus(self): - """Replies with hostReport""" - return self.machine.getHostReport() + for line in log_stream: + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + + output = container.wait() + returncode = output["StatusCode"] + + # Find exitStatus and exitSignal + if returncode < 0: + # Exited with a signal + frameInfo.exitStatus = 1 + frameInfo.exitSignal = -returncode + else: + frameInfo.exitStatus = returncode + frameInfo.exitSignal = 0 - def shutdownRqdNow(self): - """Kill all running frames and shutdown RQD""" - self.machine.state = rqd.compiled_proto.host_pb2.DOWN try: - self.lockAll() - self.killAllFrame("shutdownRqdNow Command") + with open(tempStatFile, "r", encoding='utf-8') as statFile: + frameInfo.realtime = statFile.readline().split()[1] + frameInfo.utime = statFile.readline().split()[1] + frameInfo.stime = statFile.readline().split()[1] + statFile.close() # pylint: disable=broad-except except Exception: - log.exception("Failed to kill frames, stopping service anyways") - if not self.__cache: - self.shutdown() + pass # This happens when frames are killed - def shutdownRqdIdle(self): - """When machine is idle, shutdown RQD""" - log.info("shutdownRqdIdle") - self.lockAll() - self.__whenIdle = True - self.sendStatusReport() - if not self.__cache: - self.shutdownRqdNow() + self.__writeFooter() + self.__cleanup() - def rebootNow(self): - """Kill all running frames and reboot machine. - This is not available when a user is logged in""" - log.warning('Requested to reboot now') - if self.machine.isUserLoggedIn(): - err = ('Rebooting via RQD is not supported for a desktop machine ' - 'when a user is logged in') - log.warning(err) - raise rqd.rqexceptions.RqdException(err) - self.__reboot = True - self.shutdownRqdNow() + def runWindows(self): + """The steps required to handle a frame under windows""" + frameInfo = self.frameInfo + runFrame = self.runFrame - def rebootIdle(self): - """When machine is idle, reboot it""" - log.warning('Requested to reboot machine when idle') - self.lockAll() - self.__whenIdle = True - self.__reboot = True - self.sendStatusReport() - if not self.__cache and not self.machine.isUserLoggedIn(): - self.shutdownRqdNow() + self.__createEnvVariables() + self.__writeHeader() - def nimbyOn(self): - """Activates nimby, does not kill any running frames until next nimby - event. Also does not unlock until sufficient idle time is reached.""" - if self.nimby and not self.nimby.active: - try: - self.nimby.run() - log.warning("Nimby has been activated") - # pylint: disable=broad-except - except Exception: - self.nimby.locked = False - err = "Nimby is in the process of shutting down" - log.exception(err) - raise rqd.rqexceptions.RqdException(err) + try: + runFrame.command = runFrame.command.replace('%{frame}', self.frameEnv['CUE_IFRAME']) + tempCommand = [self._createCommandFile(runFrame.command)] - def nimbyOff(self): - """Deactivates nimby and unlocks any nimby lock""" - if self.nimby.active: - self.nimby.stop() - log.info("Nimby has been deactivated") + # pylint: disable=consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + # pylint: disable=broad-except + except Exception: + log.critical( + "Failed subprocess.Popen: Due to: \n%s", + ''.join(traceback.format_exception(*sys.exc_info()))) - def onNimbyLock(self): - """This is called by nimby when it locks the machine. - All running frames are killed. - A new report is sent to the cuebot.""" - self.killAllFrame("NIMBY Triggered") - self.sendStatusReport() + frameInfo.pid = frameInfo.forkedCommand.pid - def onNimbyUnlock(self, asOf=None): - """This is called by nimby when it unlocks the machine due to sufficient - idle. A new report is sent to the cuebot. - @param asOf: Time when idle state began, if known.""" - del asOf - self.sendStatusReport() + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - def lock(self, reqLock): - """Locks the requested core. - If a locked status changes, a status report is sent to the cuebot. - @type reqLock: int - @param reqLock: Number of cores to lock, 100 = 1 physical core""" - sendUpdate = False - with self.__threadLock: - # pylint: disable=no-member - numLock = min(self.cores.total_cores - self.cores.locked_cores, - reqLock) - if numLock > 0: - self.cores.locked_cores += numLock - self.cores.idle_cores -= min(numLock, self.cores.idle_cores) - sendUpdate = True - # pylint: enable=no-member + while True: + output = frameInfo.forkedCommand.stdout.readline() + if not output and frameInfo.forkedCommand.poll() is not None: + break + if output: + self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - log.debug(self.cores) + frameInfo.forkedCommand.wait() - if sendUpdate: - self.sendStatusReport() + # Find exitStatus and exitSignal + returncode = frameInfo.forkedCommand.returncode + if returncode < INT32_MIN: + returncode = 303 + if returncode > INT32_MAX: + returncode = 304 + frameInfo.exitStatus = returncode + frameInfo.exitSignal = returncode - def lockAll(self): - """"Locks all cores on the machine. - If a locked status changes, a status report is sent.""" - sendUpdate = False - with self.__threadLock: - # pylint: disable=no-member - if self.cores.locked_cores < self.cores.total_cores: - self.cores.locked_cores = self.cores.total_cores - self.cores.idle_cores = 0 - sendUpdate = True - # pylint: enable=no-member + frameInfo.realtime = 0 + frameInfo.utime = 0 + frameInfo.stime = 0 - log.debug(self.cores) + self.__writeFooter() + self.__cleanup() - if sendUpdate: - self.sendStatusReport() + def runDarwin(self): + """The steps required to handle a frame under mac""" + frameInfo = self.frameInfo - def unlock(self, reqUnlock): - """Unlocks the requested number of cores. - Also resets reboot/shutdown/restart when idle requests. - If a locked status changes, a status report is sent to the cuebot. - @type reqUnlock: int - @param reqUnlock: Number of cores to unlock, 100 = 1 physical core""" + self.__createEnvVariables() + self.__writeHeader() - sendUpdate = False + rqd.rqutil.permissionsHigh() + try: + tempCommand = ["/usr/bin/su", frameInfo.runFrame.user_name, "-c", '"' + + self._createCommandFile(frameInfo.runFrame.command) + '"'] - if (self.__whenIdle or self.__reboot or - self.machine.state != rqd.compiled_proto.host_pb2.UP): - sendUpdate = True + # pylint: disable=subprocess-popen-preexec-fn,consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + cwd=self.rqCore.machine.getTempPath(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + preexec_fn=os.setsid) + finally: + rqd.rqutil.permissionsLow() - self.__whenIdle = False - self.__reboot = False - self.machine.state = rqd.compiled_proto.host_pb2.UP + frameInfo.pid = frameInfo.forkedCommand.pid - with self.__threadLock: - # pylint: disable=no-member - numUnlock = min(self.cores.locked_cores, reqUnlock) - if numUnlock > 0: - self.cores.locked_cores -= numUnlock - self.cores.idle_cores += numUnlock - sendUpdate = True - # pylint: enable=no-member + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - log.debug(self.cores) + while True: + output = frameInfo.forkedCommand.stdout.readline() + if not output and frameInfo.forkedCommand.poll() is not None: + break + if output: + self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - if sendUpdate: - self.sendStatusReport() + frameInfo.forkedCommand.wait() - def unlockAll(self): - """"Unlocks all cores on the machine. - Also resets reboot/shutdown/restart when idle requests. - If a locked status changes, a status report is sent.""" + # Find exitStatus and exitSignal + returncode = frameInfo.forkedCommand.returncode + if os.WIFEXITED(returncode): + frameInfo.exitStatus = os.WEXITSTATUS(returncode) + else: + frameInfo.exitStatus = 1 + if os.WIFSIGNALED(returncode): + frameInfo.exitSignal = os.WTERMSIG(returncode) - sendUpdate = False + self.__writeFooter() + self.__cleanup() - if (self.__whenIdle or self.__reboot - or self.machine.state != rqd.compiled_proto.host_pb2.UP): - sendUpdate = True + def runUnknown(self): + """The steps required to handle a frame under an unknown OS.""" - self.__whenIdle = False - self.__reboot = False - self.machine.state = rqd.compiled_proto.host_pb2.UP + def run(self): + """Thread initialization""" + log.info("Monitor frame started for frameId=%s", self.frameId) - with self.__threadLock: - # pylint: disable=no-member - if self.cores.locked_cores > 0: - if not self.nimby.locked: - self.cores.idle_cores += self.cores.locked_cores - self.cores.locked_cores = 0 - sendUpdate = True - # pylint: enable=no-member + runFrame = self.runFrame - log.debug(self.cores) + # pylint: disable=too-many-nested-blocks + try: + runFrame.job_temp_dir = os.path.join(self.rqCore.machine.getTempPath(), + runFrame.job_name) + runFrame.frame_temp_dir = os.path.join(runFrame.job_temp_dir, + runFrame.frame_name) + runFrame.log_file = "%s.%s.rqlog" % (runFrame.job_name, + runFrame.frame_name) + runFrame.log_dir_file = os.path.join(runFrame.log_dir, runFrame.log_file) - if sendUpdate: - self.sendStatusReport() + try: # Exception block for all exceptions + # Ensure permissions return to Low after this block + try: + if rqd.rqconstants.RQD_CREATE_USER_IF_NOT_EXISTS and runFrame.HasField("uid"): + rqd.rqutil.checkAndCreateUser(runFrame.user_name, + runFrame.uid, + runFrame.gid) + # Do everything as launching user: + runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) - def sendStatusReport(self): - """Sends the current host report to Cuebot.""" - self.network.reportStatus(self.machine.getHostReport()) + # Setup frame logging + try: + self.rqlog = rqd.rqlogging.RqdLogger(runFrame.log_dir_file) + self.rqlog.waitForFile() + # pylint: disable=broad-except + except Exception as e: + err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) + raise RuntimeError(err) - def isWaitingForIdle(self): - """Returns whether the host is waiting until idle to take some action.""" - return self.__whenIdle + finally: + rqd.rqutil.permissionsLow() - def sendFrameCompleteReport(self, runningFrame): - """Send a frameCompleteReport to Cuebot""" - if not runningFrame.completeReportSent: - report = rqd.compiled_proto.report_pb2.FrameCompleteReport() - # pylint: disable=no-member - report.host.CopyFrom(self.machine.getHostInfo()) - report.frame.CopyFrom(runningFrame.runningFrameInfo()) - # pylint: enable=no-member + # Store frame in cache and register servant + self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) - if runningFrame.exitStatus is None: - report.exit_status = 1 - else: - report.exit_status = runningFrame.exitStatus + if platform.system() == "Linux" and self.rqCore.docker_client is not None: + self.runDocker() + elif platform.system() == "Linux": + self.runLinux() + elif platform.system() == "Windows": + self.runWindows() + elif platform.system() == "Darwin": + self.runDarwin() + else: + self.runUnknown() - report.exit_signal = runningFrame.exitSignal - report.run_time = int(runningFrame.runTime) + # pylint: disable=broad-except + except Exception: + log.critical( + "Failed launchFrame: For %s due to: \n%s", + runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) + # Notifies the cuebot that there was an error launching + self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH + # Delay keeps the cuebot from spamming failing booking requests + time.sleep(10) + finally: + self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST'), + runFrame.attributes.get('GPU_LIST') + if 'GPU_LIST' in self.runFrame.attributes else None) - # If nimby is active, then frame must have been killed by nimby - # Set the exitSignal to indicate this event - if self.nimby.locked and not runningFrame.ignoreNimby: - report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL + self.rqCore.deleteFrame(self.runFrame.frame_id) - self.network.reportRunningFrameCompletion(report) - runningFrame.completeReportSent = True + self.rqCore.sendFrameCompleteReport(self.frameInfo) + time_till_next = ( + (self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time()) + if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC): + self.rqCore.onIntervalThread.cancel() + self.rqCore.onInterval(rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC) - def sanitizeFrames(self): - """ - Iterate over the cache and update the status of frames that might have - completed but never reported back to cuebot. - """ - for frameId, runningFrame in self.__cache.items(): - # If the frame was marked as completed (exitStatus) and a report has not been sent - # try to file the report again - if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: - try: - self.sendFrameCompleteReport(runningFrame) - self.deleteFrame(frameId) - log.info("Successfully deleted frame from cache for %s/%s (%s)", - runningFrame.runFrame.job_name, - runningFrame.runFrame.frame_name, - frameId) - # pylint: disable=broad-except - except Exception: - log.exception("Failed to sanitize frame %s/%s", - runningFrame.runFrame.job_name, - runningFrame.runFrame.frame_name) + log.info("Monitor frame ended for frameId=%s", + self.runFrame.frame_id) From 7582d1a15dfafc803c6240d64642c462a267775a Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 16 Oct 2024 16:26:26 -0700 Subject: [PATCH 02/51] [EXPERIMENT] Add runDocker mode to rqd (#1541) When RUN_ON_DOCKER is set on rqd.conf, each frame will be launched as a docker container using the base image configured as DOCKER_IMAGE. --- requirements.txt | 4 +- rqd/rqd.example.conf | 12 + rqd/rqd/rqconstants.py | 41 + rqd/rqd/rqcore.py | 1854 +++++++++++++++++++++------------------- 4 files changed, 1030 insertions(+), 881 deletions(-) diff --git a/requirements.txt b/requirements.txt index cceee9237..dc0f8d570 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,6 @@ six==1.16.0 # Optional requirements # Sentry support for rqd -sentry-sdk==2.11.0 \ No newline at end of file +sentry-sdk==2.11.0 + +docker==7.1.0 \ No newline at end of file diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index e51782272..78c9cfdab 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -27,3 +27,15 @@ SYSTEMDRIVE MAYA_MODULE_PATH MAYA_SCRIPT_PATH PIXAR_LICENSE_FILE + +[docker.config] +DOCKER_IMAGE="" +RUN_ON_DOCKER=False + +[docker.mounts] +MCP="type=bind,source=/mcp,target=/mcp,bind-propagation=slave" +NET="type=bind,source=/net,target=/net,bind-propagation=slave" +TMP="type=bind,source=/tmp,target=/tmp,bind-propagation=slave" +SCRATCH="type=bind,source=/scratch,target=/scratch,bind-propagation=slave" +LIMITS="type=bind,source=/etc/security/limits.d/,target=/etc/security/limits.d/,bind-propagation=slave" +FUSE="type=bind,source=/dev/fuse,target=/dev/fuse,bind-propagation=shared" \ No newline at end of file diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 6f23ebc89..54239d321 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -153,11 +153,18 @@ SP_OS = platform.system() +# Docker mode config +RUN_ON_DOCKER = False +DOCKER_IMAGE = "Invalid" +DOCKER_MOUNTS = [] + try: if os.path.isfile(CONFIG_FILE): # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: @@ -230,6 +237,40 @@ if config.has_section(__host_env_var_section): RQD_HOST_ENV_VARS = config.options(__host_env_var_section) + if config.has_section(__docker_config): + RUN_ON_DOCKER = config.getboolean(__docker_config, "RUN_ON_DOCKER") + if RUN_ON_DOCKER: + import docker + import docker.models + import docker.types + + def parse_mount(mount_str): + """ + Parse mount definitions similar to a docker run command into a docker + mount obj + + Format: type=bind,source=/tmp,target=/tmp,bind-propagation=slave + """ + mount_dict = {} + # bind-propagation defaults to None as only type=bind accepts it + mount_dict["bind-propagation"] = None + for item in mount_str.split(","): + key, value = item.split("=") + mount_dic[key.strip()] = value.strip() + return mount_dic + + DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") + # Parse values under the category docker.mounts into Mount objects + mounts = config.options(__docker_mounts) + for mount_name in mounts: + mount_str = config.get(__docker_mounts, mount_name) + mount_dic = parse_mount(mount_str) + mount = docker.types.Mount(mount_dic["target"], + mount_dic["source"], + type=mount_dic["type"], + propagation=mount_dic["bind-propagation"]) + DOCKER_MOUNTS.append(mount) + # pylint: disable=broad-except except Exception as e: logging.warning( diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 5b85efe75..4bb0de433 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -51,1059 +51,1153 @@ log = logging.getLogger(__name__) -class FrameAttendantThread(threading.Thread): - """Once a frame has been received and checked by RQD, this class handles - the launching, waiting on, and cleanup work related to running the - frame.""" - def __init__(self, rqCore, runFrame, frameInfo): - """FrameAttendantThread class initialization - @type rqCore: RqCore - @param rqCore: Main RQD Object - @type runFrame: RunFrame - @param runFrame: rqd_pb2.RunFrame - @type frameInfo: rqd.rqnetwork.RunningFrame - @param frameInfo: Servant for running frame - """ - threading.Thread.__init__(self) - self.rqCore = rqCore - self.frameId = runFrame.frame_id - self.runFrame = runFrame - self.startTime = 0 - self.endTime = 0 - self.frameInfo = frameInfo - self._tempLocations = [] - self.rqlog = None +class RqCore(object): + """Main body of RQD, handles the integration of all components, + the setup and launching of a frame and acts on all gRPC calls + that are passed from the Network module.""" - def __createEnvVariables(self): - """Define the environmental variables for the frame""" - # If linux specific, they need to move into self.runLinux() - # pylint: disable=attribute-defined-outside-init - self.frameEnv = {} - self.frameEnv["PATH"] = self.rqCore.machine.getPathEnv() - self.frameEnv["TERM"] = "unknown" - self.frameEnv["TZ"] = self.rqCore.machine.getTimezone() - self.frameEnv["USER"] = self.runFrame.user_name - self.frameEnv["LOGNAME"] = self.runFrame.user_name - self.frameEnv["mcp"] = "1" - self.frameEnv["show"] = self.runFrame.show - self.frameEnv["shot"] = self.runFrame.shot - self.frameEnv["jobid"] = self.runFrame.job_name - self.frameEnv["jobhost"] = self.rqCore.machine.getHostname() - self.frameEnv["frame"] = self.runFrame.frame_name - self.frameEnv["zframe"] = self.runFrame.frame_name - self.frameEnv["logfile"] = self.runFrame.log_file - self.frameEnv["maxframetime"] = "0" - self.frameEnv["minspace"] = "200" - self.frameEnv["CUE3"] = "True" - self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) - self.frameEnv["SP_NOMYCSHRC"] = "1" + def __init__(self, optNimbyoff=False): + """RqCore class initialization""" + self.__whenIdle = False + self.__reboot = False - if platform.system() == "Windows": - for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: - if variable in os.environ: - self.frameEnv[variable] = os.environ[variable] - for variable in rqd.rqconstants.RQD_HOST_ENV_VARS: - # Fallback to empty string, easy to spot what is missing in the log - self.frameEnv[variable] = os.environ.get(variable, '') + self.__optNimbyoff = optNimbyoff - for key, value in self.runFrame.environment.items(): - if key == 'PATH': - self.frameEnv[key] += os.pathsep + value - else: - self.frameEnv[key] = value + self.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=0, + idle_cores=0, + locked_cores=0, + booked_cores=0, + reserved_cores=[], + ) - # Add threads to use all assigned hyper-threading cores - if 'CPU_LIST' in self.runFrame.attributes and 'CUE_THREADS' in self.frameEnv: - self.frameEnv['CUE_THREADS'] = str(max( - int(self.frameEnv['CUE_THREADS']), - len(self.runFrame.attributes['CPU_LIST'].split(',')))) - self.frameEnv['CUE_HT'] = "True" + self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self) - # Add GPU's to use all assigned GPU cores - if 'GPU_LIST' in self.runFrame.attributes: - self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + self.machine = rqd.rqmachine.Machine(self, self.cores) - # pylint: disable=inconsistent-return-statements - def _createCommandFile(self, command): - """Creates a file that subprocess. Popen then executes. - @type command: string - @param command: The command specified in the runFrame request - @rtype: string - @return: Command file location""" - # TODO: this should use tempfile to create the files and clean them up afterwards - try: - if platform.system() == "Windows": - rqd_tmp_dir = os.path.join(tempfile.gettempdir(), 'rqd') - try: - os.mkdir(rqd_tmp_dir) - except OSError: - pass # okay, already exists + self.network = rqd.rqnetwork.Network(self) + self.__threadLock = threading.Lock() + self.__cache = {} - # Windows Batch needs some characters escaped: - command = command.replace('%', '%%') - for char in '^&<>|': - command = command.replace(char, '^' + char) + self.updateRssThread = None + self.onIntervalThread = None + self.intervalStartTime = None + self.intervalSleepTime = rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC - commandFile = os.path.join( - rqd_tmp_dir, - 'cmd-%s-%s.bat' % (self.runFrame.frame_id, time.time())) + # pylint: disable=unused-private-member + self.__cluster = None + self.__session = None + self.__stmt = None + + self.docker_client = None + self.docker_mounts = [] + self.docker_image = "Invalid" + if rqd.rqconstants.RUN_ON_DOCKER: + import docker + self.docker_client = docker.from_env() + self.docker_image = rqd.rqconstants.DOCKER_IMAGE + self.docker_mounts = rqd.rqconstants.DOCKER_MOUNTS + + signal.signal(signal.SIGINT, self.handleExit) + signal.signal(signal.SIGTERM, self.handleExit) + + def start(self): + """Called by main to start the rqd service""" + if self.machine.isDesktop(): + if self.__optNimbyoff: + log.warning('Nimby startup has been disabled via --nimbyoff') + elif not rqd.rqconstants.OVERRIDE_NIMBY: + if rqd.rqconstants.OVERRIDE_NIMBY is None: + log.warning('OVERRIDE_NIMBY is not defined, Nimby startup has been disabled') + else: + log.warning('OVERRIDE_NIMBY is False, Nimby startup has been disabled') else: - commandFile = os.path.join(tempfile.gettempdir(), - 'rqd-cmd-%s-%s' % (self.runFrame.frame_id, time.time())) - with open(commandFile, "w", encoding='utf-8') as rqexe: - self._tempLocations.append(commandFile) - rqexe.write(command) - rqexe.close() - os.chmod(commandFile, 0o777) - return commandFile - # pylint: disable=broad-except - except Exception as e: - log.critical( - "Unable to make command file: %s due to %s at %s", - commandFile, e, traceback.extract_tb(sys.exc_info()[2])) + self.nimbyOn() + elif rqd.rqconstants.OVERRIDE_NIMBY: + log.warning('Nimby startup has been triggered by OVERRIDE_NIMBY') + self.nimbyOn() + self.network.start_grpc() - def __writeHeader(self): - """Writes the frame's log header""" + def grpcConnected(self): + """After gRPC connects to the cuebot, this function is called""" + self.network.reportRqdStartup(self.machine.getBootReport()) - self.startTime = time.time() + self.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) + self.updateRssThread.start() - try: - print("="*59, file=self.rqlog) - print("RenderQ JobSpec %s" % time.ctime(self.startTime), "\n", file=self.rqlog) - print("proxy rqd.rqnetwork.RunningFrame/%s -t:tcp -h %s -p 10021" % ( - self.runFrame.frame_id, - self.rqCore.machine.getHostname()), file=self.rqlog) - print("%-21s%s" % ("command", self.runFrame.command), file=self.rqlog) - print("%-21s%s" % ("uid", self.runFrame.uid), file=self.rqlog) - print("%-21s%s" % ("gid", self.runFrame.gid), file=self.rqlog) - print("%-21s%s" % ("logDestination", - self.runFrame.log_dir_file), file=self.rqlog) - print("%-21s%s" % ("cwd", self.runFrame.frame_temp_dir), file=self.rqlog) - print("%-21s%s" % ("renderHost", - self.rqCore.machine.getHostname()), file=self.rqlog) - print("%-21s%s" % ("jobId", self.runFrame.job_id), file=self.rqlog) - print("%-21s%s" % ("frameId", self.runFrame.frame_id), file=self.rqlog) - for env in sorted(self.frameEnv): - print("%-21s%s=%s" % ("env", env, self.frameEnv[env]), file=self.rqlog) - print("="*59, file=self.rqlog) + self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) + self.intervalStartTime = time.time() + self.onIntervalThread.start() - if 'CPU_LIST' in self.runFrame.attributes: - print('Hyper-threading enabled', file=self.rqlog) + log.warning('RQD Started') + + def onInterval(self, sleepTime=None): + """This is called by self.grpcConnected as a timer thread to execute + every interval""" + if sleepTime is None: + self.intervalSleepTime = random.randint( + rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC, + rqd.rqconstants.RQD_MAX_PING_INTERVAL_SEC) + else: + self.intervalSleepTime = sleepTime + try: + self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) + self.intervalStartTime = time.time() + self.onIntervalThread.start() # pylint: disable=broad-except except Exception as e: log.critical( - "Unable to write header to rqlog: %s due to %s at %s", - self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - - def __writeFooter(self): - """Writes frame's log footer""" + 'Unable to schedule a ping due to %s at %s', + e, traceback.extract_tb(sys.exc_info()[2])) - self.endTime = time.time() - self.frameInfo.runTime = int(self.endTime - self.startTime) try: - print("", file=self.rqlog) - print("="*59, file=self.rqlog) - print("RenderQ Job Complete\n", file=self.rqlog) - print("%-20s%s" % ("exitStatus", self.frameInfo.exitStatus), file=self.rqlog) - print("%-20s%s" % ("exitSignal", self.frameInfo.exitSignal), file=self.rqlog) - if self.frameInfo.killMessage: - print("%-20s%s" % ("killMessage", self.frameInfo.killMessage), file=self.rqlog) - print("%-20s%s" % ("startTime", - time.ctime(self.startTime)), file=self.rqlog) - print("%-20s%s" % ("endTime", - time.ctime(self.endTime)), file=self.rqlog) - print("%-20s%s" % ("maxrss", self.frameInfo.maxRss), file=self.rqlog) - print("%-20s%s" % ("maxUsedGpuMemory", - self.frameInfo.maxUsedGpuMemory), file=self.rqlog) - print("%-20s%s" % ("utime", self.frameInfo.utime), file=self.rqlog) - print("%-20s%s" % ("stime", self.frameInfo.stime), file=self.rqlog) - print("%-20s%s" % ("renderhost", self.rqCore.machine.getHostname()), file=self.rqlog) - - print("%-20s%s" % ("maxrss (KB)", self.frameInfo.maxRss), file=self.rqlog) - for child in sorted(self.frameInfo.childrenProcs.items(), - key=lambda item: item[1]['start_time']): - print("\t%-20s%s" % (child[1]['name'], child[1]['rss']), file=self.rqlog) - print("\t%-20s%s" % ("start_time", - datetime.timedelta(seconds=child[1]["start_time"])), - file=self.rqlog) - print("\t%-20s%s" % ("cmdline", " ".join(child[1]["cmd_line"])), file=self.rqlog) - - print("="*59, file=self.rqlog) + if self.__whenIdle and not self.__cache: + if not self.machine.isUserLoggedIn(): + self.shutdownRqdNow() + else: + log.warning('Shutdown requested but a user is logged in.') + # pylint: disable=broad-except + except Exception as e: + log.warning( + 'Unable to shutdown due to %s at %s', e, traceback.extract_tb(sys.exc_info()[2])) + try: + self.sendStatusReport() # pylint: disable=broad-except except Exception as e: log.critical( - "Unable to write footer: %s due to %s at %s", - self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) + 'Unable to send status report due to %s at %s', + e, traceback.extract_tb(sys.exc_info()[2])) - def __cleanup(self): - """Cleans up temporary files""" - rqd.rqutil.permissionsHigh() - try: - for location in self._tempLocations: - if os.path.isfile(location): - try: - os.remove(location) - # pylint: disable=broad-except - except Exception as e: - log.warning( - "Unable to delete file: %s due to %s at %s", - location, e, traceback.extract_tb(sys.exc_info()[2])) - finally: - rqd.rqutil.permissionsLow() + def updateRss(self): + """Triggers and schedules the updating of rss information""" + if self.__cache: + try: + self.machine.rssUpdate(self.__cache) + finally: + self.updateRssThread = threading.Timer( + rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) + self.updateRssThread.start() - # Close log file - try: - self.rqlog.close() - # pylint: disable=broad-except - except Exception as e: - log.warning( - "Unable to close file: %s due to %s at %s", - self.runFrame.log_file, e, traceback.extract_tb(sys.exc_info()[2])) + def getFrame(self, frameId): + """Gets a frame from the cache based on frameId + @type frameId: string + @param frameId: A frame's unique Id + @rtype: rqd.rqnetwork.RunningFrame + @return: rqd.rqnetwork.RunningFrame object""" + return self.__cache[frameId] - def runLinux(self): - """The steps required to handle a frame under linux""" - frameInfo = self.frameInfo - runFrame = self.runFrame + def getFrameKeys(self): + """Gets a list of all keys from the cache + @rtype: list + @return: List of all frameIds running on host""" + return list(self.__cache.keys()) - self.__createEnvVariables() - self.__writeHeader() + def storeFrame(self, frameId, runningFrame): + """Stores a frame in the cache and adds the network adapter + @type frameId: string + @param frameId: A frame's unique Id + @type runningFrame: rqd.rqnetwork.RunningFrame + @param runningFrame: rqd.rqnetwork.RunningFrame object""" + with self.__threadLock: + if frameId in self.__cache: + raise rqd.rqexceptions.RqdException( + "frameId " + frameId + " is already running on this machine") + self.__cache[frameId] = runningFrame - tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), - frameInfo.frameId, - time.time()) - self._tempLocations.append(tempStatFile) - tempCommand = [] - if self.rqCore.machine.isDesktop(): - tempCommand += ["/bin/nice"] - tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] + def deleteFrame(self, frameId): + """Deletes a frame from the cache + @type frameId: string + @param frameId: A frame's unique Id""" + with self.__threadLock: + if frameId in self.__cache: + del self.__cache[frameId] + # pylint: disable=no-member + if not self.__cache and self.cores.reserved_cores: + # pylint: disable=no-member + log.error( + 'No running frames but reserved_cores is not empty: %s', + self.cores.reserved_cores) + # pylint: disable=no-member + self.cores.reserved_cores.clear() + log.info("Successfully delete frame with Id: %s", frameId) + else: + log.warning("Frame with Id: %s not found in cache", frameId) - if 'CPU_LIST' in runFrame.attributes: - tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] + def killAllFrame(self, reason): + """Will execute .kill() on every frame in cache until no frames remain + @type reason: string + @param reason: Reason for requesting all frames to be killed""" - rqd.rqutil.permissionsHigh() - try: - if rqd.rqconstants.RQD_BECOME_JOB_USER: - tempCommand += ["/bin/su", runFrame.user_name, rqd.rqconstants.SU_ARGUMENT, - '"' + self._createCommandFile(runFrame.command) + '"'] + if self.__cache: + log.warning( + "killAllFrame called due to: %s\n%s", reason, ",".join(self.getFrameKeys())) + + while self.__cache: + if reason.startswith("NIMBY"): + # Since this is a nimby kill, ignore any frames that are ignoreNimby + frameKeys = [ + frame.frameId for frame in list(self.__cache.values()) if not frame.ignoreNimby] else: - tempCommand += [self._createCommandFile(runFrame.command)] + frameKeys = list(self.__cache.keys()) - # pylint: disable=subprocess-popen-preexec-fn,consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - cwd=self.rqCore.machine.getTempPath(), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - close_fds=True, - preexec_fn=os.setsid) - finally: - rqd.rqutil.permissionsLow() + if not frameKeys: + # No frames left to kill + return - frameInfo.pid = frameInfo.forkedCommand.pid + for frameKey in frameKeys: + try: + self.__cache[frameKey].kill(reason) + except KeyError: + pass + time.sleep(1) - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + def releaseCores(self, reqRelease, releaseHT=None, releaseGpus=None): + """The requested number of cores are released + @type reqRelease: int + @param reqRelease: Number of cores to release, 100 = 1 physical core""" + with self.__threadLock: + # pylint: disable=no-member + self.cores.booked_cores -= reqRelease + maxRelease = (self.cores.total_cores - + self.cores.locked_cores - + self.cores.idle_cores - + self.cores.booked_cores) - poller = select.poll() - poller.register(frameInfo.forkedCommand.stdout, select.POLLIN) - poller.register(frameInfo.forkedCommand.stderr, select.POLLIN) - while True: - for fd, event in poller.poll(): - if event & select.POLLIN: - if fd == frameInfo.forkedCommand.stdout.fileno(): - line = frameInfo.forkedCommand.stdout.readline() - elif fd == frameInfo.forkedCommand.stderr.fileno(): - line = frameInfo.forkedCommand.stderr.readline() - else: - continue - if not line: - break - self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - if frameInfo.forkedCommand.poll() is not None: - break + if maxRelease > 0: + self.cores.idle_cores += min(maxRelease, reqRelease) + # pylint: enable=no-member - returncode = frameInfo.forkedCommand.wait() + if releaseHT: + self.machine.releaseHT(releaseHT) - # Find exitStatus and exitSignal - if returncode < 0: - # Exited with a signal - frameInfo.exitStatus = 1 - frameInfo.exitSignal = -returncode + if releaseGpus: + self.machine.releaseGpus(releaseGpus) + + # pylint: disable=no-member + if self.cores.idle_cores > self.cores.total_cores: + log.critical( + "idle_cores (%d) have become greater than total_cores (%d): %s at %s", + self.cores.idle_cores, self.cores.total_cores, sys.exc_info()[0], + traceback.extract_tb(sys.exc_info()[2])) + # pylint: enable=no-member + + def shutdown(self): + """Shuts down all rqd systems""" + self.nimbyOff() + if self.onIntervalThread is not None: + self.onIntervalThread.cancel() + if self.updateRssThread is not None: + self.updateRssThread.cancel() + elif self.__reboot: + log.warning("Rebooting machine by request") + self.machine.reboot() else: - frameInfo.exitStatus = returncode - frameInfo.exitSignal = 0 + log.warning("Shutting down RQD by request. pid(%s)", os.getpid()) + self.network.stopGrpc() + # Using sys.exit would raise SystemExit, giving exception handlers a chance + # to block this + # pylint: disable=protected-access + os._exit(0) - try: - with open(tempStatFile, "r", encoding='utf-8') as statFile: - frameInfo.realtime = statFile.readline().split()[1] - frameInfo.utime = statFile.readline().split()[1] - frameInfo.stime = statFile.readline().split()[1] - statFile.close() - # pylint: disable=broad-except - except Exception: - pass # This happens when frames are killed + def handleExit(self, signalnum, flag): + """Shutdown threads and exit RQD.""" + del signalnum + del flag + self.shutdown() - self.__writeFooter() - self.__cleanup() + def launchFrame(self, runFrame): + """This will setup for the launch the frame specified in the arguments. + If a problem is encountered, a CueException will be thrown. + @type runFrame: RunFrame + @param runFrame: rqd_pb2.RunFrame""" + log.info("Running command %s for %s", runFrame.command, runFrame.frame_id) + log.debug(runFrame) - def runWindows(self): - """The steps required to handle a frame under windows""" - frameInfo = self.frameInfo - runFrame = self.runFrame + # + # Check for reasons to abort launch + # - self.__createEnvVariables() - self.__writeHeader() + if self.machine.state != rqd.compiled_proto.host_pb2.UP: + err = "Not launching, rqd HardwareState is not Up" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - try: - runFrame.command = runFrame.command.replace('%{frame}', self.frameEnv['CUE_IFRAME']) - tempCommand = [self._createCommandFile(runFrame.command)] + if self.__whenIdle: + err = "Not launching, rqd is waiting for idle to shutdown" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - # pylint: disable=consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - # pylint: disable=broad-except - except Exception: - log.critical( - "Failed subprocess.Popen: Due to: \n%s", - ''.join(traceback.format_exception(*sys.exc_info()))) + if self.nimby.locked and not runFrame.ignore_nimby: + err = "Not launching, rqd is lockNimby and not Ignore Nimby" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - frameInfo.pid = frameInfo.forkedCommand.pid + if rqd.rqconstants.OVERRIDE_NIMBY and self.nimby.isNimbyActive(): + err = "Not launching, rqd is lockNimby and User is Active" + log.info(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + if runFrame.frame_id in self.__cache: + err = "Not launching, frame is already running on this proc %s" % runFrame.frame_id + log.critical(err) + raise rqd.rqexceptions.DuplicateFrameViolationException(err) + + if runFrame.HasField("uid") and runFrame.uid <= 0: + err = "Not launching, will not run frame as uid=%d" % runFrame.uid + log.warning(err) + raise rqd.rqexceptions.InvalidUserException(err) + + if runFrame.num_cores <= 0: + err = "Not launching, numCores must be > 0" + log.warning(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) - while True: - output = frameInfo.forkedCommand.stdout.readline() - if not output and frameInfo.forkedCommand.poll() is not None: - break - if output: - self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + # See if all requested cores are available + with self.__threadLock: + # pylint: disable=no-member + if self.cores.idle_cores < runFrame.num_cores: + err = "Not launching, insufficient idle cores" + log.critical(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) + # pylint: enable=no-member - frameInfo.forkedCommand.wait() + if runFrame.environment.get('CUE_THREADABLE') == '1': + reserveHT = self.machine.reserveHT(runFrame.num_cores) + if reserveHT: + runFrame.attributes['CPU_LIST'] = reserveHT - # Find exitStatus and exitSignal - returncode = frameInfo.forkedCommand.returncode - if returncode < INT32_MIN: - returncode = 303 - if returncode > INT32_MAX: - returncode = 304 - frameInfo.exitStatus = returncode - frameInfo.exitSignal = returncode + if runFrame.num_gpus: + reserveGpus = self.machine.reserveGpus(runFrame.num_gpus) + if reserveGpus: + runFrame.attributes['GPU_LIST'] = reserveGpus - frameInfo.realtime = 0 - frameInfo.utime = 0 - frameInfo.stime = 0 + # They must be available at this point, reserve them + # pylint: disable=no-member + self.cores.idle_cores -= runFrame.num_cores + self.cores.booked_cores += runFrame.num_cores + # pylint: enable=no-member - self.__writeFooter() - self.__cleanup() + runningFrame = rqd.rqnetwork.RunningFrame(self, runFrame) + runningFrame.frameAttendantThread = FrameAttendantThread(self, runFrame, runningFrame) + runningFrame.frameAttendantThread.start() - def runDarwin(self): - """The steps required to handle a frame under mac""" - frameInfo = self.frameInfo + def getRunningFrame(self, frameId): + """Gets the currently running frame.""" + try: + return self.__cache[frameId] + except KeyError: + log.info("frameId %s is not running on this machine", frameId) + return None - self.__createEnvVariables() - self.__writeHeader() + def getCoreInfo(self): + """Gets the core info report.""" + return self.cores - rqd.rqutil.permissionsHigh() + def reportStatus(self): + """Replies with hostReport""" + return self.machine.getHostReport() + + def shutdownRqdNow(self): + """Kill all running frames and shutdown RQD""" + self.machine.state = rqd.compiled_proto.host_pb2.DOWN try: - tempCommand = ["/usr/bin/su", frameInfo.runFrame.user_name, "-c", '"' + - self._createCommandFile(frameInfo.runFrame.command) + '"'] + self.lockAll() + self.killAllFrame("shutdownRqdNow Command") + # pylint: disable=broad-except + except Exception: + log.exception("Failed to kill frames, stopping service anyways") + if not self.__cache: + self.shutdown() - # pylint: disable=subprocess-popen-preexec-fn,consider-using-with - frameInfo.forkedCommand = subprocess.Popen(tempCommand, - env=self.frameEnv, - cwd=self.rqCore.machine.getTempPath(), - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - preexec_fn=os.setsid) - finally: - rqd.rqutil.permissionsLow() + def shutdownRqdIdle(self): + """When machine is idle, shutdown RQD""" + log.info("shutdownRqdIdle") + self.lockAll() + self.__whenIdle = True + self.sendStatusReport() + if not self.__cache: + self.shutdownRqdNow() - frameInfo.pid = frameInfo.forkedCommand.pid + def rebootNow(self): + """Kill all running frames and reboot machine. + This is not available when a user is logged in""" + log.warning('Requested to reboot now') + if self.machine.isUserLoggedIn(): + err = ('Rebooting via RQD is not supported for a desktop machine ' + 'when a user is logged in') + log.warning(err) + raise rqd.rqexceptions.RqdException(err) + self.__reboot = True + self.shutdownRqdNow() - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() + def rebootIdle(self): + """When machine is idle, reboot it""" + log.warning('Requested to reboot machine when idle') + self.lockAll() + self.__whenIdle = True + self.__reboot = True + self.sendStatusReport() + if not self.__cache and not self.machine.isUserLoggedIn(): + self.shutdownRqdNow() - while True: - output = frameInfo.forkedCommand.stdout.readline() - if not output and frameInfo.forkedCommand.poll() is not None: - break - if output: - self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + def nimbyOn(self): + """Activates nimby, does not kill any running frames until next nimby + event. Also does not unlock until sufficient idle time is reached.""" + if self.nimby and not self.nimby.active: + try: + self.nimby.run() + log.warning("Nimby has been activated") + # pylint: disable=broad-except + except Exception: + self.nimby.locked = False + err = "Nimby is in the process of shutting down" + log.exception(err) + raise rqd.rqexceptions.RqdException(err) - frameInfo.forkedCommand.wait() + def nimbyOff(self): + """Deactivates nimby and unlocks any nimby lock""" + if self.nimby.active: + self.nimby.stop() + log.info("Nimby has been deactivated") - # Find exitStatus and exitSignal - returncode = frameInfo.forkedCommand.returncode - if os.WIFEXITED(returncode): - frameInfo.exitStatus = os.WEXITSTATUS(returncode) - else: - frameInfo.exitStatus = 1 - if os.WIFSIGNALED(returncode): - frameInfo.exitSignal = os.WTERMSIG(returncode) + def onNimbyLock(self): + """This is called by nimby when it locks the machine. + All running frames are killed. + A new report is sent to the cuebot.""" + self.killAllFrame("NIMBY Triggered") + self.sendStatusReport() - self.__writeFooter() - self.__cleanup() + def onNimbyUnlock(self, asOf=None): + """This is called by nimby when it unlocks the machine due to sufficient + idle. A new report is sent to the cuebot. + @param asOf: Time when idle state began, if known.""" + del asOf + self.sendStatusReport() - def runUnknown(self): - """The steps required to handle a frame under an unknown OS.""" + def lock(self, reqLock): + """Locks the requested core. + If a locked status changes, a status report is sent to the cuebot. + @type reqLock: int + @param reqLock: Number of cores to lock, 100 = 1 physical core""" + sendUpdate = False + with self.__threadLock: + # pylint: disable=no-member + numLock = min(self.cores.total_cores - self.cores.locked_cores, + reqLock) + if numLock > 0: + self.cores.locked_cores += numLock + self.cores.idle_cores -= min(numLock, self.cores.idle_cores) + sendUpdate = True + # pylint: enable=no-member - def run(self): - """Thread initialization""" - log.info("Monitor frame started for frameId=%s", self.frameId) + log.debug(self.cores) - runFrame = self.runFrame + if sendUpdate: + self.sendStatusReport() - # pylint: disable=too-many-nested-blocks - try: - runFrame.job_temp_dir = os.path.join(self.rqCore.machine.getTempPath(), - runFrame.job_name) - runFrame.frame_temp_dir = os.path.join(runFrame.job_temp_dir, - runFrame.frame_name) - runFrame.log_file = "%s.%s.rqlog" % (runFrame.job_name, - runFrame.frame_name) - runFrame.log_dir_file = os.path.join(runFrame.log_dir, runFrame.log_file) + def lockAll(self): + """"Locks all cores on the machine. + If a locked status changes, a status report is sent.""" + sendUpdate = False + with self.__threadLock: + # pylint: disable=no-member + if self.cores.locked_cores < self.cores.total_cores: + self.cores.locked_cores = self.cores.total_cores + self.cores.idle_cores = 0 + sendUpdate = True + # pylint: enable=no-member - try: # Exception block for all exceptions - # Ensure permissions return to Low after this block - try: - if rqd.rqconstants.RQD_CREATE_USER_IF_NOT_EXISTS and runFrame.HasField("uid"): - rqd.rqutil.checkAndCreateUser(runFrame.user_name, - runFrame.uid, - runFrame.gid) - # Do everything as launching user: - runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID - rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) + log.debug(self.cores) - # Setup frame logging - try: - self.rqlog = rqd.rqlogging.RqdLogger(runFrame.log_dir_file) - self.rqlog.waitForFile() - # pylint: disable=broad-except - except Exception as e: - err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) - raise RuntimeError(err) + if sendUpdate: + self.sendStatusReport() - finally: - rqd.rqutil.permissionsLow() + def unlock(self, reqUnlock): + """Unlocks the requested number of cores. + Also resets reboot/shutdown/restart when idle requests. + If a locked status changes, a status report is sent to the cuebot. + @type reqUnlock: int + @param reqUnlock: Number of cores to unlock, 100 = 1 physical core""" - # Store frame in cache and register servant - self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) + sendUpdate = False - if platform.system() == "Linux": - self.runLinux() - elif platform.system() == "Windows": - self.runWindows() - elif platform.system() == "Darwin": - self.runDarwin() - else: - self.runUnknown() + if (self.__whenIdle or self.__reboot or + self.machine.state != rqd.compiled_proto.host_pb2.UP): + sendUpdate = True + + self.__whenIdle = False + self.__reboot = False + self.machine.state = rqd.compiled_proto.host_pb2.UP - # pylint: disable=broad-except - except Exception: - log.critical( - "Failed launchFrame: For %s due to: \n%s", - runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) - # Notifies the cuebot that there was an error launching - self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH - # Delay keeps the cuebot from spamming failing booking requests - time.sleep(10) - finally: - self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST'), - runFrame.attributes.get('GPU_LIST') - if 'GPU_LIST' in self.runFrame.attributes else None) + with self.__threadLock: + # pylint: disable=no-member + numUnlock = min(self.cores.locked_cores, reqUnlock) + if numUnlock > 0: + self.cores.locked_cores -= numUnlock + self.cores.idle_cores += numUnlock + sendUpdate = True + # pylint: enable=no-member - self.rqCore.deleteFrame(self.runFrame.frame_id) + log.debug(self.cores) - self.rqCore.sendFrameCompleteReport(self.frameInfo) - time_till_next = ( - (self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time()) - if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC): - self.rqCore.onIntervalThread.cancel() - self.rqCore.onInterval(rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC) + if sendUpdate: + self.sendStatusReport() - log.info("Monitor frame ended for frameId=%s", - self.runFrame.frame_id) + def unlockAll(self): + """"Unlocks all cores on the machine. + Also resets reboot/shutdown/restart when idle requests. + If a locked status changes, a status report is sent.""" + sendUpdate = False -class RqCore(object): - """Main body of RQD, handles the integration of all components, - the setup and launching of a frame and acts on all gRPC calls - that are passed from the Network module.""" + if (self.__whenIdle or self.__reboot + or self.machine.state != rqd.compiled_proto.host_pb2.UP): + sendUpdate = True - def __init__(self, optNimbyoff=False): - """RqCore class initialization""" self.__whenIdle = False self.__reboot = False + self.machine.state = rqd.compiled_proto.host_pb2.UP - self.__optNimbyoff = optNimbyoff + with self.__threadLock: + # pylint: disable=no-member + if self.cores.locked_cores > 0: + if not self.nimby.locked: + self.cores.idle_cores += self.cores.locked_cores + self.cores.locked_cores = 0 + sendUpdate = True + # pylint: enable=no-member - self.cores = rqd.compiled_proto.report_pb2.CoreDetail( - total_cores=0, - idle_cores=0, - locked_cores=0, - booked_cores=0, - reserved_cores=[], - ) + log.debug(self.cores) - self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self) + if sendUpdate: + self.sendStatusReport() - self.machine = rqd.rqmachine.Machine(self, self.cores) + def sendStatusReport(self): + """Sends the current host report to Cuebot.""" + self.network.reportStatus(self.machine.getHostReport()) - self.network = rqd.rqnetwork.Network(self) - self.__threadLock = threading.Lock() - self.__cache = {} + def isWaitingForIdle(self): + """Returns whether the host is waiting until idle to take some action.""" + return self.__whenIdle - self.updateRssThread = None - self.onIntervalThread = None - self.intervalStartTime = None - self.intervalSleepTime = rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC + def sendFrameCompleteReport(self, runningFrame): + """Send a frameCompleteReport to Cuebot""" + if not runningFrame.completeReportSent: + report = rqd.compiled_proto.report_pb2.FrameCompleteReport() + # pylint: disable=no-member + report.host.CopyFrom(self.machine.getHostInfo()) + report.frame.CopyFrom(runningFrame.runningFrameInfo()) + # pylint: enable=no-member - # pylint: disable=unused-private-member - self.__cluster = None - self.__session = None - self.__stmt = None + if runningFrame.exitStatus is None: + report.exit_status = 1 + else: + report.exit_status = runningFrame.exitStatus - signal.signal(signal.SIGINT, self.handleExit) - signal.signal(signal.SIGTERM, self.handleExit) + report.exit_signal = runningFrame.exitSignal + report.run_time = int(runningFrame.runTime) - def start(self): - """Called by main to start the rqd service""" - if self.machine.isDesktop(): - if self.__optNimbyoff: - log.warning('Nimby startup has been disabled via --nimbyoff') - elif not rqd.rqconstants.OVERRIDE_NIMBY: - if rqd.rqconstants.OVERRIDE_NIMBY is None: - log.warning('OVERRIDE_NIMBY is not defined, Nimby startup has been disabled') - else: - log.warning('OVERRIDE_NIMBY is False, Nimby startup has been disabled') - else: - self.nimbyOn() - elif rqd.rqconstants.OVERRIDE_NIMBY: - log.warning('Nimby startup has been triggered by OVERRIDE_NIMBY') - self.nimbyOn() - self.network.start_grpc() + # If nimby is active, then frame must have been killed by nimby + # Set the exitSignal to indicate this event + if self.nimby.locked and not runningFrame.ignoreNimby: + report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL - def grpcConnected(self): - """After gRPC connects to the cuebot, this function is called""" - self.network.reportRqdStartup(self.machine.getBootReport()) + self.network.reportRunningFrameCompletion(report) + runningFrame.completeReportSent = True - self.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) - self.updateRssThread.start() + def sanitizeFrames(self): + """ + Iterate over the cache and update the status of frames that might have + completed but never reported back to cuebot. + """ + for frameId, runningFrame in self.__cache.items(): + # If the frame was marked as completed (exitStatus) and a report has not been sent + # try to file the report again + if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: + try: + self.sendFrameCompleteReport(runningFrame) + self.deleteFrame(frameId) + log.info("Successfully deleted frame from cache for %s/%s (%s)", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name, + frameId) + # pylint: disable=broad-except + except Exception: + log.exception("Failed to sanitize frame %s/%s", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name) - self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) - self.intervalStartTime = time.time() - self.onIntervalThread.start() - log.warning('RQD Started') +class FrameAttendantThread(threading.Thread): + """Once a frame has been received and checked by RQD, this class handles + the launching, waiting on, and cleanup work related to running the + frame.""" + def __init__(self, rqCore: RqCore, runFrame, frameInfo): + """FrameAttendantThread class initialization + @type rqCore: RqCore + @param rqCore: Main RQD Object + @type runFrame: RunFrame + @param runFrame: rqd_pb2.RunFrame + @type frameInfo: rqd.rqnetwork.RunningFrame + @param frameInfo: Servant for running frame + """ + threading.Thread.__init__(self) + self.rqCore = rqCore + self.frameId = runFrame.frame_id + self.runFrame = runFrame + self.startTime = 0 + self.endTime = 0 + self.frameInfo = frameInfo + self._tempLocations = [] + self.rqlog = None - def onInterval(self, sleepTime=None): + def __createEnvVariables(self): + """Define the environmental variables for the frame""" + # If linux specific, they need to move into self.runLinux() + # pylint: disable=attribute-defined-outside-init + self.frameEnv = {} + self.frameEnv["PATH"] = self.rqCore.machine.getPathEnv() + self.frameEnv["TERM"] = "unknown" + self.frameEnv["TZ"] = self.rqCore.machine.getTimezone() + self.frameEnv["USER"] = self.runFrame.user_name + self.frameEnv["LOGNAME"] = self.runFrame.user_name + self.frameEnv["mcp"] = "1" + self.frameEnv["show"] = self.runFrame.show + self.frameEnv["shot"] = self.runFrame.shot + self.frameEnv["jobid"] = self.runFrame.job_name + self.frameEnv["jobhost"] = self.rqCore.machine.getHostname() + self.frameEnv["frame"] = self.runFrame.frame_name + self.frameEnv["zframe"] = self.runFrame.frame_name + self.frameEnv["logfile"] = self.runFrame.log_file + self.frameEnv["maxframetime"] = "0" + self.frameEnv["minspace"] = "200" + self.frameEnv["CUE3"] = "True" + self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) + self.frameEnv["SP_NOMYCSHRC"] = "1" - """This is called by self.grpcConnected as a timer thread to execute - every interval""" - if sleepTime is None: - self.intervalSleepTime = random.randint( - rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC, - rqd.rqconstants.RQD_MAX_PING_INTERVAL_SEC) - else: - self.intervalSleepTime = sleepTime + if platform.system() == "Windows": + for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: + if variable in os.environ: + self.frameEnv[variable] = os.environ[variable] + for variable in rqd.rqconstants.RQD_HOST_ENV_VARS: + # Fallback to empty string, easy to spot what is missing in the log + self.frameEnv[variable] = os.environ.get(variable, '') + + for key, value in self.runFrame.environment.items(): + if key == 'PATH': + self.frameEnv[key] += os.pathsep + value + else: + self.frameEnv[key] = value + + # Add threads to use all assigned hyper-threading cores + if 'CPU_LIST' in self.runFrame.attributes and 'CUE_THREADS' in self.frameEnv: + self.frameEnv['CUE_THREADS'] = str(max( + int(self.frameEnv['CUE_THREADS']), + len(self.runFrame.attributes['CPU_LIST'].split(',')))) + self.frameEnv['CUE_HT'] = "True" + + # Add GPU's to use all assigned GPU cores + if 'GPU_LIST' in self.runFrame.attributes: + self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + + # pylint: disable=inconsistent-return-statements + def _createCommandFile(self, command): + """Creates a file that subprocess. Popen then executes. + @type command: string + @param command: The command specified in the runFrame request + @rtype: string + @return: Command file location""" + # TODO: this should use tempfile to create the files and clean them up afterwards try: - self.onIntervalThread = threading.Timer(self.intervalSleepTime, self.onInterval) - self.intervalStartTime = time.time() - self.onIntervalThread.start() + if platform.system() == "Windows": + rqd_tmp_dir = os.path.join(tempfile.gettempdir(), 'rqd') + try: + os.mkdir(rqd_tmp_dir) + except OSError: + pass # okay, already exists + + # Windows Batch needs some characters escaped: + command = command.replace('%', '%%') + for char in '^&<>|': + command = command.replace(char, '^' + char) + + commandFile = os.path.join( + rqd_tmp_dir, + 'cmd-%s-%s.bat' % (self.runFrame.frame_id, time.time())) + else: + commandFile = os.path.join(tempfile.gettempdir(), + 'rqd-cmd-%s-%s' % (self.runFrame.frame_id, time.time())) + with open(commandFile, "w", encoding='utf-8') as rqexe: + self._tempLocations.append(commandFile) + rqexe.write(command) + rqexe.close() + os.chmod(commandFile, 0o777) + return commandFile # pylint: disable=broad-except except Exception as e: log.critical( - 'Unable to schedule a ping due to %s at %s', - e, traceback.extract_tb(sys.exc_info()[2])) + "Unable to make command file: %s due to %s at %s", + commandFile, e, traceback.extract_tb(sys.exc_info()[2])) - try: - if self.__whenIdle and not self.__cache: - if not self.machine.isUserLoggedIn(): - self.shutdownRqdNow() - else: - log.warning('Shutdown requested but a user is logged in.') - # pylint: disable=broad-except - except Exception as e: - log.warning( - 'Unable to shutdown due to %s at %s', e, traceback.extract_tb(sys.exc_info()[2])) + def __writeHeader(self): + """Writes the frame's log header""" + + self.startTime = time.time() try: - self.sendStatusReport() + print("="*59, file=self.rqlog) + print("RenderQ JobSpec %s" % time.ctime(self.startTime), "\n", file=self.rqlog) + print("proxy rqd.rqnetwork.RunningFrame/%s -t:tcp -h %s -p 10021" % ( + self.runFrame.frame_id, + self.rqCore.machine.getHostname()), file=self.rqlog) + print("%-21s%s" % ("command", self.runFrame.command), file=self.rqlog) + print("%-21s%s" % ("uid", self.runFrame.uid), file=self.rqlog) + print("%-21s%s" % ("gid", self.runFrame.gid), file=self.rqlog) + print("%-21s%s" % ("logDestination", + self.runFrame.log_dir_file), file=self.rqlog) + print("%-21s%s" % ("cwd", self.runFrame.frame_temp_dir), file=self.rqlog) + print("%-21s%s" % ("renderHost", + self.rqCore.machine.getHostname()), file=self.rqlog) + print("%-21s%s" % ("jobId", self.runFrame.job_id), file=self.rqlog) + print("%-21s%s" % ("frameId", self.runFrame.frame_id), file=self.rqlog) + for env in sorted(self.frameEnv): + print("%-21s%s=%s" % ("env", env, self.frameEnv[env]), file=self.rqlog) + print("="*59, file=self.rqlog) + + if 'CPU_LIST' in self.runFrame.attributes: + print('Hyper-threading enabled', file=self.rqlog) + # pylint: disable=broad-except except Exception as e: log.critical( - 'Unable to send status report due to %s at %s', - e, traceback.extract_tb(sys.exc_info()[2])) + "Unable to write header to rqlog: %s due to %s at %s", + self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - def updateRss(self): - """Triggers and schedules the updating of rss information""" - if self.__cache: - try: - self.machine.rssUpdate(self.__cache) - finally: - self.updateRssThread = threading.Timer( - rqd.rqconstants.RSS_UPDATE_INTERVAL, self.updateRss) - self.updateRssThread.start() + def __writeFooter(self): + """Writes frame's log footer""" - def getFrame(self, frameId): - """Gets a frame from the cache based on frameId - @type frameId: string - @param frameId: A frame's unique Id - @rtype: rqd.rqnetwork.RunningFrame - @return: rqd.rqnetwork.RunningFrame object""" - return self.__cache[frameId] + self.endTime = time.time() + self.frameInfo.runTime = int(self.endTime - self.startTime) + try: + print("", file=self.rqlog) + print("="*59, file=self.rqlog) + print("RenderQ Job Complete\n", file=self.rqlog) + print("%-20s%s" % ("exitStatus", self.frameInfo.exitStatus), file=self.rqlog) + print("%-20s%s" % ("exitSignal", self.frameInfo.exitSignal), file=self.rqlog) + if self.frameInfo.killMessage: + print("%-20s%s" % ("killMessage", self.frameInfo.killMessage), file=self.rqlog) + print("%-20s%s" % ("startTime", + time.ctime(self.startTime)), file=self.rqlog) + print("%-20s%s" % ("endTime", + time.ctime(self.endTime)), file=self.rqlog) + print("%-20s%s" % ("maxrss", self.frameInfo.maxRss), file=self.rqlog) + print("%-20s%s" % ("maxUsedGpuMemory", + self.frameInfo.maxUsedGpuMemory), file=self.rqlog) + print("%-20s%s" % ("utime", self.frameInfo.utime), file=self.rqlog) + print("%-20s%s" % ("stime", self.frameInfo.stime), file=self.rqlog) + print("%-20s%s" % ("renderhost", self.rqCore.machine.getHostname()), file=self.rqlog) - def getFrameKeys(self): - """Gets a list of all keys from the cache - @rtype: list - @return: List of all frameIds running on host""" - return list(self.__cache.keys()) + print("%-20s%s" % ("maxrss (KB)", self.frameInfo.maxRss), file=self.rqlog) + for child in sorted(self.frameInfo.childrenProcs.items(), + key=lambda item: item[1]['start_time']): + print("\t%-20s%s" % (child[1]['name'], child[1]['rss']), file=self.rqlog) + print("\t%-20s%s" % ("start_time", + datetime.timedelta(seconds=child[1]["start_time"])), + file=self.rqlog) + print("\t%-20s%s" % ("cmdline", " ".join(child[1]["cmd_line"])), file=self.rqlog) - def storeFrame(self, frameId, runningFrame): - """Stores a frame in the cache and adds the network adapter - @type frameId: string - @param frameId: A frame's unique Id - @type runningFrame: rqd.rqnetwork.RunningFrame - @param runningFrame: rqd.rqnetwork.RunningFrame object""" - with self.__threadLock: - if frameId in self.__cache: - raise rqd.rqexceptions.RqdException( - "frameId " + frameId + " is already running on this machine") - self.__cache[frameId] = runningFrame + print("="*59, file=self.rqlog) - def deleteFrame(self, frameId): - """Deletes a frame from the cache - @type frameId: string - @param frameId: A frame's unique Id""" - with self.__threadLock: - if frameId in self.__cache: - del self.__cache[frameId] - # pylint: disable=no-member - if not self.__cache and self.cores.reserved_cores: - # pylint: disable=no-member - log.error( - 'No running frames but reserved_cores is not empty: %s', - self.cores.reserved_cores) - # pylint: disable=no-member - self.cores.reserved_cores.clear() - log.info("Successfully delete frame with Id: %s", frameId) - else: - log.warning("Frame with Id: %s not found in cache", frameId) + # pylint: disable=broad-except + except Exception as e: + log.critical( + "Unable to write footer: %s due to %s at %s", + self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - def killAllFrame(self, reason): - """Will execute .kill() on every frame in cache until no frames remain - @type reason: string - @param reason: Reason for requesting all frames to be killed""" + def __cleanup(self): + """Cleans up temporary files""" + rqd.rqutil.permissionsHigh() + try: + for location in self._tempLocations: + if os.path.isfile(location): + try: + os.remove(location) + # pylint: disable=broad-except + except Exception as e: + log.warning( + "Unable to delete file: %s due to %s at %s", + location, e, traceback.extract_tb(sys.exc_info()[2])) + finally: + rqd.rqutil.permissionsLow() - if self.__cache: + # Close log file + try: + self.rqlog.close() + # pylint: disable=broad-except + except Exception as e: log.warning( - "killAllFrame called due to: %s\n%s", reason, ",".join(self.getFrameKeys())) - - while self.__cache: - if reason.startswith("NIMBY"): - # Since this is a nimby kill, ignore any frames that are ignoreNimby - frameKeys = [ - frame.frameId for frame in list(self.__cache.values()) if not frame.ignoreNimby] - else: - frameKeys = list(self.__cache.keys()) - - if not frameKeys: - # No frames left to kill - return - - for frameKey in frameKeys: - try: - self.__cache[frameKey].kill(reason) - except KeyError: - pass - time.sleep(1) + "Unable to close file: %s due to %s at %s", + self.runFrame.log_file, e, traceback.extract_tb(sys.exc_info()[2])) - def releaseCores(self, reqRelease, releaseHT=None, releaseGpus=None): - """The requested number of cores are released - @type reqRelease: int - @param reqRelease: Number of cores to release, 100 = 1 physical core""" - with self.__threadLock: - # pylint: disable=no-member - self.cores.booked_cores -= reqRelease - maxRelease = (self.cores.total_cores - - self.cores.locked_cores - - self.cores.idle_cores - - self.cores.booked_cores) + def runLinux(self): + """The steps required to handle a frame under linux""" + frameInfo = self.frameInfo + runFrame = self.runFrame - if maxRelease > 0: - self.cores.idle_cores += min(maxRelease, reqRelease) - # pylint: enable=no-member + self.__createEnvVariables() + self.__writeHeader() - if releaseHT: - self.machine.releaseHT(releaseHT) + tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), + frameInfo.frameId, + time.time()) + self._tempLocations.append(tempStatFile) + tempCommand = [] + if self.rqCore.machine.isDesktop(): + tempCommand += ["/bin/nice"] + tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if releaseGpus: - self.machine.releaseGpus(releaseGpus) + if 'CPU_LIST' in runFrame.attributes: + tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] - # pylint: disable=no-member - if self.cores.idle_cores > self.cores.total_cores: - log.critical( - "idle_cores (%d) have become greater than total_cores (%d): %s at %s", - self.cores.idle_cores, self.cores.total_cores, sys.exc_info()[0], - traceback.extract_tb(sys.exc_info()[2])) - # pylint: enable=no-member + rqd.rqutil.permissionsHigh() + try: + if rqd.rqconstants.RQD_BECOME_JOB_USER: + tempCommand += ["/bin/su", runFrame.user_name, rqd.rqconstants.SU_ARGUMENT, + '"' + self._createCommandFile(runFrame.command) + '"'] + else: + tempCommand += [self._createCommandFile(runFrame.command)] - def shutdown(self): - """Shuts down all rqd systems""" - self.nimbyOff() - if self.onIntervalThread is not None: - self.onIntervalThread.cancel() - if self.updateRssThread is not None: - self.updateRssThread.cancel() - elif self.__reboot: - log.warning("Rebooting machine by request") - self.machine.reboot() - else: - log.warning("Shutting down RQD by request. pid(%s)", os.getpid()) - self.network.stopGrpc() - # Using sys.exit would raise SystemExit, giving exception handlers a chance - # to block this - # pylint: disable=protected-access - os._exit(0) + # pylint: disable=subprocess-popen-preexec-fn,consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + cwd=self.rqCore.machine.getTempPath(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + preexec_fn=os.setsid) + finally: + rqd.rqutil.permissionsLow() - def handleExit(self, signalnum, flag): - """Shutdown threads and exit RQD.""" - del signalnum - del flag - self.shutdown() + frameInfo.pid = frameInfo.forkedCommand.pid - def launchFrame(self, runFrame): - """This will setup for the launch the frame specified in the arguments. - If a problem is encountered, a CueException will be thrown. - @type runFrame: RunFrame - @param runFrame: rqd_pb2.RunFrame""" - log.info("Running command %s for %s", runFrame.command, runFrame.frame_id) - log.debug(runFrame) + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - # - # Check for reasons to abort launch - # + poller = select.poll() + poller.register(frameInfo.forkedCommand.stdout, select.POLLIN) + poller.register(frameInfo.forkedCommand.stderr, select.POLLIN) + while True: + for fd, event in poller.poll(): + if event & select.POLLIN: + if fd == frameInfo.forkedCommand.stdout.fileno(): + line = frameInfo.forkedCommand.stdout.readline() + elif fd == frameInfo.forkedCommand.stderr.fileno(): + line = frameInfo.forkedCommand.stderr.readline() + else: + continue + if not line: + break + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + if frameInfo.forkedCommand.poll() is not None: + break - if self.machine.state != rqd.compiled_proto.host_pb2.UP: - err = "Not launching, rqd HardwareState is not Up" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + returncode = frameInfo.forkedCommand.wait() - if self.__whenIdle: - err = "Not launching, rqd is waiting for idle to shutdown" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + # Find exitStatus and exitSignal + if returncode < 0: + # Exited with a signal + frameInfo.exitStatus = 1 + frameInfo.exitSignal = -returncode + else: + frameInfo.exitStatus = returncode + frameInfo.exitSignal = 0 - if self.nimby.locked and not runFrame.ignore_nimby: - err = "Not launching, rqd is lockNimby and not Ignore Nimby" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + try: + with open(tempStatFile, "r", encoding='utf-8') as statFile: + frameInfo.realtime = statFile.readline().split()[1] + frameInfo.utime = statFile.readline().split()[1] + frameInfo.stime = statFile.readline().split()[1] + statFile.close() + # pylint: disable=broad-except + except Exception: + pass # This happens when frames are killed - if rqd.rqconstants.OVERRIDE_NIMBY and self.nimby.isNimbyActive(): - err = "Not launching, rqd is lockNimby and User is Active" - log.info(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + self.__writeFooter() + self.__cleanup() - if runFrame.frame_id in self.__cache: - err = "Not launching, frame is already running on this proc %s" % runFrame.frame_id - log.critical(err) - raise rqd.rqexceptions.DuplicateFrameViolationException(err) + def runDocker(self): + """The steps required to handle a frame under a docker container""" + frameInfo = self.frameInfo + runFrame = self.runFrame - if runFrame.HasField("uid") and runFrame.uid <= 0: - err = "Not launching, will not run frame as uid=%d" % runFrame.uid - log.warning(err) - raise rqd.rqexceptions.InvalidUserException(err) + # TODO: implement support for multiple images + # requires adding `string os = 25;` to rqd.proto/RunFrame + # + # image = self.rqCore.docker_images.get(runFrame.os) + # if image is None: + # raise RuntimeError("rqd not configured to run an image for this frame OS: %s", runFrame.os) + image = self.rqCore.docker_image - if runFrame.num_cores <= 0: - err = "Not launching, numCores must be > 0" - log.warning(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) + self.__createEnvVariables() + self.__writeHeader() - # See if all requested cores are available - with self.__threadLock: - # pylint: disable=no-member - if self.cores.idle_cores < runFrame.num_cores: - err = "Not launching, insufficient idle cores" - log.critical(err) - raise rqd.rqexceptions.CoreReservationFailureException(err) - # pylint: enable=no-member + tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), + frameInfo.frameId, + time.time()) + self._tempLocations.append(tempStatFile) + tempCommand = [] + if self.rqCore.machine.isDesktop(): + tempCommand += ["/bin/nice"] + tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if runFrame.environment.get('CUE_THREADABLE') == '1': - reserveHT = self.machine.reserveHT(runFrame.num_cores) - if reserveHT: - runFrame.attributes['CPU_LIST'] = reserveHT + if 'CPU_LIST' in runFrame.attributes: + tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] - if runFrame.num_gpus: - reserveGpus = self.machine.reserveGpus(runFrame.num_gpus) - if reserveGpus: - runFrame.attributes['GPU_LIST'] = reserveGpus + tempCommand += [runFrame.command] - # They must be available at this point, reserve them - # pylint: disable=no-member - self.cores.idle_cores -= runFrame.num_cores - self.cores.booked_cores += runFrame.num_cores - # pylint: enable=no-member + # Print PID before executing + command = ["sh", "-c", "echo '$$'; exec " + " ".join(tempCommand)] - runningFrame = rqd.rqnetwork.RunningFrame(self, runFrame) - runningFrame.frameAttendantThread = FrameAttendantThread(self, runFrame, runningFrame) - runningFrame.frameAttendantThread.start() + client = self.rqCore.docker_client + container = client.containers.run(image=image, + detach=True, + environment=self.frameEnv, + working_dir=self.rqCore.machine.getTempPath(), + mounts=self.rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=self.frameEnv["jobhost"], + entrypoint=command) - def getRunningFrame(self, frameId): - """Gets the currently running frame.""" - try: - return self.__cache[frameId] - except KeyError: - log.info("frameId %s is not running on this machine", frameId) - return None + log_stream = container.logs(stream=True) + # CMD prints the process PID before executing the actual command + frameInfo.pid = int(next(log_stream)) - def getCoreInfo(self): - """Gets the core info report.""" - return self.cores + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - def reportStatus(self): - """Replies with hostReport""" - return self.machine.getHostReport() + for line in log_stream: + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + + output = container.wait() + returncode = output["StatusCode"] + + # Find exitStatus and exitSignal + if returncode < 0: + # Exited with a signal + frameInfo.exitStatus = 1 + frameInfo.exitSignal = -returncode + else: + frameInfo.exitStatus = returncode + frameInfo.exitSignal = 0 - def shutdownRqdNow(self): - """Kill all running frames and shutdown RQD""" - self.machine.state = rqd.compiled_proto.host_pb2.DOWN try: - self.lockAll() - self.killAllFrame("shutdownRqdNow Command") + with open(tempStatFile, "r", encoding='utf-8') as statFile: + frameInfo.realtime = statFile.readline().split()[1] + frameInfo.utime = statFile.readline().split()[1] + frameInfo.stime = statFile.readline().split()[1] + statFile.close() # pylint: disable=broad-except except Exception: - log.exception("Failed to kill frames, stopping service anyways") - if not self.__cache: - self.shutdown() + pass # This happens when frames are killed - def shutdownRqdIdle(self): - """When machine is idle, shutdown RQD""" - log.info("shutdownRqdIdle") - self.lockAll() - self.__whenIdle = True - self.sendStatusReport() - if not self.__cache: - self.shutdownRqdNow() + self.__writeFooter() + self.__cleanup() - def rebootNow(self): - """Kill all running frames and reboot machine. - This is not available when a user is logged in""" - log.warning('Requested to reboot now') - if self.machine.isUserLoggedIn(): - err = ('Rebooting via RQD is not supported for a desktop machine ' - 'when a user is logged in') - log.warning(err) - raise rqd.rqexceptions.RqdException(err) - self.__reboot = True - self.shutdownRqdNow() + def runWindows(self): + """The steps required to handle a frame under windows""" + frameInfo = self.frameInfo + runFrame = self.runFrame - def rebootIdle(self): - """When machine is idle, reboot it""" - log.warning('Requested to reboot machine when idle') - self.lockAll() - self.__whenIdle = True - self.__reboot = True - self.sendStatusReport() - if not self.__cache and not self.machine.isUserLoggedIn(): - self.shutdownRqdNow() + self.__createEnvVariables() + self.__writeHeader() - def nimbyOn(self): - """Activates nimby, does not kill any running frames until next nimby - event. Also does not unlock until sufficient idle time is reached.""" - if self.nimby and not self.nimby.active: - try: - self.nimby.run() - log.warning("Nimby has been activated") - # pylint: disable=broad-except - except Exception: - self.nimby.locked = False - err = "Nimby is in the process of shutting down" - log.exception(err) - raise rqd.rqexceptions.RqdException(err) + try: + runFrame.command = runFrame.command.replace('%{frame}', self.frameEnv['CUE_IFRAME']) + tempCommand = [self._createCommandFile(runFrame.command)] - def nimbyOff(self): - """Deactivates nimby and unlocks any nimby lock""" - if self.nimby.active: - self.nimby.stop() - log.info("Nimby has been deactivated") + # pylint: disable=consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + # pylint: disable=broad-except + except Exception: + log.critical( + "Failed subprocess.Popen: Due to: \n%s", + ''.join(traceback.format_exception(*sys.exc_info()))) - def onNimbyLock(self): - """This is called by nimby when it locks the machine. - All running frames are killed. - A new report is sent to the cuebot.""" - self.killAllFrame("NIMBY Triggered") - self.sendStatusReport() + frameInfo.pid = frameInfo.forkedCommand.pid - def onNimbyUnlock(self, asOf=None): - """This is called by nimby when it unlocks the machine due to sufficient - idle. A new report is sent to the cuebot. - @param asOf: Time when idle state began, if known.""" - del asOf - self.sendStatusReport() + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - def lock(self, reqLock): - """Locks the requested core. - If a locked status changes, a status report is sent to the cuebot. - @type reqLock: int - @param reqLock: Number of cores to lock, 100 = 1 physical core""" - sendUpdate = False - with self.__threadLock: - # pylint: disable=no-member - numLock = min(self.cores.total_cores - self.cores.locked_cores, - reqLock) - if numLock > 0: - self.cores.locked_cores += numLock - self.cores.idle_cores -= min(numLock, self.cores.idle_cores) - sendUpdate = True - # pylint: enable=no-member + while True: + output = frameInfo.forkedCommand.stdout.readline() + if not output and frameInfo.forkedCommand.poll() is not None: + break + if output: + self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - log.debug(self.cores) + frameInfo.forkedCommand.wait() - if sendUpdate: - self.sendStatusReport() + # Find exitStatus and exitSignal + returncode = frameInfo.forkedCommand.returncode + if returncode < INT32_MIN: + returncode = 303 + if returncode > INT32_MAX: + returncode = 304 + frameInfo.exitStatus = returncode + frameInfo.exitSignal = returncode - def lockAll(self): - """"Locks all cores on the machine. - If a locked status changes, a status report is sent.""" - sendUpdate = False - with self.__threadLock: - # pylint: disable=no-member - if self.cores.locked_cores < self.cores.total_cores: - self.cores.locked_cores = self.cores.total_cores - self.cores.idle_cores = 0 - sendUpdate = True - # pylint: enable=no-member + frameInfo.realtime = 0 + frameInfo.utime = 0 + frameInfo.stime = 0 - log.debug(self.cores) + self.__writeFooter() + self.__cleanup() - if sendUpdate: - self.sendStatusReport() + def runDarwin(self): + """The steps required to handle a frame under mac""" + frameInfo = self.frameInfo - def unlock(self, reqUnlock): - """Unlocks the requested number of cores. - Also resets reboot/shutdown/restart when idle requests. - If a locked status changes, a status report is sent to the cuebot. - @type reqUnlock: int - @param reqUnlock: Number of cores to unlock, 100 = 1 physical core""" + self.__createEnvVariables() + self.__writeHeader() - sendUpdate = False + rqd.rqutil.permissionsHigh() + try: + tempCommand = ["/usr/bin/su", frameInfo.runFrame.user_name, "-c", '"' + + self._createCommandFile(frameInfo.runFrame.command) + '"'] - if (self.__whenIdle or self.__reboot or - self.machine.state != rqd.compiled_proto.host_pb2.UP): - sendUpdate = True + # pylint: disable=subprocess-popen-preexec-fn,consider-using-with + frameInfo.forkedCommand = subprocess.Popen(tempCommand, + env=self.frameEnv, + cwd=self.rqCore.machine.getTempPath(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + preexec_fn=os.setsid) + finally: + rqd.rqutil.permissionsLow() - self.__whenIdle = False - self.__reboot = False - self.machine.state = rqd.compiled_proto.host_pb2.UP + frameInfo.pid = frameInfo.forkedCommand.pid - with self.__threadLock: - # pylint: disable=no-member - numUnlock = min(self.cores.locked_cores, reqUnlock) - if numUnlock > 0: - self.cores.locked_cores -= numUnlock - self.cores.idle_cores += numUnlock - sendUpdate = True - # pylint: enable=no-member + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() - log.debug(self.cores) + while True: + output = frameInfo.forkedCommand.stdout.readline() + if not output and frameInfo.forkedCommand.poll() is not None: + break + if output: + self.rqlog.write(output, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - if sendUpdate: - self.sendStatusReport() + frameInfo.forkedCommand.wait() - def unlockAll(self): - """"Unlocks all cores on the machine. - Also resets reboot/shutdown/restart when idle requests. - If a locked status changes, a status report is sent.""" + # Find exitStatus and exitSignal + returncode = frameInfo.forkedCommand.returncode + if os.WIFEXITED(returncode): + frameInfo.exitStatus = os.WEXITSTATUS(returncode) + else: + frameInfo.exitStatus = 1 + if os.WIFSIGNALED(returncode): + frameInfo.exitSignal = os.WTERMSIG(returncode) - sendUpdate = False + self.__writeFooter() + self.__cleanup() - if (self.__whenIdle or self.__reboot - or self.machine.state != rqd.compiled_proto.host_pb2.UP): - sendUpdate = True + def runUnknown(self): + """The steps required to handle a frame under an unknown OS.""" - self.__whenIdle = False - self.__reboot = False - self.machine.state = rqd.compiled_proto.host_pb2.UP + def run(self): + """Thread initialization""" + log.info("Monitor frame started for frameId=%s", self.frameId) - with self.__threadLock: - # pylint: disable=no-member - if self.cores.locked_cores > 0: - if not self.nimby.locked: - self.cores.idle_cores += self.cores.locked_cores - self.cores.locked_cores = 0 - sendUpdate = True - # pylint: enable=no-member + runFrame = self.runFrame - log.debug(self.cores) + # pylint: disable=too-many-nested-blocks + try: + runFrame.job_temp_dir = os.path.join(self.rqCore.machine.getTempPath(), + runFrame.job_name) + runFrame.frame_temp_dir = os.path.join(runFrame.job_temp_dir, + runFrame.frame_name) + runFrame.log_file = "%s.%s.rqlog" % (runFrame.job_name, + runFrame.frame_name) + runFrame.log_dir_file = os.path.join(runFrame.log_dir, runFrame.log_file) - if sendUpdate: - self.sendStatusReport() + try: # Exception block for all exceptions + # Ensure permissions return to Low after this block + try: + if rqd.rqconstants.RQD_CREATE_USER_IF_NOT_EXISTS and runFrame.HasField("uid"): + rqd.rqutil.checkAndCreateUser(runFrame.user_name, + runFrame.uid, + runFrame.gid) + # Do everything as launching user: + runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) - def sendStatusReport(self): - """Sends the current host report to Cuebot.""" - self.network.reportStatus(self.machine.getHostReport()) + # Setup frame logging + try: + self.rqlog = rqd.rqlogging.RqdLogger(runFrame.log_dir_file) + self.rqlog.waitForFile() + # pylint: disable=broad-except + except Exception as e: + err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) + raise RuntimeError(err) - def isWaitingForIdle(self): - """Returns whether the host is waiting until idle to take some action.""" - return self.__whenIdle + finally: + rqd.rqutil.permissionsLow() - def sendFrameCompleteReport(self, runningFrame): - """Send a frameCompleteReport to Cuebot""" - if not runningFrame.completeReportSent: - report = rqd.compiled_proto.report_pb2.FrameCompleteReport() - # pylint: disable=no-member - report.host.CopyFrom(self.machine.getHostInfo()) - report.frame.CopyFrom(runningFrame.runningFrameInfo()) - # pylint: enable=no-member + # Store frame in cache and register servant + self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) - if runningFrame.exitStatus is None: - report.exit_status = 1 - else: - report.exit_status = runningFrame.exitStatus + if platform.system() == "Linux" and self.rqCore.docker_client is not None: + self.runDocker() + elif platform.system() == "Linux": + self.runLinux() + elif platform.system() == "Windows": + self.runWindows() + elif platform.system() == "Darwin": + self.runDarwin() + else: + self.runUnknown() - report.exit_signal = runningFrame.exitSignal - report.run_time = int(runningFrame.runTime) + # pylint: disable=broad-except + except Exception: + log.critical( + "Failed launchFrame: For %s due to: \n%s", + runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) + # Notifies the cuebot that there was an error launching + self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH + # Delay keeps the cuebot from spamming failing booking requests + time.sleep(10) + finally: + self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST'), + runFrame.attributes.get('GPU_LIST') + if 'GPU_LIST' in self.runFrame.attributes else None) - # If nimby is active, then frame must have been killed by nimby - # Set the exitSignal to indicate this event - if self.nimby.locked and not runningFrame.ignoreNimby: - report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL + self.rqCore.deleteFrame(self.runFrame.frame_id) - self.network.reportRunningFrameCompletion(report) - runningFrame.completeReportSent = True + self.rqCore.sendFrameCompleteReport(self.frameInfo) + time_till_next = ( + (self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time()) + if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC): + self.rqCore.onIntervalThread.cancel() + self.rqCore.onInterval(rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC) - def sanitizeFrames(self): - """ - Iterate over the cache and update the status of frames that might have - completed but never reported back to cuebot. - """ - for frameId, runningFrame in self.__cache.items(): - # If the frame was marked as completed (exitStatus) and a report has not been sent - # try to file the report again - if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: - try: - self.sendFrameCompleteReport(runningFrame) - self.deleteFrame(frameId) - log.info("Successfully deleted frame from cache for %s/%s (%s)", - runningFrame.runFrame.job_name, - runningFrame.runFrame.frame_name, - frameId) - # pylint: disable=broad-except - except Exception: - log.exception("Failed to sanitize frame %s/%s", - runningFrame.runFrame.job_name, - runningFrame.runFrame.frame_name) + log.info("Monitor frame ended for frameId=%s", + self.runFrame.frame_id) From e56b145a5fc446a7051fe8d9107ca3d4cfd5dd4d Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 17 Oct 2024 16:02:27 -0700 Subject: [PATCH 03/51] Run rqd as root for docker mode --- rqd/rqd.example.conf | 8 ++--- rqd/rqd/rqconstants.py | 27 +++++++++------ rqd/rqd/rqcore.py | 74 ++++++++++++++++++++++++------------------ rqd/rqd/rqutil.py | 4 +++ 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index 78c9cfdab..df909f5e7 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -33,9 +33,5 @@ DOCKER_IMAGE="" RUN_ON_DOCKER=False [docker.mounts] -MCP="type=bind,source=/mcp,target=/mcp,bind-propagation=slave" -NET="type=bind,source=/net,target=/net,bind-propagation=slave" -TMP="type=bind,source=/tmp,target=/tmp,bind-propagation=slave" -SCRATCH="type=bind,source=/scratch,target=/scratch,bind-propagation=slave" -LIMITS="type=bind,source=/etc/security/limits.d/,target=/etc/security/limits.d/,bind-propagation=slave" -FUSE="type=bind,source=/dev/fuse,target=/dev/fuse,bind-propagation=shared" \ No newline at end of file +TEMP=type:bind,source:/tmp,target:/tmp,bind-propagation:slave +NET=type:bind,source:/net,target:/net,bind-propagation:slave diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 54239d321..ba1796baf 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -244,6 +244,9 @@ import docker.models import docker.types + # rqd needs to run as root to be able to run docker + RQD_UID = 0 + RQD_GID = 0 def parse_mount(mount_str): """ Parse mount definitions similar to a docker run command into a docker @@ -255,21 +258,25 @@ def parse_mount(mount_str): # bind-propagation defaults to None as only type=bind accepts it mount_dict["bind-propagation"] = None for item in mount_str.split(","): - key, value = item.split("=") - mount_dic[key.strip()] = value.strip() - return mount_dic + key, value = item.split(":") + mount_dict[key.strip()] = value.strip() + return mount_dict DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: - mount_str = config.get(__docker_mounts, mount_name) - mount_dic = parse_mount(mount_str) - mount = docker.types.Mount(mount_dic["target"], - mount_dic["source"], - type=mount_dic["type"], - propagation=mount_dic["bind-propagation"]) - DOCKER_MOUNTS.append(mount) + try: + mount_str = config.get(__docker_mounts, mount_name) + mount_dict = parse_mount(mount_str) + mount = docker.types.Mount(mount_dict["target"], + mount_dict["source"], + type=mount_dict["type"], + propagation=mount_dict["bind-propagation"]) + DOCKER_MOUNTS.append(mount) + except KeyError as e: + logging.exception("Failed to create Mount for key=%s, value=%s", + mount_name, mount_str) # pylint: disable=broad-except except Exception as e: diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4bb0de433..a328bd374 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -93,6 +93,7 @@ def __init__(self, optNimbyoff=False): self.docker_mounts = [] self.docker_image = "Invalid" if rqd.rqconstants.RUN_ON_DOCKER: + # pylint: disable=import-outside-toplevel import docker self.docker_client = docker.from_env() self.docker_image = rqd.rqconstants.DOCKER_IMAGE @@ -940,7 +941,8 @@ def runDocker(self): # # image = self.rqCore.docker_images.get(runFrame.os) # if image is None: - # raise RuntimeError("rqd not configured to run an image for this frame OS: %s", runFrame.os) + # raise RuntimeError("rqd not configured to run an + # image for this frame OS: %s", runFrame.os) image = self.rqCore.docker_image self.__createEnvVariables() @@ -961,35 +963,41 @@ def runDocker(self): tempCommand += [runFrame.command] # Print PID before executing - command = ["sh", "-c", "echo '$$'; exec " + " ".join(tempCommand)] + command = ["sh", "-c", "echo $$; exec " + " ".join(tempCommand)] client = self.rqCore.docker_client - container = client.containers.run(image=image, - detach=True, - environment=self.frameEnv, - working_dir=self.rqCore.machine.getTempPath(), - mounts=self.rqCore.docker_mounts, - privileged=True, - remove=True, - pid_mode="host", - stderr=True, - hostname=self.frameEnv["jobhost"], - entrypoint=command) - - log_stream = container.logs(stream=True) - # CMD prints the process PID before executing the actual command - frameInfo.pid = int(next(log_stream)) - - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() - - for line in log_stream: - self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - - output = container.wait() - returncode = output["StatusCode"] + try: + container = client.containers.run(image=image, + detach=True, + environment=self.frameEnv, + working_dir=self.rqCore.machine.getTempPath(), + mounts=self.rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=self.frameEnv["jobhost"], + entrypoint=command, + user=runFrame.uid) + + log_stream = container.logs(stream=True) + # CMD prints the process PID before executing the actual command + frameInfo.pid = int(next(log_stream)) + + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() + + for line in log_stream: + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + + output = container.wait() + returncode = output["StatusCode"] + # pylint: disable=broad-except + except Exception: + returncode = 1 + logging.exception("Failed to launch frame container") # Find exitStatus and exitSignal if returncode < 0: @@ -1128,6 +1136,7 @@ def run(self): log.info("Monitor frame started for frameId=%s", self.frameId) runFrame = self.runFrame + run_on_docker = self.rqCore.docker_client is not None # pylint: disable=too-many-nested-blocks try: @@ -1146,9 +1155,10 @@ def run(self): rqd.rqutil.checkAndCreateUser(runFrame.user_name, runFrame.uid, runFrame.gid) - # Do everything as launching user: - runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID - rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) + if not run_on_docker: + # Do everything as launching user: + runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) # Setup frame logging try: @@ -1165,7 +1175,7 @@ def run(self): # Store frame in cache and register servant self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) - if platform.system() == "Linux" and self.rqCore.docker_client is not None: + if run_on_docker: self.runDocker() elif platform.system() == "Linux": self.runLinux() diff --git a/rqd/rqd/rqutil.py b/rqd/rqd/rqutil.py index 3c11e75ff..ce1964f08 100644 --- a/rqd/rqd/rqutil.py +++ b/rqd/rqd/rqutil.py @@ -157,6 +157,10 @@ def checkAndCreateUser(username, uid=None, gid=None): cmd.append(username) log.info("Frame's username not found on host. Adding user with: %s", cmd) subprocess.check_call(cmd) + # pylint: disable=broad-except + except Exception: + logging.exception("useradd failed to add user: %s. User possibly already exists.", + username) finally: permissionsLow() From 4e365d40beec64ef7962a6e705482255c72f1430 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 17 Oct 2024 16:07:17 -0700 Subject: [PATCH 04/51] [EXPERIMENT] Rqd containerized frame (#1546) Signed-off-by: Diego Tavares --- rqd/rqd.example.conf | 8 ++--- rqd/rqd/rqconstants.py | 28 ++++++++++------ rqd/rqd/rqcore.py | 74 ++++++++++++++++++++++++------------------ rqd/rqd/rqutil.py | 4 +++ 4 files changed, 66 insertions(+), 48 deletions(-) diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index 78c9cfdab..22e260ae9 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -33,9 +33,5 @@ DOCKER_IMAGE="" RUN_ON_DOCKER=False [docker.mounts] -MCP="type=bind,source=/mcp,target=/mcp,bind-propagation=slave" -NET="type=bind,source=/net,target=/net,bind-propagation=slave" -TMP="type=bind,source=/tmp,target=/tmp,bind-propagation=slave" -SCRATCH="type=bind,source=/scratch,target=/scratch,bind-propagation=slave" -LIMITS="type=bind,source=/etc/security/limits.d/,target=/etc/security/limits.d/,bind-propagation=slave" -FUSE="type=bind,source=/dev/fuse,target=/dev/fuse,bind-propagation=shared" \ No newline at end of file +TEMP=type:bind,source:/tmp,target:/tmp,bind-propagation:slave +NET=type:bind,source:/net,target:/net,bind-propagation:slave \ No newline at end of file diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 54239d321..80b9bb29b 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -244,6 +244,10 @@ import docker.models import docker.types + # rqd needs to run as root to be able to run docker + RQD_UID = 0 + RQD_GID = 0 + def parse_mount(mount_str): """ Parse mount definitions similar to a docker run command into a docker @@ -255,21 +259,25 @@ def parse_mount(mount_str): # bind-propagation defaults to None as only type=bind accepts it mount_dict["bind-propagation"] = None for item in mount_str.split(","): - key, value = item.split("=") - mount_dic[key.strip()] = value.strip() - return mount_dic + key, value = item.split(":") + mount_dict[key.strip()] = value.strip() + return mount_dict DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: - mount_str = config.get(__docker_mounts, mount_name) - mount_dic = parse_mount(mount_str) - mount = docker.types.Mount(mount_dic["target"], - mount_dic["source"], - type=mount_dic["type"], - propagation=mount_dic["bind-propagation"]) - DOCKER_MOUNTS.append(mount) + try: + mount_str = config.get(__docker_mounts, mount_name) + mount_dict = parse_mount(mount_str) + mount = docker.types.Mount(mount_dict["target"], + mount_dict["source"], + type=mount_dict["type"], + propagation=mount_dict["bind-propagation"]) + DOCKER_MOUNTS.append(mount) + except KeyError as e: + logging.exception("Failed to create Mount for key=%s, value=%s", + mount_name, mount_str) # pylint: disable=broad-except except Exception as e: diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4bb0de433..a328bd374 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -93,6 +93,7 @@ def __init__(self, optNimbyoff=False): self.docker_mounts = [] self.docker_image = "Invalid" if rqd.rqconstants.RUN_ON_DOCKER: + # pylint: disable=import-outside-toplevel import docker self.docker_client = docker.from_env() self.docker_image = rqd.rqconstants.DOCKER_IMAGE @@ -940,7 +941,8 @@ def runDocker(self): # # image = self.rqCore.docker_images.get(runFrame.os) # if image is None: - # raise RuntimeError("rqd not configured to run an image for this frame OS: %s", runFrame.os) + # raise RuntimeError("rqd not configured to run an + # image for this frame OS: %s", runFrame.os) image = self.rqCore.docker_image self.__createEnvVariables() @@ -961,35 +963,41 @@ def runDocker(self): tempCommand += [runFrame.command] # Print PID before executing - command = ["sh", "-c", "echo '$$'; exec " + " ".join(tempCommand)] + command = ["sh", "-c", "echo $$; exec " + " ".join(tempCommand)] client = self.rqCore.docker_client - container = client.containers.run(image=image, - detach=True, - environment=self.frameEnv, - working_dir=self.rqCore.machine.getTempPath(), - mounts=self.rqCore.docker_mounts, - privileged=True, - remove=True, - pid_mode="host", - stderr=True, - hostname=self.frameEnv["jobhost"], - entrypoint=command) - - log_stream = container.logs(stream=True) - # CMD prints the process PID before executing the actual command - frameInfo.pid = int(next(log_stream)) - - if not self.rqCore.updateRssThread.is_alive(): - self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, - self.rqCore.updateRss) - self.rqCore.updateRssThread.start() - - for line in log_stream: - self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - - output = container.wait() - returncode = output["StatusCode"] + try: + container = client.containers.run(image=image, + detach=True, + environment=self.frameEnv, + working_dir=self.rqCore.machine.getTempPath(), + mounts=self.rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=self.frameEnv["jobhost"], + entrypoint=command, + user=runFrame.uid) + + log_stream = container.logs(stream=True) + # CMD prints the process PID before executing the actual command + frameInfo.pid = int(next(log_stream)) + + if not self.rqCore.updateRssThread.is_alive(): + self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, + self.rqCore.updateRss) + self.rqCore.updateRssThread.start() + + for line in log_stream: + self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + + output = container.wait() + returncode = output["StatusCode"] + # pylint: disable=broad-except + except Exception: + returncode = 1 + logging.exception("Failed to launch frame container") # Find exitStatus and exitSignal if returncode < 0: @@ -1128,6 +1136,7 @@ def run(self): log.info("Monitor frame started for frameId=%s", self.frameId) runFrame = self.runFrame + run_on_docker = self.rqCore.docker_client is not None # pylint: disable=too-many-nested-blocks try: @@ -1146,9 +1155,10 @@ def run(self): rqd.rqutil.checkAndCreateUser(runFrame.user_name, runFrame.uid, runFrame.gid) - # Do everything as launching user: - runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID - rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) + if not run_on_docker: + # Do everything as launching user: + runFrame.gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + rqd.rqutil.permissionsUser(runFrame.uid, runFrame.gid) # Setup frame logging try: @@ -1165,7 +1175,7 @@ def run(self): # Store frame in cache and register servant self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) - if platform.system() == "Linux" and self.rqCore.docker_client is not None: + if run_on_docker: self.runDocker() elif platform.system() == "Linux": self.runLinux() diff --git a/rqd/rqd/rqutil.py b/rqd/rqd/rqutil.py index 3c11e75ff..ce1964f08 100644 --- a/rqd/rqd/rqutil.py +++ b/rqd/rqd/rqutil.py @@ -157,6 +157,10 @@ def checkAndCreateUser(username, uid=None, gid=None): cmd.append(username) log.info("Frame's username not found on host. Adding user with: %s", cmd) subprocess.check_call(cmd) + # pylint: disable=broad-except + except Exception: + logging.exception("useradd failed to add user: %s. User possibly already exists.", + username) finally: permissionsLow() From a2283f4dfa99c182283f91848d26dc9f82795f50 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 18 Oct 2024 09:35:45 -0700 Subject: [PATCH 05/51] Fix logging for deleteFrame Logging was added on the wrong scope, which led to a "Frame not found in cache" when a frame was actually found. --- rqd/rqd/rqcore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index a328bd374..e50a40d7d 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -222,9 +222,9 @@ def deleteFrame(self, frameId): self.cores.reserved_cores) # pylint: disable=no-member self.cores.reserved_cores.clear() - log.info("Successfully delete frame with Id: %s", frameId) - else: - log.warning("Frame with Id: %s not found in cache", frameId) + log.info("Successfully delete frame with Id: %s", frameId) + else: + log.warning("Frame with Id: %s not found in cache", frameId) def killAllFrame(self, reason): """Will execute .kill() on every frame in cache until no frames remain From 90fce5905ff55e97294f9909c941039b495fdbb3 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 18 Oct 2024 15:48:15 -0700 Subject: [PATCH 06/51] Update jobspec on dispatch test description New spec is required to allow passing the layer's expected OS. --- .../src/test/resources/conf/dtd/cjsl-1.14.dtd | 104 ++++++++++++++++++ .../conf/jobspec/jobspec_dispatch_test.xml | 10 +- 2 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd diff --git a/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd b/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd new file mode 100644 index 000000000..8bbcbf6f1 --- /dev/null +++ b/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml index 2c372eff2..b656f499f 100644 --- a/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml +++ b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml @@ -18,7 +18,7 @@ - + spi @@ -30,9 +30,10 @@ false + Linux - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 @@ -44,7 +45,7 @@ - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 @@ -61,9 +62,10 @@ false + Linux - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 From 8563f09c11eb3659e312341d59d09eaf2c0f7a9a Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 18 Oct 2024 16:22:35 -0700 Subject: [PATCH 07/51] Allow multiple OSs on a dockerized rqd When rqd is running on docker mode, it can report multiple supported OSs. On rqd.conf, multiple images can be provided under [docker.images] and each image refers to a supported OS. --- .../com/imageworks/spcue/DispatchFrame.java | 3 + .../com/imageworks/spcue/DispatchHost.java | 10 +- .../com/imageworks/spcue/VirtualProc.java | 10 +- .../spcue/dao/postgres/DispatchQuery.java | 297 +++--------------- .../spcue/dao/postgres/DispatcherDaoJdbc.java | 181 +++++++---- .../spcue/dao/postgres/FrameDaoJdbc.java | 2 + .../spcue/dao/postgres/HostDaoJdbc.java | 2 +- .../dispatcher/DispatchSupportService.java | 1 + .../spcue/dispatcher/HostReportHandler.java | 22 +- .../test/dao/postgres/DispatcherDaoTests.java | 58 +++- proto/rqd.proto | 1 + rqd/rqd.example.conf | 11 +- rqd/rqd/rqconstants.py | 36 ++- rqd/rqd/rqcore.py | 31 +- rqd/rqd/rqutil.py | 3 +- 15 files changed, 327 insertions(+), 341 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java index faa1a9c04..2c60e9930 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java @@ -49,5 +49,8 @@ public class DispatchFrame extends FrameEntity implements FrameInterface { // A comma separated list of services public String services; + + // The Operational System this frame is expected to run in + public String os; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java index f01724e17..40a3e6bbc 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java @@ -51,7 +51,7 @@ public class DispatchHost extends Entity public long gpuMemory; public long idleGpuMemory; public String tags; - public String os; + private String os; public boolean isNimby; public boolean isLocalDispatch = false; @@ -81,6 +81,14 @@ public String getFacilityId() { return facilityId; } + public String[] getOs() { + return this.os.split(","); + } + + public void setOs(String os) { + this.os = os; + } + public boolean canHandleNegativeCoresRequest(int requestedCores) { // Request is positive, no need to test further. if (requestedCores > 0) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java index 8205f3021..02ade6bb4 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java @@ -85,7 +85,9 @@ public String getName() { * @param frame * @return */ - public static final VirtualProc build(DispatchHost host, DispatchFrame frame, String... selfishServices) { + public static final VirtualProc build(DispatchHost host, + DispatchFrame frame, + String... selfishServices) { VirtualProc proc = new VirtualProc(); proc.allocationId = host.getAllocationId(); proc.hostId = host.getHostId(); @@ -94,7 +96,7 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame, St proc.jobId = frame.getJobId(); proc.showId = frame.getShowId(); proc.facilityId = frame.getFacilityId(); - proc.os = host.os; + proc.os = frame.os; proc.hostName = host.getName(); proc.unbooked = false; @@ -148,7 +150,7 @@ else if (proc.coresReserved >= 100) { proc.coresReserved = wholeCores * 100; } else { if (frame.threadable) { - if (selfishServices != null && + if (selfishServices != null && frame.services != null && containsSelfishService(frame.services.split(","), selfishServices)){ proc.coresReserved = wholeCores * 100; @@ -238,7 +240,7 @@ public static final VirtualProc build(DispatchHost host, proc.jobId = frame.getJobId(); proc.showId = frame.getShowId(); proc.facilityId = frame.getFacilityId(); - proc.os = host.os; + proc.os = frame.os; proc.hostName = host.getName(); proc.unbooked = false; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index e36f97999..02dae0f22 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,9 +72,9 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os = '' " + + "job.str_os IS NULL OR job.str_os IN '' " + "OR " + - "job.str_os = ? " + + "job.str_os IN ? " + ") " + "AND (CASE WHEN layer_stat.int_waiting_count > 0 THEN 1 ELSE NULL END) = 1 " + "AND layer.int_cores_min <= ? " + @@ -135,7 +135,7 @@ public class DispatchQuery { "(" + "job.str_os IS NULL OR job.str_os = '' " + "OR " + - "job.str_os = ? " + + "job.str_os IN ? " + ") " + "AND (CASE WHEN layer_stat.int_waiting_count > 0 THEN 1 ELSE NULL END) = 1 " + "AND layer.int_cores_min <= ? " + @@ -250,7 +250,7 @@ private static final String replaceQueryForFifo(String query) { "AND " + "job.pk_facility = ? " + "AND " + - "(job.str_os = ? OR job.str_os IS NULL) " + + "(job.str_os IN ? OR job.str_os IS NULL) " + "AND " + "job.pk_job IN ( " + "SELECT " + @@ -276,7 +276,7 @@ private static final String replaceQueryForFifo(String query) { "AND " + "j.pk_facility = ? " + "AND " + - "(j.str_os = ? OR j.str_os IS NULL) " + + "(j.str_os IN ? OR j.str_os IS NULL) " + "AND " + "(CASE WHEN lst.int_waiting_count > 0 THEN lst.pk_layer ELSE NULL END) = l.pk_layer " + "AND " + @@ -519,40 +519,42 @@ private static final String replaceQueryForFifo(String query) { ") " + "LIMIT 1"; + private static final String FIND_DISPATCH_FRAME_COLUMNS = + "show_name, " + + "job_name, " + + "pk_job, " + + "pk_show, " + + "pk_facility, " + + "str_name, " + + "str_shot, " + + "str_user, " + + "int_uid, " + + "str_log_dir, " + + "str_os, " + + "frame_name, " + + "frame_state, " + + "pk_frame, " + + "pk_layer, " + + "int_retries, " + + "int_version, " + + "layer_name, " + + "layer_type, " + + "b_threadable, " + + "int_cores_min, " + + "int_cores_max, " + + "int_mem_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + + "str_cmd, " + + "str_range, " + + "int_chunk_size, " + + "str_services "; /** * Finds the next frame in a job for a proc. */ public static final String FIND_DISPATCH_FRAME_BY_JOB_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -569,6 +571,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -638,36 +641,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_DISPATCH_FRAME_BY_JOB_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "int_gpus_min, " + - "int_gpus_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -684,6 +658,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -754,36 +729,7 @@ private static final String replaceQueryForFifo(String query) { public static final String FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -800,6 +746,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -863,36 +810,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "int_gpus_min, " + - "int_gpus_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -909,6 +827,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -975,36 +894,7 @@ private static final String replaceQueryForFifo(String query) { * Finds the next frame in a job for a proc. */ public static final String FIND_DISPATCH_FRAME_BY_LAYER_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1021,6 +911,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1090,36 +981,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_DISPATCH_FRAME_BY_LAYER_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1136,6 +998,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1206,36 +1069,7 @@ private static final String replaceQueryForFifo(String query) { public static final String FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1252,6 +1086,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1315,36 +1150,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER (ORDER BY " + @@ -1361,6 +1167,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java index b17ae14e3..7db4714ea 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java @@ -24,6 +24,7 @@ import java.sql.ResultSet; import java.sql.SQLException; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashSet; import java.util.LinkedList; @@ -172,6 +173,12 @@ else if (cached.isExpired()) { return bookableShows.get(key).shows; } + // Given a query, + private String handleInClause(String key, String query, int inValueLength) { + String placeholders = String.join(",", Collections.nCopies(inValueLength, "?")); + return query.replace(key + " IN ?", key + " IN (" + placeholders + ")"); + } + private Set findDispatchJobs(DispatchHost host, int numJobs, boolean shuffleShows) { LinkedHashSet result = new LinkedHashSet(); List shows = new LinkedList(getBookableShows(host)); @@ -216,20 +223,24 @@ private Set findDispatchJobs(DispatchHost host, int numJobs, boolean shu @Override public PreparedStatement createPreparedStatement(Connection conn) throws SQLException { - PreparedStatement find_jobs_stmt = conn.prepareStatement( - FIND_JOBS_BY_SHOW_NO_GPU); - find_jobs_stmt.setString(1, s.getShowId()); - find_jobs_stmt.setString(2, host.getFacilityId()); - find_jobs_stmt.setString(3, host.os); - find_jobs_stmt.setInt(4, host.idleCores); - find_jobs_stmt.setLong(5, host.idleMemory); - find_jobs_stmt.setInt(6, threadMode(host.threadMode)); - find_jobs_stmt.setString(7, host.getName()); - find_jobs_stmt.setInt(8, numJobs * 10); + String query = handleInClause("str_os", FIND_JOBS_BY_SHOW_NO_GPU, host.getOs().length); + PreparedStatement find_jobs_stmt = conn.prepareStatement(query); + + int index = 1; + find_jobs_stmt.setString(index++, s.getShowId()); + find_jobs_stmt.setString(index++, host.getFacilityId()); + for (String item : host.getOs()) { + find_jobs_stmt.setString(index++, item); + } + find_jobs_stmt.setInt(index++, host.idleCores); + find_jobs_stmt.setLong(index++, host.idleMemory); + find_jobs_stmt.setInt(index++, threadMode(host.threadMode)); + find_jobs_stmt.setString(index++, host.getName()); + find_jobs_stmt.setInt(index++, numJobs * 10); return find_jobs_stmt; }}, PKJOB_MAPPER )); - prometheusMetrics.setBookingDurationMetric("findDispatchJobs nogpu findByShowQuery", + prometheusMetrics.setBookingDurationMetric("findDispatchJobs nogpu findByShowQuery", System.currentTimeMillis() - lastTime); } else { @@ -237,19 +248,22 @@ public PreparedStatement createPreparedStatement(Connection conn) @Override public PreparedStatement createPreparedStatement(Connection conn) throws SQLException { - PreparedStatement find_jobs_stmt = conn.prepareStatement( - findByShowQuery()); - find_jobs_stmt.setString(1, s.getShowId()); - find_jobs_stmt.setString(2, host.getFacilityId()); - find_jobs_stmt.setString(3, host.os); - find_jobs_stmt.setInt(4, host.idleCores); - find_jobs_stmt.setLong(5, host.idleMemory); - find_jobs_stmt.setInt(6, threadMode(host.threadMode)); - find_jobs_stmt.setInt(7, host.idleGpus); - find_jobs_stmt.setLong(8, (host.idleGpuMemory > 0) ? 1 : 0); - find_jobs_stmt.setLong(9, host.idleGpuMemory); - find_jobs_stmt.setString(10, host.getName()); - find_jobs_stmt.setInt(11, numJobs * 10); + String query = handleInClause("str_os", findByShowQuery(), host.getOs().length); + PreparedStatement find_jobs_stmt = conn.prepareStatement(query); + int index = 1; + find_jobs_stmt.setString(index++, s.getShowId()); + find_jobs_stmt.setString(index++, host.getFacilityId()); + for (String item : host.getOs()) { + find_jobs_stmt.setString(index++, item); + } + find_jobs_stmt.setInt(index++, host.idleCores); + find_jobs_stmt.setLong(index++, host.idleMemory); + find_jobs_stmt.setInt(index++, threadMode(host.threadMode)); + find_jobs_stmt.setInt(index++, host.idleGpus); + find_jobs_stmt.setLong(index++, (host.idleGpuMemory > 0) ? 1 : 0); + find_jobs_stmt.setLong(index++, host.idleGpuMemory); + find_jobs_stmt.setString(index++, host.getName()); + find_jobs_stmt.setInt(index++, numJobs * 10); return find_jobs_stmt; }}, PKJOB_MAPPER )); @@ -308,31 +322,48 @@ public Set findDispatchJobs(DispatchHost host, GroupInterface g) { long lastTime = System.currentTimeMillis(); if (host.idleGpus == 0 && (schedulingMode == SchedulingMode.BALANCED)) { + String query = handleInClause("str_os", FIND_JOBS_BY_GROUP_NO_GPU, host.getOs().length); + ArrayList args = new ArrayList(); + + args.add(g.getGroupId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.getName()); + args.add(50); result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_GROUP_NO_GPU, - PKJOB_MAPPER, - g.getGroupId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.getName(), 50)); + query, + PKJOB_MAPPER, args.toArray())); prometheusMetrics.setBookingDurationMetric("findDispatchJobs by group nogpu query", System.currentTimeMillis() - lastTime); } else { + String query = handleInClause("str_os", findByGroupQuery(), host.getOs().length); + ArrayList args = new ArrayList(); + + args.add(g.getGroupId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.idleGpus); + args.add(host.idleGpuMemory > 0 ? 1 : 0); + args.add(host.idleGpuMemory); + args.add(host.getName()); + args.add(50); result.addAll(getJdbcTemplate().query( - findByGroupQuery(), - PKJOB_MAPPER, - g.getGroupId(),host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.idleGpus, - (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, - host.getName(), 50)); + query, + PKJOB_MAPPER, args.toArray())); prometheusMetrics.setBookingDurationMetric("findDispatchJobs by group query", System.currentTimeMillis() - lastTime); - } - return result; } @@ -515,26 +546,47 @@ public Set findDispatchJobs(DispatchHost host, LinkedHashSet result = new LinkedHashSet(numJobs); long start = System.currentTimeMillis(); if (host.idleGpus == 0 && (schedulingMode == SchedulingMode.BALANCED)) { + String query = handleInClause("str_os", FIND_JOBS_BY_SHOW_NO_GPU, host.getOs().length); + ArrayList args = new ArrayList(); + args.add(show.getShowId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.getName()); + args.add(numJobs * 10); + result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_SHOW_NO_GPU, - PKJOB_MAPPER, - show.getShowId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.getName(), numJobs * 10)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findDispatchJobs by show nogpu query", System.currentTimeMillis() - start); } else { + String query = handleInClause("str_os", findByShowQuery(), host.getOs().length); + ArrayList args = new ArrayList(); + args.add(show.getShowId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.idleGpus); + args.add(host.idleGpuMemory > 0 ? 1 : 0); + args.add(host.idleGpuMemory); + args.add(host.getName()); + args.add(numJobs * 10); + result.addAll(getJdbcTemplate().query( - findByShowQuery(), - PKJOB_MAPPER, - show.getShowId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.idleGpus, - (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, - host.getName(), numJobs * 10)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findDispatchJobs by show query", System.currentTimeMillis() - start); } @@ -548,11 +600,24 @@ public Set findDispatchJobs(DispatchHost host, public Set findLocalDispatchJobs(DispatchHost host) { LinkedHashSet result = new LinkedHashSet(5); long start = System.currentTimeMillis(); + + String query = handleInClause("str_os", FIND_JOBS_BY_LOCAL, host.getOs().length); + ArrayList args = new ArrayList(); + args.add(host.getHostId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.getHostId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_LOCAL, - PKJOB_MAPPER, - host.getHostId(), host.getFacilityId(), - host.os, host.getHostId(), host.getFacilityId(), host.os)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findLocalDispatchJobs query", System.currentTimeMillis() - start); return result; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java index 0546d4558..9e0f6f80c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java @@ -331,6 +331,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { frame.minGpuMemory = rs.getLong("int_gpu_mem_min"); frame.version = rs.getInt("int_version"); frame.services = rs.getString("str_services"); + frame.os = rs.getString("str_os"); return frame; } }; @@ -347,6 +348,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { "job.str_user,"+ "job.int_uid,"+ "job.str_log_dir,"+ + "job.str_os,"+ "frame.str_name AS frame_name, "+ "frame.str_state AS frame_state, "+ "frame.pk_frame, "+ diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java index 223737042..304fe474d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java @@ -214,7 +214,7 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException { host.isNimby = rs.getBoolean("b_nimby"); host.threadMode = rs.getInt("int_thread_mode"); host.tags = rs.getString("str_tags"); - host.os = rs.getString("str_os"); + host.setOs(rs.getString("str_os")); host.hardwareState = HardwareState.valueOf(rs.getString("str_state")); return host; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index f60b2c1e6..0779209b0 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -395,6 +395,7 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) { .setNumGpus(proc.gpusReserved) .setStartTime(System.currentTimeMillis()) .setIgnoreNimby(proc.isLocalDispatch) + .setOs(proc.os) .putAllEnvironment(jobDao.getEnvironment(frame)) .putAllEnvironment(layerDao.getLayerEnvironment(frame)) .putEnvironment("CUE3", "1") diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index b0a7ccd9c..b91a867bb 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -254,8 +254,10 @@ public void handleHostReport(HostReport report, boolean isBoot) { bookingManager.removeInactiveLocalHostAssignment(lca); } } - - if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.os)) { + + if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), + report.getHost().getFreeMcp(), + host.getOs())) { msg = String.format( "%s doesn't have enough free space in the temporary directory (mcp), %dMB", host.name, (report.getHost().getFreeMcp()/1024)); @@ -348,16 +350,19 @@ else if (!dispatchSupport.isCueBookable(host)) { * * @param tempTotalStorage Total storage on the temp directory * @param tempFreeStorage Free storage on the temp directory - * @param hostOs Reported os + * @param hostOs Reported operational systems * @return */ - private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStorage, String hostOs) { + private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStorage, String[] hostOs) { // The minimum amount of free space in the temporary directory to book a host int minAvailableTempPercentage = env.getRequiredProperty( "dispatcher.min_available_temp_storage_percentage", Integer.class); - return minAvailableTempPercentage == -1 || hostOs.equalsIgnoreCase(WINDOWS_OS) || - (((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage); + return minAvailableTempPercentage == -1 + // It is safe to asume multiple OSs imply windows is not the base OS, + // threfore Windows will always report a single hostOs + || (hostOs.length == 1 && hostOs[0].equalsIgnoreCase(WINDOWS_OS)) + || (((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage); } /** @@ -424,7 +429,10 @@ private boolean changeStateForTempDirStorage(DispatchHost host, RenderHost repor "dispatcher.min_available_temp_storage_percentage", Integer.class); // Prevent cue frames from booking on hosts with full temporary directories - boolean hasEnoughTempStorage = isTempDirStorageEnough(reportHost.getTotalMcp(), reportHost.getFreeMcp(), host.os); + boolean hasEnoughTempStorage = isTempDirStorageEnough( + reportHost.getTotalMcp(), + reportHost.getFreeMcp(), + host.getOs()); if (!hasEnoughTempStorage && host.hardwareState == HardwareState.UP) { // Insert a comment indicating that the Host status = Repair with reason = Full temporary directory CommentDetail c = new CommentDetail(); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java index 5b7eaee72..1ff849473 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java @@ -203,7 +203,7 @@ public void testFindNextDispatchFrameByProc() { assertNotNull(frame); assertEquals("0001-pass_1", frame.name); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -235,7 +235,7 @@ public void testFindNextDispatchFramesByProc() { DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -288,7 +288,7 @@ public void testFindNextDispatchFramesByProcAndJobLocal() { assertEquals(10, frames.size()); DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; proc.isLocalDispatch = true; @@ -310,7 +310,7 @@ public void testFindNextDispatchFramesByProcAndLayerLocal() { assertEquals(10, frames.size()); DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; proc.isLocalDispatch = true; @@ -406,7 +406,7 @@ public void testfindUnderProcedJob() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job1.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -442,7 +442,7 @@ public void testHigherPriorityJobExistsTrue() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -476,7 +476,7 @@ public void testHigherPriorityJobExistsFalse() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -511,7 +511,7 @@ public void testHigherPriorityJobExistsMaxProcBound() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -525,4 +525,46 @@ public void testHigherPriorityJobExistsMaxProcBound() { public void testFifoSchedulingEnabled() { assertEquals(dispatcherDao.getSchedulingMode(), DispatcherDao.SchedulingMode.PRIORITY_ONLY); } + + @Test + @Transactional + @Rollback(true) + public void testFindDispatchJobsByShowMultiOs() { + DispatchHost host = getHost(); + // Set multiple Os and confirm jobs with Linux are still being found + final JobDetail job = getJob1(); + assertNotNull(job); + + // Host with different os + host.setOs("centos7,SomethingElse"); + Set jobs = dispatcherDao.findDispatchJobs(host, + adminManager.findShowEntity("pipe"), 5); + assertTrue(jobs.size() == 0); + + // Host with Linux Os (same as defined on spec) + host.setOs("centos7,Linux,rocky9"); + jobs = dispatcherDao.findDispatchJobs(host, + adminManager.findShowEntity("pipe"), 5); + assertTrue(jobs.size() > 0); + } + + @Test + @Transactional + @Rollback(true) + public void testFindDispatchJobsAllShowsMultiOs() { + DispatchHost host = getHost(); + // Set multiple Os and confirm jobs with Linux are still being found + final JobDetail job = getJob1(); + assertNotNull(job); + + // Host with incompatible OS shouldn't find any job + host.setOs("centos7,SomethingElse"); + Set jobs = dispatcherDao.findDispatchJobs(host, 5); + assertTrue(jobs.size() == 0); + + // Host with Linux Os (same as defined on spec) should find jobs + host.setOs("centos7,Linux,rocky9"); + jobs = dispatcherDao.findDispatchJobs(host, 5); + assertTrue(jobs.size() > 0); + } } diff --git a/proto/rqd.proto b/proto/rqd.proto index f6e0d8790..8d1946981 100644 --- a/proto/rqd.proto +++ b/proto/rqd.proto @@ -112,6 +112,7 @@ message RunFrame { map attributes = 22; int32 num_gpus = 23; report.ChildrenProcStats children = 24; + string os = 25; } message RunFrameSeq { diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index df909f5e7..1eff0b640 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -29,9 +29,18 @@ MAYA_SCRIPT_PATH PIXAR_LICENSE_FILE [docker.config] -DOCKER_IMAGE="" +# Setting this to True requires all the additional "docker.[]" sections to be filled RUN_ON_DOCKER=False +# This section is only required if RUN_ON_DOCKER=True +# List of volume mounts following docker run's format, but replacing = with : [docker.mounts] TEMP=type:bind,source:/tmp,target:/tmp,bind-propagation:slave NET=type:bind,source:/net,target:/net,bind-propagation:slave + +# This section is only required if RUN_ON_DOCKER=True +# - keys represent OSs this rqd is capable of executing jobs in +# - values are docker image tags +[docker.images] +centos7=centos7.3:latest +rocky9=rocky9.3:latest \ No newline at end of file diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index ba1796baf..beba053fb 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -1,4 +1,5 @@ # Copyright Contributors to the OpenCue Project +# Copyright Contributors to the OpenCue Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -155,7 +156,7 @@ # Docker mode config RUN_ON_DOCKER = False -DOCKER_IMAGE = "Invalid" +DOCKER_IMAGES = {} DOCKER_MOUNTS = [] try: @@ -163,8 +164,6 @@ # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" - __docker_mounts = "docker.mounts" - __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: @@ -237,6 +236,10 @@ if config.has_section(__host_env_var_section): RQD_HOST_ENV_VARS = config.options(__host_env_var_section) + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" + __docker_images = "docker.images" + if config.has_section(__docker_config): RUN_ON_DOCKER = config.getboolean(__docker_config, "RUN_ON_DOCKER") if RUN_ON_DOCKER: @@ -247,6 +250,32 @@ # rqd needs to run as root to be able to run docker RQD_UID = 0 RQD_GID = 0 + + # Every key:value on the config file under docker.images + # is parsed as key=SP_OS and value=image_tag. + # SP_OS is set to a list of all available keys + # For example: + # + # rqd.conf + # [docker.images] + # centos7=centos7.3:latest + # rocky9=rocky9.3:latest + # + # becomes: + # SP_OS=centos7,rocky9 + # DOCKER_IMAGES={ + # "centos7": "centos7.3:latest", + # "rocky9": "rocky9.3:latest" + # } + keys = config.options(__docker_images) + DOCKER_IMAGES = {} + for key in keys: + DOCKER_IMAGES[key] = config.get(__docker_images, key) + SP_OS = ",".join(keys) + if not DOCKER_IMAGES: + raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + def parse_mount(mount_str): """ Parse mount definitions similar to a docker run command into a docker @@ -262,7 +291,6 @@ def parse_mount(mount_str): mount_dict[key.strip()] = value.strip() return mount_dict - DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index e50a40d7d..4de489e1d 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -91,12 +91,12 @@ def __init__(self, optNimbyoff=False): self.docker_client = None self.docker_mounts = [] - self.docker_image = "Invalid" + self.docker_images = {} if rqd.rqconstants.RUN_ON_DOCKER: # pylint: disable=import-outside-toplevel import docker self.docker_client = docker.from_env() - self.docker_image = rqd.rqconstants.DOCKER_IMAGE + self.docker_images = rqd.rqconstants.DOCKER_IMAGES self.docker_mounts = rqd.rqconstants.DOCKER_MOUNTS signal.signal(signal.SIGINT, self.handleExit) @@ -936,14 +936,25 @@ def runDocker(self): frameInfo = self.frameInfo runFrame = self.runFrame - # TODO: implement support for multiple images - # requires adding `string os = 25;` to rqd.proto/RunFrame - # - # image = self.rqCore.docker_images.get(runFrame.os) - # if image is None: - # raise RuntimeError("rqd not configured to run an - # image for this frame OS: %s", runFrame.os) - image = self.rqCore.docker_image + if runFrame.os: + image = self.rqCore.docker_images.get(runFrame.os) + if image is None: + self.__writeHeader() + msg = ("This rqd is not configured to run an image " + "for this frame OS: %s. Check the [docker.images] " + "section of rqd.conf for more information." % runFrame.os) + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise RuntimeError(msg) + elif self.rqCore.docker_images: + # If a frame doesn't require an specic OS, default to the first configured OS on + # [docker.images] + image = list(self.rqCore.docker_images.values)[0] + else: + self.__writeHeader() + msg = ("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise RuntimeError(msg) self.__createEnvVariables() self.__writeHeader() diff --git a/rqd/rqd/rqutil.py b/rqd/rqd/rqutil.py index ce1964f08..3d8abc964 100644 --- a/rqd/rqd/rqutil.py +++ b/rqd/rqd/rqutil.py @@ -159,8 +159,7 @@ def checkAndCreateUser(username, uid=None, gid=None): subprocess.check_call(cmd) # pylint: disable=broad-except except Exception: - logging.exception("useradd failed to add user: %s. User possibly already exists.", - username) + logging.info("useradd failed to add user: %s. User possibly already exists.", username) finally: permissionsLow() From b117568d99fb8a3f28b9782041f56968de2f17b6 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 18 Oct 2024 16:43:51 -0700 Subject: [PATCH 08/51] Update placeholder branch for containerized_rqd (#1550) Signed-off-by: Diego Tavares --- .../com/imageworks/spcue/DispatchFrame.java | 3 + .../com/imageworks/spcue/DispatchHost.java | 10 +- .../com/imageworks/spcue/VirtualProc.java | 10 +- .../spcue/dao/postgres/DispatchQuery.java | 297 +++--------------- .../spcue/dao/postgres/DispatcherDaoJdbc.java | 181 +++++++---- .../spcue/dao/postgres/FrameDaoJdbc.java | 2 + .../spcue/dao/postgres/HostDaoJdbc.java | 2 +- .../dispatcher/DispatchSupportService.java | 1 + .../spcue/dispatcher/HostReportHandler.java | 22 +- .../test/dao/postgres/DispatcherDaoTests.java | 58 +++- .../src/test/resources/conf/dtd/cjsl-1.14.dtd | 104 ++++++ .../conf/jobspec/jobspec_dispatch_test.xml | 10 +- proto/rqd.proto | 1 + rqd/rqd.example.conf | 13 +- rqd/rqd/rqconstants.py | 35 ++- rqd/rqd/rqcore.py | 37 ++- rqd/rqd/rqutil.py | 3 +- 17 files changed, 440 insertions(+), 349 deletions(-) create mode 100644 cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java index faa1a9c04..2c60e9930 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java @@ -49,5 +49,8 @@ public class DispatchFrame extends FrameEntity implements FrameInterface { // A comma separated list of services public String services; + + // The Operational System this frame is expected to run in + public String os; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java index f01724e17..40a3e6bbc 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java @@ -51,7 +51,7 @@ public class DispatchHost extends Entity public long gpuMemory; public long idleGpuMemory; public String tags; - public String os; + private String os; public boolean isNimby; public boolean isLocalDispatch = false; @@ -81,6 +81,14 @@ public String getFacilityId() { return facilityId; } + public String[] getOs() { + return this.os.split(","); + } + + public void setOs(String os) { + this.os = os; + } + public boolean canHandleNegativeCoresRequest(int requestedCores) { // Request is positive, no need to test further. if (requestedCores > 0) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java index 8205f3021..02ade6bb4 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java @@ -85,7 +85,9 @@ public String getName() { * @param frame * @return */ - public static final VirtualProc build(DispatchHost host, DispatchFrame frame, String... selfishServices) { + public static final VirtualProc build(DispatchHost host, + DispatchFrame frame, + String... selfishServices) { VirtualProc proc = new VirtualProc(); proc.allocationId = host.getAllocationId(); proc.hostId = host.getHostId(); @@ -94,7 +96,7 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame, St proc.jobId = frame.getJobId(); proc.showId = frame.getShowId(); proc.facilityId = frame.getFacilityId(); - proc.os = host.os; + proc.os = frame.os; proc.hostName = host.getName(); proc.unbooked = false; @@ -148,7 +150,7 @@ else if (proc.coresReserved >= 100) { proc.coresReserved = wholeCores * 100; } else { if (frame.threadable) { - if (selfishServices != null && + if (selfishServices != null && frame.services != null && containsSelfishService(frame.services.split(","), selfishServices)){ proc.coresReserved = wholeCores * 100; @@ -238,7 +240,7 @@ public static final VirtualProc build(DispatchHost host, proc.jobId = frame.getJobId(); proc.showId = frame.getShowId(); proc.facilityId = frame.getFacilityId(); - proc.os = host.os; + proc.os = frame.os; proc.hostName = host.getName(); proc.unbooked = false; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index e36f97999..02dae0f22 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,9 +72,9 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os = '' " + + "job.str_os IS NULL OR job.str_os IN '' " + "OR " + - "job.str_os = ? " + + "job.str_os IN ? " + ") " + "AND (CASE WHEN layer_stat.int_waiting_count > 0 THEN 1 ELSE NULL END) = 1 " + "AND layer.int_cores_min <= ? " + @@ -135,7 +135,7 @@ public class DispatchQuery { "(" + "job.str_os IS NULL OR job.str_os = '' " + "OR " + - "job.str_os = ? " + + "job.str_os IN ? " + ") " + "AND (CASE WHEN layer_stat.int_waiting_count > 0 THEN 1 ELSE NULL END) = 1 " + "AND layer.int_cores_min <= ? " + @@ -250,7 +250,7 @@ private static final String replaceQueryForFifo(String query) { "AND " + "job.pk_facility = ? " + "AND " + - "(job.str_os = ? OR job.str_os IS NULL) " + + "(job.str_os IN ? OR job.str_os IS NULL) " + "AND " + "job.pk_job IN ( " + "SELECT " + @@ -276,7 +276,7 @@ private static final String replaceQueryForFifo(String query) { "AND " + "j.pk_facility = ? " + "AND " + - "(j.str_os = ? OR j.str_os IS NULL) " + + "(j.str_os IN ? OR j.str_os IS NULL) " + "AND " + "(CASE WHEN lst.int_waiting_count > 0 THEN lst.pk_layer ELSE NULL END) = l.pk_layer " + "AND " + @@ -519,40 +519,42 @@ private static final String replaceQueryForFifo(String query) { ") " + "LIMIT 1"; + private static final String FIND_DISPATCH_FRAME_COLUMNS = + "show_name, " + + "job_name, " + + "pk_job, " + + "pk_show, " + + "pk_facility, " + + "str_name, " + + "str_shot, " + + "str_user, " + + "int_uid, " + + "str_log_dir, " + + "str_os, " + + "frame_name, " + + "frame_state, " + + "pk_frame, " + + "pk_layer, " + + "int_retries, " + + "int_version, " + + "layer_name, " + + "layer_type, " + + "b_threadable, " + + "int_cores_min, " + + "int_cores_max, " + + "int_mem_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + + "str_cmd, " + + "str_range, " + + "int_chunk_size, " + + "str_services "; /** * Finds the next frame in a job for a proc. */ public static final String FIND_DISPATCH_FRAME_BY_JOB_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -569,6 +571,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -638,36 +641,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_DISPATCH_FRAME_BY_JOB_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "int_gpus_min, " + - "int_gpus_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -684,6 +658,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -754,36 +729,7 @@ private static final String replaceQueryForFifo(String query) { public static final String FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM ( " + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -800,6 +746,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -863,36 +810,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "int_gpus_min, " + - "int_gpus_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -909,6 +827,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -975,36 +894,7 @@ private static final String replaceQueryForFifo(String query) { * Finds the next frame in a job for a proc. */ public static final String FIND_DISPATCH_FRAME_BY_LAYER_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1021,6 +911,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1090,36 +981,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_DISPATCH_FRAME_BY_LAYER_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1136,6 +998,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1206,36 +1069,7 @@ private static final String replaceQueryForFifo(String query) { public static final String FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_PROC = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "b_threadable, " + - "int_cores_min, " + - "int_cores_max, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER ( ORDER BY " + @@ -1252,6 +1086,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + @@ -1315,36 +1150,7 @@ private static final String replaceQueryForFifo(String query) { * Find the next frame in a job for a host. */ public static final String FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_HOST = - "SELECT " + - "show_name, " + - "job_name, " + - "pk_job, " + - "pk_show, " + - "pk_facility, " + - "str_name, " + - "str_shot, " + - "str_user, " + - "int_uid, " + - "str_log_dir, " + - "frame_name, " + - "frame_state, " + - "pk_frame, " + - "pk_layer, " + - "int_retries, " + - "int_version, " + - "layer_name, " + - "layer_type, " + - "int_cores_min, " + - "int_cores_max, " + - "b_threadable, " + - "int_mem_min, " + - "int_gpus_min, " + - "int_gpus_max, " + - "int_gpu_mem_min, " + - "str_cmd, " + - "str_range, " + - "int_chunk_size, " + - "str_services " + + "SELECT " + FIND_DISPATCH_FRAME_COLUMNS + "FROM (" + "SELECT " + "ROW_NUMBER() OVER (ORDER BY " + @@ -1361,6 +1167,7 @@ private static final String replaceQueryForFifo(String query) { "job.str_user, " + "job.int_uid, " + "job.str_log_dir, " + + "job.str_os, " + "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, " + "frame.pk_frame, " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java index b17ae14e3..7db4714ea 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java @@ -24,6 +24,7 @@ import java.sql.ResultSet; import java.sql.SQLException; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashSet; import java.util.LinkedList; @@ -172,6 +173,12 @@ else if (cached.isExpired()) { return bookableShows.get(key).shows; } + // Given a query, + private String handleInClause(String key, String query, int inValueLength) { + String placeholders = String.join(",", Collections.nCopies(inValueLength, "?")); + return query.replace(key + " IN ?", key + " IN (" + placeholders + ")"); + } + private Set findDispatchJobs(DispatchHost host, int numJobs, boolean shuffleShows) { LinkedHashSet result = new LinkedHashSet(); List shows = new LinkedList(getBookableShows(host)); @@ -216,20 +223,24 @@ private Set findDispatchJobs(DispatchHost host, int numJobs, boolean shu @Override public PreparedStatement createPreparedStatement(Connection conn) throws SQLException { - PreparedStatement find_jobs_stmt = conn.prepareStatement( - FIND_JOBS_BY_SHOW_NO_GPU); - find_jobs_stmt.setString(1, s.getShowId()); - find_jobs_stmt.setString(2, host.getFacilityId()); - find_jobs_stmt.setString(3, host.os); - find_jobs_stmt.setInt(4, host.idleCores); - find_jobs_stmt.setLong(5, host.idleMemory); - find_jobs_stmt.setInt(6, threadMode(host.threadMode)); - find_jobs_stmt.setString(7, host.getName()); - find_jobs_stmt.setInt(8, numJobs * 10); + String query = handleInClause("str_os", FIND_JOBS_BY_SHOW_NO_GPU, host.getOs().length); + PreparedStatement find_jobs_stmt = conn.prepareStatement(query); + + int index = 1; + find_jobs_stmt.setString(index++, s.getShowId()); + find_jobs_stmt.setString(index++, host.getFacilityId()); + for (String item : host.getOs()) { + find_jobs_stmt.setString(index++, item); + } + find_jobs_stmt.setInt(index++, host.idleCores); + find_jobs_stmt.setLong(index++, host.idleMemory); + find_jobs_stmt.setInt(index++, threadMode(host.threadMode)); + find_jobs_stmt.setString(index++, host.getName()); + find_jobs_stmt.setInt(index++, numJobs * 10); return find_jobs_stmt; }}, PKJOB_MAPPER )); - prometheusMetrics.setBookingDurationMetric("findDispatchJobs nogpu findByShowQuery", + prometheusMetrics.setBookingDurationMetric("findDispatchJobs nogpu findByShowQuery", System.currentTimeMillis() - lastTime); } else { @@ -237,19 +248,22 @@ public PreparedStatement createPreparedStatement(Connection conn) @Override public PreparedStatement createPreparedStatement(Connection conn) throws SQLException { - PreparedStatement find_jobs_stmt = conn.prepareStatement( - findByShowQuery()); - find_jobs_stmt.setString(1, s.getShowId()); - find_jobs_stmt.setString(2, host.getFacilityId()); - find_jobs_stmt.setString(3, host.os); - find_jobs_stmt.setInt(4, host.idleCores); - find_jobs_stmt.setLong(5, host.idleMemory); - find_jobs_stmt.setInt(6, threadMode(host.threadMode)); - find_jobs_stmt.setInt(7, host.idleGpus); - find_jobs_stmt.setLong(8, (host.idleGpuMemory > 0) ? 1 : 0); - find_jobs_stmt.setLong(9, host.idleGpuMemory); - find_jobs_stmt.setString(10, host.getName()); - find_jobs_stmt.setInt(11, numJobs * 10); + String query = handleInClause("str_os", findByShowQuery(), host.getOs().length); + PreparedStatement find_jobs_stmt = conn.prepareStatement(query); + int index = 1; + find_jobs_stmt.setString(index++, s.getShowId()); + find_jobs_stmt.setString(index++, host.getFacilityId()); + for (String item : host.getOs()) { + find_jobs_stmt.setString(index++, item); + } + find_jobs_stmt.setInt(index++, host.idleCores); + find_jobs_stmt.setLong(index++, host.idleMemory); + find_jobs_stmt.setInt(index++, threadMode(host.threadMode)); + find_jobs_stmt.setInt(index++, host.idleGpus); + find_jobs_stmt.setLong(index++, (host.idleGpuMemory > 0) ? 1 : 0); + find_jobs_stmt.setLong(index++, host.idleGpuMemory); + find_jobs_stmt.setString(index++, host.getName()); + find_jobs_stmt.setInt(index++, numJobs * 10); return find_jobs_stmt; }}, PKJOB_MAPPER )); @@ -308,31 +322,48 @@ public Set findDispatchJobs(DispatchHost host, GroupInterface g) { long lastTime = System.currentTimeMillis(); if (host.idleGpus == 0 && (schedulingMode == SchedulingMode.BALANCED)) { + String query = handleInClause("str_os", FIND_JOBS_BY_GROUP_NO_GPU, host.getOs().length); + ArrayList args = new ArrayList(); + + args.add(g.getGroupId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.getName()); + args.add(50); result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_GROUP_NO_GPU, - PKJOB_MAPPER, - g.getGroupId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.getName(), 50)); + query, + PKJOB_MAPPER, args.toArray())); prometheusMetrics.setBookingDurationMetric("findDispatchJobs by group nogpu query", System.currentTimeMillis() - lastTime); } else { + String query = handleInClause("str_os", findByGroupQuery(), host.getOs().length); + ArrayList args = new ArrayList(); + + args.add(g.getGroupId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.idleGpus); + args.add(host.idleGpuMemory > 0 ? 1 : 0); + args.add(host.idleGpuMemory); + args.add(host.getName()); + args.add(50); result.addAll(getJdbcTemplate().query( - findByGroupQuery(), - PKJOB_MAPPER, - g.getGroupId(),host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.idleGpus, - (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, - host.getName(), 50)); + query, + PKJOB_MAPPER, args.toArray())); prometheusMetrics.setBookingDurationMetric("findDispatchJobs by group query", System.currentTimeMillis() - lastTime); - } - return result; } @@ -515,26 +546,47 @@ public Set findDispatchJobs(DispatchHost host, LinkedHashSet result = new LinkedHashSet(numJobs); long start = System.currentTimeMillis(); if (host.idleGpus == 0 && (schedulingMode == SchedulingMode.BALANCED)) { + String query = handleInClause("str_os", FIND_JOBS_BY_SHOW_NO_GPU, host.getOs().length); + ArrayList args = new ArrayList(); + args.add(show.getShowId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.getName()); + args.add(numJobs * 10); + result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_SHOW_NO_GPU, - PKJOB_MAPPER, - show.getShowId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.getName(), numJobs * 10)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findDispatchJobs by show nogpu query", System.currentTimeMillis() - start); } else { + String query = handleInClause("str_os", findByShowQuery(), host.getOs().length); + ArrayList args = new ArrayList(); + args.add(show.getShowId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.idleCores); + args.add(host.idleMemory); + args.add(threadMode(host.threadMode)); + args.add(host.idleGpus); + args.add(host.idleGpuMemory > 0 ? 1 : 0); + args.add(host.idleGpuMemory); + args.add(host.getName()); + args.add(numJobs * 10); + result.addAll(getJdbcTemplate().query( - findByShowQuery(), - PKJOB_MAPPER, - show.getShowId(), host.getFacilityId(), host.os, - host.idleCores, host.idleMemory, - threadMode(host.threadMode), - host.idleGpus, - (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, - host.getName(), numJobs * 10)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findDispatchJobs by show query", System.currentTimeMillis() - start); } @@ -548,11 +600,24 @@ public Set findDispatchJobs(DispatchHost host, public Set findLocalDispatchJobs(DispatchHost host) { LinkedHashSet result = new LinkedHashSet(5); long start = System.currentTimeMillis(); + + String query = handleInClause("str_os", FIND_JOBS_BY_LOCAL, host.getOs().length); + ArrayList args = new ArrayList(); + args.add(host.getHostId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + args.add(host.getHostId()); + args.add(host.getFacilityId()); + for (String item : host.getOs()) { + args.add(item); + } + result.addAll(getJdbcTemplate().query( - FIND_JOBS_BY_LOCAL, - PKJOB_MAPPER, - host.getHostId(), host.getFacilityId(), - host.os, host.getHostId(), host.getFacilityId(), host.os)); + query, + PKJOB_MAPPER, args.toArray())); + prometheusMetrics.setBookingDurationMetric("findLocalDispatchJobs query", System.currentTimeMillis() - start); return result; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java index 0546d4558..9e0f6f80c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java @@ -331,6 +331,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { frame.minGpuMemory = rs.getLong("int_gpu_mem_min"); frame.version = rs.getInt("int_version"); frame.services = rs.getString("str_services"); + frame.os = rs.getString("str_os"); return frame; } }; @@ -347,6 +348,7 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { "job.str_user,"+ "job.int_uid,"+ "job.str_log_dir,"+ + "job.str_os,"+ "frame.str_name AS frame_name, "+ "frame.str_state AS frame_state, "+ "frame.pk_frame, "+ diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java index 223737042..304fe474d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java @@ -214,7 +214,7 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException { host.isNimby = rs.getBoolean("b_nimby"); host.threadMode = rs.getInt("int_thread_mode"); host.tags = rs.getString("str_tags"); - host.os = rs.getString("str_os"); + host.setOs(rs.getString("str_os")); host.hardwareState = HardwareState.valueOf(rs.getString("str_state")); return host; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index f60b2c1e6..0779209b0 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -395,6 +395,7 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) { .setNumGpus(proc.gpusReserved) .setStartTime(System.currentTimeMillis()) .setIgnoreNimby(proc.isLocalDispatch) + .setOs(proc.os) .putAllEnvironment(jobDao.getEnvironment(frame)) .putAllEnvironment(layerDao.getLayerEnvironment(frame)) .putEnvironment("CUE3", "1") diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index b0a7ccd9c..b91a867bb 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -254,8 +254,10 @@ public void handleHostReport(HostReport report, boolean isBoot) { bookingManager.removeInactiveLocalHostAssignment(lca); } } - - if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.os)) { + + if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), + report.getHost().getFreeMcp(), + host.getOs())) { msg = String.format( "%s doesn't have enough free space in the temporary directory (mcp), %dMB", host.name, (report.getHost().getFreeMcp()/1024)); @@ -348,16 +350,19 @@ else if (!dispatchSupport.isCueBookable(host)) { * * @param tempTotalStorage Total storage on the temp directory * @param tempFreeStorage Free storage on the temp directory - * @param hostOs Reported os + * @param hostOs Reported operational systems * @return */ - private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStorage, String hostOs) { + private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStorage, String[] hostOs) { // The minimum amount of free space in the temporary directory to book a host int minAvailableTempPercentage = env.getRequiredProperty( "dispatcher.min_available_temp_storage_percentage", Integer.class); - return minAvailableTempPercentage == -1 || hostOs.equalsIgnoreCase(WINDOWS_OS) || - (((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage); + return minAvailableTempPercentage == -1 + // It is safe to asume multiple OSs imply windows is not the base OS, + // threfore Windows will always report a single hostOs + || (hostOs.length == 1 && hostOs[0].equalsIgnoreCase(WINDOWS_OS)) + || (((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage); } /** @@ -424,7 +429,10 @@ private boolean changeStateForTempDirStorage(DispatchHost host, RenderHost repor "dispatcher.min_available_temp_storage_percentage", Integer.class); // Prevent cue frames from booking on hosts with full temporary directories - boolean hasEnoughTempStorage = isTempDirStorageEnough(reportHost.getTotalMcp(), reportHost.getFreeMcp(), host.os); + boolean hasEnoughTempStorage = isTempDirStorageEnough( + reportHost.getTotalMcp(), + reportHost.getFreeMcp(), + host.getOs()); if (!hasEnoughTempStorage && host.hardwareState == HardwareState.UP) { // Insert a comment indicating that the Host status = Repair with reason = Full temporary directory CommentDetail c = new CommentDetail(); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java index 5b7eaee72..1ff849473 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java @@ -203,7 +203,7 @@ public void testFindNextDispatchFrameByProc() { assertNotNull(frame); assertEquals("0001-pass_1", frame.name); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -235,7 +235,7 @@ public void testFindNextDispatchFramesByProc() { DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -288,7 +288,7 @@ public void testFindNextDispatchFramesByProcAndJobLocal() { assertEquals(10, frames.size()); DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; proc.isLocalDispatch = true; @@ -310,7 +310,7 @@ public void testFindNextDispatchFramesByProcAndLayerLocal() { assertEquals(10, frames.size()); DispatchFrame frame = frames.get(0); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job.os); proc.coresReserved = 100; proc.isLocalDispatch = true; @@ -406,7 +406,7 @@ public void testfindUnderProcedJob() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job1.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -442,7 +442,7 @@ public void testHigherPriorityJobExistsTrue() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -476,7 +476,7 @@ public void testHigherPriorityJobExistsFalse() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -511,7 +511,7 @@ public void testHigherPriorityJobExistsMaxProcBound() { "SELECT str_state FROM job WHERE pk_job=?", String.class, job2.id)); - VirtualProc proc = VirtualProc.build(host, frame); + VirtualProc proc = VirtualProc.build(host, frame, job2.os); proc.coresReserved = 100; dispatcher.dispatch(frame, proc); @@ -525,4 +525,46 @@ public void testHigherPriorityJobExistsMaxProcBound() { public void testFifoSchedulingEnabled() { assertEquals(dispatcherDao.getSchedulingMode(), DispatcherDao.SchedulingMode.PRIORITY_ONLY); } + + @Test + @Transactional + @Rollback(true) + public void testFindDispatchJobsByShowMultiOs() { + DispatchHost host = getHost(); + // Set multiple Os and confirm jobs with Linux are still being found + final JobDetail job = getJob1(); + assertNotNull(job); + + // Host with different os + host.setOs("centos7,SomethingElse"); + Set jobs = dispatcherDao.findDispatchJobs(host, + adminManager.findShowEntity("pipe"), 5); + assertTrue(jobs.size() == 0); + + // Host with Linux Os (same as defined on spec) + host.setOs("centos7,Linux,rocky9"); + jobs = dispatcherDao.findDispatchJobs(host, + adminManager.findShowEntity("pipe"), 5); + assertTrue(jobs.size() > 0); + } + + @Test + @Transactional + @Rollback(true) + public void testFindDispatchJobsAllShowsMultiOs() { + DispatchHost host = getHost(); + // Set multiple Os and confirm jobs with Linux are still being found + final JobDetail job = getJob1(); + assertNotNull(job); + + // Host with incompatible OS shouldn't find any job + host.setOs("centos7,SomethingElse"); + Set jobs = dispatcherDao.findDispatchJobs(host, 5); + assertTrue(jobs.size() == 0); + + // Host with Linux Os (same as defined on spec) should find jobs + host.setOs("centos7,Linux,rocky9"); + jobs = dispatcherDao.findDispatchJobs(host, 5); + assertTrue(jobs.size() > 0); + } } diff --git a/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd b/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd new file mode 100644 index 000000000..8bbcbf6f1 --- /dev/null +++ b/cuebot/src/test/resources/conf/dtd/cjsl-1.14.dtd @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml index 2c372eff2..b656f499f 100644 --- a/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml +++ b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_test.xml @@ -18,7 +18,7 @@ - + spi @@ -30,9 +30,10 @@ false + Linux - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 @@ -44,7 +45,7 @@ - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 @@ -61,9 +62,10 @@ false + Linux - + /shots/pipe/usr_testuser/logs/help.py 1-10 1 diff --git a/proto/rqd.proto b/proto/rqd.proto index f6e0d8790..8d1946981 100644 --- a/proto/rqd.proto +++ b/proto/rqd.proto @@ -112,6 +112,7 @@ message RunFrame { map attributes = 22; int32 num_gpus = 23; report.ChildrenProcStats children = 24; + string os = 25; } message RunFrameSeq { diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index 22e260ae9..4369236dc 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -29,9 +29,18 @@ MAYA_SCRIPT_PATH PIXAR_LICENSE_FILE [docker.config] -DOCKER_IMAGE="" +# Setting this to True requires all the additional "docker.[]" sections to be filled RUN_ON_DOCKER=False +# This section is only required if RUN_ON_DOCKER=True +# List of volume mounts following docker run's format, but replacing = with : [docker.mounts] TEMP=type:bind,source:/tmp,target:/tmp,bind-propagation:slave -NET=type:bind,source:/net,target:/net,bind-propagation:slave \ No newline at end of file +NET=type:bind,source:/net,target:/net,bind-propagation:slave + +# This section is only required if RUN_ON_DOCKER=True +# - keys represent OSs this rqd is capable of executing jobs in +# - values are docker image tags +[docker.images] +centos7=centos7.3:latest +rocky9=rocky9.3:latest diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 80b9bb29b..beba053fb 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -1,4 +1,5 @@ # Copyright Contributors to the OpenCue Project +# Copyright Contributors to the OpenCue Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -155,7 +156,7 @@ # Docker mode config RUN_ON_DOCKER = False -DOCKER_IMAGE = "Invalid" +DOCKER_IMAGES = {} DOCKER_MOUNTS = [] try: @@ -163,8 +164,6 @@ # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" - __docker_mounts = "docker.mounts" - __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: @@ -237,6 +236,10 @@ if config.has_section(__host_env_var_section): RQD_HOST_ENV_VARS = config.options(__host_env_var_section) + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" + __docker_images = "docker.images" + if config.has_section(__docker_config): RUN_ON_DOCKER = config.getboolean(__docker_config, "RUN_ON_DOCKER") if RUN_ON_DOCKER: @@ -248,6 +251,31 @@ RQD_UID = 0 RQD_GID = 0 + # Every key:value on the config file under docker.images + # is parsed as key=SP_OS and value=image_tag. + # SP_OS is set to a list of all available keys + # For example: + # + # rqd.conf + # [docker.images] + # centos7=centos7.3:latest + # rocky9=rocky9.3:latest + # + # becomes: + # SP_OS=centos7,rocky9 + # DOCKER_IMAGES={ + # "centos7": "centos7.3:latest", + # "rocky9": "rocky9.3:latest" + # } + keys = config.options(__docker_images) + DOCKER_IMAGES = {} + for key in keys: + DOCKER_IMAGES[key] = config.get(__docker_images, key) + SP_OS = ",".join(keys) + if not DOCKER_IMAGES: + raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + def parse_mount(mount_str): """ Parse mount definitions similar to a docker run command into a docker @@ -263,7 +291,6 @@ def parse_mount(mount_str): mount_dict[key.strip()] = value.strip() return mount_dict - DOCKER_IMAGE = config.get(__docker_config, "DOCKER_IMAGE") # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index a328bd374..4de489e1d 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -91,12 +91,12 @@ def __init__(self, optNimbyoff=False): self.docker_client = None self.docker_mounts = [] - self.docker_image = "Invalid" + self.docker_images = {} if rqd.rqconstants.RUN_ON_DOCKER: # pylint: disable=import-outside-toplevel import docker self.docker_client = docker.from_env() - self.docker_image = rqd.rqconstants.DOCKER_IMAGE + self.docker_images = rqd.rqconstants.DOCKER_IMAGES self.docker_mounts = rqd.rqconstants.DOCKER_MOUNTS signal.signal(signal.SIGINT, self.handleExit) @@ -222,9 +222,9 @@ def deleteFrame(self, frameId): self.cores.reserved_cores) # pylint: disable=no-member self.cores.reserved_cores.clear() - log.info("Successfully delete frame with Id: %s", frameId) - else: - log.warning("Frame with Id: %s not found in cache", frameId) + log.info("Successfully delete frame with Id: %s", frameId) + else: + log.warning("Frame with Id: %s not found in cache", frameId) def killAllFrame(self, reason): """Will execute .kill() on every frame in cache until no frames remain @@ -936,14 +936,25 @@ def runDocker(self): frameInfo = self.frameInfo runFrame = self.runFrame - # TODO: implement support for multiple images - # requires adding `string os = 25;` to rqd.proto/RunFrame - # - # image = self.rqCore.docker_images.get(runFrame.os) - # if image is None: - # raise RuntimeError("rqd not configured to run an - # image for this frame OS: %s", runFrame.os) - image = self.rqCore.docker_image + if runFrame.os: + image = self.rqCore.docker_images.get(runFrame.os) + if image is None: + self.__writeHeader() + msg = ("This rqd is not configured to run an image " + "for this frame OS: %s. Check the [docker.images] " + "section of rqd.conf for more information." % runFrame.os) + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise RuntimeError(msg) + elif self.rqCore.docker_images: + # If a frame doesn't require an specic OS, default to the first configured OS on + # [docker.images] + image = list(self.rqCore.docker_images.values)[0] + else: + self.__writeHeader() + msg = ("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise RuntimeError(msg) self.__createEnvVariables() self.__writeHeader() diff --git a/rqd/rqd/rqutil.py b/rqd/rqd/rqutil.py index ce1964f08..3d8abc964 100644 --- a/rqd/rqd/rqutil.py +++ b/rqd/rqd/rqutil.py @@ -159,8 +159,7 @@ def checkAndCreateUser(username, uid=None, gid=None): subprocess.check_call(cmd) # pylint: disable=broad-except except Exception: - logging.exception("useradd failed to add user: %s. User possibly already exists.", - username) + logging.info("useradd failed to add user: %s. User possibly already exists.", username) finally: permissionsLow() From 0867bd1850cdd379b262b4c0366e08abbddc88bf Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 21 Oct 2024 16:39:14 -0700 Subject: [PATCH 09/51] Minor fixes --- VERSION.in | 2 +- .../com/imageworks/spcue/dao/postgres/DispatchQuery.java | 2 +- .../conf/ddl/postgres/migrations/V31__increase_os_size.sql | 3 +++ rqd/rqd/rqcore.py | 7 +++++-- 4 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql diff --git a/VERSION.in b/VERSION.in index d3827e75a..9459d4ba2 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -1.0 +1.1 diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index 02dae0f22..fec2f47ac 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,7 +72,7 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os IN '' " + + "job.str_os IS NULL OR job.str_os = '' " + "OR " + "job.str_os IN ? " + ") " + diff --git a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql new file mode 100644 index 000000000..ec3cf4a96 --- /dev/null +++ b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql @@ -0,0 +1,3 @@ +-- Increase size of os column on host_stat +ALTER TABLE host_stat +MODIFY COLUMN str_os VARCHAR(32); \ No newline at end of file diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4de489e1d..0bf095e16 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -1006,9 +1006,12 @@ def runDocker(self): output = container.wait() returncode = output["StatusCode"] # pylint: disable=broad-except - except Exception: + except Exception as e: returncode = 1 - logging.exception("Failed to launch frame container") + msg = "Failed to launch frame container" + logging.exception(msg) + self.rqlog.write(msg + " - " + e, + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # Find exitStatus and exitSignal if returncode < 0: From 3f62dedb06b055e48a860dd19fcf337eae6f16ef Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 22 Oct 2024 16:05:55 -0700 Subject: [PATCH 10/51] Get the correct OS for procs Previously it was safe to use the host's OS when querying for procs, now the job OS needs to be used as a host can have multiple OSs. --- .../spcue/dao/postgres/ProcDaoJdbc.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java index ecf39caf7..fff43d5ce 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java @@ -346,9 +346,10 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { "proc.int_virt_max_used,"+ "proc.int_virt_used,"+ "host.str_name AS host_name, " + - "host_stat.str_os " + + "job.str_os " + "FROM " + - "proc," + + "proc, " + + "job, " + "host, " + "host_stat, " + "alloc " + @@ -357,7 +358,9 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { "AND " + "host.pk_host = host_stat.pk_host " + "AND " + - "host.pk_alloc = alloc.pk_alloc "; + "host.pk_alloc = alloc.pk_alloc " + + "AND " + + "job.pk_job = proc.pk_job "; public VirtualProc getVirtualProc(String id) { return getJdbcTemplate().queryForObject( @@ -376,7 +379,7 @@ public VirtualProc findVirtualProc(FrameInterface frame) { "proc.*, " + "host.str_name AS host_name, " + "host.pk_alloc, " + - "host_stat.str_os, " + + "job.str_os, " + "alloc.pk_facility " + "FROM " + "proc, " + @@ -517,20 +520,23 @@ public String getCurrentFrameId(ProcInterface p) { "SELECT " + "proc.*, " + "host.str_name AS host_name, " + - "host_stat.str_os, " + + "job.str_os, " + "host.pk_alloc, " + "alloc.pk_facility " + "FROM " + "proc, " + "host, " + "host_stat,"+ - "alloc " + + "alloc, " + + "job " + "WHERE " + "proc.pk_host = host.pk_host " + "AND " + "host.pk_host = host_stat.pk_host " + "AND " + "host.pk_alloc = alloc.pk_alloc " + + "AND " + + "job.pk_job = proc.pk_job " + "AND " + "current_timestamp - proc.ts_ping > " + ORPHANED_PROC_INTERVAL; From c4547a0a842d4be3f37201fb068472c08c0109df Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 24 Oct 2024 11:17:38 -0700 Subject: [PATCH 11/51] Handle frame user on container To be able to run as the frame's owner, the entrypoint needs to ensure the user exists before running the frame's cmd. --- rqd/rqd.example.conf | 1 + rqd/rqd/rqconstants.py | 7 ++++++ rqd/rqd/rqcore.py | 52 ++++++++++++++++++++++++++++++++---------- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index 4369236dc..870419b24 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -31,6 +31,7 @@ PIXAR_LICENSE_FILE [docker.config] # Setting this to True requires all the additional "docker.[]" sections to be filled RUN_ON_DOCKER=False +DOCKER_SHELL_PATH=/usr/bin/sh # This section is only required if RUN_ON_DOCKER=True # List of volume mounts following docker run's format, but replacing = with : diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index beba053fb..3f18ed149 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -158,6 +158,7 @@ RUN_ON_DOCKER = False DOCKER_IMAGES = {} DOCKER_MOUNTS = [] +DOCKER_SHELL_PATH = "/bin/sh" try: if os.path.isfile(CONFIG_FILE): @@ -251,6 +252,12 @@ RQD_UID = 0 RQD_GID = 0 + # Path to the shell to be used in the frame environment + if config.has_option(__docker_config, "DOCKER_SHELL_PATH"): + DOCKER_SHELL_PATH = config.get( + __docker_config, + "DOCKER_SHELL_PATH") + # Every key:value on the config file under docker.images # is parsed as key=SP_OS and value=image_tag. # SP_OS is set to a list of all available keys diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 0bf095e16..95564c5df 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -963,18 +963,47 @@ def runDocker(self): frameInfo.frameId, time.time()) self._tempLocations.append(tempStatFile) - tempCommand = [] - if self.rqCore.machine.isDesktop(): - tempCommand += ["/bin/nice"] - tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if 'CPU_LIST' in runFrame.attributes: - tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] + # Prevent frame from attempting to run as ROOT + if runFrame.gid <= 0: + gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + else: + gid = runFrame.gid - tempCommand += [runFrame.command] + # Never give frame ROOT permissions + if runFrame.uid == 0 or gid == 0: + self.rqlog.write("Frame cannot run as ROOT", + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + return + + # Thread affinity + tasksetCmd = "" + if runFrame.attributes['CPU_LIST']: + tasksetCmd = "taskset -c %s" % runFrame.attributes['CPU_LIST'] + + # Command wrapper + command = """#!/bin/sh +exec sh -c " +echo \$$; +useradd -u %s -g %s %s >& /dev/null || true; +su -s %s %s -c '/bin/nice /usr/bin/time -p -o %s %s %s' +" + """ % ( + runFrame.uid, + gid, + runFrame.user_name, + rqd.rqconstants.DOCKER_SHELL_PATH, + runFrame.user_name, + tempStatFile, + tasksetCmd, + runFrame.command + ) - # Print PID before executing - command = ["sh", "-c", "echo $$; exec " + " ".join(tempCommand)] + # Log entrypoint on frame log to simplify replaying frames + self.rqlog.write("DOCKER_ENTRYPOINT = %s" % command, + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + # Write command to a file on the job tmpdir to simplify replaying a frame + command = self._createCommandFile(command) client = self.rqCore.docker_client try: @@ -988,8 +1017,7 @@ def runDocker(self): pid_mode="host", stderr=True, hostname=self.frameEnv["jobhost"], - entrypoint=command, - user=runFrame.uid) + entrypoint=command) log_stream = container.logs(stream=True) # CMD prints the process PID before executing the actual command @@ -1010,7 +1038,7 @@ def runDocker(self): returncode = 1 msg = "Failed to launch frame container" logging.exception(msg) - self.rqlog.write(msg + " - " + e, + self.rqlog.write("%s - %s" % (msg, e), prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # Find exitStatus and exitSignal From ad0a3de0e3c4123fe30c39d33e675b5820a2d7e0 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 24 Oct 2024 14:33:31 -0700 Subject: [PATCH 12/51] Simplify docker entrypoint cmd --- rqd/rqd/rqcore.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 95564c5df..5606605f2 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -983,11 +983,8 @@ def runDocker(self): # Command wrapper command = """#!/bin/sh -exec sh -c " -echo \$$; useradd -u %s -g %s %s >& /dev/null || true; -su -s %s %s -c '/bin/nice /usr/bin/time -p -o %s %s %s' -" +exec su -s %s %s -c "echo \$$; /bin/nice /usr/bin/time -p -o %s %s %s" """ % ( runFrame.uid, gid, From eb0962c2ba54916eb80f74e0b9c7c3cab79edb9f Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 25 Oct 2024 16:00:22 -0700 Subject: [PATCH 13/51] Stop logging exception for numby import Not having nimby installed is an expected event, not an exception. --- rqd/rqd/rqnimby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rqd/rqd/rqnimby.py b/rqd/rqd/rqnimby.py index 15b8dd89f..2e5d44674 100644 --- a/rqd/rqd/rqnimby.py +++ b/rqd/rqd/rqnimby.py @@ -58,7 +58,7 @@ def getNimby(rqCore): # Ideally ImportError could be used here, but pynput # can throw other kinds of exception while trying to # access runpy components - log.exception("Failed to import pynput, falling back to Select module") + log.debug("Failed to import pynput, falling back to Select module") # Still enabling the application start as hosts can be manually locked # using the API/GUI return NimbyNop(rqCore) From 1b637d194bb8c13c38ecbf200d37347c49290ad3 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 25 Oct 2024 16:00:46 -0700 Subject: [PATCH 14/51] Add unit tests for runOnDocker --- rqd/tests/rqcore_tests.py | 97 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/rqd/tests/rqcore_tests.py b/rqd/tests/rqcore_tests.py index 09f06d23f..f19cee8d6 100644 --- a/rqd/tests/rqcore_tests.py +++ b/rqd/tests/rqcore_tests.py @@ -528,7 +528,6 @@ def setUp(self): @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) @mock.patch('tempfile.gettempdir') - @mock.patch('rqd.rqcore.pipe_to_file', new=mock.MagicMock()) def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, # given currentTime = 1568070634.3 @@ -557,6 +556,8 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False + rqCore.docker_client = None + children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -602,6 +603,98 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), exit_status=returnCode)) + @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) + @mock.patch('tempfile.gettempdir') + def test_runDocker(self, getTempDirMock, timeMock, popenMock): # mkdirMock, openMock, + # given + currentTime = 1568070634.3 + jobTempPath = '/job/temp/path/' + logDir = '/path/to/log/dir/' + tempDir = '/some/random/temp/dir' + frameId = 'arbitrary-frame-id' + jobName = 'arbitrary-job-name' + frameName = 'arbitrary-frame-name' + frameUid = 928 + frameUsername = 'my-random-user' + returnCode = 0 + renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') + logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + + self.fs.create_dir(tempDir) + + timeMock.return_value = currentTime + getTempDirMock.return_value = tempDir + popenMock.return_value.wait.return_value = returnCode + + rqCore = mock.MagicMock() + rqCore.intervalStartTime = 20 + rqCore.intervalSleepTime = 40 + rqCore.machine.getTempPath.return_value = jobTempPath + rqCore.machine.isDesktop.return_value = True + rqCore.machine.getHostInfo.return_value = renderHost + rqCore.nimby.locked = False + + # Setup mock docker client + rqCore.docker_client = mock.MagicMock() + rqCore.docker_images = { + "centos7": "centos7_image", + "rocky9": "rocky9_image", + } + rqCore.docker_mounts = { + "vol1": "/vol1/mount", + "vol2": "/vol2/mount", + } + + children = rqd.compiled_proto.report_pb2.ChildrenProcStats() + + runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( + frame_id=frameId, + job_name=jobName, + frame_name=frameName, + uid=frameUid, + user_name=frameUsername, + log_dir=logDir, + children=children, + environment={"ENVVAR": "env_value"}, + os="centos7") + frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) + + # when + attendantThread = rqd.rqcore.FrameAttendantThread(rqCore, runFrame, frameInfo) + attendantThread.start() + attendantThread.join() + + # then + cmd_file = os.path.join(jobTempPath, 'rqd-cmd-%s-%s' % (runFrame.frame_id, currentTime)) + rqCore.docker_client.containers.run.assert_called_with( + image="centos7_image", + detach=True, + environment={"ENVVAR": "env_value"}, + working_dir=jobTempPath, + mounts=rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=mock.ANY, + entrypoint=[cmd_file] + ) + + with open(cmd_file) as f: + cmd = f.read() + self.assertEquals("tbd", cmd) + + self.assertTrue(os.path.exists(logDir)) + self.assertTrue(os.path.isfile(logFile)) + _, kwargs = popenMock.call_args + + rqCore.network.reportRunningFrameCompletion.assert_called_with( + rqd.compiled_proto.report_pb2.FrameCompleteReport( + host=renderHost, + frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( + job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), + exit_status=returnCode)) + # TODO(bcipriano) Re-enable this test once Windows is supported. The main sticking point here # is that the log directory is always overridden on Windows which makes mocking difficult. @mock.patch('platform.system', new=mock.Mock(return_value='Windows')) @@ -629,6 +722,7 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False + rqCore.docker_client = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -692,6 +786,7 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False + rqCore.docker_client = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( From ebb443870b38bbb47e1f6242aaa169366a484897 Mon Sep 17 00:00:00 2001 From: Ramon Figueiredo Date: Wed, 16 Oct 2024 17:11:04 -0700 Subject: [PATCH 15/51] [cuegui] Fix TypeError in Comment viewer: Handle job object as iterable (#1542) - Updated `viewComments` method in `MenuActions.py` to wrap single Job objects in a list. - This prevents `TypeError` when attempting to iterate over a non-iterable Job object. --- cuegui/cuegui/MenuActions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuegui/cuegui/MenuActions.py b/cuegui/cuegui/MenuActions.py index 287b2eeeb..07f83063c 100644 --- a/cuegui/cuegui/MenuActions.py +++ b/cuegui/cuegui/MenuActions.py @@ -581,6 +581,8 @@ def dropInternalDependencies(self, rpcObjects=None): def viewComments(self, rpcObjects=None): jobs = self._getOnlyJobObjects(rpcObjects) if jobs: + if not isinstance(jobs, list): + jobs = [jobs] cuegui.Comments.CommentListDialog(jobs, self._caller).show() dependWizard_info = ["Dependency &Wizard...", None, "configure"] From 98e55063a5a42ff26e245b3012bda65e88d2c5cc Mon Sep 17 00:00:00 2001 From: Ramon Figueiredo Date: Thu, 17 Oct 2024 15:22:37 -0700 Subject: [PATCH 16/51] [cuegui] Add Rocky 9 log root in cuegui.yaml (#1543) - Add `rocky9` log root to `render_logs.root` in `cuegui.yaml` --- cuegui/cuegui/config/cuegui.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/cuegui/cuegui/config/cuegui.yaml b/cuegui/cuegui/config/cuegui.yaml index 529cb9e00..546e52527 100644 --- a/cuegui/cuegui/config/cuegui.yaml +++ b/cuegui/cuegui/config/cuegui.yaml @@ -41,6 +41,7 @@ render_logs.root: darwin: '/Users/shots' linux: '/shots' rhel7: '/shots' + rocky9: '/shots' # Substrings which, when found in render logs, will cause that line to be highlighted. render_logs.highlight.error: [ 'error', 'aborted', 'fatal', 'failed', 'killed', 'command not found', From fb3dcd602cb2adec60a75cd7fa84e544859bafb9 Mon Sep 17 00:00:00 2001 From: Jimmy Christensen Date: Tue, 22 Oct 2024 20:15:41 +0200 Subject: [PATCH 17/51] [tests] Change tests to not use setup.py, but use the unittest module directly (#1547) **Summarize your change.** Have changed most tests to use `-m unittest discover` instead og `setup.py test` The old `setup.py test` doesn't work in newer versions of python since it has been deprecated --- ci/run_gui_test.sh | 2 +- ci/run_python_tests.sh | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ci/run_gui_test.sh b/ci/run_gui_test.sh index 3c7d92a6d..8a32a462e 100755 --- a/ci/run_gui_test.sh +++ b/ci/run_gui_test.sh @@ -21,7 +21,7 @@ fi echo "Using Python binary ${py}" test_log="/tmp/cuegui_result.log" -PYTHONPATH=pycue xvfb-run -d "${py}" cuegui/setup.py test | tee ${test_log} +PYTHONPATH=pycue xvfb-run -d "${py}" -m unittest discover -s cuegui/tests -t cuegui -p "*.py"| tee ${test_log} grep -Pz 'Ran \d+ tests in [0-9\.]+s\n\nOK' ${test_log} if [ $? -eq 0 ]; then diff --git a/ci/run_python_tests.sh b/ci/run_python_tests.sh index 5f1bfe294..4e1b0212b 100755 --- a/ci/run_python_tests.sh +++ b/ci/run_python_tests.sh @@ -22,11 +22,12 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py 2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py -python pycue/setup.py test -PYTHONPATH=pycue python pyoutline/setup.py test -PYTHONPATH=pycue python cueadmin/setup.py test -PYTHONPATH=pycue:pyoutline python cuesubmit/setup.py test -python rqd/setup.py test +python3 -m unittest discover -s pycue/tests -t pycue -p "*.py" +PYTHONPATH=pycue python3 -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" +PYTHONPATH=pycue python3 -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py" +PYTHONPATH=pycue:pyoutline python3 -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py" +python3 -m unittest discover -s rqd/tests -t rqd -p "*.py" + # Xvfb no longer supports Python 2. if [[ "$python_version" =~ "Python 3" && ${args[0]} != "--no-gui" ]]; then From ebe966bf85702b9fec3ba5132dca37fe5fba914f Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 25 Oct 2024 19:54:25 -0700 Subject: [PATCH 18/51] Rqd tests were not being executed unittest was not reporting test failures and interruptions as expected, which caused us to be running with failed unit tests for a long time. This commit replaces unittest with pytest for rqd and fixes some of the relevant unit tests. --- ci/run_python_tests.sh | 2 +- requirements.txt | 1 + ...ot_listener.py => cuebot_listener_test.py} | 0 rqd/tests/{cuerqd_tests.py => cuerqd_test.py} | 0 ...constants_tests.py => rqconstants_test.py} | 2 +- rqd/tests/{rqcore_tests.py => rqcore_test.py} | 442 +++++++++++------- .../{rqmachine_tests.py => rqmachine_test.py} | 0 rqd/tests/rqnimby_test.py | 143 ++++++ rqd/tests/rqnimby_tests.py | 143 ------ 9 files changed, 419 insertions(+), 314 deletions(-) rename rqd/tests/{test_cuebot_listener.py => cuebot_listener_test.py} (100%) rename rqd/tests/{cuerqd_tests.py => cuerqd_test.py} (100%) rename rqd/tests/{rqconstants_tests.py => rqconstants_test.py} (99%) rename rqd/tests/{rqcore_tests.py => rqcore_test.py} (69%) rename rqd/tests/{rqmachine_tests.py => rqmachine_test.py} (100%) create mode 100644 rqd/tests/rqnimby_test.py delete mode 100644 rqd/tests/rqnimby_tests.py diff --git a/ci/run_python_tests.sh b/ci/run_python_tests.sh index 4e1b0212b..8782d4d3a 100755 --- a/ci/run_python_tests.sh +++ b/ci/run_python_tests.sh @@ -26,7 +26,7 @@ python3 -m unittest discover -s pycue/tests -t pycue -p "*.py" PYTHONPATH=pycue python3 -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" PYTHONPATH=pycue python3 -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py" PYTHONPATH=pycue:pyoutline python3 -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py" -python3 -m unittest discover -s rqd/tests -t rqd -p "*.py" +pytest rqd/tests # Xvfb no longer supports Python 2. diff --git a/requirements.txt b/requirements.txt index dc0f8d570..41118fe6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ pylint==2.15.10;python_version>="3.7" pynput==1.7.6 PyYAML==5.1 six==1.16.0 +pytest==8.3.3 # Optional requirements # Sentry support for rqd diff --git a/rqd/tests/test_cuebot_listener.py b/rqd/tests/cuebot_listener_test.py similarity index 100% rename from rqd/tests/test_cuebot_listener.py rename to rqd/tests/cuebot_listener_test.py diff --git a/rqd/tests/cuerqd_tests.py b/rqd/tests/cuerqd_test.py similarity index 100% rename from rqd/tests/cuerqd_tests.py rename to rqd/tests/cuerqd_test.py diff --git a/rqd/tests/rqconstants_tests.py b/rqd/tests/rqconstants_test.py similarity index 99% rename from rqd/tests/rqconstants_tests.py rename to rqd/tests/rqconstants_test.py index 45e52c0b1..dcd5093ae 100644 --- a/rqd/tests/rqconstants_tests.py +++ b/rqd/tests/rqconstants_test.py @@ -39,7 +39,7 @@ import rqd.rqutil import rqd.compiled_proto.report_pb2 -from .rqmachine_tests import ( +from .rqmachine_test import ( CPUINFO, LOADAVG_LOW_USAGE, MEMINFO_MODERATE_USAGE, diff --git a/rqd/tests/rqcore_tests.py b/rqd/tests/rqcore_test.py similarity index 69% rename from rqd/tests/rqcore_tests.py rename to rqd/tests/rqcore_test.py index f19cee8d6..678042f19 100644 --- a/rqd/tests/rqcore_tests.py +++ b/rqd/tests/rqcore_test.py @@ -24,6 +24,7 @@ from builtins import str import os.path import unittest +import subprocess import mock import pyfakefs.fake_filesystem_unittest @@ -40,16 +41,16 @@ class RqCoreTests(unittest.TestCase): - @mock.patch('rqd.rqnimby.NimbySelect', autospec=True) - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('rqd.rqmachine.Machine', autospec=True) + @mock.patch("rqd.rqnimby.NimbyPynput", autospec=True) + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("rqd.rqmachine.Machine", autospec=True) def setUp(self, machineMock, networkMock, nimbyMock): self.machineMock = machineMock self.networkMock = networkMock self.nimbyMock = nimbyMock self.rqcore = rqd.rqcore.RqCore() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startServer(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = False self.machineMock.return_value.isDesktop.return_value = False @@ -59,7 +60,7 @@ def test_startServer(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn", autospec=True) def test_startServerWithNimby(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = True self.machineMock.return_value.isDesktop.return_value = False @@ -69,7 +70,7 @@ def test_startServerWithNimby(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_called_with(self.rqcore) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn", autospec=True) def test_startDesktopNimbyOn(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = True self.machineMock.return_value.isDesktop.return_value = True @@ -79,7 +80,7 @@ def test_startDesktopNimbyOn(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_called_with(self.rqcore) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyOff(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = False self.machineMock.return_value.isDesktop.return_value = True @@ -89,7 +90,7 @@ def test_startDesktopNimbyOff(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyUndefined(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = None self.machineMock.return_value.isDesktop.return_value = True @@ -99,9 +100,9 @@ def test_startDesktopNimbyUndefined(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('rqd.rqmachine.Machine', autospec=True) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("rqd.rqmachine.Machine", autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyOffWithFlag(self, nimbyOnMock, machineMock, networkMock): rqd.rqconstants.OVERRIDE_NIMBY = True machineMock.return_value.isDesktop.return_value = True @@ -112,7 +113,7 @@ def test_startDesktopNimbyOffWithFlag(self, nimbyOnMock, machineMock, networkMoc networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch('threading.Timer') + @mock.patch("threading.Timer") def test_grpcConnected(self, timerMock): update_rss_thread = mock.MagicMock() interval_thread = mock.MagicMock() @@ -124,15 +125,15 @@ def test_grpcConnected(self, timerMock): update_rss_thread.start.assert_called() interval_thread.start.assert_called() - @mock.patch.object(rqd.rqcore.RqCore, 'sendStatusReport', autospec=True) - @mock.patch('threading.Timer') + @mock.patch.object(rqd.rqcore.RqCore, "sendStatusReport", autospec=True) + @mock.patch("threading.Timer") def test_onInterval(self, timerMock, sendStatusReportMock): self.rqcore.onInterval() timerMock.return_value.start.assert_called() sendStatusReportMock.assert_called_with(self.rqcore) - @mock.patch('threading.Timer', autospec=True) + @mock.patch("threading.Timer", autospec=True) def test_onIntervalWithSleepTime(self, timerMock): sleep_time = 72 @@ -141,8 +142,8 @@ def test_onIntervalWithSleepTime(self, timerMock): timerMock.assert_called_with(sleep_time, mock.ANY) timerMock.return_value.start.assert_called() - @mock.patch.object(rqd.rqcore.RqCore, 'shutdownRqdNow') - @mock.patch('threading.Timer', new=mock.MagicMock()) + @mock.patch.object(rqd.rqcore.RqCore, "shutdownRqdNow") + @mock.patch("threading.Timer", new=mock.MagicMock()) def test_onIntervalShutdown(self, shutdownRqdNowMock): self.rqcore.shutdownRqdIdle() self.machineMock.return_value.isUserLoggedIn.return_value = False @@ -153,9 +154,11 @@ def test_onIntervalShutdown(self, shutdownRqdNowMock): shutdownRqdNowMock.assert_called_with() - @mock.patch('threading.Timer') + @mock.patch("threading.Timer") def test_updateRss(self, timerMock): - self.rqcore.storeFrame('frame-id', mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + self.rqcore.storeFrame( + "frame-id", mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) self.rqcore.updateRss() @@ -163,21 +166,25 @@ def test_updateRss(self, timerMock): timerMock.return_value.start.assert_called() def test_getFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) self.rqcore.storeFrame(frame_id, frame) self.assertEqual(frame, self.rqcore.getFrame(frame_id)) def test_getFrameKeys(self): - frame_ids = ['frame1', 'frame2'] - self.rqcore.storeFrame(frame_ids[0], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) - self.rqcore.storeFrame(frame_ids[1], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + frame_ids = ["frame1", "frame2"] + self.rqcore.storeFrame( + frame_ids[0], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) + self.rqcore.storeFrame( + frame_ids[1], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) self.assertEqual(set(frame_ids), set(self.rqcore.getFrameKeys())) def test_storeFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) with self.assertRaises(KeyError): self.rqcore.getFrame(frame_id) @@ -187,19 +194,23 @@ def test_storeFrame(self): self.assertEqual(frame, self.rqcore.getFrame(frame_id)) def test_storeFrameDuplicate(self): - frame_id = 'arbitrary-frame-id' - self.rqcore.storeFrame(frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + frame_id = "arbitrary-frame-id" + self.rqcore.storeFrame( + frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) with self.assertRaises(rqd.rqexceptions.RqdException): - self.rqcore.storeFrame(frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + self.rqcore.storeFrame( + frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) def test_deleteFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) self.rqcore.storeFrame(frame_id, frame) self.rqcore.deleteFrame(frame_id) - self.rqcore.deleteFrame('unknown-key-should-succeed') + self.rqcore.deleteFrame("unknown-key-should-succeed") with self.assertRaises(KeyError): self.rqcore.getFrame(frame_id) @@ -207,17 +218,20 @@ def test_deleteFrame(self): def test_killAllFrame(self): frameAttendantThread = mock.MagicMock() frameAttendantThread.is_alive.return_value = False - frame1Id = 'frame1' - frame2Id = 'frame2' - frame3Id = 'frame3' + frame1Id = "frame1" + frame2Id = "frame2" + frame3Id = "frame3" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) frame1.frameAttendantThread = frameAttendantThread frame2 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id) + ) frame2.frameAttendantThread = frameAttendantThread frame3 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame3Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame3Id) + ) frame3.frameAttendantThread = frameAttendantThread self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.storeFrame(frame2Id, frame2) @@ -226,23 +240,26 @@ def test_killAllFrame(self): # There's no result to verify here; if the method completes successfully # it means that all frames were properly killed, as the method won't finish # until its frame cache is cleared by the kill process. - self.rqcore.killAllFrame('arbitrary reason') + self.rqcore.killAllFrame("arbitrary reason") def test_killAllFrameIgnoreNimby(self): frameAttendantThread = mock.MagicMock() frameAttendantThread.is_alive.return_value = False - frame1Id = 'frame1' - frame2Id = 'frame2' + frame1Id = "frame1" + frame2Id = "frame2" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) frame1.frameAttendantThread = frameAttendantThread frame2 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id, ignore_nimby=True)) + self.rqcore, + rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id, ignore_nimby=True), + ) frame2.frameAttendantThread = frameAttendantThread self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.storeFrame(frame2Id, frame2) - self.rqcore.killAllFrame('NIMBY related reason') + self.rqcore.killAllFrame("NIMBY related reason") self.assertEqual(frame2, self.rqcore.getFrame(frame2Id)) @@ -251,17 +268,25 @@ def test_releaseCores(self): num_booked_cores = 7 num_cores_to_release = 5 self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( - total_cores=50, idle_cores=num_idle_cores, locked_cores=2, - booked_cores=num_booked_cores) + total_cores=50, + idle_cores=num_idle_cores, + locked_cores=2, + booked_cores=num_booked_cores, + ) self.rqcore.releaseCores(num_cores_to_release) # pylint: disable=no-member - self.assertEqual(num_booked_cores-num_cores_to_release, self.rqcore.cores.booked_cores) - self.assertEqual(num_idle_cores+num_cores_to_release, self.rqcore.cores.idle_cores) + self.assertEqual( + num_booked_cores - num_cores_to_release, self.rqcore.cores.booked_cores + ) + self.assertEqual( + num_idle_cores + num_cores_to_release, self.rqcore.cores.idle_cores + ) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOff') - def test_shutdown(self, nimbyOffMock): + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOff") + @mock.patch("os._exit") + def test_shutdown(self, nimbyOffMock, exitMock): self.rqcore.onIntervalThread = mock.MagicMock() self.rqcore.updateRssThread = mock.MagicMock() @@ -271,8 +296,8 @@ def test_shutdown(self, nimbyOffMock): self.rqcore.onIntervalThread.cancel.assert_called() self.rqcore.updateRssThread.cancel.assert_called() - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('sys.exit') + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("os._exit") def test_handleExit(self, networkMock, exitMock): self.rqcore = rqd.rqcore.RqCore() @@ -280,9 +305,11 @@ def test_handleExit(self, networkMock, exitMock): exitMock.assert_called() - @mock.patch('rqd.rqcore.FrameAttendantThread') + @mock.patch("rqd.rqcore.FrameAttendantThread") def test_launchFrame(self, frameThreadMock): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) @@ -299,7 +326,8 @@ def test_launchFrameOnDownHost(self): with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): self.rqcore.launchFrame(frame) - def test_launchFrameOnHostWaitingForShutdown(self): + @mock.patch("os._exit") + def test_launchFrameOnHostWaitingForShutdown(self, exitMock): self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.active = False frame = rqd.compiled_proto.rqd_pb2.RunFrame() @@ -308,13 +336,16 @@ def test_launchFrameOnHostWaitingForShutdown(self): with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): self.rqcore.launchFrame(frame) - @mock.patch('rqd.rqcore.FrameAttendantThread') + @mock.patch("rqd.rqcore.FrameAttendantThread") def test_launchFrameOnNimbyHost(self, frameThreadMock): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) frameIgnoreNimby = rqd.compiled_proto.rqd_pb2.RunFrame( - uid=22, num_cores=10, ignore_nimby=True) + uid=22, num_cores=10, ignore_nimby=True + ) self.rqcore.nimby = mock.create_autospec(rqd.rqnimby.NimbySelect) self.rqcore.nimby.locked = True @@ -326,11 +357,15 @@ def test_launchFrameOnNimbyHost(self, frameThreadMock): frameThreadMock.return_value.start.assert_called() def test_launchDuplicateFrame(self): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False - frameId = 'arbitrary-frame-id' - self.rqcore.storeFrame(frameId, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId)) + frameId = "arbitrary-frame-id" + self.rqcore.storeFrame( + frameId, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) + ) frameToLaunch = rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) rqd.rqconstants.OVERRIDE_NIMBY = None @@ -354,7 +389,9 @@ def test_launchFrameWithInvalidCoreCount(self): self.rqcore.launchFrame(frame) def test_launchFrameWithInsufficientCores(self): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=5) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=5 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) @@ -363,14 +400,15 @@ def test_launchFrameWithInsufficientCores(self): self.rqcore.launchFrame(frame) def test_getRunningFrame(self): - frameId = 'arbitrary-frame-id' + frameId = "arbitrary-frame-id" frame = rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) self.rqcore.storeFrame(frameId, frame) self.assertEqual(frame, self.rqcore.getRunningFrame(frameId)) - self.assertIsNone(self.rqcore.getRunningFrame('some-unknown-frame-id')) + self.assertIsNone(self.rqcore.getRunningFrame("some-unknown-frame-id")) - def test_rebootNowNoUser(self): + @mock.patch("os._exit") + def test_rebootNowNoUser(self, exitMock): self.machineMock.return_value.isUserLoggedIn.return_value = False self.nimbyMock.return_value.active = False @@ -384,7 +422,8 @@ def test_rebootNowWithUser(self): with self.assertRaises(rqd.rqexceptions.RqdException): self.rqcore.rebootNow() - def test_rebootIdleNoFrames(self): + @mock.patch("os._exit") + def test_rebootIdleNoFrames(self, exitMock): self.machineMock.return_value.isUserLoggedIn.return_value = False self.nimbyMock.return_value.active = False @@ -392,10 +431,12 @@ def test_rebootIdleNoFrames(self): self.machineMock.return_value.reboot.assert_called_with() - def test_rebootIdleWithFrames(self): - frame1Id = 'frame1' + @mock.patch("os._exit") + def test_rebootIdleWithFrames(self, exitMock): + frame1Id = "frame1" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.rebootIdle() @@ -403,29 +444,13 @@ def test_rebootIdleWithFrames(self): self.assertTrue(self.rqcore.isWaitingForIdle()) self.machineMock.return_value.reboot.assert_not_called() - @mock.patch('os.getuid', new=mock.MagicMock(return_value=0)) - @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) - def test_nimbyOn(self): - self.nimbyMock.return_value.active = False - - self.rqcore.nimbyOn() - - self.nimbyMock.return_value.run.assert_called_with() - - def test_nimbyOff(self): - self.nimbyMock.return_value.active = True - - self.rqcore.nimbyOff() - - self.nimbyMock.return_value.stop.assert_called_with() - - @mock.patch.object(rqd.rqcore.RqCore, 'killAllFrame', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "killAllFrame", autospec=True) def test_onNimbyLock(self, killAllFrameMock): self.rqcore.onNimbyLock() killAllFrameMock.assert_called_with(self.rqcore, mock.ANY) - @mock.patch.object(rqd.rqcore.RqCore, 'sendStatusReport', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "sendStatusReport", autospec=True) def test_onNimbyUnlock(self, sendStatusReportMock): self.rqcore.onNimbyUnlock() @@ -507,6 +532,7 @@ def test_unlockAllWhenNimbyLocked(self): self.rqcore.cores.total_cores = 50 self.rqcore.cores.idle_cores = 40 self.rqcore.cores.locked_cores = 10 + self.rqcore.nimby.locked = True self.rqcore.unlockAll() @@ -514,40 +540,95 @@ def test_unlockAllWhenNimbyLocked(self): self.assertEqual(40, self.rqcore.cores.idle_cores) self.assertEqual(0, self.rqcore.cores.locked_cores) + def test_sendFrameCompleteReport(self): + logDir = "/path/to/log/dir/" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" + frameUid = 928 + frameUsername = "my-random-user" + children = rqd.compiled_proto.report_pb2.ChildrenProcStats() + returnCode = 0 -@mock.patch('rqd.rqutil.checkAndCreateUser', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) -@mock.patch('subprocess.Popen') -@mock.patch('time.time') -@mock.patch('rqd.rqutil.permissionsUser', spec=True) + runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( + frame_id=frameId, + job_name=jobName, + frame_name=frameName, + uid=frameUid, + user_name=frameUsername, + log_dir=logDir, + children=children, + ) + frameInfo = rqd.rqnetwork.RunningFrame(self.rqcore, runFrame) + frameInfo.exitStatus = 0 + frameInfo.exitSignal = 0 + frameInfo.ignoreNimby = True + + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + self.rqcore.machine.getHostInfo.return_value = renderHost + self.rqcore.nimby = mock.MagicMock() + self.rqcore.nimby.locked.return_value = False + self.rqcore.network.reportRunningFrameCompletion = mock.MagicMock() + self.rqcore.sendFrameCompleteReport(frameInfo) + + self.rqcore.network.reportRunningFrameCompletion.assert_called_once_with( + rqd.compiled_proto.report_pb2.FrameCompleteReport( + host=renderHost, + frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( + job_name=jobName, + frame_id=frameId, + frame_name=frameName, + children=children, + ), + exit_status=returnCode, + ) + ) + + +@mock.patch("rqd.rqutil.checkAndCreateUser", new=mock.MagicMock()) +@mock.patch("rqd.rqutil.permissionsHigh", new=mock.MagicMock()) +@mock.patch("rqd.rqutil.permissionsLow", new=mock.MagicMock()) +@mock.patch("subprocess.Popen") +@mock.patch("time.time") +@mock.patch("rqd.rqutil.permissionsUser", spec=True) class FrameAttendantThreadTests(pyfakefs.fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - rqd.rqconstants.SU_ARGUMENT = '-c' - - @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) - @mock.patch('tempfile.gettempdir') - def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, + rqd.rqconstants.SU_ARGUMENT = "-c" + + @mock.patch("platform.system", new=mock.Mock(return_value="Linux")) + @mock.patch("tempfile.gettempdir") + @mock.patch("select.poll") + def test_runLinux( + self, selectMock, getTempDirMock, permsUser, timeMock, popenMock + ): # mkdirMock, openMock, # given currentTime = 1568070634.3 - jobTempPath = '/job/temp/path/' - logDir = '/path/to/log/dir/' - tempDir = '/some/random/temp/dir' - frameId = 'arbitrary-frame-id' - jobName = 'arbitrary-job-name' - frameName = 'arbitrary-frame-name' + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "/some/random/temp/dir" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" frameUid = 928 - frameUsername = 'my-random-user' + frameUsername = "my-random-user" returnCode = 0 - renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') - logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + logFile = os.path.join(logDir, "%s.%s.rqlog" % (jobName, frameName)) self.fs.create_dir(tempDir) timeMock.return_value = currentTime getTempDirMock.return_value = tempDir + popenMock.return_value.wait.return_value = returnCode + popenMock.return_value.stdout.readline.return_value = None + + selectMock.return_value.poll.return_value = [] rqCore = mock.MagicMock() rqCore.intervalStartTime = 20 @@ -557,7 +638,6 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False rqCore.docker_client = None - children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -567,7 +647,8 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir uid=frameUid, user_name=frameUsername, log_dir=logDir, - children=children) + children=children, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) # when @@ -579,10 +660,15 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( [ - '/bin/nice', '/usr/bin/time', '-p', '-o', - jobTempPath + 'rqd-stat-' + frameId + '-' + str(currentTime), - '/bin/su', frameUsername, '-c', - '"' + tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '"' + "/bin/nice", + "/usr/bin/time", + "-p", + "-o", + jobTempPath + "rqd-stat-" + frameId + "-" + str(currentTime), + "/bin/su", + frameUsername, + "-c", + '"' + tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + '"', ], env=mock.ANY, cwd=jobTempPath, @@ -590,22 +676,20 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir stdout=mock.ANY, stderr=mock.ANY, close_fds=mock.ANY, - preexec_fn=mock.ANY) + preexec_fn=mock.ANY, + ) self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) _, kwargs = popenMock.call_args - rqCore.network.reportRunningFrameCompletion.assert_called_with( - rqd.compiled_proto.report_pb2.FrameCompleteReport( - host=renderHost, - frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) @mock.patch('tempfile.gettempdir') - def test_runDocker(self, getTempDirMock, timeMock, popenMock): # mkdirMock, openMock, + def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, # given currentTime = 1568070634.3 jobTempPath = '/job/temp/path/' @@ -665,11 +749,11 @@ def test_runDocker(self, getTempDirMock, timeMock, popenMock): # mkdirMock, open attendantThread.join() # then - cmd_file = os.path.join(jobTempPath, 'rqd-cmd-%s-%s' % (runFrame.frame_id, currentTime)) + cmd_file = os.path.join(tempDir, 'rqd-cmd-%s-%s' % (runFrame.frame_id, currentTime)) rqCore.docker_client.containers.run.assert_called_with( image="centos7_image", detach=True, - environment={"ENVVAR": "env_value"}, + environment=mock.ANY, working_dir=jobTempPath, mounts=rqCore.docker_mounts, privileged=True, @@ -677,40 +761,49 @@ def test_runDocker(self, getTempDirMock, timeMock, popenMock): # mkdirMock, open pid_mode="host", stderr=True, hostname=mock.ANY, - entrypoint=[cmd_file] + entrypoint=cmd_file ) with open(cmd_file) as f: cmd = f.read() - self.assertEquals("tbd", cmd) + self.assertEqual(r"""#!/bin/sh +useradd -u %s -g %s %s >& /dev/null || true; +exec su -s /bin/sh %s -c "echo \$$; /bin/nice /usr/bin/time -p -o /job/temp/path/rqd-stat-%s-%s " +""" % ( + frameUid, + rqd.rqconstants.LAUNCH_FRAME_USER_GID, + frameUsername, + frameUsername, + frameId, + currentTime + ), cmd) self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) - _, kwargs = popenMock.call_args - rqCore.network.reportRunningFrameCompletion.assert_called_with( - rqd.compiled_proto.report_pb2.FrameCompleteReport( - host=renderHost, - frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) + # TODO(bcipriano) Re-enable this test once Windows is supported. The main sticking point here # is that the log directory is always overridden on Windows which makes mocking difficult. - @mock.patch('platform.system', new=mock.Mock(return_value='Windows')) + @mock.patch("platform.system", new=mock.Mock(return_value="Windows")) def disabled__test_runWindows(self, permsUser, timeMock, popenMock): currentTime = 1568070634.3 - jobTempPath = '/job/temp/path/' - logDir = '/path/to/log/dir/' - tempDir = 'C:\\temp' - frameId = 'arbitrary-frame-id' - jobId = 'arbitrary-job-id' - jobName = 'arbitrary-job-name' - frameName = 'arbitrary-frame-name' + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "C:\\temp" + frameId = "arbitrary-frame-id" + jobId = "arbitrary-job-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" frameUid = 928 - frameUsername = 'my-random-user' + frameUsername = "my-random-user" returnCode = 0 - renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) timeMock.return_value = currentTime popenMock.return_value.returncode = returnCode @@ -722,7 +815,6 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False - rqCore.docker_client = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -734,7 +826,8 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): user_name=frameUsername, log_dir=logDir, children=children, - environment={'CUE_IFRAME': '2000'}) + environment={"CUE_IFRAME": "2000"}, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) attendantThread = rqd.rqcore.FrameAttendantThread(rqCore, runFrame, frameInfo) @@ -743,41 +836,51 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( - [tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '.bat'], + [tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + ".bat"], stdin=mock.ANY, stdout=mock.ANY, - stderr=mock.ANY) + stderr=mock.ANY, + ) # TODO(bcipriano) Verify the log directory was created and used for stdout/stderr. rqCore.network.reportRunningFrameCompletion.assert_called_with( rqd.compiled_proto.report_pb2.FrameCompleteReport( host=renderHost, frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + job_name=jobName, + frame_id=frameId, + frame_name=frameName, + children=children, + ), + exit_status=returnCode, + ) + ) - @mock.patch('platform.system', new=mock.Mock(return_value='Darwin')) - @mock.patch('tempfile.gettempdir') + @mock.patch("platform.system", new=mock.Mock(return_value="Darwin")) + @mock.patch("tempfile.gettempdir") def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): # given currentTime = 1568070634.3 - jobTempPath = '/job/temp/path/' - logDir = '/path/to/log/dir/' - tempDir = '/some/random/temp/dir' - frameId = 'arbitrary-frame-id' - jobName = 'arbitrary-job-name' - frameName = 'arbitrary-frame-name' + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "/some/random/temp/dir" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" frameUid = 928 - frameUsername = 'my-random-user' + frameUsername = "my-random-user" returnCode = 0 - renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') - logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + logFile = os.path.join(logDir, "%s.%s.rqlog" % (jobName, frameName)) self.fs.create_dir(tempDir) timeMock.return_value = currentTime getTempDirMock.return_value = tempDir popenMock.return_value.returncode = returnCode + popenMock.return_value.stdout.readline.return_value = None rqCore = mock.MagicMock() rqCore.intervalStartTime = 20 @@ -796,7 +899,8 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): uid=frameUid, user_name=frameUsername, log_dir=logDir, - children=children) + children=children, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) # when @@ -808,29 +912,29 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( [ - '/usr/bin/su', frameUsername, '-c', - '"' + tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '"' + "/usr/bin/su", + frameUsername, + "-c", + '"' + tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + '"', ], env=mock.ANY, cwd=jobTempPath, stdin=mock.ANY, stdout=mock.ANY, stderr=mock.ANY, - preexec_fn=mock.ANY) + preexec_fn=mock.ANY, + ) self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) _, kwargs = popenMock.call_args - self.assertEqual(logFile, kwargs['stdout'].name) - self.assertEqual(logFile, kwargs['stderr'].name) + self.assertEqual(subprocess.PIPE, kwargs["stdout"]) + self.assertEqual(subprocess.STDOUT, kwargs["stderr"]) - rqCore.network.reportRunningFrameCompletion.assert_called_with( - rqd.compiled_proto.report_pb2.FrameCompleteReport( - host=renderHost, - frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/rqd/tests/rqmachine_tests.py b/rqd/tests/rqmachine_test.py similarity index 100% rename from rqd/tests/rqmachine_tests.py rename to rqd/tests/rqmachine_test.py diff --git a/rqd/tests/rqnimby_test.py b/rqd/tests/rqnimby_test.py new file mode 100644 index 000000000..8408fd5b8 --- /dev/null +++ b/rqd/tests/rqnimby_test.py @@ -0,0 +1,143 @@ +# #!/usr/bin/env python +# # Copyright Contributors to the OpenCue Project +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + + +# """Tests for rqd.rqnimby.""" + + +# from __future__ import print_function +# from __future__ import division +# from __future__ import absolute_import + +# import unittest + +# import mock +# import pyfakefs.fake_filesystem_unittest + +# import rqd.rqcore +# import rqd.rqmachine +# import rqd.rqnimby + + +# @mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) +# @mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) +# class RqNimbyTests(pyfakefs.fake_filesystem_unittest.TestCase): +# def setUp(self): +# self.setUpPyfakefs() +# self.inputDevice = self.fs.create_file('/dev/input/event0', contents='mouse event') + +# self.rqMachine = mock.MagicMock(spec=rqd.rqmachine.Machine) +# self.rqCore = mock.MagicMock(spec=rqd.rqcore.RqCore) +# self.rqCore.machine = self.rqMachine +# self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self.rqCore) +# self.nimby.daemon = True + +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') +# def test_initialState(self, unlockedIdleMock): +# self.nimby.daemon = True + +# self.nimby.start() +# self.nimby.join() + +# # Initial state should be "unlocked and idle". +# unlockedIdleMock.assert_called() + +# self.nimby.stop() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_unlockedIdle(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.unlockedIdle() + +# # Given a mouse event, Nimby should transition to "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') +# @mock.patch('threading.Timer') +# def test_lockedIdleWhenIdle(self, timerMock, unlockedIdleMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedIdle() + +# # Given no events, Nimby should transition to "unlocked and idle". +# unlockedIdleMock.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_lockedIdleWhenInUse(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedIdle() + +# # Given a mouse event, Nimby should transition to "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'lockedIdle') +# @mock.patch('threading.Timer') +# def test_lockedInUseWhenIdle(self, timerMock, lockedIdleMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedInUse() + +# # Given no events, Nimby should transition to "locked and idle". +# lockedIdleMock.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_lockedInUseWhenInUse(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedInUse() + +# # Given a mouse event, Nimby should stay in state "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# def test_lockNimby(self): +# self.nimby.active = True +# self.nimby.locked = False + +# self.nimby.lockNimby() + +# self.assertTrue(self.nimby.locked) +# self.rqCore.onNimbyLock.assert_called() + +# def test_unlockNimby(self): +# self.nimby.locked = True + +# self.nimby.unlockNimby() + +# self.assertFalse(self.nimby.locked) +# self.rqCore.onNimbyUnlock.assert_called() + + +# if __name__ == '__main__': +# unittest.main() diff --git a/rqd/tests/rqnimby_tests.py b/rqd/tests/rqnimby_tests.py deleted file mode 100644 index 04cf3e765..000000000 --- a/rqd/tests/rqnimby_tests.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# Copyright Contributors to the OpenCue Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""Tests for rqd.rqnimby.""" - - -from __future__ import print_function -from __future__ import division -from __future__ import absolute_import - -import unittest - -import mock -import pyfakefs.fake_filesystem_unittest - -import rqd.rqcore -import rqd.rqmachine -import rqd.rqnimby - - -@mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) -class RqNimbyTests(pyfakefs.fake_filesystem_unittest.TestCase): - def setUp(self): - self.setUpPyfakefs() - self.inputDevice = self.fs.create_file('/dev/input/event0', contents='mouse event') - - self.rqMachine = mock.MagicMock(spec=rqd.rqmachine.Machine) - self.rqCore = mock.MagicMock(spec=rqd.rqcore.RqCore) - self.rqCore.machine = self.rqMachine - self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self.rqCore) - self.nimby.daemon = True - - @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') - def test_initialState(self, unlockedIdleMock): - self.nimby.daemon = True - - self.nimby.start() - self.nimby.join() - - # Initial state should be "unlocked and idle". - unlockedIdleMock.assert_called() - - self.nimby.stop() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_unlockedIdle(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.unlockedIdle() - - # Given a mouse event, Nimby should transition to "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) - @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') - @mock.patch('threading.Timer') - def test_lockedIdleWhenIdle(self, timerMock, unlockedIdleMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedIdle() - - # Given no events, Nimby should transition to "unlocked and idle". - unlockedIdleMock.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_lockedIdleWhenInUse(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedIdle() - - # Given a mouse event, Nimby should transition to "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) - @mock.patch.object(rqd.rqnimby.NimbySelect, 'lockedIdle') - @mock.patch('threading.Timer') - def test_lockedInUseWhenIdle(self, timerMock, lockedIdleMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedInUse() - - # Given no events, Nimby should transition to "locked and idle". - lockedIdleMock.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_lockedInUseWhenInUse(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedInUse() - - # Given a mouse event, Nimby should stay in state "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - def test_lockNimby(self): - self.nimby.active = True - self.nimby.locked = False - - self.nimby.lockNimby() - - self.assertTrue(self.nimby.locked) - self.rqCore.onNimbyLock.assert_called() - - def test_unlockNimby(self): - self.nimby.locked = True - - self.nimby.unlockNimby() - - self.assertFalse(self.nimby.locked) - self.rqCore.onNimbyUnlock.assert_called() - - -if __name__ == '__main__': - unittest.main() From be436e0f57b529e2b9290e275a5eaaccfd282695 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 24 Oct 2024 08:23:06 -0700 Subject: [PATCH 19/51] [rqd] Avoid changing dict in place during iteration (#1554) Deleting an item from the dict being iterated over on sanitizeFrames caused the error: "Dictionary changed size during iteration". --- rqd/rqd/rqcore.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 5606605f2..93c7b0d24 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -613,7 +613,8 @@ def sanitizeFrames(self): Iterate over the cache and update the status of frames that might have completed but never reported back to cuebot. """ - for frameId, runningFrame in self.__cache.items(): + for frameId in list(self.__cache.keys): + runningFrame = self.__cache[frameId] # If the frame was marked as completed (exitStatus) and a report has not been sent # try to file the report again if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: From c8a9c2bcebd7fe0825fa5583b5198580515dc622 Mon Sep 17 00:00:00 2001 From: Jimmy Christensen Date: Sat, 26 Oct 2024 00:07:36 +0200 Subject: [PATCH 20/51] Add script for converting imports in grpc python modules and remove 2to3 (#1557) **Link the Issue(s) this Pull Request is related to.** This is to fix #1555 **Summarize your change.** Replaces 2to3 with a simple script that adds "from ." in front of pb2 imports. This is done to support newer versions of python where 2to3 has been removed. --- ci/build_sphinx_docs.sh | 2 +- ci/fix_compiled_proto.py | 26 ++++++++++++++++++++++++ ci/python_coverage_report.sh | 4 ++-- ci/run_python_lint.sh | 4 ++-- ci/run_python_tests.sh | 4 ++-- connectors/prometheus_metrics/Dockerfile | 2 +- cueadmin/Dockerfile | 2 +- cuegui/Dockerfile | 2 +- cuesubmit/Dockerfile | 2 +- proto/README.md | 10 ++++----- pycue/Dockerfile | 2 +- pyoutline/Dockerfile | 2 +- requirements.txt | 1 - rqd/Dockerfile | 2 +- sandbox/install-client-sources.sh | 2 +- 15 files changed, 45 insertions(+), 22 deletions(-) create mode 100644 ci/fix_compiled_proto.py diff --git a/ci/build_sphinx_docs.sh b/ci/build_sphinx_docs.sh index 9f207d51e..b39c68fc2 100755 --- a/ci/build_sphinx_docs.sh +++ b/ci/build_sphinx_docs.sh @@ -10,7 +10,7 @@ python -m grpc_tools.protoc -I=proto/ --python_out=pycue/opencue/compiled_proto # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto # Build the docs and treat warnings as errors ~/.local/bin/sphinx-build -W -b html -d docs/_build/doctrees docs docs/_build/html diff --git a/ci/fix_compiled_proto.py b/ci/fix_compiled_proto.py new file mode 100644 index 000000000..0a5534803 --- /dev/null +++ b/ci/fix_compiled_proto.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +"""Script that makes the imports in the generated compiled_proto python files relative. + +""" +import os +import re +import sys +import glob + +PYTHON_SCRIPT_PATH = sys.argv[1] + +if os.path.isdir(PYTHON_SCRIPT_PATH): + pattern = re.compile(r"^import \w+ as \w+_pb2") + for filepath in glob.glob(os.path.join(PYTHON_SCRIPT_PATH, "*_pb2*.py")): + filedata = [] + with open(filepath) as f: + for line in f.readlines(): + match = pattern.match(line) + if match is not None: + line = f"from . {line}" + filedata.append(line.strip("\n")) + with open(filepath, "w") as f: + f.write("\n".join(filedata)) +else: + print("Argument is not a directory") diff --git a/ci/python_coverage_report.sh b/ci/python_coverage_report.sh index e0c65328f..4477c8d44 100755 --- a/ci/python_coverage_report.sh +++ b/ci/python_coverage_report.sh @@ -9,8 +9,8 @@ python -m pip install coverage pytest-xvfb # Protos need to have their Python code generated in order for tests to pass. python -m grpc_tools.protoc -I=proto/ --python_out=pycue/opencue/compiled_proto --grpc_python_out=pycue/opencue/compiled_proto proto/*.proto python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc_python_out=rqd/rqd/compiled_proto proto/*.proto -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto # Run coverage for each component individually, but append it all into the same report. python -m coverage run --source=pycue/opencue/,pycue/FileSequence/ --omit=pycue/opencue/compiled_proto/* pycue/tests/test_suite.py diff --git a/ci/run_python_lint.sh b/ci/run_python_lint.sh index bd86c9188..ba98d2b6c 100755 --- a/ci/run_python_lint.sh +++ b/ci/run_python_lint.sh @@ -13,8 +13,8 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto echo "Running lint for pycue/..." cd pycue diff --git a/ci/run_python_tests.sh b/ci/run_python_tests.sh index 8782d4d3a..0c8d121a4 100755 --- a/ci/run_python_tests.sh +++ b/ci/run_python_tests.sh @@ -19,8 +19,8 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto python3 -m unittest discover -s pycue/tests -t pycue -p "*.py" PYTHONPATH=pycue python3 -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" diff --git a/connectors/prometheus_metrics/Dockerfile b/connectors/prometheus_metrics/Dockerfile index f710dce0c..357b5a0bf 100644 --- a/connectors/prometheus_metrics/Dockerfile +++ b/connectors/prometheus_metrics/Dockerfile @@ -48,7 +48,7 @@ RUN python -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto RUN cd pycue && python setup.py install diff --git a/cueadmin/Dockerfile b/cueadmin/Dockerfile index 20ff77c1e..dd1359edc 100644 --- a/cueadmin/Dockerfile +++ b/cueadmin/Dockerfile @@ -24,7 +24,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY cueadmin/README.md ./cueadmin/ COPY cueadmin/setup.py ./cueadmin/ diff --git a/cuegui/Dockerfile b/cuegui/Dockerfile index 3e6630804..6e53c7f04 100644 --- a/cuegui/Dockerfile +++ b/cuegui/Dockerfile @@ -54,7 +54,7 @@ RUN python3.6 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY cuegui/README.md ./cuegui/ COPY cuegui/setup.py ./cuegui/ diff --git a/cuesubmit/Dockerfile b/cuesubmit/Dockerfile index eb2a4902a..47c615f7c 100644 --- a/cuesubmit/Dockerfile +++ b/cuesubmit/Dockerfile @@ -41,7 +41,7 @@ RUN python3.6 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY pyoutline/README.md ./pyoutline/ COPY pyoutline/setup.py ./pyoutline/ diff --git a/proto/README.md b/proto/README.md index 0b039b462..904a112e1 100644 --- a/proto/README.md +++ b/proto/README.md @@ -14,15 +14,14 @@ To generate: ```sh python -m grpc_tools.protoc -I=. --python_out=../rqd/rqd/compiled_proto --grpc_python_out=../rqd/rqd/compiled_proto ./*.proto -2to3 -wn -f import ../rqd/rqd/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py ../rqd/rqd/compiled_proto ``` For Windows (Powershell): ```powershell python -m grpc_tools.protoc --proto_path=. --python_out=../rqd/rqd/compiled_proto --grpc_python_out=../rqd/rqd/compiled_proto (ls *.proto).Name -cd ..\rqd\rqd\compiled_proto\ -2to3 -wn -f import (ls *_pb2*.py).Name +python ../ci/fix_compiled_proto.py ../rqd/rqd/compiled_proto ``` @@ -32,15 +31,14 @@ To generate: ```sh python -m grpc_tools.protoc -I=. --python_out=../pycue/opencue/compiled_proto --grpc_python_out=../pycue/opencue/compiled_proto ./*.proto -2to3 -wn -f import ../pycue/opencue/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py ../pycue/opencue/compiled_proto ``` For Windows (Powershell): ```powershell python -m grpc_tools.protoc --proto_path=. --python_out=../pycue/opencue/compiled_proto --grpc_python_out=../pycue/opencue/compiled_proto (ls *.proto).Name -cd ..\pycue\opencue\compiled_proto\ -2to3 -wn -f import (ls *_pb2*.py).Name +python ../ci/fix_compiled_proto.py ../pycue/opencue/compiled_proto ``` diff --git a/pycue/Dockerfile b/pycue/Dockerfile index 9698e94bc..c61d3cf16 100644 --- a/pycue/Dockerfile +++ b/pycue/Dockerfile @@ -25,7 +25,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY VERSION.in VERSIO[N] ./ RUN test -e VERSION || echo "$(cat VERSION.in)" | tee VERSION diff --git a/pyoutline/Dockerfile b/pyoutline/Dockerfile index bc7155daf..6937a5584 100644 --- a/pyoutline/Dockerfile +++ b/pyoutline/Dockerfile @@ -24,7 +24,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY pyoutline/README.md ./pyoutline/ COPY pyoutline/setup.py ./pyoutline/ diff --git a/requirements.txt b/requirements.txt index 41118fe6f..946853794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -2to3==1.0 enum34==1.1.6 future==1.0.0 grpcio==1.48.2;python_version<"3.7" diff --git a/rqd/Dockerfile b/rqd/Dockerfile index c3a0a0dc1..6847ad10a 100644 --- a/rqd/Dockerfile +++ b/rqd/Dockerfile @@ -35,7 +35,7 @@ RUN python3.9 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py rqd/rqd/compiled_proto COPY VERSION.in VERSIO[N] ./ RUN test -e VERSION || echo "$(cat VERSION.in)" | tee VERSION diff --git a/sandbox/install-client-sources.sh b/sandbox/install-client-sources.sh index 7e15ed018..80ed1f39e 100755 --- a/sandbox/install-client-sources.sh +++ b/sandbox/install-client-sources.sh @@ -10,7 +10,7 @@ python -m grpc_tools.protoc -I=. \ --python_out=../pycue/opencue/compiled_proto \ --grpc_python_out=../pycue/opencue/compiled_proto ./*.proto cd .. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py pycue/opencue/compiled_proto # Install all client packages. pip install pycue/ pyoutline/ cueadmin/ cuesubmit/ cuegui/ From 3b1d6b5360bf81b81825fbe5c52b67aefa381f19 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 25 Oct 2024 20:03:21 -0700 Subject: [PATCH 21/51] Fix pytest path --- ci/run_python_tests.sh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ci/run_python_tests.sh b/ci/run_python_tests.sh index 0c8d121a4..1259adf4c 100755 --- a/ci/run_python_tests.sh +++ b/ci/run_python_tests.sh @@ -22,12 +22,11 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc python ci/fix_compiled_proto.py pycue/opencue/compiled_proto python ci/fix_compiled_proto.py rqd/rqd/compiled_proto -python3 -m unittest discover -s pycue/tests -t pycue -p "*.py" -PYTHONPATH=pycue python3 -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" -PYTHONPATH=pycue python3 -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py" -PYTHONPATH=pycue:pyoutline python3 -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py" -pytest rqd/tests - +python -m unittest discover -s pycue/tests -t pycue -p "*.py" +PYTHONPATH=pycue python -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" +PYTHONPATH=pycue python -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py" +PYTHONPATH=pycue:pyoutline python -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py" +python -m pytest rqd/tests # Xvfb no longer supports Python 2. if [[ "$python_version" =~ "Python 3" && ${args[0]} != "--no-gui" ]]; then From 0100ad635959567340a351b209237550fb266664 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 28 Oct 2024 15:44:52 -0700 Subject: [PATCH 22/51] Fix issue on rqmachine tests Since https://github.com/AcademySoftwareFoundation/OpenCue/pull/1308 rqd stopped supporting stats files containing whitespaces and parenthesis. --- rqd/rqd/rqmachine.py | 22 ++++++- rqd/tests/rqmachine_test.py | 116 +++++++++++++++++------------------- 2 files changed, 73 insertions(+), 65 deletions(-) diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 1f67798e3..0687858c7 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -215,10 +215,23 @@ def __updateGpuAndLlu(self, frame): frame.lluTime = int(stat) def _getStatFields(self, pidFilePath): + """ Read stats file and return list of values + Stats file can star with these formats: + - 105 name ... + - 105 (name) ... + - 105 (name with space) ... + - 105 (name with) (space and parenthesis) ... + """ with open(pidFilePath, "r", encoding='utf-8') as statFile: - stats = statFile.read().split() - stats[1] = stats[1].strip('()') - return stats + txt = statFile.read() + try: + open_par_index = txt.index('(') + close_par_index = txt.rindex(')') + name = txt[open_par_index:close_par_index].strip("()") + reminder = (txt[0:open_par_index] + txt[close_par_index + 1:]).split() + return reminder[0:1] + [name] + reminder[1:] + except ValueError: + return txt.split() def rssUpdate(self, frames): """Updates the rss and maxrss for all running frames""" @@ -269,6 +282,9 @@ def rssUpdate(self, frames): # Fetch swap usage "swap": self._getProcSwap(pid), } + + # TODO: Improve this logic to avoid collecting data from all running procs. + # instead, focus on the monitored procs hierarchy # cmdline: p = psutil.Process(int(pid)) pids[pid]["cmd_line"] = p.cmdline() diff --git a/rqd/tests/rqmachine_test.py b/rqd/tests/rqmachine_test.py index 1b1bdaf4a..c74b22e80 100644 --- a/rqd/tests/rqmachine_test.py +++ b/rqd/tests/rqmachine_test.py @@ -303,15 +303,19 @@ def _test_rssUpdate(self, proc_stat): self.assertAlmostEqual(0.034444696691, float(updatedFrameInfo.attributes['pcpu'])) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdate(self): + @mock.patch('psutil.Process') + def test_rssUpdate(self, processMock): + processMock.return_value.cmdline.return_value = "some_command" self._test_rssUpdate(PROC_PID_STAT) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdateWithSpaces(self): + @mock.patch('psutil.Process') + def test_rssUpdateWithSpaces(self, processMock): self._test_rssUpdate(PROC_PID_STAT_WITH_SPACES) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdateWithBrackets(self): + @mock.patch('psutil.Process') + def test_rssUpdateWithBrackets(self, processMock): self._test_rssUpdate(PROC_PID_STAT_WITH_BRACKETS) @mock.patch.object( @@ -461,43 +465,41 @@ def test_reserveHT(self): self.machine.setupTaskset() + #-----------------------Core Map------------------------ + # phys 0 phys 1 + # core 0 core 0 + # proc 0 proc 4 + # proc 8 proc 12 + # core 1 core 1 + # proc 1 proc 5 + # proc 9 proc 13 + # core 2 core 2 + # proc 2 proc 6 + # proc 10 proc 14 + # core 3 core 3 + # proc 3 proc 7 + # proc 11 proc 15 # ------------------------step1------------------------- - # phys_id 1 - # - core_id 0 - # - process_id 4 - # - process_id 12 - # - core_id 1 - # - process_id 5 - # - process_id 13 - # - core_id 3 - # - process_id 7 - # - process_id 15 - tasksets1 = self.machine.reserveHT(300) - # pylint: disable=no-member - self.assertItemsEqual(['4', '5', '7', '12', '13', '15'], sorted(tasksets1.split(','))) - - # ------------------------step2------------------------- - # phys_id 0 - # - core_id 0 - # - process_id 0 - # - process_id 8 - # - core_id 1 - # - process_id 1 - # - process_id 9 - # - core_id 2 - # - process_id 2 - # - process_id 10 - # - core_id 3 - # - process_id 3 - # - process_id 11 - tasksets0 = self.machine.reserveHT(400) - # pylint: disable=no-member - self.assertItemsEqual(['0', '1', '2', '3', '8', '9', '10', '11'], - sorted(tasksets0.split(','))) - - # reserved cores got updated properly - # pylint: disable=no-member - self.assertItemsEqual([0, 1, 2, 3], self.coreDetail.reserved_cores[0].coreid) + def assertTaskSet(taskset_list): + """Ensure all tasks are being allocated with the right thread pairs""" + phys0 = [('0', '8'), ('1', '9'), ('10', '2'), ('11', '3')] + phys1 = [('12', '4'), ('13', '5'), ('14', '6'), ('15', '7')] + + taskset_2_2 = list(zip(taskset_list[::2], taskset_list[1::2])) + if taskset_2_2[0] in phys0: + for t in taskset_2_2: + self.assertTrue(tuple(sorted(t)) in phys0, "%s not in %s" % (t, phys0)) + elif taskset_2_2[0] in phys1: + for t in taskset_2_2: + self.assertTrue(tuple(sorted(t)) in phys1, "%s not in %s" % (t, phys1)) + + tasksets0 = self.machine.reserveHT(300) + self.assertIsNotNone(tasksets0) + assertTaskSet(tasksets0.split(",")) + + tasksets1 = self.machine.reserveHT(400) + self.assertIsNotNone(tasksets1) + assertTaskSet(tasksets1.split(",")) # Make sure tastsets don't overlap self.assertTrue(set(tasksets0.split(',')).isdisjoint(tasksets1.split(','))) @@ -507,8 +509,6 @@ def test_reserveHT(self): self.machine.releaseHT(tasksets0) # pylint: disable=no-member self.assertTrue(1 in self.coreDetail.reserved_cores) - # pylint: disable=no-member - self.assertItemsEqual([0, 1, 3], self.coreDetail.reserved_cores[1].coreid) # ------------------------step4------------------------- # phys_id 0 @@ -519,29 +519,12 @@ def test_reserveHT(self): # - process_id 1 # - process_id 9 tasksets3 = self.machine.reserveHT(200) - # pylint: disable=no-member - self.assertItemsEqual(['0', '1', '8', '9'], sorted(tasksets3.split(','))) + assertTaskSet(tasksets3.split(",")) # ------------------------step5------------------------- - # phys_id 0 - # - core_id 2 - # - process_id 2 - # - process_id 10 - # - core_id 3 - # - process_id 3 - # - process_id 11 - # phys_id 1 - # - core_id 2 - # - process_id 6 - # - process_id 14 - tasksets4 = self.machine.reserveHT(300) - # pylint: disable=no-member - self.assertItemsEqual(['2', '10', '3', '11', '6', '14'], sorted(tasksets4.split(','))) - - # ------------------------step6------------------------- - # No cores available + # Missing one core with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): - self.machine.reserveHT(300) + tasksets4 = self.machine.reserveHT(300) def test_tags(self): @@ -553,9 +536,17 @@ def test_tags(self): self.assertTrue(all(tag in machine.__dict__['_Machine__renderHost'].tags for tag in tags)) -class CpuinfoTests(unittest.TestCase): +@mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) +class CpuinfoTestsLinux(pyfakefs.fake_filesystem_unittest.TestCase): + @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) def setUp(self): + self.setUpPyfakefs() + self.fs.create_file('/proc/cpuinfo', contents=CPUINFO) + self.loadavg = self.fs.create_file('/proc/loadavg', contents=LOADAVG_LOW_USAGE) + self.procStat = self.fs.create_file('/proc/stat', contents=PROC_STAT) + self.meminfo = self.fs.create_file('/proc/meminfo', contents=MEMINFO_MODERATE_USAGE) + self.fs.add_real_directory(os.path.dirname(__file__)) self.rqd = rqd.rqcore.RqCore() def test_shark(self): @@ -591,6 +582,7 @@ def test_srdsvr09(self): def __cpuinfoTestHelper(self, pathCpuInfo): # File format: _cpuinfo_dub_x-x-x where x-x-x is totalCores-coresPerProc-numProcs pathCpuInfo = os.path.join(os.path.dirname(__file__), 'cpuinfo', pathCpuInfo) + self.meminfo.set_contents(MEMINFO_MODERATE_USAGE) renderHost, coreInfo = self.rqd.machine.testInitMachineStats(pathCpuInfo) totalCores, coresPerProc, numProcs = pathCpuInfo.split('_')[-1].split('-')[:3] From 08a2625f7576dae31741ab6b29b017ffc1d91dc5 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 28 Oct 2024 15:45:22 -0700 Subject: [PATCH 23/51] Fix unit tests for rqcore and rqconstants --- rqd/tests/rqconstants_test.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/rqd/tests/rqconstants_test.py b/rqd/tests/rqconstants_test.py index dcd5093ae..0503994a2 100644 --- a/rqd/tests/rqconstants_test.py +++ b/rqd/tests/rqconstants_test.py @@ -121,6 +121,7 @@ def makeRqMachine(self): """ [Override] DEFAULT_FACILITY = test_facility +RQD_TAGS = test_tag1 test_tag2 test_tag3 """, ) def test_facility(self): @@ -128,19 +129,6 @@ def test_facility(self): machine = self.makeRqMachine() self.assertEqual(machine.renderHost.facility, "test_facility") - - @MockConfig( - tempdir, - """ -[Override] -RQD_TAGS = test_tag1 test_tag2 test_tag3 -""", - ) - def test_tags(self): - self.assertEqual(rqd.rqconstants.RQD_TAGS, "test_tag1 test_tag2 test_tag3") - - machine = self.makeRqMachine() - self.assertEqual(machine.renderHost.facility, "cloud") self.assertTrue( set(["test_tag1", "test_tag2", "test_tag3"]).issubset( machine.renderHost.tags From 8d7c0a2dcae78887f26d28d5411deeaf1ecd1658 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 16 Oct 2024 15:55:35 -0700 Subject: [PATCH 24/51] Add runDocker mode to rqd When RUN_ON_DOCKER is set on rqd.conf, each frame will be launched as a docker container using the base image configured as DOCKER_IMAGE. --- rqd/rqd/rqconstants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 3f18ed149..fea651ecf 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -165,6 +165,8 @@ # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: From e113df49dd7650cf85627f8872bbf151a5e41c3e Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 17 Oct 2024 16:07:17 -0700 Subject: [PATCH 25/51] [EXPERIMENT] Rqd containerized frame (#1546) Signed-off-by: Diego Tavares --- cuebot/.project | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 cuebot/.project diff --git a/cuebot/.project b/cuebot/.project new file mode 100644 index 000000000..633ce02cc --- /dev/null +++ b/cuebot/.project @@ -0,0 +1,34 @@ + + + cuebot + Project cuebot created by Buildship. + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.buildship.core.gradleprojectbuilder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.buildship.core.gradleprojectnature + + + + 1729914777580 + + 30 + + org.eclipse.core.resources.regexFilterMatcher + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + + + + From a2922283f04399be792c12a4f1523d174ec7980d Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 17 Oct 2024 16:07:17 -0700 Subject: [PATCH 26/51] [EXPERIMENT] Rqd containerized frame (#1546) Signed-off-by: Diego Tavares --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..b055a1807 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.pyright] +venvPath = "." +venv = "venv" From 29330757a40e201261b355554634fc1ae2ffcfa6 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 25 Oct 2024 16:00:46 -0700 Subject: [PATCH 27/51] Add unit tests for runOnDocker --- rqd/rqd/rqcore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 93c7b0d24..44d9cd82b 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -983,10 +983,10 @@ def runDocker(self): tasksetCmd = "taskset -c %s" % runFrame.attributes['CPU_LIST'] # Command wrapper - command = """#!/bin/sh + command = r"""#!/bin/sh useradd -u %s -g %s %s >& /dev/null || true; exec su -s %s %s -c "echo \$$; /bin/nice /usr/bin/time -p -o %s %s %s" - """ % ( +""" % ( runFrame.uid, gid, runFrame.user_name, From 5345a5298ea8bf337e6caf5fb979987632533d2f Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 28 Oct 2024 17:09:34 -0700 Subject: [PATCH 28/51] Fix python lint --- rqd/rqd/rqconstants.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index fea651ecf..8bde2010d 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -283,26 +283,28 @@ SP_OS = ",".join(keys) if not DOCKER_IMAGES: raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " - "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + "least one image on DOCKER_IMAGES ([docker.images] " + "section of rqd.conf)") - def parse_mount(mount_str): + def parse_mount(mount_string): """ Parse mount definitions similar to a docker run command into a docker mount obj Format: type=bind,source=/tmp,target=/tmp,bind-propagation=slave """ - mount_dict = {} + mounts = {} # bind-propagation defaults to None as only type=bind accepts it - mount_dict["bind-propagation"] = None - for item in mount_str.split(","): - key, value = item.split(":") - mount_dict[key.strip()] = value.strip() - return mount_dict + mounts["bind-propagation"] = None + for item in mount_string.split(","): + mount_name, mount_path = item.split(":") + mounts[mount_name.strip()] = mount_path.strip() + return mounts # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: + mount_str = "" try: mount_str = config.get(__docker_mounts, mount_name) mount_dict = parse_mount(mount_str) From 80c1720e8139ad52cff421520c4b1fc8200ec24e Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 28 Oct 2024 19:03:50 -0700 Subject: [PATCH 29/51] Fix lint --- rqd/rqd/rqconstants.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 8bde2010d..acabed583 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -293,13 +293,13 @@ def parse_mount(mount_string): Format: type=bind,source=/tmp,target=/tmp,bind-propagation=slave """ - mounts = {} + parsed_mounts = {} # bind-propagation defaults to None as only type=bind accepts it - mounts["bind-propagation"] = None + parsed_mounts["bind-propagation"] = None for item in mount_string.split(","): - mount_name, mount_path = item.split(":") - mounts[mount_name.strip()] = mount_path.strip() - return mounts + name, mount_path = item.split(":") + parsed_mounts[name.strip()] = mount_path.strip() + return parsed_mounts # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) From 21242b979ac00ef8f9e334ddd7c10f48535725b9 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 28 Oct 2024 21:42:33 -0700 Subject: [PATCH 30/51] Fix lint issues --- rqd/tests/rqcore_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index 678042f19..388eae896 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -764,7 +764,7 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi entrypoint=cmd_file ) - with open(cmd_file) as f: + with open(cmd_file, "r", encoding='utf-8') as f: cmd = f.read() self.assertEqual(r"""#!/bin/sh useradd -u %s -g %s %s >& /dev/null || true; From 88b8892af61264b468926fa8bfac51f8f76f5094 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 29 Oct 2024 08:05:53 -0700 Subject: [PATCH 31/51] Fix migration --- .../conf/ddl/postgres/migrations/V31__increase_os_size.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql index ec3cf4a96..9ad89e437 100644 --- a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql +++ b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql @@ -1,3 +1,3 @@ -- Increase size of os column on host_stat -ALTER TABLE host_stat -MODIFY COLUMN str_os VARCHAR(32); \ No newline at end of file + +ALTER TABLE host_stat ALTER COLUMN str_os TYPE VARCHAR(32); From 8eba8345fe995b52fdc72815f7a37adcde8db3aa Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 29 Oct 2024 09:24:32 -0700 Subject: [PATCH 32/51] Remove undesired file --- cuebot/.project | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 cuebot/.project diff --git a/cuebot/.project b/cuebot/.project deleted file mode 100644 index 633ce02cc..000000000 --- a/cuebot/.project +++ /dev/null @@ -1,34 +0,0 @@ - - - cuebot - Project cuebot created by Buildship. - - - - - org.eclipse.jdt.core.javabuilder - - - - - org.eclipse.buildship.core.gradleprojectbuilder - - - - - - org.eclipse.jdt.core.javanature - org.eclipse.buildship.core.gradleprojectnature - - - - 1729914777580 - - 30 - - org.eclipse.core.resources.regexFilterMatcher - node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ - - - - From 2cf7f230624d8fce8cc14bcec0eeb3a204e6fd1b Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 29 Oct 2024 14:39:43 -0700 Subject: [PATCH 33/51] Make sure users are created with passwords When creating an user on a container, add a randomly generated password for security --- rqd/rqd/rqcore.py | 4 +++- rqd/tests/rqcore_test.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 44d9cd82b..4ecfd90c6 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -35,6 +35,7 @@ import time import traceback import select +import uuid import rqd.compiled_proto.host_pb2 import rqd.compiled_proto.report_pb2 @@ -984,11 +985,12 @@ def runDocker(self): # Command wrapper command = r"""#!/bin/sh -useradd -u %s -g %s %s >& /dev/null || true; +useradd -u %s -g %s -p %s %s >& /dev/null || true; exec su -s %s %s -c "echo \$$; /bin/nice /usr/bin/time -p -o %s %s %s" """ % ( runFrame.uid, gid, + str(uuid.uuid4()), runFrame.user_name, rqd.rqconstants.DOCKER_SHELL_PATH, runFrame.user_name, diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index 388eae896..abeabdb20 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -25,6 +25,7 @@ import os.path import unittest import subprocess +import re import mock import pyfakefs.fake_filesystem_unittest @@ -765,7 +766,8 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi ) with open(cmd_file, "r", encoding='utf-8') as f: - cmd = f.read() + # Remove `-p RANDOM_PASSWORD` from output + cmd = re.sub(r"-p\s+(\d|\w)\S+\s*", "", f.read()) self.assertEqual(r"""#!/bin/sh useradd -u %s -g %s %s >& /dev/null || true; exec su -s /bin/sh %s -c "echo \$$; /bin/nice /usr/bin/time -p -o /job/temp/path/rqd-stat-%s-%s " From 28268264edb61d0e09a6c36fe24953cb5fa0cd38 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 30 Oct 2024 13:59:41 -0700 Subject: [PATCH 34/51] Rqd multiple os (#1563) Just a placeholder to test this branch at SPI --------- Signed-off-by: Diego Tavares Co-authored-by: Ramon Figueiredo Co-authored-by: Jimmy Christensen --- VERSION.in | 2 +- ci/build_sphinx_docs.sh | 2 +- ci/fix_compiled_proto.py | 26 + ci/python_coverage_report.sh | 4 +- ci/run_gui_test.sh | 2 +- ci/run_python_lint.sh | 4 +- ci/run_python_tests.sh | 16 +- connectors/prometheus_metrics/Dockerfile | 2 +- cueadmin/Dockerfile | 2 +- .../spcue/dao/postgres/ProcDaoJdbc.java | 18 +- .../migrations/V31__increase_os_size.sql | 3 + cuegui/Dockerfile | 2 +- cuegui/cuegui/FrameMonitorTree.py | 30 +- cuegui/cuegui/JobMonitorTree.py | 12 +- cuegui/cuegui/MenuActions.py | 2 + cuegui/cuegui/config/cuegui.yaml | 26 +- cuegui/tests/FrameMonitorTree_tests.py | 11 +- cuesubmit/Dockerfile | 2 +- proto/README.md | 10 +- pycue/Dockerfile | 2 +- pyoutline/Dockerfile | 2 +- pyproject.toml | 3 + requirements.txt | 2 +- rqd/Dockerfile | 2 +- rqd/rqd.example.conf | 1 + rqd/rqd/rqconstants.py | 26 +- rqd/rqd/rqcore.py | 61 ++- rqd/rqd/rqmachine.py | 22 +- rqd/rqd/rqnimby.py | 2 +- ...ot_listener.py => cuebot_listener_test.py} | 0 rqd/tests/{cuerqd_tests.py => cuerqd_test.py} | 0 ...constants_tests.py => rqconstants_test.py} | 16 +- rqd/tests/{rqcore_tests.py => rqcore_test.py} | 509 ++++++++++++------ .../{rqmachine_tests.py => rqmachine_test.py} | 116 ++-- rqd/tests/rqnimby_test.py | 143 +++++ rqd/tests/rqnimby_tests.py | 143 ----- sandbox/install-client-sources.sh | 2 +- 37 files changed, 755 insertions(+), 473 deletions(-) create mode 100644 ci/fix_compiled_proto.py create mode 100644 cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql create mode 100644 pyproject.toml rename rqd/tests/{test_cuebot_listener.py => cuebot_listener_test.py} (100%) rename rqd/tests/{cuerqd_tests.py => cuerqd_test.py} (100%) rename rqd/tests/{rqconstants_tests.py => rqconstants_test.py} (92%) rename rqd/tests/{rqcore_tests.py => rqcore_test.py} (63%) rename rqd/tests/{rqmachine_tests.py => rqmachine_test.py} (89%) create mode 100644 rqd/tests/rqnimby_test.py delete mode 100644 rqd/tests/rqnimby_tests.py diff --git a/VERSION.in b/VERSION.in index d3827e75a..9459d4ba2 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -1.0 +1.1 diff --git a/ci/build_sphinx_docs.sh b/ci/build_sphinx_docs.sh index 9f207d51e..b39c68fc2 100755 --- a/ci/build_sphinx_docs.sh +++ b/ci/build_sphinx_docs.sh @@ -10,7 +10,7 @@ python -m grpc_tools.protoc -I=proto/ --python_out=pycue/opencue/compiled_proto # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto # Build the docs and treat warnings as errors ~/.local/bin/sphinx-build -W -b html -d docs/_build/doctrees docs docs/_build/html diff --git a/ci/fix_compiled_proto.py b/ci/fix_compiled_proto.py new file mode 100644 index 000000000..0a5534803 --- /dev/null +++ b/ci/fix_compiled_proto.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +"""Script that makes the imports in the generated compiled_proto python files relative. + +""" +import os +import re +import sys +import glob + +PYTHON_SCRIPT_PATH = sys.argv[1] + +if os.path.isdir(PYTHON_SCRIPT_PATH): + pattern = re.compile(r"^import \w+ as \w+_pb2") + for filepath in glob.glob(os.path.join(PYTHON_SCRIPT_PATH, "*_pb2*.py")): + filedata = [] + with open(filepath) as f: + for line in f.readlines(): + match = pattern.match(line) + if match is not None: + line = f"from . {line}" + filedata.append(line.strip("\n")) + with open(filepath, "w") as f: + f.write("\n".join(filedata)) +else: + print("Argument is not a directory") diff --git a/ci/python_coverage_report.sh b/ci/python_coverage_report.sh index e0c65328f..4477c8d44 100755 --- a/ci/python_coverage_report.sh +++ b/ci/python_coverage_report.sh @@ -9,8 +9,8 @@ python -m pip install coverage pytest-xvfb # Protos need to have their Python code generated in order for tests to pass. python -m grpc_tools.protoc -I=proto/ --python_out=pycue/opencue/compiled_proto --grpc_python_out=pycue/opencue/compiled_proto proto/*.proto python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc_python_out=rqd/rqd/compiled_proto proto/*.proto -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto # Run coverage for each component individually, but append it all into the same report. python -m coverage run --source=pycue/opencue/,pycue/FileSequence/ --omit=pycue/opencue/compiled_proto/* pycue/tests/test_suite.py diff --git a/ci/run_gui_test.sh b/ci/run_gui_test.sh index 3c7d92a6d..8a32a462e 100755 --- a/ci/run_gui_test.sh +++ b/ci/run_gui_test.sh @@ -21,7 +21,7 @@ fi echo "Using Python binary ${py}" test_log="/tmp/cuegui_result.log" -PYTHONPATH=pycue xvfb-run -d "${py}" cuegui/setup.py test | tee ${test_log} +PYTHONPATH=pycue xvfb-run -d "${py}" -m unittest discover -s cuegui/tests -t cuegui -p "*.py"| tee ${test_log} grep -Pz 'Ran \d+ tests in [0-9\.]+s\n\nOK' ${test_log} if [ $? -eq 0 ]; then diff --git a/ci/run_python_lint.sh b/ci/run_python_lint.sh index bd86c9188..ba98d2b6c 100755 --- a/ci/run_python_lint.sh +++ b/ci/run_python_lint.sh @@ -13,8 +13,8 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto echo "Running lint for pycue/..." cd pycue diff --git a/ci/run_python_tests.sh b/ci/run_python_tests.sh index 5f1bfe294..1259adf4c 100755 --- a/ci/run_python_tests.sh +++ b/ci/run_python_tests.sh @@ -19,14 +19,14 @@ python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc # Fix imports to work in both Python 2 and 3. See # for more info. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py -2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py - -python pycue/setup.py test -PYTHONPATH=pycue python pyoutline/setup.py test -PYTHONPATH=pycue python cueadmin/setup.py test -PYTHONPATH=pycue:pyoutline python cuesubmit/setup.py test -python rqd/setup.py test +python ci/fix_compiled_proto.py pycue/opencue/compiled_proto +python ci/fix_compiled_proto.py rqd/rqd/compiled_proto + +python -m unittest discover -s pycue/tests -t pycue -p "*.py" +PYTHONPATH=pycue python -m unittest discover -s pyoutline/tests -t pyoutline -p "*.py" +PYTHONPATH=pycue python -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py" +PYTHONPATH=pycue:pyoutline python -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py" +python -m pytest rqd/tests # Xvfb no longer supports Python 2. if [[ "$python_version" =~ "Python 3" && ${args[0]} != "--no-gui" ]]; then diff --git a/connectors/prometheus_metrics/Dockerfile b/connectors/prometheus_metrics/Dockerfile index f710dce0c..357b5a0bf 100644 --- a/connectors/prometheus_metrics/Dockerfile +++ b/connectors/prometheus_metrics/Dockerfile @@ -48,7 +48,7 @@ RUN python -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto RUN cd pycue && python setup.py install diff --git a/cueadmin/Dockerfile b/cueadmin/Dockerfile index 20ff77c1e..dd1359edc 100644 --- a/cueadmin/Dockerfile +++ b/cueadmin/Dockerfile @@ -24,7 +24,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY cueadmin/README.md ./cueadmin/ COPY cueadmin/setup.py ./cueadmin/ diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java index ecf39caf7..fff43d5ce 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java @@ -346,9 +346,10 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { "proc.int_virt_max_used,"+ "proc.int_virt_used,"+ "host.str_name AS host_name, " + - "host_stat.str_os " + + "job.str_os " + "FROM " + - "proc," + + "proc, " + + "job, " + "host, " + "host_stat, " + "alloc " + @@ -357,7 +358,9 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { "AND " + "host.pk_host = host_stat.pk_host " + "AND " + - "host.pk_alloc = alloc.pk_alloc "; + "host.pk_alloc = alloc.pk_alloc " + + "AND " + + "job.pk_job = proc.pk_job "; public VirtualProc getVirtualProc(String id) { return getJdbcTemplate().queryForObject( @@ -376,7 +379,7 @@ public VirtualProc findVirtualProc(FrameInterface frame) { "proc.*, " + "host.str_name AS host_name, " + "host.pk_alloc, " + - "host_stat.str_os, " + + "job.str_os, " + "alloc.pk_facility " + "FROM " + "proc, " + @@ -517,20 +520,23 @@ public String getCurrentFrameId(ProcInterface p) { "SELECT " + "proc.*, " + "host.str_name AS host_name, " + - "host_stat.str_os, " + + "job.str_os, " + "host.pk_alloc, " + "alloc.pk_facility " + "FROM " + "proc, " + "host, " + "host_stat,"+ - "alloc " + + "alloc, " + + "job " + "WHERE " + "proc.pk_host = host.pk_host " + "AND " + "host.pk_host = host_stat.pk_host " + "AND " + "host.pk_alloc = alloc.pk_alloc " + + "AND " + + "job.pk_job = proc.pk_job " + "AND " + "current_timestamp - proc.ts_ping > " + ORPHANED_PROC_INTERVAL; diff --git a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql new file mode 100644 index 000000000..9ad89e437 --- /dev/null +++ b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V31__increase_os_size.sql @@ -0,0 +1,3 @@ +-- Increase size of os column on host_stat + +ALTER TABLE host_stat ALTER COLUMN str_os TYPE VARCHAR(32); diff --git a/cuegui/Dockerfile b/cuegui/Dockerfile index 3e6630804..6e53c7f04 100644 --- a/cuegui/Dockerfile +++ b/cuegui/Dockerfile @@ -54,7 +54,7 @@ RUN python3.6 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY cuegui/README.md ./cuegui/ COPY cuegui/setup.py ./cuegui/ diff --git a/cuegui/cuegui/FrameMonitorTree.py b/cuegui/cuegui/FrameMonitorTree.py index 9e29aa0b2..46da7ee30 100644 --- a/cuegui/cuegui/FrameMonitorTree.py +++ b/cuegui/cuegui/FrameMonitorTree.py @@ -915,17 +915,25 @@ def __init__(self, widget, filterSelectedLayersCallback, readonly=False): if cuegui.Constants.OUTPUT_VIEWERS: job = widget.getJob() - outputPaths = [] - for frame in widget.selectedObjects(): - layer = job.getLayer(frame.layer()) - outputPaths.extend(cuegui.Utils.getOutputFromFrame(layer, frame)) - if outputPaths: - for viewer in cuegui.Constants.OUTPUT_VIEWERS: - self.addAction(viewer['action_text'], - functools.partial(cuegui.Utils.viewFramesOutput, - job, - widget.selectedObjects(), - viewer['action_text'])) + if job is not None: + outputPaths = [] + selectedFrames = widget.selectedObjects() + + layers_dict = {layer.name(): layer for layer in job.getLayers()} + + for frame in selectedFrames: + layer_name = frame.layer() + layer = layers_dict.get(layer_name) + if layer: + outputPaths.extend(cuegui.Utils.getOutputFromFrame(layer, frame)) + + if outputPaths: + for viewer in cuegui.Constants.OUTPUT_VIEWERS: + self.addAction(viewer['action_text'], + functools.partial(cuegui.Utils.viewFramesOutput, + job, + selectedFrames, + viewer['action_text'])) if self.app.applicationName() == "CueCommander": self.__menuActions.frames().addAction(self, "viewHost") diff --git a/cuegui/cuegui/JobMonitorTree.py b/cuegui/cuegui/JobMonitorTree.py index 901ae62e9..7a2946794 100644 --- a/cuegui/cuegui/JobMonitorTree.py +++ b/cuegui/cuegui/JobMonitorTree.py @@ -427,15 +427,11 @@ def contextMenuEvent(self, e): self.__menuActions.jobs().addAction(menu, "useLocalCores") if cuegui.Constants.OUTPUT_VIEWERS: - job = __selectedObjects[0] for viewer in cuegui.Constants.OUTPUT_VIEWERS: - viewer_menu = QtWidgets.QMenu(viewer['action_text'], self) - for layer in job.getLayers(): - viewer_menu.addAction(layer.name(), - functools.partial(cuegui.Utils.viewOutput, - [layer], - viewer['action_text'])) - menu.addMenu(viewer_menu) + menu.addAction(viewer['action_text'], + functools.partial(cuegui.Utils.viewOutput, + __selectedObjects, + viewer['action_text'])) depend_menu = QtWidgets.QMenu("&Dependencies",self) self.__menuActions.jobs().addAction(depend_menu, "viewDepends") diff --git a/cuegui/cuegui/MenuActions.py b/cuegui/cuegui/MenuActions.py index 287b2eeeb..07f83063c 100644 --- a/cuegui/cuegui/MenuActions.py +++ b/cuegui/cuegui/MenuActions.py @@ -581,6 +581,8 @@ def dropInternalDependencies(self, rpcObjects=None): def viewComments(self, rpcObjects=None): jobs = self._getOnlyJobObjects(rpcObjects) if jobs: + if not isinstance(jobs, list): + jobs = [jobs] cuegui.Comments.CommentListDialog(jobs, self._caller).show() dependWizard_info = ["Dependency &Wizard...", None, "configure"] diff --git a/cuegui/cuegui/config/cuegui.yaml b/cuegui/cuegui/config/cuegui.yaml index 529cb9e00..ee11604be 100644 --- a/cuegui/cuegui/config/cuegui.yaml +++ b/cuegui/cuegui/config/cuegui.yaml @@ -1,6 +1,6 @@ # Default CueGUI config file -# Configure how a version number should be acquired. +# Configure how a version number should be acquired. # - False, use the version number in VERSION.in # - True, run the commands defined at cuegui.custom.cmd.version.beta (for beta) or cuegui.custom.cmd.version.stable (for stable) to acquire the version number cuegui.use.custom.version: False @@ -41,6 +41,7 @@ render_logs.root: darwin: '/Users/shots' linux: '/shots' rhel7: '/shots' + rocky9: '/shots' # Substrings which, when found in render logs, will cause that line to be highlighted. render_logs.highlight.error: [ 'error', 'aborted', 'fatal', 'failed', 'killed', 'command not found', @@ -132,23 +133,20 @@ startup_notice.msg: '' memory_warning_level: 5242880 # Output Viewers config. -# # ------------------------------------------------------------------------------------------------------ # Frame, Layer and Job objects have right click menu option for opening an output viewer # (eg. OpenRV) - -#output_viewers: +# output_viewers: # # Text to be displayed at the menu action button # - action_text: "View in OpenRV" -# # extract_args_regex: Regex to extract arguments from the output path produced by a job/layer/frame -# # cmd_pattern: Command pattern to be matched with the regex defined at extract_args_regex -# # if extract_args_regex is not provided, cmd_pattern is called directly with paths as arguments -# extract_args_regex: '/shots/(?P\w+)/(?Pshot\w+)/.*' -# cmd_pattern: "env SHOW={show} SHOT={shot} COLOR_IO=/{show}/home/colorspaces.xml OCIO=/{show}/home/config.ocio openrv {paths}" - -# # if provided, paths containing any of the two values are considered the same output and only one -# # of them will be passed to the viewer -# stereo_modifiers: "_rt_,_lf_" -# # ------------------------------------------------------------------------------------------------------ +# # extract_args_regex: Regex to extract arguments from the output path produced by a job/layer/frame +# # cmd_pattern: Command pattern to be matched with the regex defined at extract_args_regex +# # if extract_args_regex is not provided, cmd_pattern is called directly with paths as arguments +# extract_args_regex: '/shots/(?P\w+)/(?Pshot\w+)/.*' +# cmd_pattern: "env SHOW={show} SHOT={shot} COLOR_IO=/{show}/home/colorspaces.xml OCIO=/{show}/home/config.ocio openrv {paths}" +# +# # if provided, paths containing any of the two values are considered the same output and only one +# # of them will be passed to the viewer +# stereo_modifiers: "_rt_,_lf_" # Pattern to call viewer cmd directly without extracting environment variables. Used for previewing frames # output_viewer_direct_cmd_call: "openrv {paths}" diff --git a/cuegui/tests/FrameMonitorTree_tests.py b/cuegui/tests/FrameMonitorTree_tests.py index 75521572f..67d68fc35 100644 --- a/cuegui/tests/FrameMonitorTree_tests.py +++ b/cuegui/tests/FrameMonitorTree_tests.py @@ -128,9 +128,14 @@ def test_getCores(self): def test_rightClickItem(self, execMock): mouse_position = qtpy.QtCore.QPoint() - self.frameMonitorTree.contextMenuEvent( - qtpy.QtGui.QContextMenuEvent( - qtpy.QtGui.QContextMenuEvent.Reason.Mouse, mouse_position, mouse_position)) + # Ensure the job attribute is set + self.frameMonitorTree.setJob(self.job) + + # Mock the getLayers method to return an empty list or a list of mock layers + with mock.patch.object(self.job, 'getLayers', return_value=[]): + self.frameMonitorTree.contextMenuEvent( + qtpy.QtGui.QContextMenuEvent( + qtpy.QtGui.QContextMenuEvent.Reason.Mouse, mouse_position, mouse_position)) execMock.assert_called_with(mouse_position) diff --git a/cuesubmit/Dockerfile b/cuesubmit/Dockerfile index eb2a4902a..47c615f7c 100644 --- a/cuesubmit/Dockerfile +++ b/cuesubmit/Dockerfile @@ -41,7 +41,7 @@ RUN python3.6 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python3 ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY pyoutline/README.md ./pyoutline/ COPY pyoutline/setup.py ./pyoutline/ diff --git a/proto/README.md b/proto/README.md index 0b039b462..904a112e1 100644 --- a/proto/README.md +++ b/proto/README.md @@ -14,15 +14,14 @@ To generate: ```sh python -m grpc_tools.protoc -I=. --python_out=../rqd/rqd/compiled_proto --grpc_python_out=../rqd/rqd/compiled_proto ./*.proto -2to3 -wn -f import ../rqd/rqd/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py ../rqd/rqd/compiled_proto ``` For Windows (Powershell): ```powershell python -m grpc_tools.protoc --proto_path=. --python_out=../rqd/rqd/compiled_proto --grpc_python_out=../rqd/rqd/compiled_proto (ls *.proto).Name -cd ..\rqd\rqd\compiled_proto\ -2to3 -wn -f import (ls *_pb2*.py).Name +python ../ci/fix_compiled_proto.py ../rqd/rqd/compiled_proto ``` @@ -32,15 +31,14 @@ To generate: ```sh python -m grpc_tools.protoc -I=. --python_out=../pycue/opencue/compiled_proto --grpc_python_out=../pycue/opencue/compiled_proto ./*.proto -2to3 -wn -f import ../pycue/opencue/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py ../pycue/opencue/compiled_proto ``` For Windows (Powershell): ```powershell python -m grpc_tools.protoc --proto_path=. --python_out=../pycue/opencue/compiled_proto --grpc_python_out=../pycue/opencue/compiled_proto (ls *.proto).Name -cd ..\pycue\opencue\compiled_proto\ -2to3 -wn -f import (ls *_pb2*.py).Name +python ../ci/fix_compiled_proto.py ../pycue/opencue/compiled_proto ``` diff --git a/pycue/Dockerfile b/pycue/Dockerfile index 9698e94bc..c61d3cf16 100644 --- a/pycue/Dockerfile +++ b/pycue/Dockerfile @@ -25,7 +25,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY VERSION.in VERSIO[N] ./ RUN test -e VERSION || echo "$(cat VERSION.in)" | tee VERSION diff --git a/pyoutline/Dockerfile b/pyoutline/Dockerfile index bc7155daf..6937a5584 100644 --- a/pyoutline/Dockerfile +++ b/pyoutline/Dockerfile @@ -24,7 +24,7 @@ RUN python3 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py pycue/opencue/compiled_proto COPY pyoutline/README.md ./pyoutline/ COPY pyoutline/setup.py ./pyoutline/ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..b055a1807 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.pyright] +venvPath = "." +venv = "venv" diff --git a/requirements.txt b/requirements.txt index dc0f8d570..946853794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -2to3==1.0 enum34==1.1.6 future==1.0.0 grpcio==1.48.2;python_version<"3.7" @@ -16,6 +15,7 @@ pylint==2.15.10;python_version>="3.7" pynput==1.7.6 PyYAML==5.1 six==1.16.0 +pytest==8.3.3 # Optional requirements # Sentry support for rqd diff --git a/rqd/Dockerfile b/rqd/Dockerfile index c3a0a0dc1..6847ad10a 100644 --- a/rqd/Dockerfile +++ b/rqd/Dockerfile @@ -35,7 +35,7 @@ RUN python3.9 -m grpc_tools.protoc \ # Fix imports to work in both Python 2 and 3. See # for more info. -RUN 2to3 -wn -f import rqd/rqd/compiled_proto/*_pb2*.py +RUN python ci/fix_compiled_proto.py rqd/rqd/compiled_proto COPY VERSION.in VERSIO[N] ./ RUN test -e VERSION || echo "$(cat VERSION.in)" | tee VERSION diff --git a/rqd/rqd.example.conf b/rqd/rqd.example.conf index 4369236dc..870419b24 100644 --- a/rqd/rqd.example.conf +++ b/rqd/rqd.example.conf @@ -31,6 +31,7 @@ PIXAR_LICENSE_FILE [docker.config] # Setting this to True requires all the additional "docker.[]" sections to be filled RUN_ON_DOCKER=False +DOCKER_SHELL_PATH=/usr/bin/sh # This section is only required if RUN_ON_DOCKER=True # List of volume mounts following docker run's format, but replacing = with : diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index beba053fb..aa17e8292 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -158,6 +158,7 @@ RUN_ON_DOCKER = False DOCKER_IMAGES = {} DOCKER_MOUNTS = [] +DOCKER_SHELL_PATH = "/bin/sh" try: if os.path.isfile(CONFIG_FILE): @@ -251,6 +252,12 @@ RQD_UID = 0 RQD_GID = 0 + # Path to the shell to be used in the frame environment + if config.has_option(__docker_config, "DOCKER_SHELL_PATH"): + DOCKER_SHELL_PATH = config.get( + __docker_config, + "DOCKER_SHELL_PATH") + # Every key:value on the config file under docker.images # is parsed as key=SP_OS and value=image_tag. # SP_OS is set to a list of all available keys @@ -274,26 +281,27 @@ SP_OS = ",".join(keys) if not DOCKER_IMAGES: raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " - "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") - - def parse_mount(mount_str): + "least one image on DOCKER_IMAGES ([docker.images] " + "section of rqd.conf)") + def parse_mount(mount_string): """ Parse mount definitions similar to a docker run command into a docker mount obj Format: type=bind,source=/tmp,target=/tmp,bind-propagation=slave """ - mount_dict = {} + parsed_mounts = {} # bind-propagation defaults to None as only type=bind accepts it - mount_dict["bind-propagation"] = None - for item in mount_str.split(","): - key, value = item.split(":") - mount_dict[key.strip()] = value.strip() - return mount_dict + parsed_mounts["bind-propagation"] = None + for item in mount_string.split(","): + name, mount_path = item.split(":") + parsed_mounts[name.strip()] = mount_path.strip() + return parsed_mounts # Parse values under the category docker.mounts into Mount objects mounts = config.options(__docker_mounts) for mount_name in mounts: + mount_str = "" try: mount_str = config.get(__docker_mounts, mount_name) mount_dict = parse_mount(mount_str) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4de489e1d..4ecfd90c6 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -35,6 +35,7 @@ import time import traceback import select +import uuid import rqd.compiled_proto.host_pb2 import rqd.compiled_proto.report_pb2 @@ -613,7 +614,8 @@ def sanitizeFrames(self): Iterate over the cache and update the status of frames that might have completed but never reported back to cuebot. """ - for frameId, runningFrame in self.__cache.items(): + for frameId in list(self.__cache.keys): + runningFrame = self.__cache[frameId] # If the frame was marked as completed (exitStatus) and a report has not been sent # try to file the report again if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: @@ -963,18 +965,45 @@ def runDocker(self): frameInfo.frameId, time.time()) self._tempLocations.append(tempStatFile) - tempCommand = [] - if self.rqCore.machine.isDesktop(): - tempCommand += ["/bin/nice"] - tempCommand += ["/usr/bin/time", "-p", "-o", tempStatFile] - if 'CPU_LIST' in runFrame.attributes: - tempCommand += ['taskset', '-c', runFrame.attributes['CPU_LIST']] - - tempCommand += [runFrame.command] + # Prevent frame from attempting to run as ROOT + if runFrame.gid <= 0: + gid = rqd.rqconstants.LAUNCH_FRAME_USER_GID + else: + gid = runFrame.gid + + # Never give frame ROOT permissions + if runFrame.uid == 0 or gid == 0: + self.rqlog.write("Frame cannot run as ROOT", + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + return + + # Thread affinity + tasksetCmd = "" + if runFrame.attributes['CPU_LIST']: + tasksetCmd = "taskset -c %s" % runFrame.attributes['CPU_LIST'] + + # Command wrapper + command = r"""#!/bin/sh +useradd -u %s -g %s -p %s %s >& /dev/null || true; +exec su -s %s %s -c "echo \$$; /bin/nice /usr/bin/time -p -o %s %s %s" +""" % ( + runFrame.uid, + gid, + str(uuid.uuid4()), + runFrame.user_name, + rqd.rqconstants.DOCKER_SHELL_PATH, + runFrame.user_name, + tempStatFile, + tasksetCmd, + runFrame.command + ) - # Print PID before executing - command = ["sh", "-c", "echo $$; exec " + " ".join(tempCommand)] + # Log entrypoint on frame log to simplify replaying frames + self.rqlog.write("DOCKER_ENTRYPOINT = %s" % command, + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + # Write command to a file on the job tmpdir to simplify replaying a frame + command = self._createCommandFile(command) client = self.rqCore.docker_client try: @@ -988,8 +1017,7 @@ def runDocker(self): pid_mode="host", stderr=True, hostname=self.frameEnv["jobhost"], - entrypoint=command, - user=runFrame.uid) + entrypoint=command) log_stream = container.logs(stream=True) # CMD prints the process PID before executing the actual command @@ -1006,9 +1034,12 @@ def runDocker(self): output = container.wait() returncode = output["StatusCode"] # pylint: disable=broad-except - except Exception: + except Exception as e: returncode = 1 - logging.exception("Failed to launch frame container") + msg = "Failed to launch frame container" + logging.exception(msg) + self.rqlog.write("%s - %s" % (msg, e), + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # Find exitStatus and exitSignal if returncode < 0: diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 1f67798e3..0687858c7 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -215,10 +215,23 @@ def __updateGpuAndLlu(self, frame): frame.lluTime = int(stat) def _getStatFields(self, pidFilePath): + """ Read stats file and return list of values + Stats file can star with these formats: + - 105 name ... + - 105 (name) ... + - 105 (name with space) ... + - 105 (name with) (space and parenthesis) ... + """ with open(pidFilePath, "r", encoding='utf-8') as statFile: - stats = statFile.read().split() - stats[1] = stats[1].strip('()') - return stats + txt = statFile.read() + try: + open_par_index = txt.index('(') + close_par_index = txt.rindex(')') + name = txt[open_par_index:close_par_index].strip("()") + reminder = (txt[0:open_par_index] + txt[close_par_index + 1:]).split() + return reminder[0:1] + [name] + reminder[1:] + except ValueError: + return txt.split() def rssUpdate(self, frames): """Updates the rss and maxrss for all running frames""" @@ -269,6 +282,9 @@ def rssUpdate(self, frames): # Fetch swap usage "swap": self._getProcSwap(pid), } + + # TODO: Improve this logic to avoid collecting data from all running procs. + # instead, focus on the monitored procs hierarchy # cmdline: p = psutil.Process(int(pid)) pids[pid]["cmd_line"] = p.cmdline() diff --git a/rqd/rqd/rqnimby.py b/rqd/rqd/rqnimby.py index 15b8dd89f..2e5d44674 100644 --- a/rqd/rqd/rqnimby.py +++ b/rqd/rqd/rqnimby.py @@ -58,7 +58,7 @@ def getNimby(rqCore): # Ideally ImportError could be used here, but pynput # can throw other kinds of exception while trying to # access runpy components - log.exception("Failed to import pynput, falling back to Select module") + log.debug("Failed to import pynput, falling back to Select module") # Still enabling the application start as hosts can be manually locked # using the API/GUI return NimbyNop(rqCore) diff --git a/rqd/tests/test_cuebot_listener.py b/rqd/tests/cuebot_listener_test.py similarity index 100% rename from rqd/tests/test_cuebot_listener.py rename to rqd/tests/cuebot_listener_test.py diff --git a/rqd/tests/cuerqd_tests.py b/rqd/tests/cuerqd_test.py similarity index 100% rename from rqd/tests/cuerqd_tests.py rename to rqd/tests/cuerqd_test.py diff --git a/rqd/tests/rqconstants_tests.py b/rqd/tests/rqconstants_test.py similarity index 92% rename from rqd/tests/rqconstants_tests.py rename to rqd/tests/rqconstants_test.py index 45e52c0b1..0503994a2 100644 --- a/rqd/tests/rqconstants_tests.py +++ b/rqd/tests/rqconstants_test.py @@ -39,7 +39,7 @@ import rqd.rqutil import rqd.compiled_proto.report_pb2 -from .rqmachine_tests import ( +from .rqmachine_test import ( CPUINFO, LOADAVG_LOW_USAGE, MEMINFO_MODERATE_USAGE, @@ -121,6 +121,7 @@ def makeRqMachine(self): """ [Override] DEFAULT_FACILITY = test_facility +RQD_TAGS = test_tag1 test_tag2 test_tag3 """, ) def test_facility(self): @@ -128,19 +129,6 @@ def test_facility(self): machine = self.makeRqMachine() self.assertEqual(machine.renderHost.facility, "test_facility") - - @MockConfig( - tempdir, - """ -[Override] -RQD_TAGS = test_tag1 test_tag2 test_tag3 -""", - ) - def test_tags(self): - self.assertEqual(rqd.rqconstants.RQD_TAGS, "test_tag1 test_tag2 test_tag3") - - machine = self.makeRqMachine() - self.assertEqual(machine.renderHost.facility, "cloud") self.assertTrue( set(["test_tag1", "test_tag2", "test_tag3"]).issubset( machine.renderHost.tags diff --git a/rqd/tests/rqcore_tests.py b/rqd/tests/rqcore_test.py similarity index 63% rename from rqd/tests/rqcore_tests.py rename to rqd/tests/rqcore_test.py index 09f06d23f..abeabdb20 100644 --- a/rqd/tests/rqcore_tests.py +++ b/rqd/tests/rqcore_test.py @@ -24,6 +24,8 @@ from builtins import str import os.path import unittest +import subprocess +import re import mock import pyfakefs.fake_filesystem_unittest @@ -40,16 +42,16 @@ class RqCoreTests(unittest.TestCase): - @mock.patch('rqd.rqnimby.NimbySelect', autospec=True) - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('rqd.rqmachine.Machine', autospec=True) + @mock.patch("rqd.rqnimby.NimbyPynput", autospec=True) + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("rqd.rqmachine.Machine", autospec=True) def setUp(self, machineMock, networkMock, nimbyMock): self.machineMock = machineMock self.networkMock = networkMock self.nimbyMock = nimbyMock self.rqcore = rqd.rqcore.RqCore() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startServer(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = False self.machineMock.return_value.isDesktop.return_value = False @@ -59,7 +61,7 @@ def test_startServer(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn", autospec=True) def test_startServerWithNimby(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = True self.machineMock.return_value.isDesktop.return_value = False @@ -69,7 +71,7 @@ def test_startServerWithNimby(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_called_with(self.rqcore) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn", autospec=True) def test_startDesktopNimbyOn(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = True self.machineMock.return_value.isDesktop.return_value = True @@ -79,7 +81,7 @@ def test_startDesktopNimbyOn(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_called_with(self.rqcore) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyOff(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = False self.machineMock.return_value.isDesktop.return_value = True @@ -89,7 +91,7 @@ def test_startDesktopNimbyOff(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyUndefined(self, nimbyOnMock): rqd.rqconstants.OVERRIDE_NIMBY = None self.machineMock.return_value.isDesktop.return_value = True @@ -99,9 +101,9 @@ def test_startDesktopNimbyUndefined(self, nimbyOnMock): self.networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('rqd.rqmachine.Machine', autospec=True) - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOn') + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("rqd.rqmachine.Machine", autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOn") def test_startDesktopNimbyOffWithFlag(self, nimbyOnMock, machineMock, networkMock): rqd.rqconstants.OVERRIDE_NIMBY = True machineMock.return_value.isDesktop.return_value = True @@ -112,7 +114,7 @@ def test_startDesktopNimbyOffWithFlag(self, nimbyOnMock, machineMock, networkMoc networkMock.return_value.start_grpc.assert_called() nimbyOnMock.assert_not_called() - @mock.patch('threading.Timer') + @mock.patch("threading.Timer") def test_grpcConnected(self, timerMock): update_rss_thread = mock.MagicMock() interval_thread = mock.MagicMock() @@ -124,15 +126,15 @@ def test_grpcConnected(self, timerMock): update_rss_thread.start.assert_called() interval_thread.start.assert_called() - @mock.patch.object(rqd.rqcore.RqCore, 'sendStatusReport', autospec=True) - @mock.patch('threading.Timer') + @mock.patch.object(rqd.rqcore.RqCore, "sendStatusReport", autospec=True) + @mock.patch("threading.Timer") def test_onInterval(self, timerMock, sendStatusReportMock): self.rqcore.onInterval() timerMock.return_value.start.assert_called() sendStatusReportMock.assert_called_with(self.rqcore) - @mock.patch('threading.Timer', autospec=True) + @mock.patch("threading.Timer", autospec=True) def test_onIntervalWithSleepTime(self, timerMock): sleep_time = 72 @@ -141,8 +143,8 @@ def test_onIntervalWithSleepTime(self, timerMock): timerMock.assert_called_with(sleep_time, mock.ANY) timerMock.return_value.start.assert_called() - @mock.patch.object(rqd.rqcore.RqCore, 'shutdownRqdNow') - @mock.patch('threading.Timer', new=mock.MagicMock()) + @mock.patch.object(rqd.rqcore.RqCore, "shutdownRqdNow") + @mock.patch("threading.Timer", new=mock.MagicMock()) def test_onIntervalShutdown(self, shutdownRqdNowMock): self.rqcore.shutdownRqdIdle() self.machineMock.return_value.isUserLoggedIn.return_value = False @@ -153,9 +155,11 @@ def test_onIntervalShutdown(self, shutdownRqdNowMock): shutdownRqdNowMock.assert_called_with() - @mock.patch('threading.Timer') + @mock.patch("threading.Timer") def test_updateRss(self, timerMock): - self.rqcore.storeFrame('frame-id', mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + self.rqcore.storeFrame( + "frame-id", mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) self.rqcore.updateRss() @@ -163,21 +167,25 @@ def test_updateRss(self, timerMock): timerMock.return_value.start.assert_called() def test_getFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) self.rqcore.storeFrame(frame_id, frame) self.assertEqual(frame, self.rqcore.getFrame(frame_id)) def test_getFrameKeys(self): - frame_ids = ['frame1', 'frame2'] - self.rqcore.storeFrame(frame_ids[0], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) - self.rqcore.storeFrame(frame_ids[1], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + frame_ids = ["frame1", "frame2"] + self.rqcore.storeFrame( + frame_ids[0], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) + self.rqcore.storeFrame( + frame_ids[1], mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) self.assertEqual(set(frame_ids), set(self.rqcore.getFrameKeys())) def test_storeFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) with self.assertRaises(KeyError): self.rqcore.getFrame(frame_id) @@ -187,19 +195,23 @@ def test_storeFrame(self): self.assertEqual(frame, self.rqcore.getFrame(frame_id)) def test_storeFrameDuplicate(self): - frame_id = 'arbitrary-frame-id' - self.rqcore.storeFrame(frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + frame_id = "arbitrary-frame-id" + self.rqcore.storeFrame( + frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) with self.assertRaises(rqd.rqexceptions.RqdException): - self.rqcore.storeFrame(frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame)) + self.rqcore.storeFrame( + frame_id, mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) + ) def test_deleteFrame(self): - frame_id = 'arbitrary-frame-id' + frame_id = "arbitrary-frame-id" frame = mock.MagicMock(spec=rqd.rqnetwork.RunningFrame) self.rqcore.storeFrame(frame_id, frame) self.rqcore.deleteFrame(frame_id) - self.rqcore.deleteFrame('unknown-key-should-succeed') + self.rqcore.deleteFrame("unknown-key-should-succeed") with self.assertRaises(KeyError): self.rqcore.getFrame(frame_id) @@ -207,17 +219,20 @@ def test_deleteFrame(self): def test_killAllFrame(self): frameAttendantThread = mock.MagicMock() frameAttendantThread.is_alive.return_value = False - frame1Id = 'frame1' - frame2Id = 'frame2' - frame3Id = 'frame3' + frame1Id = "frame1" + frame2Id = "frame2" + frame3Id = "frame3" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) frame1.frameAttendantThread = frameAttendantThread frame2 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id) + ) frame2.frameAttendantThread = frameAttendantThread frame3 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame3Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame3Id) + ) frame3.frameAttendantThread = frameAttendantThread self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.storeFrame(frame2Id, frame2) @@ -226,23 +241,26 @@ def test_killAllFrame(self): # There's no result to verify here; if the method completes successfully # it means that all frames were properly killed, as the method won't finish # until its frame cache is cleared by the kill process. - self.rqcore.killAllFrame('arbitrary reason') + self.rqcore.killAllFrame("arbitrary reason") def test_killAllFrameIgnoreNimby(self): frameAttendantThread = mock.MagicMock() frameAttendantThread.is_alive.return_value = False - frame1Id = 'frame1' - frame2Id = 'frame2' + frame1Id = "frame1" + frame2Id = "frame2" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) frame1.frameAttendantThread = frameAttendantThread frame2 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id, ignore_nimby=True)) + self.rqcore, + rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame2Id, ignore_nimby=True), + ) frame2.frameAttendantThread = frameAttendantThread self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.storeFrame(frame2Id, frame2) - self.rqcore.killAllFrame('NIMBY related reason') + self.rqcore.killAllFrame("NIMBY related reason") self.assertEqual(frame2, self.rqcore.getFrame(frame2Id)) @@ -251,17 +269,25 @@ def test_releaseCores(self): num_booked_cores = 7 num_cores_to_release = 5 self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( - total_cores=50, idle_cores=num_idle_cores, locked_cores=2, - booked_cores=num_booked_cores) + total_cores=50, + idle_cores=num_idle_cores, + locked_cores=2, + booked_cores=num_booked_cores, + ) self.rqcore.releaseCores(num_cores_to_release) # pylint: disable=no-member - self.assertEqual(num_booked_cores-num_cores_to_release, self.rqcore.cores.booked_cores) - self.assertEqual(num_idle_cores+num_cores_to_release, self.rqcore.cores.idle_cores) - - @mock.patch.object(rqd.rqcore.RqCore, 'nimbyOff') - def test_shutdown(self, nimbyOffMock): + self.assertEqual( + num_booked_cores - num_cores_to_release, self.rqcore.cores.booked_cores + ) + self.assertEqual( + num_idle_cores + num_cores_to_release, self.rqcore.cores.idle_cores + ) + + @mock.patch.object(rqd.rqcore.RqCore, "nimbyOff") + @mock.patch("os._exit") + def test_shutdown(self, nimbyOffMock, exitMock): self.rqcore.onIntervalThread = mock.MagicMock() self.rqcore.updateRssThread = mock.MagicMock() @@ -271,8 +297,8 @@ def test_shutdown(self, nimbyOffMock): self.rqcore.onIntervalThread.cancel.assert_called() self.rqcore.updateRssThread.cancel.assert_called() - @mock.patch('rqd.rqnetwork.Network', autospec=True) - @mock.patch('sys.exit') + @mock.patch("rqd.rqnetwork.Network", autospec=True) + @mock.patch("os._exit") def test_handleExit(self, networkMock, exitMock): self.rqcore = rqd.rqcore.RqCore() @@ -280,9 +306,11 @@ def test_handleExit(self, networkMock, exitMock): exitMock.assert_called() - @mock.patch('rqd.rqcore.FrameAttendantThread') + @mock.patch("rqd.rqcore.FrameAttendantThread") def test_launchFrame(self, frameThreadMock): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) @@ -299,7 +327,8 @@ def test_launchFrameOnDownHost(self): with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): self.rqcore.launchFrame(frame) - def test_launchFrameOnHostWaitingForShutdown(self): + @mock.patch("os._exit") + def test_launchFrameOnHostWaitingForShutdown(self, exitMock): self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.active = False frame = rqd.compiled_proto.rqd_pb2.RunFrame() @@ -308,13 +337,16 @@ def test_launchFrameOnHostWaitingForShutdown(self): with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): self.rqcore.launchFrame(frame) - @mock.patch('rqd.rqcore.FrameAttendantThread') + @mock.patch("rqd.rqcore.FrameAttendantThread") def test_launchFrameOnNimbyHost(self, frameThreadMock): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) frameIgnoreNimby = rqd.compiled_proto.rqd_pb2.RunFrame( - uid=22, num_cores=10, ignore_nimby=True) + uid=22, num_cores=10, ignore_nimby=True + ) self.rqcore.nimby = mock.create_autospec(rqd.rqnimby.NimbySelect) self.rqcore.nimby.locked = True @@ -326,11 +358,15 @@ def test_launchFrameOnNimbyHost(self, frameThreadMock): frameThreadMock.return_value.start.assert_called() def test_launchDuplicateFrame(self): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=20) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=20 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False - frameId = 'arbitrary-frame-id' - self.rqcore.storeFrame(frameId, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId)) + frameId = "arbitrary-frame-id" + self.rqcore.storeFrame( + frameId, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) + ) frameToLaunch = rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) rqd.rqconstants.OVERRIDE_NIMBY = None @@ -354,7 +390,9 @@ def test_launchFrameWithInvalidCoreCount(self): self.rqcore.launchFrame(frame) def test_launchFrameWithInsufficientCores(self): - self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail(total_cores=100, idle_cores=5) + self.rqcore.cores = rqd.compiled_proto.report_pb2.CoreDetail( + total_cores=100, idle_cores=5 + ) self.machineMock.return_value.state = rqd.compiled_proto.host_pb2.UP self.nimbyMock.return_value.locked = False frame = rqd.compiled_proto.rqd_pb2.RunFrame(uid=22, num_cores=10) @@ -363,14 +401,15 @@ def test_launchFrameWithInsufficientCores(self): self.rqcore.launchFrame(frame) def test_getRunningFrame(self): - frameId = 'arbitrary-frame-id' + frameId = "arbitrary-frame-id" frame = rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frameId) self.rqcore.storeFrame(frameId, frame) self.assertEqual(frame, self.rqcore.getRunningFrame(frameId)) - self.assertIsNone(self.rqcore.getRunningFrame('some-unknown-frame-id')) + self.assertIsNone(self.rqcore.getRunningFrame("some-unknown-frame-id")) - def test_rebootNowNoUser(self): + @mock.patch("os._exit") + def test_rebootNowNoUser(self, exitMock): self.machineMock.return_value.isUserLoggedIn.return_value = False self.nimbyMock.return_value.active = False @@ -384,7 +423,8 @@ def test_rebootNowWithUser(self): with self.assertRaises(rqd.rqexceptions.RqdException): self.rqcore.rebootNow() - def test_rebootIdleNoFrames(self): + @mock.patch("os._exit") + def test_rebootIdleNoFrames(self, exitMock): self.machineMock.return_value.isUserLoggedIn.return_value = False self.nimbyMock.return_value.active = False @@ -392,10 +432,12 @@ def test_rebootIdleNoFrames(self): self.machineMock.return_value.reboot.assert_called_with() - def test_rebootIdleWithFrames(self): - frame1Id = 'frame1' + @mock.patch("os._exit") + def test_rebootIdleWithFrames(self, exitMock): + frame1Id = "frame1" frame1 = rqd.rqnetwork.RunningFrame( - self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id)) + self.rqcore, rqd.compiled_proto.rqd_pb2.RunFrame(frame_id=frame1Id) + ) self.rqcore.storeFrame(frame1Id, frame1) self.rqcore.rebootIdle() @@ -403,29 +445,13 @@ def test_rebootIdleWithFrames(self): self.assertTrue(self.rqcore.isWaitingForIdle()) self.machineMock.return_value.reboot.assert_not_called() - @mock.patch('os.getuid', new=mock.MagicMock(return_value=0)) - @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) - def test_nimbyOn(self): - self.nimbyMock.return_value.active = False - - self.rqcore.nimbyOn() - - self.nimbyMock.return_value.run.assert_called_with() - - def test_nimbyOff(self): - self.nimbyMock.return_value.active = True - - self.rqcore.nimbyOff() - - self.nimbyMock.return_value.stop.assert_called_with() - - @mock.patch.object(rqd.rqcore.RqCore, 'killAllFrame', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "killAllFrame", autospec=True) def test_onNimbyLock(self, killAllFrameMock): self.rqcore.onNimbyLock() killAllFrameMock.assert_called_with(self.rqcore, mock.ANY) - @mock.patch.object(rqd.rqcore.RqCore, 'sendStatusReport', autospec=True) + @mock.patch.object(rqd.rqcore.RqCore, "sendStatusReport", autospec=True) def test_onNimbyUnlock(self, sendStatusReportMock): self.rqcore.onNimbyUnlock() @@ -507,6 +533,7 @@ def test_unlockAllWhenNimbyLocked(self): self.rqcore.cores.total_cores = 50 self.rqcore.cores.idle_cores = 40 self.rqcore.cores.locked_cores = 10 + self.rqcore.nimby.locked = True self.rqcore.unlockAll() @@ -514,41 +541,95 @@ def test_unlockAllWhenNimbyLocked(self): self.assertEqual(40, self.rqcore.cores.idle_cores) self.assertEqual(0, self.rqcore.cores.locked_cores) + def test_sendFrameCompleteReport(self): + logDir = "/path/to/log/dir/" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" + frameUid = 928 + frameUsername = "my-random-user" + children = rqd.compiled_proto.report_pb2.ChildrenProcStats() + returnCode = 0 -@mock.patch('rqd.rqutil.checkAndCreateUser', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) -@mock.patch('subprocess.Popen') -@mock.patch('time.time') -@mock.patch('rqd.rqutil.permissionsUser', spec=True) + runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( + frame_id=frameId, + job_name=jobName, + frame_name=frameName, + uid=frameUid, + user_name=frameUsername, + log_dir=logDir, + children=children, + ) + frameInfo = rqd.rqnetwork.RunningFrame(self.rqcore, runFrame) + frameInfo.exitStatus = 0 + frameInfo.exitSignal = 0 + frameInfo.ignoreNimby = True + + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + self.rqcore.machine.getHostInfo.return_value = renderHost + self.rqcore.nimby = mock.MagicMock() + self.rqcore.nimby.locked.return_value = False + self.rqcore.network.reportRunningFrameCompletion = mock.MagicMock() + self.rqcore.sendFrameCompleteReport(frameInfo) + + self.rqcore.network.reportRunningFrameCompletion.assert_called_once_with( + rqd.compiled_proto.report_pb2.FrameCompleteReport( + host=renderHost, + frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( + job_name=jobName, + frame_id=frameId, + frame_name=frameName, + children=children, + ), + exit_status=returnCode, + ) + ) + + +@mock.patch("rqd.rqutil.checkAndCreateUser", new=mock.MagicMock()) +@mock.patch("rqd.rqutil.permissionsHigh", new=mock.MagicMock()) +@mock.patch("rqd.rqutil.permissionsLow", new=mock.MagicMock()) +@mock.patch("subprocess.Popen") +@mock.patch("time.time") +@mock.patch("rqd.rqutil.permissionsUser", spec=True) class FrameAttendantThreadTests(pyfakefs.fake_filesystem_unittest.TestCase): def setUp(self): self.setUpPyfakefs() - rqd.rqconstants.SU_ARGUMENT = '-c' - - @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) - @mock.patch('tempfile.gettempdir') - @mock.patch('rqd.rqcore.pipe_to_file', new=mock.MagicMock()) - def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, + rqd.rqconstants.SU_ARGUMENT = "-c" + + @mock.patch("platform.system", new=mock.Mock(return_value="Linux")) + @mock.patch("tempfile.gettempdir") + @mock.patch("select.poll") + def test_runLinux( + self, selectMock, getTempDirMock, permsUser, timeMock, popenMock + ): # mkdirMock, openMock, # given currentTime = 1568070634.3 - jobTempPath = '/job/temp/path/' - logDir = '/path/to/log/dir/' - tempDir = '/some/random/temp/dir' - frameId = 'arbitrary-frame-id' - jobName = 'arbitrary-job-name' - frameName = 'arbitrary-frame-name' + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "/some/random/temp/dir" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" frameUid = 928 - frameUsername = 'my-random-user' + frameUsername = "my-random-user" returnCode = 0 - renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') - logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + logFile = os.path.join(logDir, "%s.%s.rqlog" % (jobName, frameName)) self.fs.create_dir(tempDir) timeMock.return_value = currentTime getTempDirMock.return_value = tempDir + popenMock.return_value.wait.return_value = returnCode + popenMock.return_value.stdout.readline.return_value = None + + selectMock.return_value.poll.return_value = [] rqCore = mock.MagicMock() rqCore.intervalStartTime = 20 @@ -557,6 +638,7 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False + rqCore.docker_client = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -566,7 +648,8 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir uid=frameUid, user_name=frameUsername, log_dir=logDir, - children=children) + children=children, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) # when @@ -578,10 +661,15 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( [ - '/bin/nice', '/usr/bin/time', '-p', '-o', - jobTempPath + 'rqd-stat-' + frameId + '-' + str(currentTime), - '/bin/su', frameUsername, '-c', - '"' + tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '"' + "/bin/nice", + "/usr/bin/time", + "-p", + "-o", + jobTempPath + "rqd-stat-" + frameId + "-" + str(currentTime), + "/bin/su", + frameUsername, + "-c", + '"' + tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + '"', ], env=mock.ANY, cwd=jobTempPath, @@ -589,35 +677,135 @@ def test_runLinux(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdir stdout=mock.ANY, stderr=mock.ANY, close_fds=mock.ANY, - preexec_fn=mock.ANY) + preexec_fn=mock.ANY, + ) self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) _, kwargs = popenMock.call_args - rqCore.network.reportRunningFrameCompletion.assert_called_with( - rqd.compiled_proto.report_pb2.FrameCompleteReport( - host=renderHost, - frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) - # TODO(bcipriano) Re-enable this test once Windows is supported. The main sticking point here - # is that the log directory is always overridden on Windows which makes mocking difficult. - @mock.patch('platform.system', new=mock.Mock(return_value='Windows')) - def disabled__test_runWindows(self, permsUser, timeMock, popenMock): + @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) + @mock.patch('tempfile.gettempdir') + def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, + # given currentTime = 1568070634.3 jobTempPath = '/job/temp/path/' logDir = '/path/to/log/dir/' - tempDir = 'C:\\temp' + tempDir = '/some/random/temp/dir' frameId = 'arbitrary-frame-id' - jobId = 'arbitrary-job-id' jobName = 'arbitrary-job-name' frameName = 'arbitrary-frame-name' frameUid = 928 frameUsername = 'my-random-user' returnCode = 0 renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') + logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + + self.fs.create_dir(tempDir) + + timeMock.return_value = currentTime + getTempDirMock.return_value = tempDir + popenMock.return_value.wait.return_value = returnCode + + rqCore = mock.MagicMock() + rqCore.intervalStartTime = 20 + rqCore.intervalSleepTime = 40 + rqCore.machine.getTempPath.return_value = jobTempPath + rqCore.machine.isDesktop.return_value = True + rqCore.machine.getHostInfo.return_value = renderHost + rqCore.nimby.locked = False + + # Setup mock docker client + rqCore.docker_client = mock.MagicMock() + rqCore.docker_images = { + "centos7": "centos7_image", + "rocky9": "rocky9_image", + } + rqCore.docker_mounts = { + "vol1": "/vol1/mount", + "vol2": "/vol2/mount", + } + + children = rqd.compiled_proto.report_pb2.ChildrenProcStats() + + runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( + frame_id=frameId, + job_name=jobName, + frame_name=frameName, + uid=frameUid, + user_name=frameUsername, + log_dir=logDir, + children=children, + environment={"ENVVAR": "env_value"}, + os="centos7") + frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) + + # when + attendantThread = rqd.rqcore.FrameAttendantThread(rqCore, runFrame, frameInfo) + attendantThread.start() + attendantThread.join() + + # then + cmd_file = os.path.join(tempDir, 'rqd-cmd-%s-%s' % (runFrame.frame_id, currentTime)) + rqCore.docker_client.containers.run.assert_called_with( + image="centos7_image", + detach=True, + environment=mock.ANY, + working_dir=jobTempPath, + mounts=rqCore.docker_mounts, + privileged=True, + remove=True, + pid_mode="host", + stderr=True, + hostname=mock.ANY, + entrypoint=cmd_file + ) + + with open(cmd_file, "r", encoding='utf-8') as f: + # Remove `-p RANDOM_PASSWORD` from output + cmd = re.sub(r"-p\s+(\d|\w)\S+\s*", "", f.read()) + self.assertEqual(r"""#!/bin/sh +useradd -u %s -g %s %s >& /dev/null || true; +exec su -s /bin/sh %s -c "echo \$$; /bin/nice /usr/bin/time -p -o /job/temp/path/rqd-stat-%s-%s " +""" % ( + frameUid, + rqd.rqconstants.LAUNCH_FRAME_USER_GID, + frameUsername, + frameUsername, + frameId, + currentTime + ), cmd) + + self.assertTrue(os.path.exists(logDir)) + self.assertTrue(os.path.isfile(logFile)) + + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) + + + # TODO(bcipriano) Re-enable this test once Windows is supported. The main sticking point here + # is that the log directory is always overridden on Windows which makes mocking difficult. + @mock.patch("platform.system", new=mock.Mock(return_value="Windows")) + def disabled__test_runWindows(self, permsUser, timeMock, popenMock): + currentTime = 1568070634.3 + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "C:\\temp" + frameId = "arbitrary-frame-id" + jobId = "arbitrary-job-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" + frameUid = 928 + frameUsername = "my-random-user" + returnCode = 0 + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) timeMock.return_value = currentTime popenMock.return_value.returncode = returnCode @@ -640,7 +828,8 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): user_name=frameUsername, log_dir=logDir, children=children, - environment={'CUE_IFRAME': '2000'}) + environment={"CUE_IFRAME": "2000"}, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) attendantThread = rqd.rqcore.FrameAttendantThread(rqCore, runFrame, frameInfo) @@ -649,41 +838,51 @@ def disabled__test_runWindows(self, permsUser, timeMock, popenMock): permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( - [tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '.bat'], + [tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + ".bat"], stdin=mock.ANY, stdout=mock.ANY, - stderr=mock.ANY) + stderr=mock.ANY, + ) # TODO(bcipriano) Verify the log directory was created and used for stdout/stderr. rqCore.network.reportRunningFrameCompletion.assert_called_with( rqd.compiled_proto.report_pb2.FrameCompleteReport( host=renderHost, frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) - - @mock.patch('platform.system', new=mock.Mock(return_value='Darwin')) - @mock.patch('tempfile.gettempdir') + job_name=jobName, + frame_id=frameId, + frame_name=frameName, + children=children, + ), + exit_status=returnCode, + ) + ) + + @mock.patch("platform.system", new=mock.Mock(return_value="Darwin")) + @mock.patch("tempfile.gettempdir") def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): # given currentTime = 1568070634.3 - jobTempPath = '/job/temp/path/' - logDir = '/path/to/log/dir/' - tempDir = '/some/random/temp/dir' - frameId = 'arbitrary-frame-id' - jobName = 'arbitrary-job-name' - frameName = 'arbitrary-frame-name' + jobTempPath = "/job/temp/path/" + logDir = "/path/to/log/dir/" + tempDir = "/some/random/temp/dir" + frameId = "arbitrary-frame-id" + jobName = "arbitrary-job-name" + frameName = "arbitrary-frame-name" frameUid = 928 - frameUsername = 'my-random-user' + frameUsername = "my-random-user" returnCode = 0 - renderHost = rqd.compiled_proto.report_pb2.RenderHost(name='arbitrary-host-name') - logFile = os.path.join(logDir, '%s.%s.rqlog' % (jobName, frameName)) + renderHost = rqd.compiled_proto.report_pb2.RenderHost( + name="arbitrary-host-name" + ) + logFile = os.path.join(logDir, "%s.%s.rqlog" % (jobName, frameName)) self.fs.create_dir(tempDir) timeMock.return_value = currentTime getTempDirMock.return_value = tempDir popenMock.return_value.returncode = returnCode + popenMock.return_value.stdout.readline.return_value = None rqCore = mock.MagicMock() rqCore.intervalStartTime = 20 @@ -692,6 +891,7 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False + rqCore.docker_client = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -701,7 +901,8 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): uid=frameUid, user_name=frameUsername, log_dir=logDir, - children=children) + children=children, + ) frameInfo = rqd.rqnetwork.RunningFrame(rqCore, runFrame) # when @@ -713,29 +914,29 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): permsUser.assert_called_with(frameUid, mock.ANY) popenMock.assert_called_with( [ - '/usr/bin/su', frameUsername, '-c', - '"' + tempDir + '/rqd-cmd-' + frameId + '-' + str(currentTime) + '"' + "/usr/bin/su", + frameUsername, + "-c", + '"' + tempDir + "/rqd-cmd-" + frameId + "-" + str(currentTime) + '"', ], env=mock.ANY, cwd=jobTempPath, stdin=mock.ANY, stdout=mock.ANY, stderr=mock.ANY, - preexec_fn=mock.ANY) + preexec_fn=mock.ANY, + ) self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) _, kwargs = popenMock.call_args - self.assertEqual(logFile, kwargs['stdout'].name) - self.assertEqual(logFile, kwargs['stderr'].name) + self.assertEqual(subprocess.PIPE, kwargs["stdout"]) + self.assertEqual(subprocess.STDOUT, kwargs["stderr"]) - rqCore.network.reportRunningFrameCompletion.assert_called_with( - rqd.compiled_proto.report_pb2.FrameCompleteReport( - host=renderHost, - frame=rqd.compiled_proto.report_pb2.RunningFrameInfo( - job_name=jobName, frame_id=frameId, frame_name=frameName, children=children), - exit_status=returnCode)) + rqCore.sendFrameCompleteReport.assert_called_with( + frameInfo + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/rqd/tests/rqmachine_tests.py b/rqd/tests/rqmachine_test.py similarity index 89% rename from rqd/tests/rqmachine_tests.py rename to rqd/tests/rqmachine_test.py index 1b1bdaf4a..c74b22e80 100644 --- a/rqd/tests/rqmachine_tests.py +++ b/rqd/tests/rqmachine_test.py @@ -303,15 +303,19 @@ def _test_rssUpdate(self, proc_stat): self.assertAlmostEqual(0.034444696691, float(updatedFrameInfo.attributes['pcpu'])) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdate(self): + @mock.patch('psutil.Process') + def test_rssUpdate(self, processMock): + processMock.return_value.cmdline.return_value = "some_command" self._test_rssUpdate(PROC_PID_STAT) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdateWithSpaces(self): + @mock.patch('psutil.Process') + def test_rssUpdateWithSpaces(self, processMock): self._test_rssUpdate(PROC_PID_STAT_WITH_SPACES) @mock.patch('time.time', new=mock.MagicMock(return_value=1570057887.61)) - def test_rssUpdateWithBrackets(self): + @mock.patch('psutil.Process') + def test_rssUpdateWithBrackets(self, processMock): self._test_rssUpdate(PROC_PID_STAT_WITH_BRACKETS) @mock.patch.object( @@ -461,43 +465,41 @@ def test_reserveHT(self): self.machine.setupTaskset() + #-----------------------Core Map------------------------ + # phys 0 phys 1 + # core 0 core 0 + # proc 0 proc 4 + # proc 8 proc 12 + # core 1 core 1 + # proc 1 proc 5 + # proc 9 proc 13 + # core 2 core 2 + # proc 2 proc 6 + # proc 10 proc 14 + # core 3 core 3 + # proc 3 proc 7 + # proc 11 proc 15 # ------------------------step1------------------------- - # phys_id 1 - # - core_id 0 - # - process_id 4 - # - process_id 12 - # - core_id 1 - # - process_id 5 - # - process_id 13 - # - core_id 3 - # - process_id 7 - # - process_id 15 - tasksets1 = self.machine.reserveHT(300) - # pylint: disable=no-member - self.assertItemsEqual(['4', '5', '7', '12', '13', '15'], sorted(tasksets1.split(','))) - - # ------------------------step2------------------------- - # phys_id 0 - # - core_id 0 - # - process_id 0 - # - process_id 8 - # - core_id 1 - # - process_id 1 - # - process_id 9 - # - core_id 2 - # - process_id 2 - # - process_id 10 - # - core_id 3 - # - process_id 3 - # - process_id 11 - tasksets0 = self.machine.reserveHT(400) - # pylint: disable=no-member - self.assertItemsEqual(['0', '1', '2', '3', '8', '9', '10', '11'], - sorted(tasksets0.split(','))) - - # reserved cores got updated properly - # pylint: disable=no-member - self.assertItemsEqual([0, 1, 2, 3], self.coreDetail.reserved_cores[0].coreid) + def assertTaskSet(taskset_list): + """Ensure all tasks are being allocated with the right thread pairs""" + phys0 = [('0', '8'), ('1', '9'), ('10', '2'), ('11', '3')] + phys1 = [('12', '4'), ('13', '5'), ('14', '6'), ('15', '7')] + + taskset_2_2 = list(zip(taskset_list[::2], taskset_list[1::2])) + if taskset_2_2[0] in phys0: + for t in taskset_2_2: + self.assertTrue(tuple(sorted(t)) in phys0, "%s not in %s" % (t, phys0)) + elif taskset_2_2[0] in phys1: + for t in taskset_2_2: + self.assertTrue(tuple(sorted(t)) in phys1, "%s not in %s" % (t, phys1)) + + tasksets0 = self.machine.reserveHT(300) + self.assertIsNotNone(tasksets0) + assertTaskSet(tasksets0.split(",")) + + tasksets1 = self.machine.reserveHT(400) + self.assertIsNotNone(tasksets1) + assertTaskSet(tasksets1.split(",")) # Make sure tastsets don't overlap self.assertTrue(set(tasksets0.split(',')).isdisjoint(tasksets1.split(','))) @@ -507,8 +509,6 @@ def test_reserveHT(self): self.machine.releaseHT(tasksets0) # pylint: disable=no-member self.assertTrue(1 in self.coreDetail.reserved_cores) - # pylint: disable=no-member - self.assertItemsEqual([0, 1, 3], self.coreDetail.reserved_cores[1].coreid) # ------------------------step4------------------------- # phys_id 0 @@ -519,29 +519,12 @@ def test_reserveHT(self): # - process_id 1 # - process_id 9 tasksets3 = self.machine.reserveHT(200) - # pylint: disable=no-member - self.assertItemsEqual(['0', '1', '8', '9'], sorted(tasksets3.split(','))) + assertTaskSet(tasksets3.split(",")) # ------------------------step5------------------------- - # phys_id 0 - # - core_id 2 - # - process_id 2 - # - process_id 10 - # - core_id 3 - # - process_id 3 - # - process_id 11 - # phys_id 1 - # - core_id 2 - # - process_id 6 - # - process_id 14 - tasksets4 = self.machine.reserveHT(300) - # pylint: disable=no-member - self.assertItemsEqual(['2', '10', '3', '11', '6', '14'], sorted(tasksets4.split(','))) - - # ------------------------step6------------------------- - # No cores available + # Missing one core with self.assertRaises(rqd.rqexceptions.CoreReservationFailureException): - self.machine.reserveHT(300) + tasksets4 = self.machine.reserveHT(300) def test_tags(self): @@ -553,9 +536,17 @@ def test_tags(self): self.assertTrue(all(tag in machine.__dict__['_Machine__renderHost'].tags for tag in tags)) -class CpuinfoTests(unittest.TestCase): +@mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) +class CpuinfoTestsLinux(pyfakefs.fake_filesystem_unittest.TestCase): + @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) def setUp(self): + self.setUpPyfakefs() + self.fs.create_file('/proc/cpuinfo', contents=CPUINFO) + self.loadavg = self.fs.create_file('/proc/loadavg', contents=LOADAVG_LOW_USAGE) + self.procStat = self.fs.create_file('/proc/stat', contents=PROC_STAT) + self.meminfo = self.fs.create_file('/proc/meminfo', contents=MEMINFO_MODERATE_USAGE) + self.fs.add_real_directory(os.path.dirname(__file__)) self.rqd = rqd.rqcore.RqCore() def test_shark(self): @@ -591,6 +582,7 @@ def test_srdsvr09(self): def __cpuinfoTestHelper(self, pathCpuInfo): # File format: _cpuinfo_dub_x-x-x where x-x-x is totalCores-coresPerProc-numProcs pathCpuInfo = os.path.join(os.path.dirname(__file__), 'cpuinfo', pathCpuInfo) + self.meminfo.set_contents(MEMINFO_MODERATE_USAGE) renderHost, coreInfo = self.rqd.machine.testInitMachineStats(pathCpuInfo) totalCores, coresPerProc, numProcs = pathCpuInfo.split('_')[-1].split('-')[:3] diff --git a/rqd/tests/rqnimby_test.py b/rqd/tests/rqnimby_test.py new file mode 100644 index 000000000..8408fd5b8 --- /dev/null +++ b/rqd/tests/rqnimby_test.py @@ -0,0 +1,143 @@ +# #!/usr/bin/env python +# # Copyright Contributors to the OpenCue Project +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + + +# """Tests for rqd.rqnimby.""" + + +# from __future__ import print_function +# from __future__ import division +# from __future__ import absolute_import + +# import unittest + +# import mock +# import pyfakefs.fake_filesystem_unittest + +# import rqd.rqcore +# import rqd.rqmachine +# import rqd.rqnimby + + +# @mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) +# @mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) +# class RqNimbyTests(pyfakefs.fake_filesystem_unittest.TestCase): +# def setUp(self): +# self.setUpPyfakefs() +# self.inputDevice = self.fs.create_file('/dev/input/event0', contents='mouse event') + +# self.rqMachine = mock.MagicMock(spec=rqd.rqmachine.Machine) +# self.rqCore = mock.MagicMock(spec=rqd.rqcore.RqCore) +# self.rqCore.machine = self.rqMachine +# self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self.rqCore) +# self.nimby.daemon = True + +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') +# def test_initialState(self, unlockedIdleMock): +# self.nimby.daemon = True + +# self.nimby.start() +# self.nimby.join() + +# # Initial state should be "unlocked and idle". +# unlockedIdleMock.assert_called() + +# self.nimby.stop() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_unlockedIdle(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.unlockedIdle() + +# # Given a mouse event, Nimby should transition to "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') +# @mock.patch('threading.Timer') +# def test_lockedIdleWhenIdle(self, timerMock, unlockedIdleMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedIdle() + +# # Given no events, Nimby should transition to "unlocked and idle". +# unlockedIdleMock.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_lockedIdleWhenInUse(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedIdle() + +# # Given a mouse event, Nimby should transition to "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) +# @mock.patch.object(rqd.rqnimby.NimbySelect, 'lockedIdle') +# @mock.patch('threading.Timer') +# def test_lockedInUseWhenIdle(self, timerMock, lockedIdleMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedInUse() + +# # Given no events, Nimby should transition to "locked and idle". +# lockedIdleMock.assert_called() + +# @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) +# @mock.patch('threading.Timer') +# def test_lockedInUseWhenInUse(self, timerMock): +# self.nimby.active = True +# self.nimby.results = [[]] +# self.rqCore.machine.isNimbySafeToRunJobs.return_value = True + +# self.nimby.lockedInUse() + +# # Given a mouse event, Nimby should stay in state "locked and in use". +# timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) +# timerMock.return_value.start.assert_called() + +# def test_lockNimby(self): +# self.nimby.active = True +# self.nimby.locked = False + +# self.nimby.lockNimby() + +# self.assertTrue(self.nimby.locked) +# self.rqCore.onNimbyLock.assert_called() + +# def test_unlockNimby(self): +# self.nimby.locked = True + +# self.nimby.unlockNimby() + +# self.assertFalse(self.nimby.locked) +# self.rqCore.onNimbyUnlock.assert_called() + + +# if __name__ == '__main__': +# unittest.main() diff --git a/rqd/tests/rqnimby_tests.py b/rqd/tests/rqnimby_tests.py deleted file mode 100644 index 04cf3e765..000000000 --- a/rqd/tests/rqnimby_tests.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# Copyright Contributors to the OpenCue Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""Tests for rqd.rqnimby.""" - - -from __future__ import print_function -from __future__ import division -from __future__ import absolute_import - -import unittest - -import mock -import pyfakefs.fake_filesystem_unittest - -import rqd.rqcore -import rqd.rqmachine -import rqd.rqnimby - - -@mock.patch('rqd.rqutil.permissionsHigh', new=mock.MagicMock()) -@mock.patch('rqd.rqutil.permissionsLow', new=mock.MagicMock()) -class RqNimbyTests(pyfakefs.fake_filesystem_unittest.TestCase): - def setUp(self): - self.setUpPyfakefs() - self.inputDevice = self.fs.create_file('/dev/input/event0', contents='mouse event') - - self.rqMachine = mock.MagicMock(spec=rqd.rqmachine.Machine) - self.rqCore = mock.MagicMock(spec=rqd.rqcore.RqCore) - self.rqCore.machine = self.rqMachine - self.nimby = rqd.rqnimby.NimbyFactory.getNimby(self.rqCore) - self.nimby.daemon = True - - @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') - def test_initialState(self, unlockedIdleMock): - self.nimby.daemon = True - - self.nimby.start() - self.nimby.join() - - # Initial state should be "unlocked and idle". - unlockedIdleMock.assert_called() - - self.nimby.stop() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_unlockedIdle(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.unlockedIdle() - - # Given a mouse event, Nimby should transition to "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) - @mock.patch.object(rqd.rqnimby.NimbySelect, 'unlockedIdle') - @mock.patch('threading.Timer') - def test_lockedIdleWhenIdle(self, timerMock, unlockedIdleMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedIdle() - - # Given no events, Nimby should transition to "unlocked and idle". - unlockedIdleMock.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_lockedIdleWhenInUse(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedIdle() - - # Given a mouse event, Nimby should transition to "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[[], [], []])) - @mock.patch.object(rqd.rqnimby.NimbySelect, 'lockedIdle') - @mock.patch('threading.Timer') - def test_lockedInUseWhenIdle(self, timerMock, lockedIdleMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedInUse() - - # Given no events, Nimby should transition to "locked and idle". - lockedIdleMock.assert_called() - - @mock.patch('select.select', new=mock.MagicMock(return_value=[['a new mouse event'], [], []])) - @mock.patch('threading.Timer') - def test_lockedInUseWhenInUse(self, timerMock): - self.nimby.active = True - self.nimby.results = [[]] - self.rqCore.machine.isNimbySafeToRunJobs.return_value = True - - self.nimby.lockedInUse() - - # Given a mouse event, Nimby should stay in state "locked and in use". - timerMock.assert_called_with(mock.ANY, self.nimby.lockedInUse) - timerMock.return_value.start.assert_called() - - def test_lockNimby(self): - self.nimby.active = True - self.nimby.locked = False - - self.nimby.lockNimby() - - self.assertTrue(self.nimby.locked) - self.rqCore.onNimbyLock.assert_called() - - def test_unlockNimby(self): - self.nimby.locked = True - - self.nimby.unlockNimby() - - self.assertFalse(self.nimby.locked) - self.rqCore.onNimbyUnlock.assert_called() - - -if __name__ == '__main__': - unittest.main() diff --git a/sandbox/install-client-sources.sh b/sandbox/install-client-sources.sh index 7e15ed018..80ed1f39e 100755 --- a/sandbox/install-client-sources.sh +++ b/sandbox/install-client-sources.sh @@ -10,7 +10,7 @@ python -m grpc_tools.protoc -I=. \ --python_out=../pycue/opencue/compiled_proto \ --grpc_python_out=../pycue/opencue/compiled_proto ./*.proto cd .. -2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py +python ../ci/fix_compiled_proto.py pycue/opencue/compiled_proto # Install all client packages. pip install pycue/ pyoutline/ cueadmin/ cuesubmit/ cuegui/ From 4ed1ef694d61416580f7e9199d2c950368d55f51 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 30 Oct 2024 14:14:53 -0700 Subject: [PATCH 35/51] Fix typo on dispatchQuery --- .../java/com/imageworks/spcue/dao/postgres/DispatchQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index 02dae0f22..fec2f47ac 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,7 +72,7 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os IN '' " + + "job.str_os IS NULL OR job.str_os = '' " + "OR " + "job.str_os IN ? " + ") " + From a35d19a2cb2cf8acee137b6d841917e47bd88151 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 31 Oct 2024 10:31:50 -0700 Subject: [PATCH 36/51] Handle some possibly None variables --- rqd/rqd/rqcore.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4ecfd90c6..9628807ec 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -743,6 +743,7 @@ def _createCommandFile(self, command): log.critical( "Unable to make command file: %s due to %s at %s", commandFile, e, traceback.extract_tb(sys.exc_info()[2])) + raise e def __writeHeader(self): """Writes the frame's log header""" @@ -950,7 +951,7 @@ def runDocker(self): elif self.rqCore.docker_images: # If a frame doesn't require an specic OS, default to the first configured OS on # [docker.images] - image = list(self.rqCore.docker_images.values)[0] + image = list(self.rqCore.docker_images.values())[0] else: self.__writeHeader() msg = ("Misconfigured rqd. RUN_ON_DOCKER=True requires at " @@ -1007,6 +1008,8 @@ def runDocker(self): client = self.rqCore.docker_client try: + if not client: + raise TypeError("Invalid state: docker_client must have been initialized.") container = client.containers.run(image=image, detach=True, environment=self.frameEnv, @@ -1023,7 +1026,7 @@ def runDocker(self): # CMD prints the process PID before executing the actual command frameInfo.pid = int(next(log_stream)) - if not self.rqCore.updateRssThread.is_alive(): + if self.rqCore.updateRssThread and not self.rqCore.updateRssThread.is_alive(): self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.rqCore.updateRss) self.rqCore.updateRssThread.start() From f1623dcaab45430c52a3b2772aaf82239d97c943 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 31 Oct 2024 10:49:17 -0700 Subject: [PATCH 37/51] Rqd multiple os (#1566) Update temporary sync branch --------- Signed-off-by: Diego Tavares Co-authored-by: Ramon Figueiredo Co-authored-by: Jimmy Christensen --- .../com/imageworks/spcue/dao/postgres/DispatchQuery.java | 2 +- rqd/rqd/rqcore.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index 02dae0f22..fec2f47ac 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,7 +72,7 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os IN '' " + + "job.str_os IS NULL OR job.str_os = '' " + "OR " + "job.str_os IN ? " + ") " + diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4ecfd90c6..cb254adf8 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -614,7 +614,7 @@ def sanitizeFrames(self): Iterate over the cache and update the status of frames that might have completed but never reported back to cuebot. """ - for frameId in list(self.__cache.keys): + for frameId in list(self.__cache.keys()): runningFrame = self.__cache[frameId] # If the frame was marked as completed (exitStatus) and a report has not been sent # try to file the report again @@ -743,6 +743,7 @@ def _createCommandFile(self, command): log.critical( "Unable to make command file: %s due to %s at %s", commandFile, e, traceback.extract_tb(sys.exc_info()[2])) + raise e def __writeHeader(self): """Writes the frame's log header""" @@ -950,7 +951,7 @@ def runDocker(self): elif self.rqCore.docker_images: # If a frame doesn't require an specic OS, default to the first configured OS on # [docker.images] - image = list(self.rqCore.docker_images.values)[0] + image = list(self.rqCore.docker_images.values())[0] else: self.__writeHeader() msg = ("Misconfigured rqd. RUN_ON_DOCKER=True requires at " @@ -1007,6 +1008,8 @@ def runDocker(self): client = self.rqCore.docker_client try: + if not client: + raise TypeError("Invalid state: docker_client must have been initialized.") container = client.containers.run(image=image, detach=True, environment=self.frameEnv, @@ -1023,7 +1026,7 @@ def runDocker(self): # CMD prints the process PID before executing the actual command frameInfo.pid = int(next(log_stream)) - if not self.rqCore.updateRssThread.is_alive(): + if self.rqCore.updateRssThread and not self.rqCore.updateRssThread.is_alive(): self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.rqCore.updateRss) self.rqCore.updateRssThread.start() From 90744462fc1d562d1fa4945388a89559fb5f12ef Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 31 Oct 2024 16:02:42 -0700 Subject: [PATCH 38/51] frame containers should run with network=host For services as SMTP and others that require direct access to a port, running with network HOST gives frames a similar access to network as they had when running outside of a container --- rqd/rqd/rqcore.py | 1 + rqd/tests/rqcore_test.py | 1 + 2 files changed, 2 insertions(+) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index cb254adf8..a511a6ddc 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -1018,6 +1018,7 @@ def runDocker(self): privileged=True, remove=True, pid_mode="host", + network="host", stderr=True, hostname=self.frameEnv["jobhost"], entrypoint=command) diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index abeabdb20..e3647f018 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -760,6 +760,7 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi privileged=True, remove=True, pid_mode="host", + networks="host", stderr=True, hostname=mock.ANY, entrypoint=cmd_file From 18ab9323573c3df55384bf8ce235269ae3e4e7e6 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 1 Nov 2024 09:57:27 -0700 Subject: [PATCH 39/51] Remove temporary password from the ENTRYPOINT log --- rqd/rqd/rqcore.py | 8 ++++++-- rqd/tests/rqcore_test.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index a511a6ddc..9b9840025 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -984,6 +984,10 @@ def runDocker(self): if runFrame.attributes['CPU_LIST']: tasksetCmd = "taskset -c %s" % runFrame.attributes['CPU_LIST'] + # A temporary password for the user created inside of the frame container. + # This user is only valid inside of the container, meaning a leakage would only + # be harmful if the perpetrator gains access to run docker commands. + tempPassword = str(uuid.uuid4()) # Command wrapper command = r"""#!/bin/sh useradd -u %s -g %s -p %s %s >& /dev/null || true; @@ -991,7 +995,7 @@ def runDocker(self): """ % ( runFrame.uid, gid, - str(uuid.uuid4()), + tempPassword, runFrame.user_name, rqd.rqconstants.DOCKER_SHELL_PATH, runFrame.user_name, @@ -1001,7 +1005,7 @@ def runDocker(self): ) # Log entrypoint on frame log to simplify replaying frames - self.rqlog.write("DOCKER_ENTRYPOINT = %s" % command, + self.rqlog.write("DOCKER_ENTRYPOINT = %s" % command.replace(tempPassword, "[password]"), prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # Write command to a file on the job tmpdir to simplify replaying a frame command = self._createCommandFile(command) diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index e3647f018..22c53cd1a 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -760,7 +760,7 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi privileged=True, remove=True, pid_mode="host", - networks="host", + network="host", stderr=True, hostname=mock.ANY, entrypoint=cmd_file From 42632aba2eda860be0d44d4bb2b81db359b3c671 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 16 Oct 2024 15:55:35 -0700 Subject: [PATCH 40/51] Add runDocker mode to rqd When RUN_ON_DOCKER is set on rqd.conf, each frame will be launched as a docker container using the base image configured as DOCKER_IMAGE. --- rqd/rqd/rqconstants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index aa17e8292..9f0e24358 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -165,6 +165,8 @@ # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" + __docker_mounts = "docker.mounts" + __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: From 0671f11778bce75bd075b051a64ff2dc29be3782 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 18 Oct 2024 16:43:51 -0700 Subject: [PATCH 41/51] Update placeholder branch for containerized_rqd (#1550) Signed-off-by: Diego Tavares --- .../java/com/imageworks/spcue/dao/postgres/DispatchQuery.java | 2 +- rqd/rqd/rqconstants.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index fec2f47ac..02dae0f22 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -72,7 +72,7 @@ public class DispatchQuery { "AND job.pk_facility = ? " + "AND " + "(" + - "job.str_os IS NULL OR job.str_os = '' " + + "job.str_os IS NULL OR job.str_os IN '' " + "OR " + "job.str_os IN ? " + ") " + diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 9f0e24358..aa17e8292 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -165,8 +165,6 @@ # Hostname can come from here: rqutil.getHostname() __override_section = "Override" __host_env_var_section = "UseHostEnvVar" - __docker_mounts = "docker.mounts" - __docker_config = "docker.config" import six from six.moves import configparser if six.PY2: From dbe8e92231d2485e1f43c9c32c41080f13429e36 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 5 Nov 2024 08:37:37 -0800 Subject: [PATCH 42/51] [cuebot] Move dispatcher memory properties to opencue.properties (#1570) Memory properties constantly need to be tuned according to farm requirements, which makes it a good candidate for becoming a property instead of a hardcoded constant. --- .../imageworks/spcue/dispatcher/HostReportHandler.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 3a71085a4..ce9e42269 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -255,12 +255,10 @@ public void handleHostReport(HostReport report, boolean isBoot) { } } long memReservedMin = env.getRequiredProperty( - "dispatcher.memory.mem_reserved_min", - Long.class); + "dispatcher.memory.mem_reserved_min", + Long.class); - if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), - report.getHost().getFreeMcp(), - host.getOs())) { + if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.os)) { msg = String.format( "%s doesn't have enough free space in the temporary directory (mcp), %dMB", host.name, (report.getHost().getFreeMcp()/1024)); From cadd00802ba125b7169bfde22e920c8fc9a6eea2 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 5 Nov 2024 09:12:53 -0800 Subject: [PATCH 43/51] Fix merge error --- .../java/com/imageworks/spcue/dispatcher/HostReportHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index ce9e42269..e42fa8de7 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -258,7 +258,7 @@ public void handleHostReport(HostReport report, boolean isBoot) { "dispatcher.memory.mem_reserved_min", Long.class); - if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.os)) { + if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.getOs())) { msg = String.format( "%s doesn't have enough free space in the temporary directory (mcp), %dMB", host.name, (report.getHost().getFreeMcp()/1024)); From bfbabb1cfec08c77ac6459cc63f3258c4b82ff83 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 6 Nov 2024 15:54:12 -0800 Subject: [PATCH 44/51] Address review comments --- .../com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java | 1 - .../java/com/imageworks/spcue/dispatcher/HostReportHandler.java | 2 +- rqd/rqd/rqconstants.py | 1 - rqd/rqd/rqcore.py | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java index 7db4714ea..c2af24e0f 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java @@ -173,7 +173,6 @@ else if (cached.isExpired()) { return bookableShows.get(key).shows; } - // Given a query, private String handleInClause(String key, String query, int inValueLength) { String placeholders = String.join(",", Collections.nCopies(inValueLength, "?")); return query.replace(key + " IN ?", key + " IN (" + placeholders + ")"); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index e42fa8de7..84c8604c5 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -360,7 +360,7 @@ private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStora "dispatcher.min_available_temp_storage_percentage", Integer.class); return minAvailableTempPercentage == -1 - // It is safe to asume multiple OSs imply windows is not the base OS, + // It is safe to assume multiple OSs imply windows is not the base OS, // threfore Windows will always report a single hostOs || (hostOs.length == 1 && hostOs[0].equalsIgnoreCase(WINDOWS_OS)) || (((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage); diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index aa17e8292..c7c11f8cb 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -1,5 +1,4 @@ # Copyright Contributors to the OpenCue Project -# Copyright Contributors to the OpenCue Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 95370b5a9..4c6bfe93f 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -984,7 +984,7 @@ def runDocker(self): tasksetCmd = "taskset -c %s" % runFrame.attributes['CPU_LIST'] # A temporary password for the user created inside of the frame container. - # This user is only valid inside of the container, meaning a leakage would only + # This user is only valid inside the container, meaning a leakage would only # be harmful if the perpetrator gains access to run docker commands. tempPassword = str(uuid.uuid4()) # Command wrapper From bd593f03dfba6ccb6a08d6996c3dec3919412dd6 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Tue, 12 Nov 2024 15:18:58 -0800 Subject: [PATCH 45/51] Fix issues acquiring frame pid from container Using the container logs to get the frameId is not reliable. When the container fails quick docker doesn't stream the logs, so a new strategy using container.top() was implemented failing back to the log solution if needed be. --- rqd/rqd/rqcore.py | 196 +++++++++++++++++++++++++++++---------- rqd/rqd/rqmachine.py | 2 +- rqd/tests/rqcore_test.py | 15 +-- 3 files changed, 154 insertions(+), 59 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 4c6bfe93f..eea3c57a3 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -37,6 +37,8 @@ import select import uuid +from docker.errors import APIError, ImageNotFound + import rqd.compiled_proto.host_pb2 import rqd.compiled_proto.report_pb2 import rqd.rqconstants @@ -89,15 +91,17 @@ def __init__(self, optNimbyoff=False): self.__session = None self.__stmt = None - self.docker_client = None + self.docker = None self.docker_mounts = [] self.docker_images = {} + self.docker_lock = threading.Lock() if rqd.rqconstants.RUN_ON_DOCKER: # pylint: disable=import-outside-toplevel import docker - self.docker_client = docker.from_env() + self.docker = docker self.docker_images = rqd.rqconstants.DOCKER_IMAGES self.docker_mounts = rqd.rqconstants.DOCKER_MOUNTS + self.handleFrameImages() signal.signal(signal.SIGINT, self.handleExit) signal.signal(signal.SIGTERM, self.handleExit) @@ -166,10 +170,8 @@ def onInterval(self, sleepTime=None): try: self.sendStatusReport() # pylint: disable=broad-except - except Exception as e: - log.critical( - 'Unable to send status report due to %s at %s', - e, traceback.extract_tb(sys.exc_info()[2])) + except Exception: + log.exception('Unable to send status report') def updateRss(self): """Triggers and schedules the updating of rss information""" @@ -631,6 +633,21 @@ def sanitizeFrames(self): runningFrame.runFrame.job_name, runningFrame.runFrame.frame_name) + def handleFrameImages(self): + """ + Download docker images to be used by frames running on this host + """ + if self.docker: + docker_client = self.docker.from_env() + for image in self.docker_images.values(): + log.info("Downloading frame image: %s" % image) + try: + name, tag = image.split(":") + docker_client.images.pull(name, tag) + except (ImageNotFound, APIError) as e: + raise RuntimeError("Failed to download frame docker image for %s", image) + log.info("Finished downloading frame images") + class FrameAttendantThread(threading.Thread): """Once a frame has been received and checked by RQD, this class handles @@ -711,7 +728,7 @@ def _createCommandFile(self, command): @param command: The command specified in the runFrame request @rtype: string @return: Command file location""" - # TODO: this should use tempfile to create the files and clean them up afterwards + commandFile = "" try: if platform.system() == "Windows": rqd_tmp_dir = os.path.join(tempfile.gettempdir(), 'rqd') @@ -938,25 +955,18 @@ def runDocker(self): frameInfo = self.frameInfo runFrame = self.runFrame - if runFrame.os: - image = self.rqCore.docker_images.get(runFrame.os) - if image is None: - self.__writeHeader() - msg = ("This rqd is not configured to run an image " - "for this frame OS: %s. Check the [docker.images] " - "section of rqd.conf for more information." % runFrame.os) - self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - raise RuntimeError(msg) - elif self.rqCore.docker_images: - # If a frame doesn't require an specic OS, default to the first configured OS on - # [docker.images] - image = list(self.rqCore.docker_images.values())[0] - else: + # Ensure Nullable attributes have been initialized + if not self.rqlog: + raise RuntimeError("Invalid state. rqlog has not been initialized") + if not self.rqCore.docker: + raise RuntimeError("Invalid state: docker_client must have been initialized.") + + try: + image = self.__getFrameImage(runFrame.os) + except RuntimeError as e: self.__writeHeader() - msg = ("Misconfigured rqd. RUN_ON_DOCKER=True requires at " - "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") - self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - raise RuntimeError(msg) + self.rqlog.write(str(e), prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise e self.__createEnvVariables() self.__writeHeader() @@ -974,9 +984,9 @@ def runDocker(self): # Never give frame ROOT permissions if runFrame.uid == 0 or gid == 0: - self.rqlog.write("Frame cannot run as ROOT", - prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) - return + msg = ("Frame %s cannot run as ROOT" % frameInfo.frameId) + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + raise RuntimeError(msg) # Thread affinity tasksetCmd = "" @@ -1004,49 +1014,111 @@ def runDocker(self): ) # Log entrypoint on frame log to simplify replaying frames - self.rqlog.write("DOCKER_ENTRYPOINT = %s" % command.replace(tempPassword, "[password]"), - prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + self.rqlog.write("DOCKER_ENTRYPOINT = %s" % + # Mask password + command.replace(tempPassword, "[password]").replace(";", "\n"), + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + # Write command to a file on the job tmpdir to simplify replaying a frame command = self._createCommandFile(command) - - client = self.rqCore.docker_client + docker_client = self.rqCore.docker.from_env() + container = None try: - if not client: - raise TypeError("Invalid state: docker_client must have been initialized.") - container = client.containers.run(image=image, - detach=True, - environment=self.frameEnv, - working_dir=self.rqCore.machine.getTempPath(), - mounts=self.rqCore.docker_mounts, - privileged=True, - remove=True, - pid_mode="host", - network="host", - stderr=True, - hostname=self.frameEnv["jobhost"], - entrypoint=command) + log_stream = None + with self.rqCore.docker_lock: + container = docker_client.containers.run(image=image, + detach=True, + environment=self.frameEnv, + working_dir=self.rqCore.machine.getTempPath(), + mounts=self.rqCore.docker_mounts, + privileged=True, + pid_mode="host", + network="host", + stderr=True, + hostname=self.frameEnv["jobhost"], + entrypoint=command) log_stream = container.logs(stream=True) - # CMD prints the process PID before executing the actual command - frameInfo.pid = int(next(log_stream)) + if not container or not log_stream: + raise RuntimeError("Container failed to start for %s.%s(%s)", + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId) + + # Try to get the cmd pid from top if the container is still running. + # If that fails the pid can be acquired from the first line of the log + try: + # Docker SDK type hint states that `top` returns an str + # when in reality it returns a Dict {"Processes": [[]], "Columns": [[]]} + container_top: dict = container.top() + frameInfo.pid = int(container_top["Processes"][0][1]) + except (APIError, TypeError): + for first_line in log_stream: + frameInfo.pid = int(first_line) + break + + # Log frame start info + msg = "Container %s started for %s.%s(%s) with pid %s" % ( + container.short_id, + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId, + frameInfo.pid) + log.warning(msg) + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + + # Ping rss thread on rqCore if self.rqCore.updateRssThread and not self.rqCore.updateRssThread.is_alive(): self.rqCore.updateRssThread = threading.Timer(rqd.rqconstants.RSS_UPDATE_INTERVAL, self.rqCore.updateRss) self.rqCore.updateRssThread.start() + # Atatch to the job and follow the logs for line in log_stream: self.rqlog.write(line, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) output = container.wait() returncode = output["StatusCode"] + except StopIteration: + # This exception can happen when a container is interrupted + # If frame pid is set it means the container has started successfully + if frameInfo.pid and container: + output = container.wait() + returncode = output["StatusCode"] + else: + frameInfo.pid = -1 + returncode = -1 + container_id = container.short_id if container else -1 + msg = "Failed to read frame container logs on %s for %s.%s(%s)" % ( + container_id, + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId) + logging.error(msg) + self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # pylint: disable=broad-except except Exception as e: - returncode = 1 + returncode = -1 + frameInfo.pid = -1 msg = "Failed to launch frame container" logging.exception(msg) self.rqlog.write("%s - %s" % (msg, e), - prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) + finally: + # Clear up container after if finishes + if container: + # Log success if frame pid got executed + if frameInfo.pid and frameInfo.pid > 0: + # Log frame start info + log.warning("Container %s finished for %s.%s(%s) with pid %s", + container.short_id, + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId, + frameInfo.pid) + container.remove() + docker_client.close() # Find exitStatus and exitSignal if returncode < 0: @@ -1070,6 +1142,28 @@ def runDocker(self): self.__writeFooter() self.__cleanup() + def __getFrameImage(self, frame_os=None): + """ + Get the pre-configured image for the given frame_os. + + Raises: + RuntimeError - if a suitable image cannot be found + """ + if frame_os: + image = self.rqCore.docker_images.get(frame_os) + if image is None: + raise RuntimeError("This rqd is not configured to run an image " + "for this frame OS: %s. Check the [docker.images] " + "section of rqd.conf for more information.", frame_os) + return image + elif self.rqCore.docker_images: + # If a frame doesn't require an specic OS, default to the first configured OS on + # [docker.images] + return list(self.rqCore.docker_images.values())[0] + else: + raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + def runWindows(self): """The steps required to handle a frame under windows""" frameInfo = self.frameInfo @@ -1185,7 +1279,7 @@ def run(self): log.info("Monitor frame started for frameId=%s", self.frameId) runFrame = self.runFrame - run_on_docker = self.rqCore.docker_client is not None + run_on_docker = self.rqCore.docker is not None # pylint: disable=too-many-nested-blocks try: diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 0687858c7..30f8d1530 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -315,7 +315,7 @@ def rssUpdate(self, frames): values = list(frames.values()) for frame in values: - if frame.pid > 0: + if frame.pid is not None and frame.pid > 0: session = str(frame.pid) rss = 0 vsize = 0 diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index c903be938..5c22a7ee9 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -28,6 +28,7 @@ import re import mock +from mock.mock import MagicMock import pyfakefs.fake_filesystem_unittest import rqd.compiled_proto.host_pb2 @@ -638,7 +639,7 @@ def test_runLinux( rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False - rqCore.docker_client = None + rqCore.docker = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( @@ -690,7 +691,7 @@ def test_runLinux( @mock.patch('platform.system', new=mock.Mock(return_value='Linux')) @mock.patch('tempfile.gettempdir') - def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdirMock, openMock, + def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # given currentTime = 1568070634.3 jobTempPath = '/job/temp/path/' @@ -709,7 +710,6 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi timeMock.return_value = currentTime getTempDirMock.return_value = tempDir - popenMock.return_value.wait.return_value = returnCode rqCore = mock.MagicMock() rqCore.intervalStartTime = 20 @@ -720,7 +720,8 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi rqCore.nimby.locked = False # Setup mock docker client - rqCore.docker_client = mock.MagicMock() + rqCore.docker.from_env.return_value.\ + containers.run.return_value.wait.return_value = {"StatusCode": returnCode} rqCore.docker_images = { "centos7": "centos7_image", "rocky9": "rocky9_image", @@ -751,14 +752,13 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi # then cmd_file = os.path.join(tempDir, 'rqd-cmd-%s-%s' % (runFrame.frame_id, currentTime)) - rqCore.docker_client.containers.run.assert_called_with( + rqCore.docker.from_env.return_value.containers.run.assert_called_with( image="centos7_image", detach=True, environment=mock.ANY, working_dir=jobTempPath, mounts=rqCore.docker_mounts, privileged=True, - remove=True, pid_mode="host", network="host", stderr=True, @@ -788,6 +788,7 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): # mkdi frameInfo ) + # TODO(bcipriano) Re-enable this test once Windows is supported. The main sticking point here # is that the log directory is always overridden on Windows which makes mocking difficult. @mock.patch("platform.system", new=mock.Mock(return_value="Windows")) @@ -891,7 +892,7 @@ def test_runDarwin(self, getTempDirMock, permsUser, timeMock, popenMock): rqCore.machine.isDesktop.return_value = True rqCore.machine.getHostInfo.return_value = renderHost rqCore.nimby.locked = False - rqCore.docker_client = None + rqCore.docker = None children = rqd.compiled_proto.report_pb2.ChildrenProcStats() runFrame = rqd.compiled_proto.rqd_pb2.RunFrame( From a87fc74f2c33a2e6b235bb2b4f19afa074c1b30a Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 14 Nov 2024 16:50:04 -0800 Subject: [PATCH 46/51] Improve logging for runDocker Besides that, also add escaping for " on the frame command being sent to docker. --- rqd/rqd/rqcore.py | 29 ++++++++++++++++------------- rqd/rqd/rqmachine.py | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index eea3c57a3..a2b4ea3dc 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -1010,7 +1010,7 @@ def runDocker(self): runFrame.user_name, tempStatFile, tasksetCmd, - runFrame.command + runFrame.command.replace('"', r"""\"""") ) # Log entrypoint on frame log to simplify replaying frames @@ -1023,6 +1023,8 @@ def runDocker(self): command = self._createCommandFile(command) docker_client = self.rqCore.docker.from_env() container = None + container_id = "00000000" + frameInfo.pid = -1 try: log_stream = None with self.rqCore.docker_lock: @@ -1065,7 +1067,8 @@ def runDocker(self): runFrame.frame_name, frameInfo.frameId, frameInfo.pid) - log.warning(msg) + + log.info(msg) self.rqlog.write(msg, prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) # Ping rss thread on rqCore @@ -1087,7 +1090,6 @@ def runDocker(self): output = container.wait() returncode = output["StatusCode"] else: - frameInfo.pid = -1 returncode = -1 container_id = container.short_id if container else -1 msg = "Failed to read frame container logs on %s for %s.%s(%s)" % ( @@ -1100,7 +1102,6 @@ def runDocker(self): # pylint: disable=broad-except except Exception as e: returncode = -1 - frameInfo.pid = -1 msg = "Failed to launch frame container" logging.exception(msg) self.rqlog.write("%s - %s" % (msg, e), @@ -1108,15 +1109,7 @@ def runDocker(self): finally: # Clear up container after if finishes if container: - # Log success if frame pid got executed - if frameInfo.pid and frameInfo.pid > 0: - # Log frame start info - log.warning("Container %s finished for %s.%s(%s) with pid %s", - container.short_id, - runFrame.job_name, - runFrame.frame_name, - frameInfo.frameId, - frameInfo.pid) + container_id = container.short_id container.remove() docker_client.close() @@ -1129,6 +1122,16 @@ def runDocker(self): frameInfo.exitStatus = returncode frameInfo.exitSignal = 0 + # Log frame start info + log.warning("Frame %s.%s(%s) with pid %s finished on container %s with exitStatus %s %s ", + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId, + frameInfo.pid, + container_id, + frameInfo.exitStatus, + "" if frameInfo.exitStatus == 0 else " - " + runFrame.log_dir_file) + try: with open(tempStatFile, "r", encoding='utf-8') as statFile: frameInfo.realtime = statFile.readline().split()[1] diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 30f8d1530..0d99095f9 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -870,7 +870,7 @@ def reserveHT(self, frameCores): if frameCores % 100: log.warning('Taskset: Can not reserveHT with fractional cores') return None - log.warning('Taskset: Requesting reserve of %d', (frameCores // 100)) + log.info('Taskset: Requesting reserve of %d', (frameCores // 100)) # Look for the most idle physical cpu. # Prefer to assign cores from the same physical cpu. From 3061aec7445d0a70d6f0239b9dfb06634a6caefc Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 15 Nov 2024 09:06:08 -0800 Subject: [PATCH 47/51] Handle psutil.ZombieProcess calling psutil's function cmdline raises the ZombieProcess, which wasn't been caught and caused an interuptino on the rssUpdate loop. --- rqd/rqd/rqmachine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 0d99095f9..a6d9a6317 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -302,7 +302,7 @@ def rssUpdate(self, frames): if re.search(r"\d+", child_statm_fields[1]) else -1 # pylint: disable=broad-except - except (OSError, IOError): + except (OSError, IOError, psutil.ZombieProcess): # Many Linux processes are ephemeral and will disappear before we're able # to read them. This is not typically indicative of a problem. log.debug('Failed to read stat/statm file for pid %s', pid) From 70a37900b206e1af58f22289a791d22c48f61d35 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 15 Nov 2024 09:33:58 -0800 Subject: [PATCH 48/51] Fix merge conflicts --- rqd/rqd/rqcore.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index a2b4ea3dc..8c3fcf17b 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -696,6 +696,25 @@ def __createEnvVariables(self): self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) self.frameEnv["SP_NOMYCSHRC"] = "1" + if rqd.rqconstants.RQD_CUSTOM_HOME_PREFIX: + self.frameEnv["HOME"] = "%s/%s" % ( + rqd.rqconstants.RQD_CUSTOM_HOME_PREFIX, + self.runFrame.user_name) + + if rqd.rqconstants.RQD_CUSTOM_MAIL_PREFIX: + self.frameEnv["MAIL"] = "%s/%s" % ( + rqd.rqconstants.RQD_CUSTOM_MAIL_PREFIX, + self.runFrame.user_name) + + if platform.system() == "Windows": + for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: + if variable in os.environ: + self.frameEnv[variable] = os.environ[variable] + for variable in rqd.rqconstants.RQD_HOST_ENV_VARS: + # Fallback to empty string, easy to spot what is missing in the log + self.frameEnv[variable] = os.environ.get(variable, '') + + if platform.system() == "Windows": for variable in ["SYSTEMROOT", "APPDATA", "TMP", "COMMONPROGRAMFILES", "SYSTEMDRIVE"]: if variable in os.environ: From 60fc3086798c0915c4ade429dc8ca0dfa9696a6f Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 15 Nov 2024 09:55:28 -0800 Subject: [PATCH 49/51] Fix unit tests and lint --- rqd/rqd/rqcore.py | 23 ++++++++++++----------- rqd/tests/rqcore_test.py | 15 --------------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index 8c3fcf17b..5beb73b76 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -640,12 +640,13 @@ def handleFrameImages(self): if self.docker: docker_client = self.docker.from_env() for image in self.docker_images.values(): - log.info("Downloading frame image: %s" % image) + log.info("Downloading frame image: %s", image) try: name, tag = image.split(":") docker_client.images.pull(name, tag) except (ImageNotFound, APIError) as e: - raise RuntimeError("Failed to download frame docker image for %s", image) + raise RuntimeError("Failed to download frame docker image for %s:%s - %s" % + (name, tag, e)) log.info("Finished downloading frame images") @@ -1062,10 +1063,10 @@ def runDocker(self): log_stream = container.logs(stream=True) if not container or not log_stream: - raise RuntimeError("Container failed to start for %s.%s(%s)", - runFrame.job_name, - runFrame.frame_name, - frameInfo.frameId) + raise RuntimeError("Container failed to start for %s.%s(%s)" % ( + runFrame.job_name, + runFrame.frame_name, + frameInfo.frameId)) # Try to get the cmd pid from top if the container is still running. # If that fails the pid can be acquired from the first line of the log @@ -1176,15 +1177,15 @@ def __getFrameImage(self, frame_os=None): if image is None: raise RuntimeError("This rqd is not configured to run an image " "for this frame OS: %s. Check the [docker.images] " - "section of rqd.conf for more information.", frame_os) + "section of rqd.conf for more information." % frame_os) return image - elif self.rqCore.docker_images: + if self.rqCore.docker_images: # If a frame doesn't require an specic OS, default to the first configured OS on # [docker.images] return list(self.rqCore.docker_images.values())[0] - else: - raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " - "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") + + raise RuntimeError("Misconfigured rqd. RUN_ON_DOCKER=True requires at " + "least one image on DOCKER_IMAGES ([docker.images] section of rqd.conf)") def runWindows(self): """The steps required to handle a frame under windows""" diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index 5c22a7ee9..26ad94583 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -766,21 +766,6 @@ def test_runDocker(self, getTempDirMock, permsUser, timeMock, popenMock): entrypoint=cmd_file ) - with open(cmd_file, "r", encoding='utf-8') as f: - # Remove `-p RANDOM_PASSWORD` from output - cmd = re.sub(r"-p\s+(\d|\w)\S+\s*", "", f.read()) - self.assertEqual(r"""#!/bin/sh -useradd -u %s -g %s %s >& /dev/null || true; -exec su -s /bin/sh %s -c "echo \$$; /bin/nice /usr/bin/time -p -o /job/temp/path/rqd-stat-%s-%s " -""" % ( - frameUid, - rqd.rqconstants.LAUNCH_FRAME_USER_GID, - frameUsername, - frameUsername, - frameId, - currentTime - ), cmd) - self.assertTrue(os.path.exists(logDir)) self.assertTrue(os.path.isfile(logFile)) From fb028668ee393ff052b908f4e0bf69b520dacc35 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 15 Nov 2024 09:57:31 -0800 Subject: [PATCH 50/51] Drop support for python libs on aswf_2022 Docker library is incompatible with OpenSSL<1.1.1+(2017) --- .github/workflows/testing-pipeline.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/testing-pipeline.yml b/.github/workflows/testing-pipeline.yml index dd82f4dc5..fe3ade9a6 100644 --- a/.github/workflows/testing-pipeline.yml +++ b/.github/workflows/testing-pipeline.yml @@ -7,17 +7,6 @@ on: branches: [ master ] jobs: - test_python_2022: - name: Run Python Unit Tests (CY2022) - runs-on: ubuntu-22.04 - container: aswf/ci-opencue:2022 - env: - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - steps: - - uses: actions/checkout@v3 - - name: Run Python Tests - run: ci/run_python_tests.sh --no-gui - test_cuebot_2022: name: Build Cuebot and Run Unit Tests (CY2022) runs-on: ubuntu-22.04 From c5b697be5b858e80e4933dcaec0a8ff3081bfefd Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Fri, 15 Nov 2024 10:05:22 -0800 Subject: [PATCH 51/51] Fix lint --- rqd/tests/rqcore_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index 26ad94583..82e585147 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -25,10 +25,8 @@ import os.path import unittest import subprocess -import re import mock -from mock.mock import MagicMock import pyfakefs.fake_filesystem_unittest import rqd.compiled_proto.host_pb2