From 75506df29ff6e05a51962fc32ae3d644ccbbeee0 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Wed, 4 Oct 2023 11:18:45 -0700 Subject: [PATCH] Limit the number of kill retries There's an error condition on rqd where a frame that cannot be killed will end up preventing the host from picking up new jobs. This logic limits the number of repeated killRequests to give host a chance to pick up new jobs. At the same time, blocked frames are logged to spcue.log to be handled manually. (cherry picked from commit aea4864ef66aca494fb455a7c103e4a832b63d41) --- .../com/imageworks/spcue/FrameInterface.java | 1 - .../spcue/dispatcher/DispatchSupport.java | 2 +- .../dispatcher/DispatchSupportService.java | 11 +- .../spcue/dispatcher/Dispatcher.java | 9 +- .../spcue/dispatcher/HostReportHandler.java | 309 ++++++++++-------- .../commands/DispatchRqdKillFrameMemory.java | 8 +- cuebot/src/main/resources/opencue.properties | 16 +- .../dispatcher/HostReportHandlerTests.java | 10 +- cuebot/src/test/resources/opencue.properties | 11 +- rqd/rqd/rqdservicers.py | 2 + rqd/rqd/rqnetwork.py | 1 + 11 files changed, 211 insertions(+), 169 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/FrameInterface.java b/cuebot/src/main/java/com/imageworks/spcue/FrameInterface.java index eff22768f..945685444 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/FrameInterface.java +++ b/cuebot/src/main/java/com/imageworks/spcue/FrameInterface.java @@ -1,4 +1,3 @@ - /* * Copyright Contributors to the OpenCue Project * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java index eca3939ed..6d142e89d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java @@ -420,7 +420,7 @@ List findNextDispatchFrames(LayerInterface layer, VirtualProc pro * * @param frame */ - void updateFrameMemoryError(FrameInterface frame); + boolean updateFrameMemoryError(FrameInterface frame); /** * Update Memory usage data and LLU time for the given frame. diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index 1d2fead26..713f6c86c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -42,6 +42,7 @@ import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; import org.springframework.dao.EmptyResultDataAccessException; +import org.springframework.dao.DataAccessException; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; @@ -184,7 +185,11 @@ public boolean increaseReservedMemory(ProcInterface p, long value) { @Override public boolean clearVirtualProcAssignement(ProcInterface proc) { - return procDao.clearVirtualProcAssignment(proc); + try { + return procDao.clearVirtualProcAssignment(proc); + } catch (DataAccessException e) { + return false; + } } @Transactional(propagation = Propagation.REQUIRED) @@ -345,8 +350,8 @@ public void clearFrame(DispatchFrame frame) { @Override @Transactional(propagation = Propagation.REQUIRED) - public void updateFrameMemoryError(FrameInterface frame) { - frameDao.updateFrameMemoryError(frame); + public boolean updateFrameMemoryError(FrameInterface frame) { + return frameDao.updateFrameMemoryError(frame); } @Transactional(propagation = Propagation.SUPPORTS) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java index 2e83d1b6b..072b04113 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java @@ -107,13 +107,8 @@ public interface Dispatcher { // without being penalized for it. public static final long VIRTUAL_MEM_THRESHHOLD = CueUtil.GB2; - // Percentage of used memory to consider a risk for triggering oom-killer - public static final double OOM_MAX_SAFE_USED_MEMORY_THRESHOLD = 0.95; - - // How much can a frame exceed its reserved memory. - // - 0.5 means 50% above reserve - // - -1.0 makes the feature inactive - public static final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD = 0.6; + // How long to keep track of a frame kill request + public static final int FRAME_KILL_CACHE_EXPIRE_AFTER_WRITE_MINUTES = 3; // A higher number gets more deep booking but less spread on the cue. public static final int DEFAULT_MAX_FRAMES_PER_PASS = 4; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 4a7a756d4..a1dc007af 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -24,8 +24,12 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; import org.springframework.beans.factory.annotation.Autowired; @@ -93,6 +97,10 @@ public class HostReportHandler { "space on the temporary directory (mcp)"; private static final String CUEBOT_COMMENT_USER = "cuebot"; + Cache killRequestCounterCache = CacheBuilder.newBuilder() + .expireAfterWrite(FRAME_KILL_CACHE_EXPIRE_AFTER_WRITE_MINUTES, TimeUnit.MINUTES) + .build(); + /** * Boolean to toggle if this class is accepting data or not. */ @@ -466,6 +474,10 @@ private void changeLockState(DispatchHost host, CoreDetail coreInfo) { * @param report */ private void handleMemoryUsage(final DispatchHost host, final HostReport report) { + final double OOM_MAX_SAFE_USED_MEMORY_THRESHOLD = + env.getRequiredProperty("dispatcher.oom_max_safe_used_memory_threshold", Double.class); + final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD = + env.getRequiredProperty("dispatcher.oom_frame_overboard_allowed_threshold", Double.class); RenderHost renderHost = report.getHost(); List runningFrames = report.getFramesList(); @@ -474,21 +486,17 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report) (1.0 - OOM_MAX_SAFE_USED_MEMORY_THRESHOLD)); if (memoryWarning) { - VirtualProc killedProc = killWorstMemoryOffender(host); long memoryAvailable = renderHost.getFreeMem(); long minSafeMemoryAvailable = (long)(renderHost.getTotalMem() * (1.0 - OOM_MAX_SAFE_USED_MEMORY_THRESHOLD)); - // Some extra protection for this possibly unbound loop - int unboundProtectionLimit = 10; - while (killedProc != null && unboundProtectionLimit > 0) { - memoryAvailable = memoryAvailable + killedProc.memoryUsed; - - // If killing this proc solved the memory issue, stop the attack - if (memoryAvailable >= minSafeMemoryAvailable) { - break; + // Only allow killing up to 10 frames at a time + int killAttemptsRemaining = 10; + do { + VirtualProc killedProc = killWorstMemoryOffender(host); + killAttemptsRemaining -= 1; + if (killedProc != null) { + memoryAvailable = memoryAvailable + killedProc.memoryUsed; } - killedProc = killWorstMemoryOffender(host); - unboundProtectionLimit -= 1; - } + } while (killAttemptsRemaining > 0 && memoryAvailable < minSafeMemoryAvailable); } else { // When no mass cleaning was required, check for frames going overboard // if frames didn't go overboard, manage its reservations trying to increase @@ -506,6 +514,23 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report) } } + public enum KillCause { + FrameOverboard("This frame is using way more than it had reserved."), + HostUnderOom("Frame killed by host under Oom pressure"), + FrameTimedOut("Frame timed out"), + FrameLluTimedOut("Frame LLU timed out"), + FrameVerificationFailure("Frame failed to be verified on the database"); + private final String message; + + private KillCause(String message) { + this.message = message; + } + @Override + public String toString() { + return message; + } + } + private boolean killFrameOverusingMemory(RunningFrameInfo frame, String hostname) { try { VirtualProc proc = hostManager.getVirtualProc(frame.getResourceId()); @@ -517,32 +542,87 @@ private boolean killFrameOverusingMemory(RunningFrameInfo frame, String hostname logger.info("Killing frame on " + frame.getJobName() + "." + frame.getFrameName() + ", using too much memory."); - return killProcForMemory(proc, hostname, "This frame is using way more than it had reserved."); + return killProcForMemory(proc, hostname, KillCause.FrameOverboard); } catch (EmptyResultDataAccessException e) { return false; } } - private boolean killProcForMemory(VirtualProc proc, String hostname, String reason) { + private boolean getKillClearance(String hostname, String frameId) { + String cacheKey = hostname + "-" + frameId; + final int FRAME_KILL_RETRY_LIMIT = + env.getRequiredProperty("dispatcher.frame_kill_retry_limit", Integer.class); + + // Cache frame+host receiving a killRequest and count how many times the request is being retried + // meaning rqd is probably failing at attempting to kill the related proc + long cachedCount; try { - FrameInterface frame = jobManager.getFrame(proc.frameId); + cachedCount = 1 + killRequestCounterCache.get(cacheKey, () -> 0L); + } catch (ExecutionException e) { + return false; + } + killRequestCounterCache.put(cacheKey, cachedCount); + if (cachedCount > FRAME_KILL_RETRY_LIMIT) { + // If the kill retry limit has been reached, notify prometheus of the issue and give up + if (!dispatcher.isTestMode()) { + FrameInterface frame = jobManager.getFrame(frameId); + JobInterface job = jobManager.getJob(frame.getJobId()); + prometheusMetrics.incrementFrameKillFailureCounter( + hostname, + job.getName(), + frame.getName(), + frameId); + } + return false; + } + return true; + } - if (dispatcher.isTestMode()) { - // For testing, don't run on a different threadpool, as different threads don't share - // the same database state - (new DispatchRqdKillFrameMemory(proc.hostName, frame, reason, rqdClient, - dispatchSupport, dispatcher.isTestMode())).run(); - } else { - killQueue.execute(new DispatchRqdKillFrameMemory(proc.hostName, frame, reason, rqdClient, + private boolean killProcForMemory(VirtualProc proc, String hostname, KillCause killCause) { + if (!getKillClearance(hostname, proc.frameId)) { + return false; + } + + FrameInterface frame = jobManager.getFrame(proc.frameId); + if (dispatcher.isTestMode()) { + // Different threads don't share the same database state on the test environment + (new DispatchRqdKillFrameMemory(hostname, frame, killCause.toString(), rqdClient, + dispatchSupport, dispatcher.isTestMode())).run(); + } else { + try { + killQueue.execute(new DispatchRqdKillFrameMemory(hostname, frame, killCause.toString(), rqdClient, dispatchSupport, dispatcher.isTestMode())); - prometheusMetrics.incrementFrameOomKilledCounter(hostname); + prometheusMetrics.incrementFrameKilledCounter(hostname, killCause); + } catch (TaskRejectedException e) { + logger.warn("Unable to add a DispatchRqdKillFrame request, task rejected, " + e); + return false; } - DispatchSupport.killedOffenderProcs.incrementAndGet(); - return true; - } catch (TaskRejectedException e) { - logger.warn("Unable to queue RQD kill, task rejected, " + e); + } + DispatchSupport.killedOffenderProcs.incrementAndGet(); + return true; + } + + private boolean killFrame(String frameId, String hostname, KillCause killCause) { + if (!getKillClearance(hostname, frameId)) { return false; } + + if (dispatcher.isTestMode()) { + // Different threads don't share the same database state on the test environment + (new DispatchRqdKillFrame(hostname, frameId, killCause.toString(), rqdClient)).run(); + } else { + try { + killQueue.execute(new DispatchRqdKillFrame(hostname, + frameId, + killCause.toString(), + rqdClient)); + prometheusMetrics.incrementFrameKilledCounter(hostname, killCause); + } catch (TaskRejectedException e) { + logger.warn("Unable to add a DispatchRqdKillFrame request, task rejected, " + e); + } + } + DispatchSupport.killedOffenderProcs.incrementAndGet(); + return true; } /** @@ -556,8 +636,7 @@ private VirtualProc killWorstMemoryOffender(final DispatchHost host) { VirtualProc proc = hostManager.getWorstMemoryOffender(host); logger.info("Killing frame on " + proc.getName() + ", host is under stress."); - if (!killProcForMemory(proc, host.getName(), "The host was dangerously low on memory and swapping.")) { - // Returning null will prevent the caller from overflowing the kill queue with more messages + if (!killProcForMemory(proc, host.getName(), KillCause.HostUnderOom)) { proc = null; } return proc; @@ -576,6 +655,8 @@ private VirtualProc killWorstMemoryOffender(final DispatchHost host) { private boolean isFrameOverboard(final RunningFrameInfo frame) { double rss = (double)frame.getRss(); double maxRss = (double)frame.getMaxRss(); + final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD = + env.getRequiredProperty("dispatcher.oom_frame_overboard_allowed_threshold", Double.class); final double MAX_RSS_OVERBOARD_THRESHOLD = OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD * 2; final double RSS_AVAILABLE_FOR_MAX_RSS_TRIGGER = 0.1; @@ -659,7 +740,6 @@ private void handleMemoryReservations(final RunningFrameInfo frame) { * @param rFrames */ private void killTimedOutFrames(HostReport report) { - final Map layers = new HashMap(5); for (RunningFrameInfo frame: report.getFramesList()) { @@ -667,36 +747,16 @@ private void killTimedOutFrames(HostReport report) { LayerDetail layer = layerDao.getLayerDetail(layerId); long runtimeMinutes = ((System.currentTimeMillis() - frame.getStartTime()) / 1000l) / 60; - if (layer.timeout != 0 && runtimeMinutes > layer.timeout){ - try { - killQueue.execute(new DispatchRqdKillFrame(report.getHost().getName(), - frame.getFrameId(), - "This frame has reached it timeout.", - rqdClient)); - } catch (TaskRejectedException e) { - logger.warn("Unable to queue RQD kill, task rejected, " + e); - } - } - - if (layer.timeout_llu == 0){ - continue; - } - - if (frame.getLluTime() == 0){ - continue; - } + String hostname = report.getHost().getName(); - long r = System.currentTimeMillis() / 1000; - long lastUpdate = (r - frame.getLluTime()) / 60; + if (layer.timeout != 0 && runtimeMinutes > layer.timeout){ + killFrame(frame.getFrameId(), hostname, KillCause.FrameTimedOut); + } else if (layer.timeout_llu != 0 && frame.getLluTime() != 0) { + long r = System.currentTimeMillis() / 1000; + long lastUpdate = (r - frame.getLluTime()) / 60; - if (layer.timeout_llu != 0 && lastUpdate > (layer.timeout_llu -1)){ - try { - killQueue.execute(new DispatchRqdKillFrame(report.getHost().getName(), - frame.getFrameId(), - "This frame has reached it LLU timeout.", - rqdClient)); - } catch (TaskRejectedException e) { - logger.warn("Unable to queue RQD kill, task rejected, " + e); + if (layer.timeout_llu != 0 && lastUpdate > (layer.timeout_llu - 1)){ + killFrame(frame.getFrameId(), hostname, KillCause.FrameLluTimedOut); } } } @@ -822,98 +882,59 @@ public void verifyRunningFrameInfo(HostReport report) { continue; } + if (hostManager.verifyRunningProc(runningFrame.getResourceId(), runningFrame.getFrameId())) { + runningFrames.add(runningFrame); + continue; + } - if (!hostManager.verifyRunningProc(runningFrame.getResourceId(), - runningFrame.getFrameId())) { - - /* - * The frame this proc is running is no longer - * assigned to this proc. Don't ever touch - * the frame record. If we make it here that means - * the proc has been running for over 2 min. - */ - - String msg; - VirtualProc proc = null; + /* + * The frame this proc is running is no longer + * assigned to this proc. Don't ever touch + * the frame record. If we make it here that means + * the proc has been running for over 2 min. + */ + String msg; + VirtualProc proc = null; - try { - proc = hostManager.getVirtualProc(runningFrame.getResourceId()); - msg = "Virutal proc " + proc.getProcId() + + try { + proc = hostManager.getVirtualProc(runningFrame.getResourceId()); + msg = "Virtual proc " + proc.getProcId() + "is assigned to " + proc.getFrameId() + " not " + runningFrame.getFrameId(); - } - catch (Exception e) { - /* - * This will happen if the host goes off line and then - * comes back. In this case, we don't touch the frame - * since it might already be running somewhere else. We - * do however kill the proc. - */ - msg = "Virtual proc did not exist."; - } - - logger.info("warning, the proc " + - runningFrame.getResourceId() + " on host " + - report.getHost().getName() + " was running for " + - (runtimeSeconds / 60.0f) + " minutes " + - runningFrame.getJobName() + "/" + runningFrame.getFrameName() + - "but the DB did not " + - "reflect this " + - msg); - - DispatchSupport.accountingErrors.incrementAndGet(); - - try { - /* - * If the proc did exist unbook it if we can't - * verify its running something. - */ - boolean rqd_kill = false; - if (proc != null) { - - /* - * Check to see if the proc is an orphan. - */ - if (hostManager.isOprhan(proc)) { - dispatchSupport.clearVirtualProcAssignement(proc); - dispatchSupport.unbookProc(proc); - rqd_kill = true; - } - } - else { - /* Proc doesn't exist so a kill won't hurt */ - rqd_kill = true; - } - - if (rqd_kill) { - try { - killQueue.execute(new DispatchRqdKillFrame(report.getHost().getName(), - runningFrame.getFrameId(), - "OpenCue could not verify this frame.", - rqdClient)); - } catch (TaskRejectedException e) { - logger.warn("Unable to queue RQD kill, task rejected, " + e); - } - } + } + catch (Exception e) { + /* + * This will happen if the host goes offline and then + * comes back. In this case, we don't touch the frame + * since it might already be running somewhere else. We + * do however kill the proc. + */ + msg = "Virtual proc did not exist."; + } - } catch (RqdClientException rqde) { - logger.warn("failed to kill " + - runningFrame.getJobName() + "/" + - runningFrame.getFrameName() + - " when trying to clear a failed " + - " frame verification, " + rqde); - - } catch (Exception e) { - CueExceptionUtil.logStackTrace("failed", e); - logger.warn("failed to verify " + - runningFrame.getJobName() + "/" + - runningFrame.getFrameName() + - " was running but the frame was " + - " unable to be killed, " + e); - } + DispatchSupport.accountingErrors.incrementAndGet(); + if (proc != null && hostManager.isOprhan(proc)) { + dispatchSupport.clearVirtualProcAssignement(proc); + dispatchSupport.unbookProc(proc); + proc = null; } - else { - runningFrames.add(runningFrame); + if (proc == null) { + if (killFrame(runningFrame.getFrameId(), + report.getHost().getName(), + KillCause.FrameVerificationFailure)) { + logger.info("FrameVerificationError, the proc " + + runningFrame.getResourceId() + " on host " + + report.getHost().getName() + " was running for " + + (runtimeSeconds / 60.0f) + " minutes " + + runningFrame.getJobName() + "/" + runningFrame.getFrameName() + + "but the DB did not " + + "reflect this. " + + msg); + } else { + logger.warn("FrameStuckWarning: frameId=" + runningFrame.getFrameId() + + " render_node=" + report.getHost().getName() + " - " + + runningFrame.getJobName() + "/" + runningFrame.getFrameName()); + } } } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java index 83d5f188c..53aff120e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java @@ -53,12 +53,14 @@ public DispatchRqdKillFrameMemory(String hostname, FrameInterface frame, String public void run() { long startTime = System.currentTimeMillis(); try { - if (!isTestMode) { + if (dispatchSupport.updateFrameMemoryError(frame) && !isTestMode) { rqdClient.killFrame(hostname, frame.getFrameId(), message); + } else { + logger.warn("Could not update frame " + frame.getFrameId() + + " status to EXIT_STATUS_MEMORY_FAILURE. Canceling kill request!"); } - dispatchSupport.updateFrameMemoryError(frame); } catch (RqdClientException e) { - logger.info("Failed to contact host " + hostname + ", " + e); + logger.warn("Failed to contact host " + hostname + ", " + e); } finally { long elapsedTime = System.currentTimeMillis() - startTime; logger.info("RQD communication with " + hostname + diff --git a/cuebot/src/main/resources/opencue.properties b/cuebot/src/main/resources/opencue.properties index 6b2875899..b7f2a23ff 100644 --- a/cuebot/src/main/resources/opencue.properties +++ b/cuebot/src/main/resources/opencue.properties @@ -130,7 +130,21 @@ dispatcher.booking_queue.max_pool_size=6 # Queue capacity for booking. dispatcher.booking_queue.queue_capacity=1000 -# Whether or not to satisfy dependents (*_ON_FRAME and *_ON_LAYER) only on Frame success +# Percentage of used memory to consider a risk for triggering oom-killer +dispatcher.oom_max_safe_used_memory_threshold=0.95 + +# How much can a frame exceed its reserved memory. +# - 0.5 means 50% above reserve +# - -1.0 makes the feature inactive +# This feature is being kept inactive for now as we work on improving the +# frame retry logic (See commit comment for more details). +dispatcher.oom_frame_overboard_allowed_threshold=-1.0 + +# How many times should cuebot send a kill request for the same frame-host before reporting +# the frame as stuck +dispatcher.frame_kill_retry_limit=3 + +# Whether to satisfy dependents (*_ON_FRAME and *_ON_LAYER) only on Frame success depend.satisfy_only_on_frame_success=true # Jobs will be archived to the history tables after being completed for this long. diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java index 9538aa4fb..25499406d 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java @@ -458,6 +458,8 @@ public void testMemoryAndLlu() { @Rollback(true) public void testMemoryAggressionRss() { jobLauncher.testMode = true; + dispatcher.setTestMode(true); + jobLauncher.launch(new File("src/test/resources/conf/jobspec/jobspec_simple.xml")); DispatchHost host = getHost(hostname); @@ -465,8 +467,8 @@ public void testMemoryAggressionRss() { assertEquals(1, procs.size()); VirtualProc proc = procs.get(0); - long memoryOverboard = (long) Math.ceil((double) proc.memoryReserved * - (1.0 + Dispatcher.OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD)); + // 1.6 = 1 + dispatcher.oom_frame_overboard_allowed_threshold + long memoryOverboard = (long) Math.ceil((double) proc.memoryReserved * 1.6); // Test rss overboard RunningFrameInfo info = RunningFrameInfo.newBuilder() @@ -493,6 +495,7 @@ public void testMemoryAggressionRss() { @Rollback(true) public void testMemoryAggressionMaxRss() { jobLauncher.testMode = true; + dispatcher.setTestMode(true); jobLauncher.launch(new File("src/test/resources/conf/jobspec/jobspec_simple.xml")); DispatchHost host = getHost(hostname); @@ -500,8 +503,9 @@ public void testMemoryAggressionMaxRss() { assertEquals(1, procs.size()); VirtualProc proc = procs.get(0); + // 0.6 = dispatcher.oom_frame_overboard_allowed_threshold long memoryOverboard = (long) Math.ceil((double) proc.memoryReserved * - (1.0 + (2 * Dispatcher.OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD))); + (1.0 + (2 * 0.6))); // Test rss>90% and maxRss overboard RunningFrameInfo info = RunningFrameInfo.newBuilder() diff --git a/cuebot/src/test/resources/opencue.properties b/cuebot/src/test/resources/opencue.properties index fed3f8e99..10516f881 100644 --- a/cuebot/src/test/resources/opencue.properties +++ b/cuebot/src/test/resources/opencue.properties @@ -64,9 +64,8 @@ dispatcher.kill_queue.queue_capacity=1000 dispatcher.booking_queue.core_pool_size=6 dispatcher.booking_queue.max_pool_size=6 dispatcher.booking_queue.queue_capacity=1000 - -# The minimum amount of free space in the temporary directory (mcp) to book a host. -# E.g: 1G = 1048576 kB => dispatcher.min_bookable_free_temp_dir_kb=1048576 -# Default = 1G = 1048576 kB -# If equals to -1, it means the feature is turned off -dispatcher.min_bookable_free_temp_dir_kb=1048576 \ No newline at end of file +dispatcher.min_bookable_free_temp_dir_kb=1048576 +dispatcher.min_bookable_free_mcp_kb=1048576 +dispatcher.oom_max_safe_used_memory_threshold=0.95 +dispatcher.oom_frame_overboard_allowed_threshold=0.6 +dispatcher.frame_kill_retry_limit=3 \ No newline at end of file diff --git a/rqd/rqd/rqdservicers.py b/rqd/rqd/rqdservicers.py index 98ab358ac..b736ef43b 100644 --- a/rqd/rqd/rqdservicers.py +++ b/rqd/rqd/rqdservicers.py @@ -67,6 +67,8 @@ def KillRunningFrame(self, request, context): frame = self.rqCore.getRunningFrame(request.frame_id) if frame: frame.kill(message=request.message) + else: + log.warning("Wasn't able to find frame(%s) to kill", request.frame_id) return rqd.compiled_proto.rqd_pb2.RqdStaticKillRunningFrameResponse() def ShutdownRqdNow(self, request, context): diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py index 9f33e64a2..9d3591fdd 100644 --- a/rqd/rqd/rqnetwork.py +++ b/rqd/rqd/rqnetwork.py @@ -162,6 +162,7 @@ def kill(self, message=""): else: os.killpg(self.pid, rqd.rqconstants.KILL_SIGNAL) finally: + log.warning("kill() successfully killed frameId=%s pid=%s", self.frameId, self.pid) rqd.rqutil.permissionsLow() except OSError as e: log.warning(