From a4c832dfd4e258de73bffa34d8e5b95246722724 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 18 Sep 2023 16:53:08 -0700 Subject: [PATCH] Frames killed for OOM should be retried (cherry picked from commit b88f7bcb1ad43f83fb8357576c33483dc2bf4952) --- .../com/imageworks/spcue/dao/FrameDao.java | 7 ++ .../spcue/dao/postgres/FrameDaoJdbc.java | 18 +++++ .../spcue/dispatcher/DispatchSupport.java | 7 ++ .../dispatcher/DispatchSupportService.java | 6 ++ .../dispatcher/FrameCompleteHandler.java | 72 +++++++++---------- .../spcue/dispatcher/HostReportHandler.java | 60 ++++++++-------- .../commands/DispatchRqdKillFrame.java | 20 +----- .../commands/DispatchRqdKillFrameMemory.java | 69 ++++++++++++++++++ .../spring/applicationContext-service.xml | 1 - .../dispatcher/FrameCompleteHandlerTests.java | 6 +- .../dispatcher/HostReportHandlerTests.java | 43 ++++++++++- 11 files changed, 222 insertions(+), 87 deletions(-) create mode 100644 cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java index 7c2e3c050..8b68f970e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java @@ -202,6 +202,13 @@ boolean updateFrameStopped(FrameInterface frame, FrameState state, int exitStatu * @return */ boolean updateFrameCleared(FrameInterface frame); + /** + * Sets a frame exitStatus to EXIT_STATUS_MEMORY_FAILURE + * + * @param frame + * @return + */ + boolean updateFrameMemoryError(FrameInterface frame); /** * Sets a frame to an unreserved waiting state. diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java index 3c752ad96..21c197a3b 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java @@ -155,6 +155,24 @@ public boolean updateFrameCleared(FrameInterface frame) { return updateFrame(frame, Dispatcher.EXIT_STATUS_FRAME_CLEARED) > 0; } + private static final String UPDATE_FRAME_MEMORY_ERROR = + "UPDATE "+ + "frame "+ + "SET " + + "int_exit_status = ?, " + + "int_version = int_version + 1 " + + "WHERE " + + "frame.pk_frame = ? "; + @Override + public boolean updateFrameMemoryError(FrameInterface frame) { + int result = getJdbcTemplate().update( + UPDATE_FRAME_MEMORY_ERROR, + Dispatcher.EXIT_STATUS_MEMORY_FAILURE, + frame.getFrameId()); + + return result > 0; + } + private static final String UPDATE_FRAME_STARTED = "UPDATE " + "frame " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java index 4accd0f8d..eca3939ed 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java @@ -415,6 +415,13 @@ List findNextDispatchFrames(LayerInterface layer, VirtualProc pro */ void clearFrame(DispatchFrame frame); + /** + * Sets the frame state exitStatus to EXIT_STATUS_MEMORY_FAILURE + * + * @param frame + */ + void updateFrameMemoryError(FrameInterface frame); + /** * Update Memory usage data and LLU time for the given frame. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index 887d0c29e..1d2fead26 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -343,6 +343,12 @@ public void clearFrame(DispatchFrame frame) { frameDao.updateFrameCleared(frame); } + @Override + @Transactional(propagation = Propagation.REQUIRED) + public void updateFrameMemoryError(FrameInterface frame) { + frameDao.updateFrameMemoryError(frame); + } + @Transactional(propagation = Propagation.SUPPORTS) public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) { int threads = proc.coresReserved / 100; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java index 55336aaf4..6e2653ac8 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java @@ -143,49 +143,35 @@ public void handleFrameCompleteReport(final FrameCompleteReport report) { } try { - - final VirtualProc proc; - - try { - - proc = hostManager.getVirtualProc( - report.getFrame().getResourceId()); - } - catch (EmptyResultDataAccessException e) { - /* - * Do not propagate this exception to RQD. This - * usually means the cue lost connectivity to - * the host and cleared out the record of the proc. - * If this is propagated back to RQD, RQD will - * keep retrying the operation forever. - */ - logger.info("failed to acquire data needed to " + - "process completed frame: " + - report.getFrame().getFrameName() + " in job " + - report.getFrame().getJobName() + "," + e); - return; - } - + final VirtualProc proc = hostManager.getVirtualProc(report.getFrame().getResourceId()); final DispatchJob job = jobManager.getDispatchJob(proc.getJobId()); final LayerDetail layer = jobManager.getLayerDetail(report.getFrame().getLayerId()); + final FrameDetail frameDetail = jobManager.getFrameDetail(report.getFrame().getFrameId()); final DispatchFrame frame = jobManager.getDispatchFrame(report.getFrame().getFrameId()); final FrameState newFrameState = determineFrameState(job, layer, frame, report); final String key = proc.getJobId() + "_" + report.getFrame().getLayerId() + "_" + report.getFrame().getFrameId(); + if (dispatchSupport.stopFrame(frame, newFrameState, report.getExitStatus(), report.getFrame().getMaxRss())) { - dispatchQueue.execute(new KeyRunnable(key) { - @Override - public void run() { - try { - handlePostFrameCompleteOperations(proc, report, job, frame, - newFrameState); - } catch (Exception e) { - logger.warn("Exception during handlePostFrameCompleteOperations " + - "in handleFrameCompleteReport" + CueExceptionUtil.getStackTrace(e)); + if (dispatcher.isTestMode()) { + // Database modifications on a threadpool cannot be captured by the test thread + handlePostFrameCompleteOperations(proc, report, job, frame, + newFrameState, frameDetail); + } else { + dispatchQueue.execute(new KeyRunnable(key) { + @Override + public void run() { + try { + handlePostFrameCompleteOperations(proc, report, job, frame, + newFrameState, frameDetail); + } catch (Exception e) { + logger.warn("Exception during handlePostFrameCompleteOperations " + + "in handleFrameCompleteReport" + CueExceptionUtil.getStackTrace(e)); + } } - } - }); + }); + } } else { /* @@ -222,6 +208,19 @@ public void run() { } } } + catch (EmptyResultDataAccessException e) { + /* + * Do not propagate this exception to RQD. This + * usually means the cue lost connectivity to + * the host and cleared out the record of the proc. + * If this is propagated back to RQD, RQD will + * keep retrying the operation forever. + */ + logger.info("failed to acquire data needed to " + + "process completed frame: " + + report.getFrame().getFrameName() + " in job " + + report.getFrame().getJobName() + "," + e); + } catch (Exception e) { /* @@ -259,7 +258,7 @@ public void run() { */ public void handlePostFrameCompleteOperations(VirtualProc proc, FrameCompleteReport report, DispatchJob job, DispatchFrame frame, - FrameState newFrameState) { + FrameState newFrameState, FrameDetail frameDetail) { try { /* @@ -313,7 +312,8 @@ public void handlePostFrameCompleteOperations(VirtualProc proc, * specified in the show's service override, service or 2GB. */ if (report.getExitStatus() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE - || report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE) { + || report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE + || frameDetail.exitStatus == Dispatcher.EXIT_STATUS_MEMORY_FAILURE) { long increase = CueUtil.GB2; // since there can be multiple services, just going for the diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 6cec1925d..69a1ced54 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -63,7 +63,6 @@ import com.imageworks.spcue.service.CommentManager; import com.imageworks.spcue.service.HostManager; import com.imageworks.spcue.service.JobManager; -import com.imageworks.spcue.service.JobManagerSupport; import com.imageworks.spcue.util.CueExceptionUtil; import com.imageworks.spcue.util.CueUtil; @@ -83,7 +82,6 @@ public class HostReportHandler { private Dispatcher localDispatcher; private RqdClient rqdClient; private JobManager jobManager; - private JobManagerSupport jobManagerSupport; private JobDao jobDao; private LayerDao layerDao; @Autowired @@ -497,7 +495,7 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report) // them accordingly for (final RunningFrameInfo frame: runningFrames) { if (isFrameOverboard(frame)) { - if (!killFrame(frame, host.getName())) { + if (!killFrameOverusingMemory(frame, host.getName())) { logger.warn("Frame " + frame.getJobName() + "." + frame.getFrameName() + " is overboard but could not be killed"); } @@ -508,7 +506,7 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report) } } - private boolean killFrame(RunningFrameInfo frame, String hostName) { + private boolean killFrameOverusingMemory(RunningFrameInfo frame, String hostname) { try { VirtualProc proc = hostManager.getVirtualProc(frame.getResourceId()); @@ -519,13 +517,30 @@ private boolean killFrame(RunningFrameInfo frame, String hostName) { logger.info("Killing frame on " + frame.getJobName() + "." + frame.getFrameName() + ", using too much memory."); - DispatchSupport.killedOffenderProcs.incrementAndGet(); + return killProcForMemory(proc, hostname, "This frame is using way more than it had reserved."); + } catch (EmptyResultDataAccessException e) { + return false; + } + } - if (!dispatcher.isTestMode()) { - jobManagerSupport.kill(proc, new Source("This frame is using way more than it had reserved.")); + private boolean killProcForMemory(VirtualProc proc, String hostname, String reason) { + try { + FrameInterface frame = jobManager.getFrame(proc.frameId); + + if (dispatcher.isTestMode()) { + // For testing, don't run on a different threadpool, as different threads don't share + // the same database state + (new DispatchRqdKillFrameMemory(proc.hostName, frame, reason, rqdClient, + dispatchSupport, dispatcher.isTestMode())).run(); + } else { + killQueue.execute(new DispatchRqdKillFrameMemory(proc.hostName, frame, reason, rqdClient, + dispatchSupport, dispatcher.isTestMode())); + prometheusMetrics.incrementFrameOomKilledCounter(hostname); } + DispatchSupport.killedOffenderProcs.incrementAndGet(); return true; - } catch (EmptyResultDataAccessException e) { + } catch (TaskRejectedException e) { + logger.warn("Unable to queue RQD kill, task rejected, " + e); return false; } } @@ -534,26 +549,23 @@ private boolean killFrame(RunningFrameInfo frame, String hostName) { * Kill proc with the worst user/reserved memory ratio. * * @param host - * @return killed proc, or null if none could be found + * @return killed proc, or null if none could be found or failed to be killed */ private VirtualProc killWorstMemoryOffender(final DispatchHost host) { - VirtualProc proc; try { - proc = hostManager.getWorstMemoryOffender(host); + VirtualProc proc = hostManager.getWorstMemoryOffender(host); + logger.info("Killing frame on " + proc.getName() + ", host is under stress."); + + if (!killProcForMemory(proc, host.getName(), "The host was dangerously low on memory and swapping.")) { + // Returning null will prevent the caller from overflowing the kill queue with more messages + proc = null; + } + return proc; } catch (EmptyResultDataAccessException e) { logger.error(host.name + " is under OOM and no proc is running on it."); return null; } - - logger.info("Killing frame on " + proc.getName() + ", host is under stress."); - DispatchSupport.killedOffenderProcs.incrementAndGet(); - - if (!dispatcher.isTestMode()) { - jobManagerSupport.kill(proc, new Source("The host was dangerously low on memory and swapping.")); - } - - return proc; } /** @@ -962,14 +974,6 @@ public void setJobManager(JobManager jobManager) { this.jobManager = jobManager; } - public JobManagerSupport getJobManagerSupport() { - return jobManagerSupport; - } - - public void setJobManagerSupport(JobManagerSupport jobManagerSupport) { - this.jobManagerSupport = jobManagerSupport; - } - public JobDao getJobDao() { return jobDao; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrame.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrame.java index 61258a824..fe9bde60e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrame.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrame.java @@ -31,9 +31,7 @@ public class DispatchRqdKillFrame extends KeyRunnable { private static final Logger logger = LogManager.getLogger(DispatchRqdKillFrame.class); - private VirtualProc proc = null; private String message; - private String hostname; private String frameId; @@ -47,28 +45,14 @@ public DispatchRqdKillFrame(String hostname, String frameId, String message, Rqd this.rqdClient = rqdClient; } - public DispatchRqdKillFrame(VirtualProc proc, String message, RqdClient rqdClient) { - super("disp_rqd_kill_frame_" + proc.getProcId() + "_" + rqdClient.toString()); - this.proc = proc; - this.hostname = proc.hostName; - this.message = message; - this.rqdClient = rqdClient; - } - @Override public void run() { long startTime = System.currentTimeMillis(); try { - if (proc != null) { - rqdClient.killFrame(proc, message); - } - else { - rqdClient.killFrame(hostname, frameId, message); - } + rqdClient.killFrame(hostname, frameId, message); } catch (RqdClientException e) { logger.info("Failed to contact host " + hostname + ", " + e); - } - finally { + } finally { long elapsedTime = System.currentTimeMillis() - startTime; logger.info("RQD communication with " + hostname + " took " + elapsedTime + "ms"); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java new file mode 100644 index 000000000..83d5f188c --- /dev/null +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchRqdKillFrameMemory.java @@ -0,0 +1,69 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package com.imageworks.spcue.dispatcher.commands; + +import com.imageworks.spcue.FrameInterface; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.dispatcher.DispatchSupport; +import com.imageworks.spcue.rqd.RqdClient; +import com.imageworks.spcue.rqd.RqdClientException; +import org.apache.log4j.Logger; + +public class DispatchRqdKillFrameMemory extends KeyRunnable { + + private static final Logger logger = Logger.getLogger(DispatchRqdKillFrameMemory.class); + + private String message; + private String hostname; + private DispatchSupport dispatchSupport; + private final RqdClient rqdClient; + private final boolean isTestMode; + + private FrameInterface frame; + + public DispatchRqdKillFrameMemory(String hostname, FrameInterface frame, String message, RqdClient rqdClient, + DispatchSupport dispatchSupport, boolean isTestMode) { + super("disp_rqd_kill_frame_" + frame.getFrameId() + "_" + rqdClient.toString()); + this.frame = frame; + this.hostname = hostname; + this.message = message; + this.rqdClient = rqdClient; + this.dispatchSupport = dispatchSupport; + this.isTestMode = isTestMode; + } + + @Override + public void run() { + long startTime = System.currentTimeMillis(); + try { + if (!isTestMode) { + rqdClient.killFrame(hostname, frame.getFrameId(), message); + } + dispatchSupport.updateFrameMemoryError(frame); + } catch (RqdClientException e) { + logger.info("Failed to contact host " + hostname + ", " + e); + } finally { + long elapsedTime = System.currentTimeMillis() - startTime; + logger.info("RQD communication with " + hostname + + " took " + elapsedTime + "ms"); + } + } +} + diff --git a/cuebot/src/main/resources/conf/spring/applicationContext-service.xml b/cuebot/src/main/resources/conf/spring/applicationContext-service.xml index 35bd5cbb8..5aedc91b3 100644 --- a/cuebot/src/main/resources/conf/spring/applicationContext-service.xml +++ b/cuebot/src/main/resources/conf/spring/applicationContext-service.xml @@ -384,7 +384,6 @@ - diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java index d313c5293..1f452e92a 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java @@ -310,10 +310,11 @@ private void executeDepend( DispatchJob dispatchJob = jobManager.getDispatchJob(proc.getJobId()); DispatchFrame dispatchFrame = jobManager.getDispatchFrame(report.getFrame().getFrameId()); + FrameDetail frameDetail = jobManager.getFrameDetail(report.getFrame().getFrameId()); dispatchSupport.stopFrame(dispatchFrame, frameState, report.getExitStatus(), report.getFrame().getMaxRss()); frameCompleteHandler.handlePostFrameCompleteOperations(proc, - report, dispatchJob, dispatchFrame, frameState); + report, dispatchJob, dispatchFrame, frameState, frameDetail); assertTrue(jobManager.isLayerComplete(layerFirst)); assertFalse(jobManager.isLayerComplete(layerSecond)); @@ -401,10 +402,11 @@ private void executeMinMemIncrease(int expected, boolean override) { DispatchJob dispatchJob = jobManager.getDispatchJob(proc.getJobId()); DispatchFrame dispatchFrame = jobManager.getDispatchFrame(report.getFrame().getFrameId()); + FrameDetail frameDetail = jobManager.getFrameDetail(report.getFrame().getFrameId()); dispatchSupport.stopFrame(dispatchFrame, FrameState.DEAD, report.getExitStatus(), report.getFrame().getMaxRss()); frameCompleteHandler.handlePostFrameCompleteOperations(proc, - report, dispatchJob, dispatchFrame, FrameState.WAITING); + report, dispatchJob, dispatchFrame, FrameState.WAITING, frameDetail); assertFalse(jobManager.isLayerComplete(layer)); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java index ae9ae6e7c..9538aa4fb 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java @@ -28,6 +28,8 @@ import com.imageworks.spcue.dispatcher.DispatchSupport; import com.imageworks.spcue.dispatcher.HostReportQueue; +import com.imageworks.spcue.dispatcher.FrameCompleteHandler; +import com.imageworks.spcue.grpc.job.FrameState; import org.junit.Before; import org.junit.Test; import org.springframework.test.annotation.Rollback; @@ -47,6 +49,7 @@ import com.imageworks.spcue.grpc.report.HostReport; import com.imageworks.spcue.grpc.report.RenderHost; import com.imageworks.spcue.grpc.report.RunningFrameInfo; +import com.imageworks.spcue.grpc.report.FrameCompleteReport; import com.imageworks.spcue.service.AdminManager; import com.imageworks.spcue.service.CommentManager; import com.imageworks.spcue.service.HostManager; @@ -55,6 +58,7 @@ import com.imageworks.spcue.test.TransactionalTest; import com.imageworks.spcue.util.CueUtil; import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.LayerDetail; import java.util.UUID; @@ -73,6 +77,9 @@ public class HostReportHandlerTests extends TransactionalTest { @Resource HostReportHandler hostReportHandler; + @Resource + FrameCompleteHandler frameCompleteHandler; + @Resource Dispatcher dispatcher; @@ -552,13 +559,14 @@ public void testMemoryAggressionMemoryWarning() { .build(); // Overboard Rss + long memoryUsedProc3 = CueUtil.GB8; RunningFrameInfo info3 = RunningFrameInfo.newBuilder() .setJobId(proc3.getJobId()) .setLayerId(proc3.getLayerId()) .setFrameId(proc3.getFrameId()) .setResourceId(proc3.getProcId()) - .setRss(CueUtil.GB4) - .setMaxRss(CueUtil.GB4) + .setRss(memoryUsedProc3) + .setMaxRss(memoryUsedProc3) .build(); RenderHost hostAfterUpdate = getRenderHostBuilder(hostname).setFreeMem(0).build(); @@ -569,10 +577,41 @@ public void testMemoryAggressionMemoryWarning() { .addAllFrames(Arrays.asList(info1, info2, info3)) .build(); + // Get layer state before report gets sent + LayerDetail layerBeforeIncrease = jobManager.getLayerDetail(proc3.getLayerId()); + // In this case, killing one job should be enough to ge the machine to a safe state long killCount = DispatchSupport.killedOffenderProcs.get(); hostReportHandler.handleHostReport(report, false, System.currentTimeMillis()); assertEquals(killCount + 1, DispatchSupport.killedOffenderProcs.get()); + + // Confirm the frame will be set to retry after it's completion has been processed + + RunningFrameInfo runningFrame = RunningFrameInfo.newBuilder() + .setFrameId(proc3.getFrameId()) + .setFrameName("frame_name") + .setLayerId(proc3.getLayerId()) + .setRss(memoryUsedProc3) + .setMaxRss(memoryUsedProc3) + .setResourceId(proc3.id) + .build(); + FrameCompleteReport completeReport = FrameCompleteReport.newBuilder() + .setHost(hostAfterUpdate) + .setFrame(runningFrame) + .setExitSignal(9) + .setRunTime(1) + .setExitStatus(1) + .build(); + + frameCompleteHandler.handleFrameCompleteReport(completeReport); + FrameDetail killedFrame = jobManager.getFrameDetail(proc3.getFrameId()); + LayerDetail layer = jobManager.getLayerDetail(proc3.getLayerId()); + assertEquals(FrameState.WAITING, killedFrame.state); + // Memory increases are processed in two different places one will set the new value to proc.reserved + 2GB + // and the other will set to the maximum reported proc.maxRss the end value will be whoever is higher. + // In this case, proc.maxRss + assertEquals(Math.max(memoryUsedProc3, layerBeforeIncrease.getMinimumMemory() + CueUtil.GB2), + layer.getMinimumMemory()); } }