From b3e433b78757b76b462c769d867dfa888b7d72e3 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Thu, 20 Jun 2024 10:06:57 -0700 Subject: [PATCH] Make oom_kill logic less aggressive (#1388) The logic was impacting more jobs than it needed to when trying to protect a host from reaching OOM state. This change filters the list of jobs to only target jobs that are using more than they had initially reserved. --- .../java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java | 2 ++ .../com/imageworks/spcue/dispatcher/HostReportHandler.java | 2 +- .../main/java/com/imageworks/spcue/service/HostManager.java | 2 +- cuebot/src/main/resources/opencue.properties | 2 +- .../com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java | 4 +++- .../spcue/test/dispatcher/HostReportHandlerTests.java | 3 +++ 6 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java index 5ae4d5a31..586d1f1df 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java @@ -600,6 +600,8 @@ public boolean increaseReservedMemory(ProcInterface p, long value) { "host.pk_host = ? " + "AND " + "proc.int_mem_reserved != 0 " + + "AND " + + "proc.int_virt_used >= proc.int_mem_pre_reserved " + "ORDER BY " + "proc.int_virt_used / proc.int_mem_pre_reserved DESC " + ") AS t1 LIMIT 1"; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index b7d39f8dd..1c042f110 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -649,7 +649,7 @@ private VirtualProc killWorstMemoryOffender(final DispatchHost host) { return proc; } catch (EmptyResultDataAccessException e) { - logger.error(host.name + " is under OOM and no proc is running on it."); + logger.error(host.name + " is under OOM and no proc is memory overboard."); return null; } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java b/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java index e62d8647b..ce5f861f8 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java @@ -173,7 +173,7 @@ void setHostStatistics(HostInterface host, void unbookProc(ProcInterface proc); /** - * Returns the proc who is most deliquent on memory allocation + * For a given host, return the proc using more memory above what it had initially reserved * @param h * @return */ diff --git a/cuebot/src/main/resources/opencue.properties b/cuebot/src/main/resources/opencue.properties index b7f2a23ff..4dd691e07 100644 --- a/cuebot/src/main/resources/opencue.properties +++ b/cuebot/src/main/resources/opencue.properties @@ -131,7 +131,7 @@ dispatcher.booking_queue.max_pool_size=6 dispatcher.booking_queue.queue_capacity=1000 # Percentage of used memory to consider a risk for triggering oom-killer -dispatcher.oom_max_safe_used_memory_threshold=0.95 +dispatcher.oom_max_safe_used_memory_threshold=0.98 # How much can a frame exceed its reserved memory. # - 0.5 means 50% above reserve diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java index 935e311ca..7504fa751 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java @@ -596,7 +596,9 @@ public void testFindReservedMemoryOffender() { // Increase the memory usage as frames are added procDao.updateProcMemoryUsage(frame, - 1000*i, 1000*i, 1000*i, 1000*i, 0, 0, children); + 1000*i, 1000*i, + Dispatcher.MEM_RESERVED_DEFAULT*i, Dispatcher.MEM_RESERVED_DEFAULT*i, + 0, 0, children); i++; } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java index 971df8d14..b610ff11c 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java @@ -548,6 +548,7 @@ public void testMemoryAggressionMemoryWarning() { .setLayerId(proc1.getLayerId()) .setFrameId(proc1.getFrameId()) .setResourceId(proc1.getProcId()) + .setVsize(CueUtil.GB2) .setRss(CueUtil.GB2) .setMaxRss(CueUtil.GB2) .build(); @@ -558,6 +559,7 @@ public void testMemoryAggressionMemoryWarning() { .setLayerId(proc2.getLayerId()) .setFrameId(proc2.getFrameId()) .setResourceId(proc2.getProcId()) + .setVsize(CueUtil.GB4) .setRss(CueUtil.GB4) .setMaxRss(CueUtil.GB4) .build(); @@ -569,6 +571,7 @@ public void testMemoryAggressionMemoryWarning() { .setLayerId(proc3.getLayerId()) .setFrameId(proc3.getFrameId()) .setResourceId(proc3.getProcId()) + .setVsize(memoryUsedProc3) .setRss(memoryUsedProc3) .setMaxRss(memoryUsedProc3) .build();