[ML] Improve hard_limit audit message (elastic#486)

Add the current model memory limit and the number of bytes in excess of that at the point of the last allocation failure to the model size stats. These will be used to construct a (hopefully) more informative hard_limit audit message. The reported memory usage is also scaled to take into account the byte limit margin, which is in play in the initial period of a jobs' lifetime and is used to scale down the high memory limit. This should give a more accurate representation of how close the memory usage is to the high limit. relates elastic/elasticsearch#42086 closes elastic/elasticsearch#38034
edsavage · May 18, 2019 · 7e1b5b0 · 7e1b5b0
1 parent 610c114
commit 7e1b5b0
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 35 deletions.
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
 
 * Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
 
+* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
+
 === Bug Fixes
 
 * Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h
@@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
 public:
     struct MODEL_EXPORT SResults {
         std::size_t s_Usage;
+        std::size_t s_AdjustedUsage;
         std::size_t s_ByFields;
         std::size_t s_PartitionFields;
         std::size_t s_OverFields;
         std::size_t s_AllocationFailures;
         model_t::EMemoryStatus s_MemoryStatus;
         core_t::TTime s_BucketStartTime;
+        std::size_t s_BytesExceeded;
+        std::size_t s_BytesMemoryLimit;
     };
 
 public:
@@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
     //! taking up too much memory and further allocations should be banned
     bool areAllocationsAllowed() const;
 
-    //! Query the resource monitor to found out if it's Ok to
-    //! create structures of a certain size
-    bool areAllocationsAllowed(std::size_t size) const;
-
     //! Return the amount of remaining space for allocations
     std::size_t allocationLimit() const;
 
@@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
     //! Returns the sum of used memory plus any extra memory
     std::size_t totalMemory() const;
 
+    //! Adjusts the amount of memory reported to take into
+    //! account the current value of the byte limit margin and the effects
+    //! of background persistence.
+    std::size_t adjustedUsage(std::size_t usage) const;
+
 private:
     //! The registered collection of components
     TDetectorPtrSizeUMap m_Detectors;
@@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
     //! Don't do any sort of memory checking if this is set
     bool m_NoLimit;
 
+    //! The number of bytes over the high limit for memory usage at the last allocation failure
+    std::size_t m_CurrentBytesExceeded;
+
     //! Test friends
     friend class ::CResourceMonitorTest;
     friend class ::CResourceLimitTest;

diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc
@@ -16,6 +16,8 @@ namespace {
 const std::string JOB_ID("job_id");
 const std::string MODEL_SIZE_STATS("model_size_stats");
 const std::string MODEL_BYTES("model_bytes");
+const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
+const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
 const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
 const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
 const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
@@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
 
     writer.String(JOB_ID);
     writer.String(jobId);
+
     writer.String(MODEL_BYTES);
-    // Background persist causes the memory size to double due to copying
-    // the models. On top of that, after the persist is done we may not
-    // be able to retrieve that memory back. Thus, we report twice the
-    // memory usage in order to allow for that.
-    // See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
-    // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
-    // discusses adding an option to perform only foreground persist.
-    // If that gets implemented, we should only double when background
-    // persist is configured.
-    writer.Uint64(results.s_Usage * 2);
+    writer.Uint64(results.s_AdjustedUsage);
+
+    writer.String(MODEL_BYTES_EXCEEDED);
+    writer.Uint64(results.s_BytesExceeded);
+
+    writer.String(MODEL_BYTES_MEMORY_LIMIT);
+    writer.Uint64(results.s_BytesMemoryLimit);
 
     writer.String(TOTAL_BY_FIELD_COUNT);
     writer.Uint64(results.s_ByFields);

diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc
@@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
 
         ml::model::CResourceMonitor::SResults resourceUsage;
         resourceUsage.s_Usage = 1;
-        resourceUsage.s_ByFields = 2;
-        resourceUsage.s_PartitionFields = 3;
-        resourceUsage.s_OverFields = 4;
-        resourceUsage.s_AllocationFailures = 5;
+        resourceUsage.s_AdjustedUsage = 2;
+        resourceUsage.s_ByFields = 3;
+        resourceUsage.s_PartitionFields = 4;
+        resourceUsage.s_OverFields = 5;
+        resourceUsage.s_AllocationFailures = 6;
         resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
-        resourceUsage.s_BucketStartTime = 6;
+        resourceUsage.s_BucketStartTime = 7;
+        resourceUsage.s_BytesExceeded = 8;
+        resourceUsage.s_BytesMemoryLimit = 9;
 
         writer.reportMemoryUsage(resourceUsage);
         writer.endOutputBatch(false, 1ul);
@@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
     CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
     CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
-    CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
-    CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
-    CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
-    CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
-    CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
     CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
                          std::string(sizeStats["memory_status"].GetString()));
     CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
     int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
     CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
     CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
+    CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
+    CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
+    CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
+    CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
 }
 
 void CJsonOutputWriterTest::testWriteScheduledEvent() {

diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc
@@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
     {
         model::CResourceMonitor::SResults modelSizeStats{
             10000,                     // bytes used
+            20000,                     // bytes used (adjusted)
             3,                         // # by fields
             1,                         // # partition fields
             150,                       // # over fields
             4,                         // # allocation failures
             model_t::E_MemoryStatusOk, // memory status
-            core_t::TTime(1521046309)  // bucket start time
+            core_t::TTime(1521046309), // bucket start time
+            0,                         // model bytes exceeded
+            50000                      // model bytes memory limit
         };
 
         CModelSnapshotJsonWriter::SModelSnapshotReport report{
@@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
     CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
     CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
     CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
+    CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
+    CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
+    CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
+    CPPUNIT_ASSERT_EQUAL(int64_t(50000),
+                         modelSizeStats["model_bytes_memory_limit"].GetInt64());
 
     CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
     const rapidjson::Value& quantiles = snapshot["quantiles"];

diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc
@@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
       m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
       m_PruneWindow(std::numeric_limits<std::size_t>::max()),
       m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
-      m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
+      m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
+      m_NoLimit(false), m_CurrentBytesExceeded(0) {
     this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
 }
 
@@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {
 
 void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
     this->memUsage(&detector);
-    core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
-    LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());
+
     this->updateAllowAllocations();
 }
 
 void CResourceMonitor::updateAllowAllocations() {
     std::size_t total{this->totalMemory()};
+    core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
+    LOG_TRACE(<< "Checking allocations: currently at " << total);
     if (m_AllowAllocations) {
         if (total > this->highLimit()) {
             LOG_INFO(<< "Over current allocation high limit. " << total
                      << " bytes used, the limit is " << this->highLimit());
             m_AllowAllocations = false;
+            std::size_t bytesExceeded{total - this->highLimit()};
+            m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
         }
     } else if (total < this->lowLimit()) {
         LOG_INFO(<< "Below current allocation low limit. " << total
@@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
     return m_AllowAllocations;
 }
 
-bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
-    if (m_AllowAllocations) {
-        return this->totalMemory() + size < this->highLimit();
-    }
-    return false;
-}
-
 std::size_t CResourceMonitor::allocationLimit() const {
     return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
 }
@@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
     res.s_OverFields = 0;
     res.s_PartitionFields = 0;
     res.s_Usage = this->totalMemory();
+    res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
+    res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
+    res.s_BytesExceeded = m_CurrentBytesExceeded;
     res.s_AllocationFailures = 0;
     res.s_MemoryStatus = m_MemoryStatus;
     res.s_BucketStartTime = bucketStartTime;
@@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
     return res;
 }
 
+std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
+    // Background persist causes the memory size to double due to copying
+    // the models. On top of that, after the persist is done we may not
+    // be able to retrieve that memory back. Thus, we report twice the
+    // memory usage in order to allow for that.
+    // See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
+    // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
+    // discusses adding an option to perform only foreground persist.
+    // If that gets implemented, we should only double when background
+    // persist is configured.
+
+    // We also scale the reported memory usage by the inverse of the byte limit margin.
+    // This gives the user a fairer indication of how close the job is to hitting
+    // the model memory limit in a concise manner (as the limit is scaled down by
+    // the margin during the beginning period of the job's existence).
+    size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
+    return adjustedUsage;
+}
+
 void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
     m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
     ++m_AllocationFailures[time];