From 7e1b5b0fcbc38afa645c2249b1acbbc5a7148459 Mon Sep 17 00:00:00 2001 From: Ed Savage <32410745+edsavage@users.noreply.github.com> Date: Sat, 18 May 2019 18:39:30 -0400 Subject: [PATCH] [ML] Improve hard_limit audit message (#486) Add the current model memory limit and the number of bytes in excess of that at the point of the last allocation failure to the model size stats. These will be used to construct a (hopefully) more informative hard_limit audit message. The reported memory usage is also scaled to take into account the byte limit margin, which is in play in the initial period of a jobs' lifetime and is used to scale down the high memory limit. This should give a more accurate representation of how close the memory usage is to the high limit. relates https://github.com/elastic/elasticsearch/pull/42086 closes https://github.com/elastic/elasticsearch/issues/38034 --- docs/CHANGELOG.asciidoc | 2 + include/model/CResourceMonitor.h | 15 +++++-- lib/api/CModelSizeStatsJsonWriter.cc | 20 +++++----- lib/api/unittest/CJsonOutputWriterTest.cc | 27 ++++++++----- .../unittest/CModelSnapshotJsonWriterTest.cc | 10 ++++- lib/model/CResourceMonitor.cc | 39 ++++++++++++++----- 6 files changed, 78 insertions(+), 35 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index f66937e080..454d67135b 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].) * Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].) +* Improvements to hard_limit audit message. (See {ml-pull}486[#486].) + === Bug Fixes * Handle NaNs when detrending seasonal components. {ml-pull}408[#408] diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h index f1c3077d64..b6088d99de 100644 --- a/include/model/CResourceMonitor.h +++ b/include/model/CResourceMonitor.h @@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor { public: struct MODEL_EXPORT SResults { std::size_t s_Usage; + std::size_t s_AdjustedUsage; std::size_t s_ByFields; std::size_t s_PartitionFields; std::size_t s_OverFields; std::size_t s_AllocationFailures; model_t::EMemoryStatus s_MemoryStatus; core_t::TTime s_BucketStartTime; + std::size_t s_BytesExceeded; + std::size_t s_BytesMemoryLimit; }; public: @@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor { //! taking up too much memory and further allocations should be banned bool areAllocationsAllowed() const; - //! Query the resource monitor to found out if it's Ok to - //! create structures of a certain size - bool areAllocationsAllowed(std::size_t size) const; - //! Return the amount of remaining space for allocations std::size_t allocationLimit() const; @@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor { //! Returns the sum of used memory plus any extra memory std::size_t totalMemory() const; + //! Adjusts the amount of memory reported to take into + //! account the current value of the byte limit margin and the effects + //! of background persistence. + std::size_t adjustedUsage(std::size_t usage) const; + private: //! The registered collection of components TDetectorPtrSizeUMap m_Detectors; @@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor { //! Don't do any sort of memory checking if this is set bool m_NoLimit; + //! The number of bytes over the high limit for memory usage at the last allocation failure + std::size_t m_CurrentBytesExceeded; + //! Test friends friend class ::CResourceMonitorTest; friend class ::CResourceLimitTest; diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc index ff2b94286d..0959812912 100644 --- a/lib/api/CModelSizeStatsJsonWriter.cc +++ b/lib/api/CModelSizeStatsJsonWriter.cc @@ -16,6 +16,8 @@ namespace { const std::string JOB_ID("job_id"); const std::string MODEL_SIZE_STATS("model_size_stats"); const std::string MODEL_BYTES("model_bytes"); +const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded"); +const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit"); const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count"); const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count"); const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count"); @@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId, writer.String(JOB_ID); writer.String(jobId); + writer.String(MODEL_BYTES); - // Background persist causes the memory size to double due to copying - // the models. On top of that, after the persist is done we may not - // be able to retrieve that memory back. Thus, we report twice the - // memory usage in order to allow for that. - // See https://github.com/elastic/x-pack-elasticsearch/issues/1020. - // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857 - // discusses adding an option to perform only foreground persist. - // If that gets implemented, we should only double when background - // persist is configured. - writer.Uint64(results.s_Usage * 2); + writer.Uint64(results.s_AdjustedUsage); + + writer.String(MODEL_BYTES_EXCEEDED); + writer.Uint64(results.s_BytesExceeded); + + writer.String(MODEL_BYTES_MEMORY_LIMIT); + writer.Uint64(results.s_BytesMemoryLimit); writer.String(TOTAL_BY_FIELD_COUNT); writer.Uint64(results.s_ByFields); diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc index 135f9267fa..741e45d68a 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.cc +++ b/lib/api/unittest/CJsonOutputWriterTest.cc @@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { ml::model::CResourceMonitor::SResults resourceUsage; resourceUsage.s_Usage = 1; - resourceUsage.s_ByFields = 2; - resourceUsage.s_PartitionFields = 3; - resourceUsage.s_OverFields = 4; - resourceUsage.s_AllocationFailures = 5; + resourceUsage.s_AdjustedUsage = 2; + resourceUsage.s_ByFields = 3; + resourceUsage.s_PartitionFields = 4; + resourceUsage.s_OverFields = 5; + resourceUsage.s_AllocationFailures = 6; resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit; - resourceUsage.s_BucketStartTime = 6; + resourceUsage.s_BucketStartTime = 7; + resourceUsage.s_BytesExceeded = 8; + resourceUsage.s_BytesMemoryLimit = 9; writer.reportMemoryUsage(resourceUsage); writer.endOutputBatch(false, 1ul); @@ -1580,15 +1583,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes")); CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count")); - CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count")); - CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count")); - CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count")); - CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("timestamp")); - CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt()); + CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("memory_status")); CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"), std::string(sizeStats["memory_status"].GetString())); @@ -1596,6 +1599,10 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll; CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64()); CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64()); + CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded")); + CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt()); + CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit")); + CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt()); } void CJsonOutputWriterTest::testWriteScheduledEvent() { diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc index 9d493ea703..d0002aabef 100644 --- a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc +++ b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc @@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() { { model::CResourceMonitor::SResults modelSizeStats{ 10000, // bytes used + 20000, // bytes used (adjusted) 3, // # by fields 1, // # partition fields 150, // # over fields 4, // # allocation failures model_t::E_MemoryStatusOk, // memory status - core_t::TTime(1521046309) // bucket start time + core_t::TTime(1521046309), // bucket start time + 0, // model bytes exceeded + 50000 // model bytes memory limit }; CModelSnapshotJsonWriter::SModelSnapshotReport report{ @@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() { CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp")); CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time")); + CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded")); + CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64()); + CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit")); + CPPUNIT_ASSERT_EQUAL(int64_t(50000), + modelSizeStats["model_bytes_memory_limit"].GetInt64()); CPPUNIT_ASSERT(snapshot.HasMember("quantiles")); const rapidjson::Value& quantiles = snapshot["quantiles"]; diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc index 29ddb502b0..842bd9866c 100644 --- a/lib/model/CResourceMonitor.cc +++ b/lib/model/CResourceMonitor.cc @@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin) m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0), m_PruneWindow(std::numeric_limits::max()), m_PruneWindowMaximum(std::numeric_limits::max()), - m_PruneWindowMinimum(std::numeric_limits::max()), m_NoLimit(false) { + m_PruneWindowMinimum(std::numeric_limits::max()), + m_NoLimit(false), m_CurrentBytesExceeded(0) { this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB); } @@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) { void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) { this->memUsage(&detector); - core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory(); - LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory()); + this->updateAllowAllocations(); } void CResourceMonitor::updateAllowAllocations() { std::size_t total{this->totalMemory()}; + core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total; + LOG_TRACE(<< "Checking allocations: currently at " << total); if (m_AllowAllocations) { if (total > this->highLimit()) { LOG_INFO(<< "Over current allocation high limit. " << total << " bytes used, the limit is " << this->highLimit()); m_AllowAllocations = false; + std::size_t bytesExceeded{total - this->highLimit()}; + m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded); } } else if (total < this->lowLimit()) { LOG_INFO(<< "Below current allocation low limit. " << total @@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const { return m_AllowAllocations; } -bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const { - if (m_AllowAllocations) { - return this->totalMemory() + size < this->highLimit(); - } - return false; -} - std::size_t CResourceMonitor::allocationLimit() const { return this->highLimit() - std::min(this->highLimit(), this->totalMemory()); } @@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi res.s_OverFields = 0; res.s_PartitionFields = 0; res.s_Usage = this->totalMemory(); + res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage); + res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh; + res.s_BytesExceeded = m_CurrentBytesExceeded; res.s_AllocationFailures = 0; res.s_MemoryStatus = m_MemoryStatus; res.s_BucketStartTime = bucketStartTime; @@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi return res; } +std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const { + // Background persist causes the memory size to double due to copying + // the models. On top of that, after the persist is done we may not + // be able to retrieve that memory back. Thus, we report twice the + // memory usage in order to allow for that. + // See https://github.com/elastic/x-pack-elasticsearch/issues/1020. + // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857 + // discusses adding an option to perform only foreground persist. + // If that gets implemented, we should only double when background + // persist is configured. + + // We also scale the reported memory usage by the inverse of the byte limit margin. + // This gives the user a fairer indication of how close the job is to hitting + // the model memory limit in a concise manner (as the limit is scaled down by + // the margin during the beginning period of the job's existence). + size_t adjustedUsage = static_cast(2 * usage / m_ByteLimitMargin); + return adjustedUsage; +} + void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) { m_MemoryStatus = model_t::E_MemoryStatusHardLimit; ++m_AllocationFailures[time];