diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index f66937e080..454d67135b 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].) * Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].) +* Improvements to hard_limit audit message. (See {ml-pull}486[#486].) + === Bug Fixes * Handle NaNs when detrending seasonal components. {ml-pull}408[#408] diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h index f1c3077d64..b6088d99de 100644 --- a/include/model/CResourceMonitor.h +++ b/include/model/CResourceMonitor.h @@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor { public: struct MODEL_EXPORT SResults { std::size_t s_Usage; + std::size_t s_AdjustedUsage; std::size_t s_ByFields; std::size_t s_PartitionFields; std::size_t s_OverFields; std::size_t s_AllocationFailures; model_t::EMemoryStatus s_MemoryStatus; core_t::TTime s_BucketStartTime; + std::size_t s_BytesExceeded; + std::size_t s_BytesMemoryLimit; }; public: @@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor { //! taking up too much memory and further allocations should be banned bool areAllocationsAllowed() const; - //! Query the resource monitor to found out if it's Ok to - //! create structures of a certain size - bool areAllocationsAllowed(std::size_t size) const; - //! Return the amount of remaining space for allocations std::size_t allocationLimit() const; @@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor { //! Returns the sum of used memory plus any extra memory std::size_t totalMemory() const; + //! Adjusts the amount of memory reported to take into + //! account the current value of the byte limit margin and the effects + //! of background persistence. + std::size_t adjustedUsage(std::size_t usage) const; + private: //! The registered collection of components TDetectorPtrSizeUMap m_Detectors; @@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor { //! Don't do any sort of memory checking if this is set bool m_NoLimit; + //! The number of bytes over the high limit for memory usage at the last allocation failure + std::size_t m_CurrentBytesExceeded; + //! Test friends friend class ::CResourceMonitorTest; friend class ::CResourceLimitTest; diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc index ff2b94286d..0959812912 100644 --- a/lib/api/CModelSizeStatsJsonWriter.cc +++ b/lib/api/CModelSizeStatsJsonWriter.cc @@ -16,6 +16,8 @@ namespace { const std::string JOB_ID("job_id"); const std::string MODEL_SIZE_STATS("model_size_stats"); const std::string MODEL_BYTES("model_bytes"); +const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded"); +const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit"); const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count"); const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count"); const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count"); @@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId, writer.String(JOB_ID); writer.String(jobId); + writer.String(MODEL_BYTES); - // Background persist causes the memory size to double due to copying - // the models. On top of that, after the persist is done we may not - // be able to retrieve that memory back. Thus, we report twice the - // memory usage in order to allow for that. - // See https://github.com/elastic/x-pack-elasticsearch/issues/1020. - // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857 - // discusses adding an option to perform only foreground persist. - // If that gets implemented, we should only double when background - // persist is configured. - writer.Uint64(results.s_Usage * 2); + writer.Uint64(results.s_AdjustedUsage); + + writer.String(MODEL_BYTES_EXCEEDED); + writer.Uint64(results.s_BytesExceeded); + + writer.String(MODEL_BYTES_MEMORY_LIMIT); + writer.Uint64(results.s_BytesMemoryLimit); writer.String(TOTAL_BY_FIELD_COUNT); writer.Uint64(results.s_ByFields); diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc index 135f9267fa..741e45d68a 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.cc +++ b/lib/api/unittest/CJsonOutputWriterTest.cc @@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { ml::model::CResourceMonitor::SResults resourceUsage; resourceUsage.s_Usage = 1; - resourceUsage.s_ByFields = 2; - resourceUsage.s_PartitionFields = 3; - resourceUsage.s_OverFields = 4; - resourceUsage.s_AllocationFailures = 5; + resourceUsage.s_AdjustedUsage = 2; + resourceUsage.s_ByFields = 3; + resourceUsage.s_PartitionFields = 4; + resourceUsage.s_OverFields = 5; + resourceUsage.s_AllocationFailures = 6; resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit; - resourceUsage.s_BucketStartTime = 6; + resourceUsage.s_BucketStartTime = 7; + resourceUsage.s_BytesExceeded = 8; + resourceUsage.s_BytesMemoryLimit = 9; writer.reportMemoryUsage(resourceUsage); writer.endOutputBatch(false, 1ul); @@ -1580,15 +1583,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes")); CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count")); - CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count")); - CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count")); - CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count")); - CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt()); + CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("timestamp")); - CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt()); + CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt()); CPPUNIT_ASSERT(sizeStats.HasMember("memory_status")); CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"), std::string(sizeStats["memory_status"].GetString())); @@ -1596,6 +1599,10 @@ void CJsonOutputWriterTest::testReportMemoryUsage() { int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll; CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64()); CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64()); + CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded")); + CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt()); + CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit")); + CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt()); } void CJsonOutputWriterTest::testWriteScheduledEvent() { diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc index 9d493ea703..d0002aabef 100644 --- a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc +++ b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc @@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() { { model::CResourceMonitor::SResults modelSizeStats{ 10000, // bytes used + 20000, // bytes used (adjusted) 3, // # by fields 1, // # partition fields 150, // # over fields 4, // # allocation failures model_t::E_MemoryStatusOk, // memory status - core_t::TTime(1521046309) // bucket start time + core_t::TTime(1521046309), // bucket start time + 0, // model bytes exceeded + 50000 // model bytes memory limit }; CModelSnapshotJsonWriter::SModelSnapshotReport report{ @@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() { CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp")); CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time")); + CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded")); + CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64()); + CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit")); + CPPUNIT_ASSERT_EQUAL(int64_t(50000), + modelSizeStats["model_bytes_memory_limit"].GetInt64()); CPPUNIT_ASSERT(snapshot.HasMember("quantiles")); const rapidjson::Value& quantiles = snapshot["quantiles"]; diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc index 29ddb502b0..842bd9866c 100644 --- a/lib/model/CResourceMonitor.cc +++ b/lib/model/CResourceMonitor.cc @@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin) m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0), m_PruneWindow(std::numeric_limits::max()), m_PruneWindowMaximum(std::numeric_limits::max()), - m_PruneWindowMinimum(std::numeric_limits::max()), m_NoLimit(false) { + m_PruneWindowMinimum(std::numeric_limits::max()), + m_NoLimit(false), m_CurrentBytesExceeded(0) { this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB); } @@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) { void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) { this->memUsage(&detector); - core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory(); - LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory()); + this->updateAllowAllocations(); } void CResourceMonitor::updateAllowAllocations() { std::size_t total{this->totalMemory()}; + core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total; + LOG_TRACE(<< "Checking allocations: currently at " << total); if (m_AllowAllocations) { if (total > this->highLimit()) { LOG_INFO(<< "Over current allocation high limit. " << total << " bytes used, the limit is " << this->highLimit()); m_AllowAllocations = false; + std::size_t bytesExceeded{total - this->highLimit()}; + m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded); } } else if (total < this->lowLimit()) { LOG_INFO(<< "Below current allocation low limit. " << total @@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const { return m_AllowAllocations; } -bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const { - if (m_AllowAllocations) { - return this->totalMemory() + size < this->highLimit(); - } - return false; -} - std::size_t CResourceMonitor::allocationLimit() const { return this->highLimit() - std::min(this->highLimit(), this->totalMemory()); } @@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi res.s_OverFields = 0; res.s_PartitionFields = 0; res.s_Usage = this->totalMemory(); + res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage); + res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh; + res.s_BytesExceeded = m_CurrentBytesExceeded; res.s_AllocationFailures = 0; res.s_MemoryStatus = m_MemoryStatus; res.s_BucketStartTime = bucketStartTime; @@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi return res; } +std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const { + // Background persist causes the memory size to double due to copying + // the models. On top of that, after the persist is done we may not + // be able to retrieve that memory back. Thus, we report twice the + // memory usage in order to allow for that. + // See https://github.com/elastic/x-pack-elasticsearch/issues/1020. + // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857 + // discusses adding an option to perform only foreground persist. + // If that gets implemented, we should only double when background + // persist is configured. + + // We also scale the reported memory usage by the inverse of the byte limit margin. + // This gives the user a fairer indication of how close the job is to hitting + // the model memory limit in a concise manner (as the limit is scaled down by + // the margin during the beginning period of the job's existence). + size_t adjustedUsage = static_cast(2 * usage / m_ByteLimitMargin); + return adjustedUsage; +} + void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) { m_MemoryStatus = model_t::E_MemoryStatusHardLimit; ++m_AllocationFailures[time];