Skip to content

Commit

Permalink
[7.2][ML] Improve hard_limit audit message (#486) (#487)
Browse files Browse the repository at this point in the history
  • Loading branch information
edsavage authored May 18, 2019
1 parent 610c114 commit 7a782a9
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 35 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
=== Bug Fixes
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
Expand Down
15 changes: 11 additions & 4 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
public:
struct MODEL_EXPORT SResults {
std::size_t s_Usage;
std::size_t s_AdjustedUsage;
std::size_t s_ByFields;
std::size_t s_PartitionFields;
std::size_t s_OverFields;
std::size_t s_AllocationFailures;
model_t::EMemoryStatus s_MemoryStatus;
core_t::TTime s_BucketStartTime;
std::size_t s_BytesExceeded;
std::size_t s_BytesMemoryLimit;
};

public:
Expand All @@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
//! taking up too much memory and further allocations should be banned
bool areAllocationsAllowed() const;

//! Query the resource monitor to found out if it's Ok to
//! create structures of a certain size
bool areAllocationsAllowed(std::size_t size) const;

//! Return the amount of remaining space for allocations
std::size_t allocationLimit() const;

Expand Down Expand Up @@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
//! Returns the sum of used memory plus any extra memory
std::size_t totalMemory() const;

//! Adjusts the amount of memory reported to take into
//! account the current value of the byte limit margin and the effects
//! of background persistence.
std::size_t adjustedUsage(std::size_t usage) const;

private:
//! The registered collection of components
TDetectorPtrSizeUMap m_Detectors;
Expand Down Expand Up @@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
//! Don't do any sort of memory checking if this is set
bool m_NoLimit;

//! The number of bytes over the high limit for memory usage at the last allocation failure
std::size_t m_CurrentBytesExceeded;

//! Test friends
friend class ::CResourceMonitorTest;
friend class ::CResourceLimitTest;
Expand Down
20 changes: 10 additions & 10 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ namespace {
const std::string JOB_ID("job_id");
const std::string MODEL_SIZE_STATS("model_size_stats");
const std::string MODEL_BYTES("model_bytes");
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
Expand All @@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,

writer.String(JOB_ID);
writer.String(jobId);

writer.String(MODEL_BYTES);
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.
writer.Uint64(results.s_Usage * 2);
writer.Uint64(results.s_AdjustedUsage);

writer.String(MODEL_BYTES_EXCEEDED);
writer.Uint64(results.s_BytesExceeded);

writer.String(MODEL_BYTES_MEMORY_LIMIT);
writer.Uint64(results.s_BytesMemoryLimit);

writer.String(TOTAL_BY_FIELD_COUNT);
writer.Uint64(results.s_ByFields);
Expand Down
27 changes: 17 additions & 10 deletions lib/api/unittest/CJsonOutputWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {

ml::model::CResourceMonitor::SResults resourceUsage;
resourceUsage.s_Usage = 1;
resourceUsage.s_ByFields = 2;
resourceUsage.s_PartitionFields = 3;
resourceUsage.s_OverFields = 4;
resourceUsage.s_AllocationFailures = 5;
resourceUsage.s_AdjustedUsage = 2;
resourceUsage.s_ByFields = 3;
resourceUsage.s_PartitionFields = 4;
resourceUsage.s_OverFields = 5;
resourceUsage.s_AllocationFailures = 6;
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
resourceUsage.s_BucketStartTime = 6;
resourceUsage.s_BucketStartTime = 7;
resourceUsage.s_BytesExceeded = 8;
resourceUsage.s_BytesMemoryLimit = 9;

writer.reportMemoryUsage(resourceUsage);
writer.endOutputBatch(false, 1ul);
Expand All @@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
std::string(sizeStats["memory_status"].GetString()));
CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
}

void CJsonOutputWriterTest::testWriteScheduledEvent() {
Expand Down
10 changes: 9 additions & 1 deletion lib/api/unittest/CModelSnapshotJsonWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
{
model::CResourceMonitor::SResults modelSizeStats{
10000, // bytes used
20000, // bytes used (adjusted)
3, // # by fields
1, // # partition fields
150, // # over fields
4, // # allocation failures
model_t::E_MemoryStatusOk, // memory status
core_t::TTime(1521046309) // bucket start time
core_t::TTime(1521046309), // bucket start time
0, // model bytes exceeded
50000 // model bytes memory limit
};

CModelSnapshotJsonWriter::SModelSnapshotReport report{
Expand Down Expand Up @@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(int64_t(50000),
modelSizeStats["model_bytes_memory_limit"].GetInt64());

CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
const rapidjson::Value& quantiles = snapshot["quantiles"];
Expand Down
39 changes: 29 additions & 10 deletions lib/model/CResourceMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
m_PruneWindow(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
m_NoLimit(false), m_CurrentBytesExceeded(0) {
this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
}

Expand Down Expand Up @@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {

void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
this->memUsage(&detector);
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());

this->updateAllowAllocations();
}

void CResourceMonitor::updateAllowAllocations() {
std::size_t total{this->totalMemory()};
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
LOG_TRACE(<< "Checking allocations: currently at " << total);
if (m_AllowAllocations) {
if (total > this->highLimit()) {
LOG_INFO(<< "Over current allocation high limit. " << total
<< " bytes used, the limit is " << this->highLimit());
m_AllowAllocations = false;
std::size_t bytesExceeded{total - this->highLimit()};
m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
}
} else if (total < this->lowLimit()) {
LOG_INFO(<< "Below current allocation low limit. " << total
Expand Down Expand Up @@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
return m_AllowAllocations;
}

bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
if (m_AllowAllocations) {
return this->totalMemory() + size < this->highLimit();
}
return false;
}

std::size_t CResourceMonitor::allocationLimit() const {
return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
}
Expand Down Expand Up @@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
res.s_OverFields = 0;
res.s_PartitionFields = 0;
res.s_Usage = this->totalMemory();
res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
res.s_BytesExceeded = m_CurrentBytesExceeded;
res.s_AllocationFailures = 0;
res.s_MemoryStatus = m_MemoryStatus;
res.s_BucketStartTime = bucketStartTime;
Expand All @@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
return res;
}

std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.

// We also scale the reported memory usage by the inverse of the byte limit margin.
// This gives the user a fairer indication of how close the job is to hitting
// the model memory limit in a concise manner (as the limit is scaled down by
// the margin during the beginning period of the job's existence).
size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
return adjustedUsage;
}

void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
++m_AllocationFailures[time];
Expand Down

0 comments on commit 7a782a9

Please sign in to comment.