Skip to content

Commit

Permalink
[ML] Improve hard_limit audit message (elastic#486)
Browse files Browse the repository at this point in the history
Add the current model memory limit and the number of bytes in
excess of that at the point of the last allocation failure to the model
size stats. These will be used to construct a (hopefully) more
informative hard_limit audit message.

The reported memory usage is also scaled to take into account the byte
limit margin, which is in play in the initial period of a jobs' lifetime
and is used to scale down the high memory limit. This should give a more
accurate representation of how close the memory usage is to the high
limit.

relates elastic/elasticsearch#42086

closes elastic/elasticsearch#38034
  • Loading branch information
edsavage committed May 18, 2019
1 parent 610c114 commit 7e1b5b0
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 35 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
=== Bug Fixes
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
Expand Down
15 changes: 11 additions & 4 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
public:
struct MODEL_EXPORT SResults {
std::size_t s_Usage;
std::size_t s_AdjustedUsage;
std::size_t s_ByFields;
std::size_t s_PartitionFields;
std::size_t s_OverFields;
std::size_t s_AllocationFailures;
model_t::EMemoryStatus s_MemoryStatus;
core_t::TTime s_BucketStartTime;
std::size_t s_BytesExceeded;
std::size_t s_BytesMemoryLimit;
};

public:
Expand All @@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
//! taking up too much memory and further allocations should be banned
bool areAllocationsAllowed() const;

//! Query the resource monitor to found out if it's Ok to
//! create structures of a certain size
bool areAllocationsAllowed(std::size_t size) const;

//! Return the amount of remaining space for allocations
std::size_t allocationLimit() const;

Expand Down Expand Up @@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
//! Returns the sum of used memory plus any extra memory
std::size_t totalMemory() const;

//! Adjusts the amount of memory reported to take into
//! account the current value of the byte limit margin and the effects
//! of background persistence.
std::size_t adjustedUsage(std::size_t usage) const;

private:
//! The registered collection of components
TDetectorPtrSizeUMap m_Detectors;
Expand Down Expand Up @@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
//! Don't do any sort of memory checking if this is set
bool m_NoLimit;

//! The number of bytes over the high limit for memory usage at the last allocation failure
std::size_t m_CurrentBytesExceeded;

//! Test friends
friend class ::CResourceMonitorTest;
friend class ::CResourceLimitTest;
Expand Down
20 changes: 10 additions & 10 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ namespace {
const std::string JOB_ID("job_id");
const std::string MODEL_SIZE_STATS("model_size_stats");
const std::string MODEL_BYTES("model_bytes");
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
Expand All @@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,

writer.String(JOB_ID);
writer.String(jobId);

writer.String(MODEL_BYTES);
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.
writer.Uint64(results.s_Usage * 2);
writer.Uint64(results.s_AdjustedUsage);

writer.String(MODEL_BYTES_EXCEEDED);
writer.Uint64(results.s_BytesExceeded);

writer.String(MODEL_BYTES_MEMORY_LIMIT);
writer.Uint64(results.s_BytesMemoryLimit);

writer.String(TOTAL_BY_FIELD_COUNT);
writer.Uint64(results.s_ByFields);
Expand Down
27 changes: 17 additions & 10 deletions lib/api/unittest/CJsonOutputWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {

ml::model::CResourceMonitor::SResults resourceUsage;
resourceUsage.s_Usage = 1;
resourceUsage.s_ByFields = 2;
resourceUsage.s_PartitionFields = 3;
resourceUsage.s_OverFields = 4;
resourceUsage.s_AllocationFailures = 5;
resourceUsage.s_AdjustedUsage = 2;
resourceUsage.s_ByFields = 3;
resourceUsage.s_PartitionFields = 4;
resourceUsage.s_OverFields = 5;
resourceUsage.s_AllocationFailures = 6;
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
resourceUsage.s_BucketStartTime = 6;
resourceUsage.s_BucketStartTime = 7;
resourceUsage.s_BytesExceeded = 8;
resourceUsage.s_BytesMemoryLimit = 9;

writer.reportMemoryUsage(resourceUsage);
writer.endOutputBatch(false, 1ul);
Expand All @@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
std::string(sizeStats["memory_status"].GetString()));
CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
}

void CJsonOutputWriterTest::testWriteScheduledEvent() {
Expand Down
10 changes: 9 additions & 1 deletion lib/api/unittest/CModelSnapshotJsonWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
{
model::CResourceMonitor::SResults modelSizeStats{
10000, // bytes used
20000, // bytes used (adjusted)
3, // # by fields
1, // # partition fields
150, // # over fields
4, // # allocation failures
model_t::E_MemoryStatusOk, // memory status
core_t::TTime(1521046309) // bucket start time
core_t::TTime(1521046309), // bucket start time
0, // model bytes exceeded
50000 // model bytes memory limit
};

CModelSnapshotJsonWriter::SModelSnapshotReport report{
Expand Down Expand Up @@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(int64_t(50000),
modelSizeStats["model_bytes_memory_limit"].GetInt64());

CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
const rapidjson::Value& quantiles = snapshot["quantiles"];
Expand Down
39 changes: 29 additions & 10 deletions lib/model/CResourceMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
m_PruneWindow(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
m_NoLimit(false), m_CurrentBytesExceeded(0) {
this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
}

Expand Down Expand Up @@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {

void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
this->memUsage(&detector);
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());

this->updateAllowAllocations();
}

void CResourceMonitor::updateAllowAllocations() {
std::size_t total{this->totalMemory()};
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
LOG_TRACE(<< "Checking allocations: currently at " << total);
if (m_AllowAllocations) {
if (total > this->highLimit()) {
LOG_INFO(<< "Over current allocation high limit. " << total
<< " bytes used, the limit is " << this->highLimit());
m_AllowAllocations = false;
std::size_t bytesExceeded{total - this->highLimit()};
m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
}
} else if (total < this->lowLimit()) {
LOG_INFO(<< "Below current allocation low limit. " << total
Expand Down Expand Up @@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
return m_AllowAllocations;
}

bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
if (m_AllowAllocations) {
return this->totalMemory() + size < this->highLimit();
}
return false;
}

std::size_t CResourceMonitor::allocationLimit() const {
return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
}
Expand Down Expand Up @@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
res.s_OverFields = 0;
res.s_PartitionFields = 0;
res.s_Usage = this->totalMemory();
res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
res.s_BytesExceeded = m_CurrentBytesExceeded;
res.s_AllocationFailures = 0;
res.s_MemoryStatus = m_MemoryStatus;
res.s_BucketStartTime = bucketStartTime;
Expand All @@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
return res;
}

std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.

// We also scale the reported memory usage by the inverse of the byte limit margin.
// This gives the user a fairer indication of how close the job is to hitting
// the model memory limit in a concise manner (as the limit is scaled down by
// the margin during the beginning period of the job's existence).
size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
return adjustedUsage;
}

void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
++m_AllocationFailures[time];
Expand Down

0 comments on commit 7e1b5b0

Please sign in to comment.