Skip to content

Commit

Permalink
[ML] Improve hard_limit audit message (#42086)
Browse files Browse the repository at this point in the history
Improve the hard_limit memory audit message by reporting how many bytes
over the configured memory limit the job was at the point of the last
allocation failure.

Previously the model memory usage was reported, however this was
inaccurate and hence of limited use -  primarily because the total
memory used by the model can decrease significantly after the models
status is changed to hard_limit but before the model size stats are
reported from autodetect to ES.

While this PR contains the changes to the format of the hard_limit audit
message it is dependent on modifications to the ml-cpp backend to
send additional data fields in the model size stats message. These
changes will follow in a subsequent PR. It is worth noting that this PR
must be merged prior to the ml-cpp one, to keep CI tests happy.
  • Loading branch information
edsavage committed May 17, 2019
1 parent f244736 commit a68b04e
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ public class ModelSizeStats implements ToXContentObject {
* Field Names
*/
public static final ParseField MODEL_BYTES_FIELD = new ParseField("model_bytes");
public static final ParseField MODEL_BYTES_EXCEEDED_FIELD = new ParseField("model_bytes_exceeded");
public static final ParseField MODEL_BYTES_MEMORY_LIMIT_FIELD = new ParseField("model_bytes_memory_limit");
public static final ParseField TOTAL_BY_FIELD_COUNT_FIELD = new ParseField("total_by_field_count");
public static final ParseField TOTAL_OVER_FIELD_COUNT_FIELD = new ParseField("total_over_field_count");
public static final ParseField TOTAL_PARTITION_FIELD_COUNT_FIELD = new ParseField("total_partition_field_count");
Expand All @@ -61,6 +63,8 @@ public class ModelSizeStats implements ToXContentObject {
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), Job.ID);
PARSER.declareLong(Builder::setModelBytes, MODEL_BYTES_FIELD);
PARSER.declareLong(Builder::setModelBytesExceeded, MODEL_BYTES_EXCEEDED_FIELD);
PARSER.declareLong(Builder::setModelBytesMemoryLimit, MODEL_BYTES_MEMORY_LIMIT_FIELD);
PARSER.declareLong(Builder::setBucketAllocationFailuresCount, BUCKET_ALLOCATION_FAILURES_COUNT_FIELD);
PARSER.declareLong(Builder::setTotalByFieldCount, TOTAL_BY_FIELD_COUNT_FIELD);
PARSER.declareLong(Builder::setTotalOverFieldCount, TOTAL_OVER_FIELD_COUNT_FIELD);
Expand Down Expand Up @@ -97,6 +101,8 @@ public String toString() {

private final String jobId;
private final long modelBytes;
private final Long modelBytesExceeded;
private final Long modelBytesMemoryLimit;
private final long totalByFieldCount;
private final long totalOverFieldCount;
private final long totalPartitionFieldCount;
Expand All @@ -105,11 +111,13 @@ public String toString() {
private final Date timestamp;
private final Date logTime;

private ModelSizeStats(String jobId, long modelBytes, long totalByFieldCount, long totalOverFieldCount,
long totalPartitionFieldCount, long bucketAllocationFailuresCount, MemoryStatus memoryStatus,
Date timestamp, Date logTime) {
private ModelSizeStats(String jobId, long modelBytes, Long modelBytesExceeded, Long modelBytesMemoryLimit, long totalByFieldCount,
long totalOverFieldCount, long totalPartitionFieldCount, long bucketAllocationFailuresCount,
MemoryStatus memoryStatus, Date timestamp, Date logTime) {
this.jobId = jobId;
this.modelBytes = modelBytes;
this.modelBytesExceeded = modelBytesExceeded;
this.modelBytesMemoryLimit = modelBytesMemoryLimit;
this.totalByFieldCount = totalByFieldCount;
this.totalOverFieldCount = totalOverFieldCount;
this.totalPartitionFieldCount = totalPartitionFieldCount;
Expand All @@ -126,6 +134,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field(Job.ID.getPreferredName(), jobId);
builder.field(Result.RESULT_TYPE.getPreferredName(), RESULT_TYPE_VALUE);
builder.field(MODEL_BYTES_FIELD.getPreferredName(), modelBytes);
if (modelBytesExceeded != null) {
builder.field(MODEL_BYTES_EXCEEDED_FIELD.getPreferredName(), modelBytesExceeded);
}
if (modelBytesMemoryLimit != null) {
builder.field(MODEL_BYTES_MEMORY_LIMIT_FIELD.getPreferredName(), modelBytesMemoryLimit);
}
builder.field(TOTAL_BY_FIELD_COUNT_FIELD.getPreferredName(), totalByFieldCount);
builder.field(TOTAL_OVER_FIELD_COUNT_FIELD.getPreferredName(), totalOverFieldCount);
builder.field(TOTAL_PARTITION_FIELD_COUNT_FIELD.getPreferredName(), totalPartitionFieldCount);
Expand All @@ -148,6 +162,14 @@ public long getModelBytes() {
return modelBytes;
}

public Long getModelBytesExceeded() {
return modelBytesExceeded;
}

public Long getModelBytesMemoryLimit() {
return modelBytesMemoryLimit;
}

public long getTotalByFieldCount() {
return totalByFieldCount;
}
Expand Down Expand Up @@ -188,8 +210,8 @@ public Date getLogTime() {

@Override
public int hashCode() {
return Objects.hash(jobId, modelBytes, totalByFieldCount, totalOverFieldCount, totalPartitionFieldCount,
this.bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
return Objects.hash(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
totalPartitionFieldCount, this.bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
}

/**
Expand All @@ -207,7 +229,8 @@ public boolean equals(Object other) {

ModelSizeStats that = (ModelSizeStats) other;

return this.modelBytes == that.modelBytes && this.totalByFieldCount == that.totalByFieldCount
return this.modelBytes == that.modelBytes && Objects.equals(this.modelBytesExceeded, that.modelBytesExceeded)
&& Objects.equals(this.modelBytesMemoryLimit, that.modelBytesMemoryLimit) && this.totalByFieldCount == that.totalByFieldCount
&& this.totalOverFieldCount == that.totalOverFieldCount && this.totalPartitionFieldCount == that.totalPartitionFieldCount
&& this.bucketAllocationFailuresCount == that.bucketAllocationFailuresCount
&& Objects.equals(this.memoryStatus, that.memoryStatus) && Objects.equals(this.timestamp, that.timestamp)
Expand All @@ -219,6 +242,8 @@ public static class Builder {

private final String jobId;
private long modelBytes;
private Long modelBytesExceeded;
private Long modelBytesMemoryLimit;
private long totalByFieldCount;
private long totalOverFieldCount;
private long totalPartitionFieldCount;
Expand All @@ -236,6 +261,8 @@ public Builder(String jobId) {
public Builder(ModelSizeStats modelSizeStats) {
this.jobId = modelSizeStats.jobId;
this.modelBytes = modelSizeStats.modelBytes;
this.modelBytesExceeded = modelSizeStats.modelBytesExceeded;
this.modelBytesMemoryLimit = modelSizeStats.modelBytesMemoryLimit;
this.totalByFieldCount = modelSizeStats.totalByFieldCount;
this.totalOverFieldCount = modelSizeStats.totalOverFieldCount;
this.totalPartitionFieldCount = modelSizeStats.totalPartitionFieldCount;
Expand All @@ -250,6 +277,16 @@ public Builder setModelBytes(long modelBytes) {
return this;
}

public Builder setModelBytesExceeded(long modelBytesExceeded) {
this.modelBytesExceeded = modelBytesExceeded;
return this;
}

public Builder setModelBytesMemoryLimit(long modelBytesMemoryLimit) {
this.modelBytesMemoryLimit = modelBytesMemoryLimit;
return this;
}

public Builder setTotalByFieldCount(long totalByFieldCount) {
this.totalByFieldCount = totalByFieldCount;
return this;
Expand Down Expand Up @@ -287,8 +324,8 @@ public Builder setLogTime(Date logTime) {
}

public ModelSizeStats build() {
return new ModelSizeStats(jobId, modelBytes, totalByFieldCount, totalOverFieldCount, totalPartitionFieldCount,
bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
return new ModelSizeStats(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
totalPartitionFieldCount, bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
}
}
}
Loading

0 comments on commit a68b04e

Please sign in to comment.