Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

not return estimated minutes remaining until cold start is finished #210

Merged
merged 3 commits into from
Aug 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ public AnomalyDetectorProfileRunner(
this.client = client;
this.xContentRegistry = xContentRegistry;
this.nodeFilter = nodeFilter;
if (requiredSamples <= 0) {
throw new IllegalArgumentException("required samples should be a positive number, but was " + requiredSamples);
Comment on lines +79 to +80
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

testing. the testing of this condition is missing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

}
this.requiredSamples = requiredSamples;
}

Expand Down Expand Up @@ -287,14 +290,7 @@ private ActionListener<GetResponse> onGetDetectorForInitProgress(
ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.nextToken(), parser::getTokenLocation);
AnomalyDetector detector = AnomalyDetector.parse(parser, detectorId);
long intervalMins = ((IntervalTimeConfiguration) detector.getDetectionInterval()).toDuration().toMinutes();
float percent = (100.0f * totalUpdates) / requiredSamples;
int neededPoints = (int) (requiredSamples - totalUpdates);
InitProgressProfile initProgress = new InitProgressProfile(
// rounding: 93.456 => 93%, 93.556 => 94%
String.format("%.0f%%", percent),
intervalMins * neededPoints,
neededPoints
);
InitProgressProfile initProgress = computeInitProgressProfile(totalUpdates, intervalMins);

listener.onResponse(new DetectorProfile.Builder().initProgress(initProgress).build());
} catch (Exception t) {
Expand All @@ -308,6 +304,17 @@ private ActionListener<GetResponse> onGetDetectorForInitProgress(
}, exception -> { listener.failImmediately(FAIL_TO_FIND_DETECTOR_MSG + detectorId, exception); });
}

private InitProgressProfile computeInitProgressProfile(long totalUpdates, long intervalMins) {
float percent = (100.0f * totalUpdates) / requiredSamples;
ohltyler marked this conversation as resolved.
Show resolved Hide resolved
int neededPoints = (int) (requiredSamples - totalUpdates);
return new InitProgressProfile(
// rounding: 93.456 => 93%, 93.556 => 94%
String.format("%.0f%%", percent),
intervalMins * neededPoints,
neededPoints
);
}

private void profileModels(
String detectorId,
Set<ProfileName> profiles,
Expand Down Expand Up @@ -357,7 +364,7 @@ private ActionListener<RCFPollingResponse> onPollRCFUpdates(
return ActionListener.wrap(rcfPollResponse -> {
long totalUpdates = rcfPollResponse.getTotalUpdates();
if (totalUpdates < requiredSamples) {
processInitResponse(detectorId, profilesToCollect, listener, totalUpdates);
processInitResponse(detectorId, profilesToCollect, listener, totalUpdates, false);
} else {
if (profilesToCollect.contains(ProfileName.STATE)) {
listener.onResponse(new DetectorProfile.Builder().state(DetectorState.RUNNING).build());
Expand All @@ -379,7 +386,11 @@ private ActionListener<RCFPollingResponse> onPollRCFUpdates(
|| (causeException instanceof IndexNotFoundException
&& causeException.getMessage().contains(CommonName.CHECKPOINT_INDEX_NAME))) {
// cannot find checkpoint
processInitResponse(detectorId, profilesToCollect, listener, 0L);
// We don't want to show the estimated time remaining to initialize
// a detector before cold start finishes, where the actual
// initialization time may be much shorter if sufficient historical
// data exists.
processInitResponse(detectorId, profilesToCollect, listener, 0L, true);
} else {
logger.error(new ParameterizedMessage("Fail to get init progress through messaging for {}", detectorId), exception);
listener.failImmediately(FAIL_TO_GET_PROFILE_MSG + detectorId, exception);
Expand All @@ -391,19 +402,26 @@ private void processInitResponse(
String detectorId,
Set<ProfileName> profilesToCollect,
MultiResponsesDelegateActionListener<DetectorProfile> listener,
long totalUpdates
long totalUpdates,
boolean hideMinutesLeft
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be hide estimate, which affects minutes left and data points needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you meant the variable name should be hideEstimate? It only affect minutes left, not data points needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the data needed and time needed together. Before model training is done, they should be hidden together by a message such as "Model training is starting soon". Once that is done, the detailed information on needed data and time is now shown, such as "need x data and y time".

It'd be confusing to see "need x data and no time". What does the ux look like in this case?

Copy link
Contributor

@ohltyler ohltyler Aug 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as ux goes, we will probably add an intermediate callout saying something like "attempting to initialize with existing/historical data", and show that callout until we can retrieve minutesLeft from the backend profile call. Then we show the regular progress bar callout, which includes the "need x data and y time" info. See Kibana issue here.

I guess it's kind of arbitrary on if we hide minutes left and/or data points left, but need some way to indicate on the frontend that the cold start process isn't finished.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial temporal call out is informative enough. Before a model is trained, both data and time that the (non-existent) model will need to wait for should be equally unknown.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have a separate callout, both of these would be ignored as far as frontend is concerned. Wouldn't we want a different callout in this case? Seems that we wouldn't want to show 0% progress either, when that may immediately go to 100% and disappear if there is sufficient historical data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call out for the case before model training may just be "attempting to initialize with existing/historical data"/"Model training is starting soon", no data/time/progress needed to cause confusion or panic (yet).

Copy link
Member Author

@kaituo kaituo Aug 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, UX does not show needed shingles. They only look at progress and estimated time. Second, as an API user, I don't feel it causes confusion when we show needed shingles. For example, one minute ago, we need 128 shingles. After 1 minute, we need 0. It means we find enough training data. At that particular moment, the needed shingles are accurate. If cold start is stuck or retrying, the number also gives clarification on the state.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not apply a more consistent logic to make code more maintainable and understandable? that is before a model is created, the time and data it needs to wait is either both unknown or both guessed. Or, what's the benefit of having mixed results from different logic for each field?

this is a question not a blocker

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data it needs to wait is not guessed. The time is. Without a checkpoint, it is accurate to say we need 128 shingles.

The benefit is provide transparency instead of guessing.

Thanks for not blocking.

) {
if (profilesToCollect.contains(ProfileName.STATE)) {
listener.onResponse(new DetectorProfile.Builder().state(DetectorState.INIT).build());
}

if (profilesToCollect.contains(ProfileName.INIT_PROGRESS)) {
GetRequest getDetectorRequest = new GetRequest(ANOMALY_DETECTORS_INDEX, detectorId);
client
.get(
getDetectorRequest,
onGetDetectorForInitProgress(listener, detectorId, profilesToCollect, totalUpdates, requiredSamples)
);
if (hideMinutesLeft) {
InitProgressProfile initProgress = computeInitProgressProfile(totalUpdates, 0);
listener.onResponse(new DetectorProfile.Builder().initProgress(initProgress).build());
} else {
GetRequest getDetectorRequest = new GetRequest(ANOMALY_DETECTORS_INDEX, detectorId);
client
.get(
getDetectorRequest,
onGetDetectorForInitProgress(listener, detectorId, profilesToCollect, totalUpdates, requiredSamples)
);
}

}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -663,4 +663,30 @@ public void testInitNoUpdateNoIndex() throws IOException, InterruptedException {
}), stateInitProgress);
assertTrue(inProgressLatch.await(100, TimeUnit.SECONDS));
}

public void testInitNoIndex() throws IOException, InterruptedException {
setUpClientGet(DetectorStatus.EXIST, JobStatus.ENABLED, RCFPollingStatus.INDEX_NOT_FOUND, ErrorResultStatus.NO_ERROR);
DetectorProfile expectedProfile = new DetectorProfile.Builder()
.state(DetectorState.INIT)
.initProgress(new InitProgressProfile("0%", 0, requiredSamples))
.build();
final CountDownLatch inProgressLatch = new CountDownLatch(1);

runner.profile(detector.getDetectorId(), ActionListener.wrap(response -> {
assertEquals(expectedProfile, response);
inProgressLatch.countDown();
}, exception -> {
logger.error(exception);
for (StackTraceElement ste : exception.getStackTrace()) {
logger.info(ste);
}
assertTrue("Should not reach here ", false);
inProgressLatch.countDown();
}), stateInitProgress);
assertTrue(inProgressLatch.await(100, TimeUnit.SECONDS));
}

public void testInvalidRequiredSamples() {
expectThrows(IllegalArgumentException.class, () -> new AnomalyDetectorProfileRunner(client, xContentRegistry(), nodeFilter, 0));
}
}