-
Notifications
You must be signed in to change notification settings - Fork 36
Adding negative cache to throttle extra request https://github.com/opendistro-for-elasticsearch/anomaly-detection/issues/33 #40
Changes from 2 commits
9aa9e46
bc6a763
3a82b22
fe2a193
1bb9257
fae4c59
c6a048d
fcab62c
f7ad7c0
ff73fb7
95f3892
1d086d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,7 @@ | |
|
||
import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector; | ||
import com.amazon.opendistroforelasticsearch.ad.model.IntervalTimeConfiguration; | ||
import com.amazon.opendistroforelasticsearch.ad.transport.ADStateManager; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.elasticsearch.action.ActionListener; | ||
|
@@ -115,15 +116,16 @@ public FeatureManager( | |
* @param detector anomaly detector for which the features are returned | ||
* @param startTime start time of the data point in epoch milliseconds | ||
* @param endTime end time of the data point in epoch milliseconds | ||
* @param stateManager ADStateManager | ||
* @return unprocessed features and processed features for the current data point | ||
*/ | ||
@Deprecated | ||
public SinglePointFeatures getCurrentFeatures(AnomalyDetector detector, long startTime, long endTime) { | ||
public SinglePointFeatures getCurrentFeatures(AnomalyDetector detector, long startTime, long endTime, ADStateManager stateManager) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The added parameter should be a dependency injected rather than passed all the way down the stack. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. We should not pass state manager around. |
||
double[][] currentPoints = null; | ||
Deque<Entry<Long, double[]>> shingle = detectorIdsToTimeShingles | ||
.computeIfAbsent(detector.getDetectorId(), id -> new ArrayDeque<Entry<Long, double[]>>(shingleSize)); | ||
if (shingle.isEmpty() || shingle.getLast().getKey() < endTime) { | ||
Optional<double[]> point = searchFeatureDao.getFeaturesForPeriod(detector, startTime, endTime); | ||
Optional<double[]> point = searchFeatureDao.getFeaturesForPeriod(detector, startTime, endTime, stateManager); | ||
if (point.isPresent()) { | ||
if (shingle.size() == shingleSize) { | ||
shingle.remove(); | ||
|
@@ -174,13 +176,16 @@ private double[][] filterAndFill(Deque<Entry<Long, double[]>> shingle, long endT | |
* in dimension via shingling. | ||
* | ||
* @param detector contains data info (indices, documents, etc) | ||
* @param stateManager ADStateManager | ||
* @return data for cold-start training, or empty if unavailable | ||
*/ | ||
@Deprecated | ||
public Optional<double[][]> getColdStartData(AnomalyDetector detector) { | ||
public Optional<double[][]> getColdStartData(AnomalyDetector detector, ADStateManager stateManager) { | ||
return searchFeatureDao | ||
.getLatestDataTime(detector) | ||
.flatMap(latest -> searchFeatureDao.getFeaturesForSampledPeriods(detector, maxTrainSamples, maxSampleStride, latest)) | ||
.flatMap( | ||
latest -> searchFeatureDao.getFeaturesForSampledPeriods(detector, maxTrainSamples, maxSampleStride, latest, stateManager) | ||
) | ||
.map( | ||
samples -> transpose( | ||
interpolator.interpolate(transpose(samples.getKey()), samples.getValue() * (samples.getKey().length - 1) + 1) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
|
||
import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector; | ||
import com.amazon.opendistroforelasticsearch.ad.model.IntervalTimeConfiguration; | ||
import com.amazon.opendistroforelasticsearch.ad.transport.ADStateManager; | ||
import com.amazon.opendistroforelasticsearch.ad.util.ClientUtil; | ||
import com.amazon.opendistroforelasticsearch.ad.util.ParseUtils; | ||
import org.apache.logging.log4j.LogManager; | ||
|
@@ -114,18 +115,24 @@ public Optional<Long> getLatestDataTime(AnomalyDetector detector) { | |
} | ||
|
||
/** | ||
* Gets features for the given time period. | ||
* Gets features for the given time period. This function also add given detector to negative cache before sending es request. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor. Adds. Also, use descriptive language instead of prescriptive. See java doc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will update, thanks for the suggestion |
||
* Once we get response/exception within timeout, we treat this request as complete and clear the negative cache. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor. We is prescriptive (giving orders to code), not descriptive (stating what code does). |
||
* Otherwise this detector entry remain in the negative to reject further request. | ||
* | ||
* @param detector info about indices, documents, feature query | ||
* @param startTime epoch milliseconds at the beginning of the period | ||
* @param endTime epoch milliseconds at the end of the period | ||
* @param stateManager ADStateManager | ||
* @throws IllegalStateException when unexpected failures happen | ||
* @return features from search results, empty when no data found | ||
*/ | ||
public Optional<double[]> getFeaturesForPeriod(AnomalyDetector detector, long startTime, long endTime) { | ||
public Optional<double[]> getFeaturesForPeriod(AnomalyDetector detector, long startTime, long endTime, ADStateManager stateManager) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The needed dependency can be injected in this class. |
||
SearchRequest searchRequest = createFeatureSearchRequest(detector, startTime, endTime, Optional.empty()); | ||
// add (detectorId, filteredQuery) to negative cache | ||
stateManager.insertFilteredQuery(detector, searchRequest); | ||
// send throttled request: this request will clear the negative cache if the request finished within timeout | ||
return clientUtil | ||
.<SearchRequest, SearchResponse>timedRequest(searchRequest, logger, client::search) | ||
.<SearchRequest, SearchResponse>throttledTimedRequest(searchRequest, logger, client::search, stateManager, detector) | ||
.flatMap(resp -> parseResponse(resp, detector.getEnabledFeatureIds())); | ||
} | ||
|
||
|
@@ -242,20 +249,22 @@ public void getFeatureSamplesForPeriods( | |
* @param maxSamples the maximum number of samples to return | ||
* @param maxStride the maximum number of periods between samples | ||
* @param endTime the end time of the latest period | ||
* @param stateManager ADStateManager | ||
* @return sampled features and stride, empty when no data found | ||
*/ | ||
public Optional<Entry<double[][], Integer>> getFeaturesForSampledPeriods( | ||
AnomalyDetector detector, | ||
int maxSamples, | ||
int maxStride, | ||
long endTime | ||
long endTime, | ||
ADStateManager stateManager | ||
) { | ||
Map<Long, double[]> cache = new HashMap<>(); | ||
int currentStride = maxStride; | ||
Optional<double[][]> features = Optional.empty(); | ||
while (currentStride >= 1) { | ||
boolean isInterpolatable = currentStride < maxStride; | ||
features = getFeaturesForSampledPeriods(detector, maxSamples, currentStride, endTime, cache, isInterpolatable); | ||
features = getFeaturesForSampledPeriods(detector, maxSamples, currentStride, endTime, cache, isInterpolatable, stateManager); | ||
if (!features.isPresent() || features.get().length > maxSamples / 2 || currentStride == 1) { | ||
break; | ||
} else { | ||
|
@@ -275,7 +284,8 @@ private Optional<double[][]> getFeaturesForSampledPeriods( | |
int stride, | ||
long endTime, | ||
Map<Long, double[]> cache, | ||
boolean isInterpolatable | ||
boolean isInterpolatable, | ||
ADStateManager stateManager | ||
) { | ||
ArrayDeque<double[]> sampledFeatures = new ArrayDeque<>(maxSamples); | ||
for (int i = 0; i < maxSamples; i++) { | ||
|
@@ -284,7 +294,7 @@ private Optional<double[][]> getFeaturesForSampledPeriods( | |
if (cache.containsKey(end)) { | ||
sampledFeatures.addFirst(cache.get(end)); | ||
} else { | ||
Optional<double[]> features = getFeaturesForPeriod(detector, end - span, end); | ||
Optional<double[]> features = getFeaturesForPeriod(detector, end - span, end, stateManager); | ||
if (features.isPresent()) { | ||
cache.put(end, features.get()); | ||
sampledFeatures.addFirst(features.get()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
import org.apache.logging.log4j.Logger; | ||
import org.elasticsearch.action.get.GetRequest; | ||
import org.elasticsearch.action.get.GetResponse; | ||
import org.elasticsearch.action.search.SearchRequest; | ||
import org.elasticsearch.client.Client; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; | ||
|
@@ -55,6 +56,9 @@ public class ADStateManager { | |
private static final Logger LOG = LogManager.getLogger(ADStateManager.class); | ||
private ConcurrentHashMap<String, Entry<AnomalyDetector, Instant>> currentDetectors; | ||
private ConcurrentHashMap<String, Entry<Integer, Instant>> partitionNumber; | ||
// negativeCache is used to reject search query if given detector already has one query running | ||
// key is detectorId, value is an entry. Key is QueryBuilder and value is the timestamp | ||
private ConcurrentHashMap<String, Entry<SearchRequest, Instant>> negativeCache; | ||
private Client client; | ||
private Random random; | ||
private ModelManager modelManager; | ||
|
@@ -83,6 +87,7 @@ public ADStateManager( | |
this.partitionNumber = new ConcurrentHashMap<>(); | ||
this.clientUtil = clientUtil; | ||
this.backpressureMuter = new ConcurrentHashMap<>(); | ||
this.negativeCache = new ConcurrentHashMap<>(); | ||
this.clock = clock; | ||
this.settings = settings; | ||
this.stateTtl = stateTtl; | ||
|
@@ -119,6 +124,47 @@ public int getPartitionNumber(String adID) throws InterruptedException { | |
return partitionNum; | ||
} | ||
|
||
/** | ||
* Get negative cache value(QueryBuilder, Instant) for given detector | ||
* If detectorId is null, return Optional.empty() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor. This can be assumed to be unlikely. If this is a real concern that must be addressed, the proper behavior to expect is to throw an exception. |
||
* @param detector AnomalyDetector | ||
* @return negative cache value(QueryBuilder, Instant) | ||
*/ | ||
public Optional<Entry<SearchRequest, Instant>> getFilteredQuery(AnomalyDetector detector) { | ||
if (detector.getDetectorId() == null) { | ||
return Optional.empty(); | ||
} | ||
if (negativeCache.containsKey(detector.getDetectorId())) { | ||
return Optional.of(negativeCache.get(detector.getDetectorId())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor. This method can be simplified to this line. |
||
} | ||
return Optional.empty(); | ||
} | ||
|
||
/** | ||
* Insert the negative cache entry for given detector | ||
* If detectorId is null, do nothing | ||
* @param detector AnomalyDetector | ||
* @param searchRequest ES search request | ||
*/ | ||
public void insertFilteredQuery(AnomalyDetector detector, SearchRequest searchRequest) { | ||
if (detector.getDetectorId() == null) { | ||
return; | ||
} | ||
negativeCache.putIfAbsent(detector.getDetectorId(), new SimpleEntry<>(searchRequest, clock.instant())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question. Is put more expected for the client? Or, the insert call returns but the entry is still not updated. If that's by design, the documentation should make it clear. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think put is better than putIfAbsent here. Will update. |
||
} | ||
|
||
/** | ||
* Clear the negative cache for given detector. | ||
* If detectorId is null, do nothing | ||
* @param detector AnomalyDetector | ||
*/ | ||
public void clearFilteredQuery(AnomalyDetector detector) { | ||
if (detector.getDetectorId() == null) { | ||
return; | ||
} | ||
negativeCache.keySet().removeIf(key -> key.equals(detector.getDetectorId())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor. Map::remove should work. |
||
} | ||
|
||
public Optional<AnomalyDetector> getAnomalyDetector(String adID) { | ||
Entry<AnomalyDetector, Instant> detectorAndTime = currentDetectors.get(adID); | ||
if (detectorAndTime != null) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question. From #33 , query cannot be aborted. The comment indicates the opposite. Is it confusing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. That's true. Currently anomaly-detection cannot abort running query in elasticsearch. If a query running longer than expected, anomaly-detection will not wait for that even though the query is still running. To solve this issue, we will 1) stop accepting new query if this case happen which is #33 2) daily cron clean up running query using es task management API(https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html)