opendistro-for-elasticsearch · khushbr · Jul 23, 2020 · Jul 7, 2020 · Jul 15, 2020 · Jul 23, 2020
diff --git a/...pendistro/elasticsearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java b/...pendistro/elasticsearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java
@@ -59,6 +59,17 @@ public class ResourceUtil {
       .setResourceEnum(ResourceEnum.SEARCH_THREADPOOL)
       .setMetricEnum(MetricEnum.QUEUE_REJECTION).build();
 
+  // cache
+  public static final Resource FIELD_DATA_CACHE_EVICTION = Resource.newBuilder()
+          .setResourceEnum(ResourceEnum.FIELD_DATA_CACHE)
+          .setMetricEnum(MetricEnum.CACHE_EVICTION).build();
+  public static final Resource SHARD_REQUEST_CACHE_EVICTION = Resource.newBuilder()
+          .setResourceEnum(ResourceEnum.SHARD_REQUEST_CACHE)
+          .setMetricEnum(MetricEnum.CACHE_EVICTION).build();
+  public static final Resource SHARD_REQUEST_CACHE_HIT = Resource.newBuilder()
+          .setResourceEnum(ResourceEnum.SHARD_REQUEST_CACHE)
+          .setMetricEnum(MetricEnum.CACHE_HIT).build();
+
   /**
    * Read the resourceType name from the ResourceType object
    * @param resource grpc Resource object

diff --git a/...om/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/cache/CacheUtil.java b/...om/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/cache/CacheUtil.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License").
+ * You may not use this file except in compliance with the License.
+ * A copy of the License is located at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * or in the "license" file accompanying this file. This file is distributed
+ * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ *  permissions and limitations under the License.
+ */
+
+package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cache;
+
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.metricsdb.MetricsDB;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.jooq.Record;
+
+public class CacheUtil {
+    private static final Logger LOG = LogManager.getLogger(CacheUtil.class);
+    private static final double CONVERT_BYTES_TO_MEGABYTES = Math.pow(1024, 3);
+
+    public static Double getTotalSizeInMB(final Metric sizeMetric) {
+        double sizeTotalInMB = 0;
+
+        // we expect the Metric to have single flow unit since it is consumed locally
+        MetricFlowUnit flowUnit = sizeMetric.getFlowUnits().get(0);
+        if (flowUnit.isEmpty() || flowUnit.getData() == null) {
+            return sizeTotalInMB;
+        }
+
+        for (Record record : flowUnit.getData()) {
+            double size = record.getValue(MetricsDB.MAX, Double.class);
+            if (Double.isNaN(size)) {
+                LOG.error("Failed to parse metric in FlowUnit from {}", sizeMetric.getClass().getName());
+            } else {
+                sizeTotalInMB += size / CONVERT_BYTES_TO_MEGABYTES;
+            }
+        }
+        return sizeTotalInMB;
+    }
+}
diff --git a/...n/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/cache/FieldDataCacheRca.java b/...n/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/cache/FieldDataCacheRca.java
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License").
+ * You may not use this file except in compliance with the License.
+ * A copy of the License is located at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * or in the "license" file accompanying this file. This file is distributed
+ * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ *  permissions and limitations under the License.
+ */
+
+package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cache;
+
+import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.FIELD_DATA_CACHE_EVICTION;
+import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cache.CacheUtil.getTotalSizeInMB;
+
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.grpc.FlowUnitMessage;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.grpc.Resource;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.metricsdb.MetricsDB;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Rca;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper;
+import com.amazon.opendistro.elasticsearch.performanceanalyzer.reader.ClusterDetailsEventProcessor;
+import com.google.common.annotations.VisibleForTesting;
+import java.time.Clock;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+/**
+ * Field Data Cache RCA is to identify when the cache is unhealthy(thrashing) and otherwise, healthy.
+ * The dimension we are using for this analysis is cache eviction count, cache current weight(size) and
+ * cache max weight(size) configured.
+ * Note : For Field Data Cache, Hit and Miss metrics aren't available.
+ *
+ * <p>Cache eviction within Elasticsearch happens in following scenarios :
+ * <ol>
+ *   <li>Mutation to Cache (Entry Insertion/Promotion and Manual Invalidation)
+ *   <li>Explicit call to refresh()
+ * </ol>
+ *
+ * <p>The Cache Eviction requires that either the cache weight exceeds OR the entry TTL is expired.
+ * For Field Data Cache, no expire setting is present, so only in case of cache_weight exceeding the
+ * max_cache_weight, eviction(removal from Cache Map and LRU linked List, entry updated to EVICTED)
+ * happens.
+ *
+ * <p>Contrarily, the Cache Invalidation is performed manually on cache clear() and index close()
+ * invocation, with removalReason as INVALIDATED and a force eviction is performed to ensure cleanup.
+ *
+ * <p>This RCA reads 'fieldDataCacheEvictions', 'fieldDataCacheSize' and 'fieldDataCacheMaxSize'
+ * from upstream metrics and maintains a collector which keeps track of the time window period(tp)
+ * where we repeatedly see evictions for the last tp duration. This RCA is marked as unhealthy if
+ * tp is above the threshold(300 seconds) and cache size exceeds the max cache size configured.
+ *
+ */
+public class FieldDataCacheRca extends Rca<ResourceFlowUnit<HotNodeSummary>> {
+    private static final Logger LOG = LogManager.getLogger(FieldDataCacheRca.class);
+    private static final long EVICTION_THRESHOLD_TIME_PERIOD_IN_MILLISECOND = TimeUnit.SECONDS.toMillis(300);
+
+    private final Metric fieldDataCacheEvictions;
+    private final Metric fieldDataCacheSize;
+    private final Metric fieldDataCacheMaxSize;
+    private final int rcaPeriod;
+    private int counter;
+    private boolean exceedsSize;
+    protected Clock clock;
+    private final CacheEvictionCollector cacheEvictionCollector;
+
+    public <M extends Metric> FieldDataCacheRca(final int rcaPeriod, final M fieldDataCacheEvictions,
+                                                final M fieldDataCacheSize, final M fieldDataCacheMaxSize) {
+        super(5);
+        this.rcaPeriod = rcaPeriod;
+        this.fieldDataCacheEvictions = fieldDataCacheEvictions;
+        this.fieldDataCacheSize = fieldDataCacheSize;
+        this.fieldDataCacheMaxSize = fieldDataCacheMaxSize;
+        this.counter = 0;
+        this.exceedsSize = Boolean.FALSE;
+        this.clock = Clock.systemUTC();
+        this.cacheEvictionCollector = new CacheEvictionCollector(FIELD_DATA_CACHE_EVICTION,
+                fieldDataCacheEvictions, EVICTION_THRESHOLD_TIME_PERIOD_IN_MILLISECOND);
+    }
+
+    @VisibleForTesting
+    public void setClock(Clock clock) {
+        this.clock = clock;
+    }
+
+    @Override
+    public ResourceFlowUnit<HotNodeSummary> operate() {
+        counter += 1;
+        long currTimestamp = clock.millis();
+
+        cacheEvictionCollector.collect(currTimestamp);
+        if (counter >= rcaPeriod) {
+            ResourceContext context;
+            HotNodeSummary nodeSummary;
+
+            ClusterDetailsEventProcessor.NodeDetails currentNode = ClusterDetailsEventProcessor.getCurrentNodeDetails();
+            double cacheSize = getTotalSizeInMB(fieldDataCacheSize);
+            double cacheMaxSize = getTotalSizeInMB(fieldDataCacheMaxSize);
+            exceedsSize = cacheMaxSize != 0 && cacheMaxSize != 0 && cacheSize > cacheMaxSize;
+            if (cacheEvictionCollector.isUnhealthy(currTimestamp) && exceedsSize) {
+                context = new ResourceContext(Resources.State.UNHEALTHY);
+                nodeSummary = new HotNodeSummary(currentNode.getId(), currentNode.getHostAddress());
+                nodeSummary.appendNestedSummary(cacheEvictionCollector.generateSummary(currTimestamp));
+            }
+            else {
+                context = new ResourceContext(Resources.State.HEALTHY);
+                nodeSummary = null;
+            }
+
+            counter = 0;
+            exceedsSize = Boolean.FALSE;
+            return new ResourceFlowUnit<>(currTimestamp, context, nodeSummary, !currentNode.getIsMasterNode());
+        }
+        else {
+            return new ResourceFlowUnit<>(currTimestamp);
+        }
+    }
+
+    @Override
+    public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) {
+        final List<FlowUnitMessage> flowUnitMessages =
+                args.getWireHopper().readFromWire(args.getNode());
+        List<ResourceFlowUnit<HotNodeSummary>> flowUnitList = new ArrayList<>();
+        LOG.debug("rca: Executing fromWire: {}", this.getClass().getSimpleName());
+        for (FlowUnitMessage flowUnitMessage : flowUnitMessages) {
+            flowUnitList.add(ResourceFlowUnit.buildFlowUnitFromWrapper(flowUnitMessage));
+        }
+        setFlowUnits(flowUnitList);
+    }
+
+    /**
+     * A collector class to collect eviction metrics
+     */
+    private static class CacheEvictionCollector {
+        private final Resource cache;
+        private final Metric cacheEvictionMetrics;
+        private boolean hasEvictions;
+        private long evictionTimestamp;
+        private long evictionTimePeriodThreshold;
+
+        private CacheEvictionCollector(final Resource cache, final Metric cacheEvictionMetrics,
+                                       final long threshold) {
+            this.cache = cache;
+            this.cacheEvictionMetrics = cacheEvictionMetrics;
+            this.hasEvictions = false;
+            this.evictionTimestamp = 0;
+            this.evictionTimePeriodThreshold = threshold;
+        }
+
+        public void collect(final long currTimestamp) {
+            for (MetricFlowUnit flowUnit : cacheEvictionMetrics.getFlowUnits()) {
+                if (flowUnit.isEmpty() || flowUnit.getData() == null) {
+                    continue;
+                }
+
+                double evictionCount = flowUnit.getData().stream().mapToDouble(
+                        record -> record.getValue(MetricsDB.MAX, Double.class)).sum();
+                if (!Double.isNaN(evictionCount)) {
+                    if (evictionCount > 0) {
+                        if (!hasEvictions) {
+                            evictionTimestamp = currTimestamp;
+                        }
+                        hasEvictions = true;
+                    }
+                    else {
+                        hasEvictions = false;
+                    }
+                }
+                else {
+                    LOG.error("Failed to parse metric from cache {}", cache.toString());
+                }
+            }
+        }
+
+        public boolean isUnhealthy(final long currTimestamp) {
+            return hasEvictions && (currTimestamp - evictionTimestamp) >= evictionTimePeriodThreshold;
+        }
+
+        private HotResourceSummary generateSummary(final long currTimestamp) {
+            HotResourceSummary resourceSummary = null;
+            if (isUnhealthy(currTimestamp)) {
+                resourceSummary = new HotResourceSummary(cache,
+                        TimeUnit.MILLISECONDS.toSeconds(evictionTimePeriodThreshold),
+                        TimeUnit.MILLISECONDS.toSeconds(currTimestamp - evictionTimestamp),
+                        0);
+            }
+            return resourceSummary;
+        }
+    }
+}