Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

Publish fault detection metrics #470

Merged
merged 5 commits into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,68 @@ public static class Constants {
public static final String SHARD_OP_COUNT_VALUE = "ShardEvents";
}
}
/*
* column names of FollowerCheck_Latency table
* SourceNodeId | TargetNodeID | sum | avg | min |max
*
* column names of LeaderCheck_Latency table
* SourceNodeId | TargetNodeID | sum | avg | min |max
*
* column names of FollowerCheck_Failure table
* SourceNodeId | TargetNodeID | sum | avg | min |max
*
* column names of LeaderCheck_Failure table
* SourceNodeId | TargetNodeID | sum | avg | min |max
*
* <p>Example:
* chMe07whRwGrOAqyLTP9vw|hgi7an4RwGrOAqyLTP9vw|1.0|0.2|0.0|1.0
*/

public enum FaultDetectionMetric implements MetricValue {
FOLLOWER_CHECK_LATENCY(Constants.FOLLOWER_CHECK_LATENCY),
LEADER_CHECK_LATENCY(Constants.LEADER_CHECK_LATENCY),
FOLLOWER_CHECK_FAILURE(Constants.FOLLOWER_CHECK_FAILURE),
LEADER_CHECK_FAILURE(Constants.LEADER_CHECK_FAILURE);

private final String value;

FaultDetectionMetric(String value) {
this.value = value;
}

@Override
public String toString() {
return value;
}

public static class Constants {
public static final String FOLLOWER_CHECK_LATENCY = "FollowerCheck_Latency";
public static final String LEADER_CHECK_LATENCY = "LeaderCheck_Latency";
public static final String FOLLOWER_CHECK_FAILURE = "FollowerCheck_Failure";
public static final String LEADER_CHECK_FAILURE = "LeaderCheck_Failure";
}
}

public enum FaultDetectionDimension implements MetricDimension {
SOURCE_NODE_ID(Constants.SOURCE_NODE_ID),
TARGET_NODE_ID(Constants.TARGET_NODE_ID);

private final String value;

FaultDetectionDimension(String value) {
this.value = value;
}

@Override
public String toString() {
return value;
}

public static class Constants {
public static final String SOURCE_NODE_ID = "SourceNodeID";
public static final String TARGET_NODE_ID = "TargetNodeID";
}
}

public enum CommonDimension implements MetricDimension {
INDEX_NAME(Constants.INDEX_NAME_VALUE),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public class PerformanceAnalyzerMetrics {
public static final String sShardFetchPath = "shardfetch";
public static final String sShardQueryPath = "shardquery";
public static final String sMasterTaskPath = "master_task";
public static final String sFaultDetection = "fault_detection";
public static final String sHttpPath = "http";
public static final String sOSPath = "os_metrics";
public static final String sHeapPath = "heap_metrics";
Expand All @@ -62,6 +63,9 @@ public class PerformanceAnalyzerMetrics {
public static final String MASTER_CURRENT = "current";
public static final String MASTER_META_DATA = "metadata";
public static final String METRIC_CURRENT_TIME = "current_time";
public static final String FAULT_DETECTION_FOLLOWER_CHECK = "follower_check";
public static final String FAULT_DETECTION_LEADER_CHECK = "leader_check";
public static final String FAULT = "fault";
public static final int QUEUE_SIZE = PluginSettings.instance().getWriterQueueSize();

// TODO: Comeup with a more sensible number.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,26 @@ public class MetricsModel {
new MetricAttributes(
MetricUnits.MILLISECOND.toString(), AllMetrics.MasterMetricDimensions.values()));

// Master Throttling Metrics
allMetricsInitializer.put(
AllMetrics.FaultDetectionMetric.FOLLOWER_CHECK_LATENCY.toString(),
new MetricAttributes(
MetricUnits.MILLISECOND.toString(), AllMetrics.FaultDetectionDimension.values()));

allMetricsInitializer.put(
AllMetrics.FaultDetectionMetric.LEADER_CHECK_LATENCY.toString(),
new MetricAttributes(
MetricUnits.MILLISECOND.toString(), AllMetrics.FaultDetectionDimension.values()));

allMetricsInitializer.put(
AllMetrics.FaultDetectionMetric.FOLLOWER_CHECK_FAILURE.toString(),
new MetricAttributes(
MetricUnits.COUNT.toString(), AllMetrics.FaultDetectionDimension.values()));

allMetricsInitializer.put(
AllMetrics.FaultDetectionMetric.LEADER_CHECK_FAILURE.toString(),
new MetricAttributes(
MetricUnits.COUNT.toString(), AllMetrics.FaultDetectionDimension.values()));

allMetricsInitializer.put(
AllMetrics.MasterThrottlingValue.MASTER_THROTTLED_PENDING_TASK_COUNT.toString(),
new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ public enum ExceptionsAndErrors implements MeasurementSet {

SHARD_STATE_COLLECTOR_ERROR("ShardStateCollectorError"),

MASTER_THROTTLING_COLLECTOR_ERROR("MasterThrottlingMetricsCollector");
MASTER_THROTTLING_COLLECTOR_ERROR("MasterThrottlingMetricsCollector"),

FAULT_DETECTION_COLLECTOR_ERROR("FaultDetectionMetricsCollector");

/** What we want to appear as the metric name. */
private String name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ public enum ReaderMetrics implements MeasurementSet {
* Amount of time taken to emit Master throttling metrics.
*/
MASTER_THROTTLING_EMITTER_EXECUTION_TIME("MasterThrottlingEmitterExecutionTime", "millis",
Arrays.asList(Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM));
Arrays.asList(Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM)),

FAULT_DETECTION_METRICS_EMITTER_EXECUTION_TIME("FaultDetectionMetricsEmitterExecutionTime", "millis",
Arrays.asList(Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM));
/** What we want to appear as the metric name. */
private String name;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ public enum WriterMetrics implements MeasurementSet {
Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM)),

MASTER_THROTTLING_COLLECTOR_NOT_AVAILABLE("MasterThrottlingCollectorNotAvailable", "count", Arrays.asList(
Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM));
Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM)),

FAULT_DETECTION_COLLECTOR_EXECUTION_TIME("FaultDetectionCollectorExecutionTime", "millis", Arrays.asList(
Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM));

/** What we want to appear as the metric name. */
private String name;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/*
* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistro.elasticsearch.performanceanalyzer.reader;
khushbr marked this conversation as resolved.
Show resolved Hide resolved

import com.amazon.opendistro.elasticsearch.performanceanalyzer.collectors.StatExceptionCode;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.collectors.StatsCollector;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.CommonMetric;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.FaultDetectionDimension;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.PerformanceAnalyzerMetrics;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.reader_writer_shared.Event;
import java.io.File;
import java.sql.Connection;
import java.util.Map;
import java.util.NavigableMap;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jooq.BatchBindStep;

public class FaultDetectionMetricsProcessor implements EventProcessor {
private static final Logger LOG = LogManager.getLogger(FaultDetectionMetricsProcessor.class);
private FaultDetectionMetricsSnapshot faultDetectionMetricsSnapshot;
private long startTime;
private long endTime;
private BatchBindStep handle;

public FaultDetectionMetricsProcessor(FaultDetectionMetricsSnapshot faultDetectionMetricsSnapshot) {
this.faultDetectionMetricsSnapshot = faultDetectionMetricsSnapshot;
}

static FaultDetectionMetricsProcessor buildFaultDetectionMetricsProcessor(
long currWindowStartTime,
Connection conn,
NavigableMap<Long, FaultDetectionMetricsSnapshot>
faultDetectionMetricsMap) {

if (faultDetectionMetricsMap.get(currWindowStartTime) == null) {
FaultDetectionMetricsSnapshot faultDetectionMetricsSnapshot =
new FaultDetectionMetricsSnapshot(conn, currWindowStartTime);
Map.Entry<Long, FaultDetectionMetricsSnapshot> entry = faultDetectionMetricsMap.lastEntry();
if (entry != null) {
faultDetectionMetricsSnapshot.rolloverInFlightRequests(entry.getValue());
}
faultDetectionMetricsMap.put(currWindowStartTime, faultDetectionMetricsSnapshot);
return new FaultDetectionMetricsProcessor(faultDetectionMetricsSnapshot);
} else {
return new FaultDetectionMetricsProcessor(faultDetectionMetricsMap.get(currWindowStartTime));
}
}

@Override
public void initializeProcessing(long startTime, long endTime) {
this.startTime = startTime;
this.endTime = endTime;
this.handle = faultDetectionMetricsSnapshot.startBatchPut();
}

@Override
public void finalizeProcessing() {
if (handle.size() > 0) {
handle.execute();
}
LOG.debug("Final Fault Detection request metrics {}", faultDetectionMetricsSnapshot.fetchAll());
}

@Override
public void processEvent(Event event) {
String[] keyItems = event.key.split(File.separatorChar == '\\' ? "\\\\" : File.separator);
assert keyItems.length == 4;
if (keyItems[0].equals(PerformanceAnalyzerMetrics.sFaultDetection)) {
if (keyItems[3].equals(PerformanceAnalyzerMetrics.START_FILE_NAME)) {
emitStartMetric(event, keyItems);
} else if (keyItems[3].equals(PerformanceAnalyzerMetrics.FINISH_FILE_NAME)) {
Comment on lines +83 to +86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the keyItems or event.key will always have the expected values an index 0 and 3 here ?
What happens if keyItems[3] throws an ArrayIndexOutOfBounds Exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That shouldn't be the case. If this happens somehow, it will have an ArrayIndexOutOfBoundsException. Can add assert here which can check the size of keyItems.

emitFinishMetric(event, keyItems);
}
}
}

@Override
public boolean shouldProcessEvent(Event event) {
return event.key.contains(PerformanceAnalyzerMetrics.sFaultDetection);
}

@Override
public void commitBatchIfRequired() {
if (handle.size() > BATCH_LIMIT) {
handle.execute();
handle = faultDetectionMetricsSnapshot.startBatchPut();
}
}

/**
* A keyItem is of the form : [fault_detection, follower_check, 76532, start]
* Example value part of the entry is:
* current_time:1566413979979
* StartTime:1566413987986
* SourceNodeID:g52i9a93a762cd59dda8d3379b09a752a
* TargetNodeID:b2a5a93a762cd59dda8d3379b09a752a
* $
* @param entry fault detection event.
* @param keyItems keys extracted from metrics path
*/
private void emitStartMetric(Event entry, String[] keyItems) {
Map<String, String> keyValueMap = ReaderMetricsProcessor.extractEntryData(entry.value);

String sourceNodeId = keyValueMap.get(FaultDetectionDimension.SOURCE_NODE_ID.toString());
String targetNodeId = keyValueMap.get(FaultDetectionDimension.TARGET_NODE_ID.toString());
String startTimeVal = keyValueMap.get(CommonMetric.START_TIME.toString());

try {
long st = Long.parseLong(startTimeVal);

String fault_detection_type = keyItems[1];
String rid = keyItems[2];
// A keyItem is of the form : [fault_detection, follower_check, 76543, start]
handle.bind(rid, sourceNodeId, targetNodeId, fault_detection_type, st, null, 0);
} catch (NumberFormatException e) {
LOG.error("Unable to parse string. StartTime:{}", startTimeVal);
StatsCollector.instance().logException(StatExceptionCode.READER_PARSER_ERROR);
throw e;
}
}

/**
* A keyItem is of the form : [fault_detection, follower_check, 76532, start]
* Example value part of the entry is:
* current_time:1566413979979
* FinishTime:1566413987986
* SourceNodeID:g52i9a93a762cd59dda8d3379b09a752a
* TargetNodeID:b2a5a93a762cd59dda8d3379b09a752a
* fault:0
* $
* @param entry fault detection event.
* @param keyItems keys extracted from metrics path
*/
private void emitFinishMetric(Event entry, String[] keyItems) {
Map<String, String> keyValueMap = ReaderMetricsProcessor.extractEntryData(entry.value);

String sourceNodeId = keyValueMap.get(FaultDetectionDimension.SOURCE_NODE_ID.toString());
String targetNodeId = keyValueMap.get(FaultDetectionDimension.TARGET_NODE_ID.toString());
String finishTimeVal = keyValueMap.get(CommonMetric.FINISH_TIME.toString());
String faultString = keyValueMap.get(PerformanceAnalyzerMetrics.FAULT);

try {
long et = Long.parseLong(finishTimeVal);
int fault = Integer.parseInt(faultString);

String fault_detection_type = keyItems[1];
String rid = keyItems[2];
// A keyItem is of the form : [fault_detection, follower_check, 76543, finish]
handle.bind(rid, sourceNodeId, targetNodeId, fault_detection_type, null, et, fault);
} catch (NumberFormatException e) {
LOG.error("Unable to parse string. StartTime:{}, Error:{}", finishTimeVal, faultString);
StatsCollector.instance().logException(StatExceptionCode.READER_PARSER_ERROR);
throw e;
}
}
}
Loading