Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

Add integ test for queue rejection cluster RCA #370

Merged
merged 6 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric;

public class ThreadPool_QueueCapacity extends Metric {
public static final String NAME = AllMetrics.ThreadPoolValue.THREADPOOL_QUEUE_CAPACITY.toString();

public ThreadPool_QueueCapacity() {
super(AllMetrics.ThreadPoolValue.THREADPOOL_QUEUE_CAPACITY.toString(), 5);
super(NAME, 5);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric;

public class ThreadPool_RejectedReqs extends Metric {
public static final String NAME = AllMetrics.ThreadPoolValue.THREADPOOL_REJECTED_REQS.toString();

public ThreadPool_RejectedReqs(long evaluationIntervalSeconds) {
super(AllMetrics.ThreadPoolValue.THREADPOOL_REJECTED_REQS.toString(), evaluationIntervalSeconds);
super(NAME, evaluationIntervalSeconds);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ public class Consts {
public static final String RCAIT_DEFAULT_RCA_CONF_STANDBY_MASTER_NODE = TEST_RESOURCES_DIR + "rca_master.conf";
public static final String RCAIT_DEFAULT_RCA_CONF_DATA_NODE = TEST_RESOURCES_DIR + "rca.conf";

public static final String INTEG_TESTS_SRC_DIR =
"./src/test/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/integTests/";

public static final String HOST_ID_KEY = "hostId";
public static final String HOST_ROLE_KEY = "hostRole";
public static final String DATA_KEY = "data";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.tests.queue_tuning;

import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.ThreadPoolDimension;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.ThreadPoolType;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_QueueCapacity;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_RejectedReqs;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.RcaItMarker;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.AClusterType;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.AErrorPatternIgnored;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.AExpect;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.AMetric;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.ARcaConf;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.ARcaGraph;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.ATable;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.annotations.ATuple;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.configs.ClusterType;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.configs.Consts;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.configs.HostTag;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.runners.RcaItNotEncryptedRunner;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.tests.queue_tuning.validator.QueueRejectionValidator;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.ElasticSearchAnalysisGraph;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cluster.QueueRejectionClusterRca;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;

@RunWith(RcaItNotEncryptedRunner.class)

@Category(RcaItMarker.class)
@AClusterType(ClusterType.MULTI_NODE_CO_LOCATED_MASTER)
@ARcaGraph(ElasticSearchAnalysisGraph.class)
//specify a custom rca.conf to set the rejection-time-period-in-seconds to 5s to reduce runtime
@ARcaConf(dataNode = RcaItQueueTuning.QUEUE_TUNING_RESOURCES_DIR + "rca.conf")
@AMetric(name = ThreadPool_RejectedReqs.class,
dimensionNames = {ThreadPoolDimension.Constants.TYPE_VALUE},
tables = {
@ATable(hostTag = HostTag.DATA_0,
tuple = {
@ATuple(dimensionValues = {ThreadPoolType.Constants.WRITE_NAME},
sum = 1.0, avg = 1.0, min = 1.0, max = 1.0),
@ATuple(dimensionValues = {ThreadPoolType.Constants.SEARCH_NAME},
sum = 0.0, avg = 0.0, min = 0.0, max = 0.0)
}
),
@ATable(hostTag = {HostTag.ELECTED_MASTER},
tuple = {
@ATuple(dimensionValues = {ThreadPoolType.Constants.WRITE_NAME},
sum = 0.0, avg = 0.0, min = 0.0, max = 0.0),
@ATuple(dimensionValues = {ThreadPoolType.Constants.SEARCH_NAME},
sum = 0.0, avg = 0.0, min = 0.0, max = 0.0)
}
)
}
)

@AMetric(name = ThreadPool_QueueCapacity.class,
dimensionNames = {ThreadPoolDimension.Constants.TYPE_VALUE},
tables = {
@ATable(hostTag = HostTag.DATA_0,
tuple = {
@ATuple(dimensionValues = {ThreadPoolType.Constants.WRITE_NAME},
sum = 500, avg = 500, min = 500, max = 500),
@ATuple(dimensionValues = {ThreadPoolType.Constants.SEARCH_NAME},
sum = 1500, avg = 1500, min = 1500, max = 1500)
}
),
@ATable(hostTag = {HostTag.ELECTED_MASTER},
tuple = {
@ATuple(dimensionValues = {ThreadPoolType.Constants.WRITE_NAME},
sum = 0.0, avg = 0.0, min = 0.0, max = 0.0),
@ATuple(dimensionValues = {ThreadPoolType.Constants.SEARCH_NAME},
sum = 1500, avg = 1500, min = 1500, max = 1500)
}
)
}
)
public class RcaItQueueTuning {
public static final String QUEUE_TUNING_RESOURCES_DIR = Consts.INTEG_TESTS_SRC_DIR + "./tests/queue_tuning/resource/";

// This integ test is built to test queue rejection RCA + queue rejection cluster RCA
// This test injects queue rejection metrics on one of the data node and queries the
// rest API on master to check whether queue rejection cluster RCA becomes unhealthy
//TODO : extend this integ test to cover Decision Maker framework and queue remediation actions
@Test
yojs marked this conversation as resolved.
Show resolved Hide resolved
@AExpect(
what = AExpect.Type.REST_API,
on = HostTag.ELECTED_MASTER,
validator = QueueRejectionValidator.class,
forRca = QueueRejectionClusterRca.class,
timeoutSeconds = 500)
@AErrorPatternIgnored(pattern = "CacheUtil:getCacheMaxSize()",
reason = "Cache related configs are expected to be missing in this integ test")
@AErrorPatternIgnored(pattern = "AggregateMetric:gather()",
reason = "Cache metrics are expected to be missing in this integ test")
public void testQueueRejectionRca() {
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"analysis-graph-implementor":
"com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.AnalysisGraphTest",
// it can be file:// or s3://
"rca-store-location": "s3://sifi-store/rcas/",

//it can be file:// or s3://
"threshold-store-location": "s3://sifi-store/thresholds/",

"new-rca-check-minutes": 60,

"new-threshold-check-minutes": 30,

// The size of the task queue for all networking operations.
// Small size queues may result in dropping of flow units, while large size queues can lead to a
// bigger backlog of tasks resulting in delays in sending and receiving.
"network-queue-length": 200,

// The size of the per-vertex buffer for flow units received from remote nodes.
// Small buffer sizes may result in dropping of flow units, while large buffer size can lead to
// high memory consumptions depending on how the analysis graph is configured.
"max-flow-units-per-vertex-buffer": 200,

"tags": {
"locus": "data-node",
"disk": "ssd",
"region": "use1",
"instance-type": "i3.8xl",
"domain": "rca-test-cluster"
},

"remote-peers": ["ip1", "ip2", "ip3"],

// Tells the runtime where the RCAs will be stored.
"datastore": {
// accepted types are sqlite, in-memory.
"type": "sqlite",
"location-dir": "/tmp",
"filename": "rca.sqlite",
"storage-file-retention-count": 5,
// How often the sqlite file be repeated in seconds. This file contains RCAs and therefore rotating it too frequently
// might not be as fruitful as there might not be any data.
"rotation-period-seconds": 21600
},

// Add config settings for different RCAs
"rca-config-settings": {
// old gen rca
"high-heap-usage-old-gen-rca": {
"top-k" : 3
},
//young gen rca
"high-heap-usage-young-gen-rca": {
"promotion-rate-mb-per-second" : 500,
"young-gen-gc-time-ms-per-second" : 400
},
"queue-rejection-rca": {
"rejection-time-period-in-seconds" : 5
},
//hot shard rca
"hot-shard-rca": {
"cpu-utilization" : 0.01,
"io-total-throughput-in-bytes" : 250000.0,
"io-total-syscallrate-per-second" : 0.1
}
},

"muted-rcas": [],
"muted-deciders": [],
"muted-actions": [],

"decider-config-settings": {
"cache-decider-config": {
"field-data-cache-upper-bound" : 0.4,
"shard-request-cache-upper-bound" : 0.05
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.tests.queue_tuning.validator;

import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary.SQL_SCHEMA_CONSTANTS.HOST_IP_ADDRESS_COL_NAME;
import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary.SQL_SCHEMA_CONSTANTS.NODE_ID_COL_NAME;
import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary.SQL_SCHEMA_CONSTANTS.RESOURCE_TYPE_COL_NAME;

import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.framework.api.IValidator;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.tests.util.JsonParserUtil;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cluster.QueueRejectionClusterRca;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import org.junit.Assert;

public class QueueRejectionValidator implements IValidator {
long startTime;

public QueueRejectionValidator() {
startTime = System.currentTimeMillis();
}

/**
* {"rca_name":"QueueRejectionClusterRca",
* "timestamp":1596557050522,
* "state":"unhealthy",
* "HotClusterSummary":[
* {"number_of_nodes":1,"number_of_unhealthy_nodes":1}
* ]}
*/
@Override
public boolean check(JsonElement response) {
JsonArray array = response.getAsJsonObject().get("data").getAsJsonArray();
if (array.size() == 0) {
return false;
}

for (int i = 0; i < array.size(); i++) {
JsonObject object = array.get(i).getAsJsonObject();
if (object.get("rca_name").getAsString().equals(QueueRejectionClusterRca.RCA_TABLE_NAME)) {
return checkClusterRca(object);
}
}
return false;
}

/**
* {"rca_name":"QueueRejectionClusterRca",
* "timestamp":1597167456322,
* "state":"unhealthy",
* "HotClusterSummary":[{"number_of_nodes":1,"number_of_unhealthy_nodes":1}]
* }
*/
boolean checkClusterRca(JsonObject rcaObject) {
if (!"unhealthy".equals(rcaObject.get("state").getAsString())) {
return false;
}
Assert.assertEquals(1,
JsonParserUtil.getSummaryJsonSize(rcaObject, HotClusterSummary.HOT_CLUSTER_SUMMARY_TABLE));
JsonObject clusterSummaryJson =
JsonParserUtil.getSummaryJson(rcaObject, HotClusterSummary.HOT_CLUSTER_SUMMARY_TABLE, 0);
Assert.assertNotNull(clusterSummaryJson);
Assert.assertEquals(1, clusterSummaryJson.get("number_of_unhealthy_nodes").getAsInt());

Assert.assertEquals(1,
JsonParserUtil.getSummaryJsonSize(clusterSummaryJson, HotNodeSummary.HOT_NODE_SUMMARY_TABLE));
JsonObject nodeSummaryJson =
JsonParserUtil.getSummaryJson(clusterSummaryJson, HotNodeSummary.HOT_NODE_SUMMARY_TABLE, 0);
Assert.assertNotNull(nodeSummaryJson);
Assert.assertEquals("DATA_0", nodeSummaryJson.get(NODE_ID_COL_NAME).getAsString());
Assert.assertEquals("127.0.0.1", nodeSummaryJson.get(HOST_IP_ADDRESS_COL_NAME).getAsString());

Assert.assertEquals(1,
JsonParserUtil.getSummaryJsonSize(nodeSummaryJson, HotResourceSummary.HOT_RESOURCE_SUMMARY_TABLE));
JsonObject resourceSummaryJson =
JsonParserUtil.getSummaryJson(nodeSummaryJson, HotResourceSummary.HOT_RESOURCE_SUMMARY_TABLE, 0);
Assert.assertNotNull(resourceSummaryJson);
Assert.assertEquals("write threadpool", resourceSummaryJson.get(RESOURCE_TYPE_COL_NAME).getAsString());
return true;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.integTests.tests.util;

import com.google.gson.JsonArray;
import com.google.gson.JsonObject;

public class JsonParserUtil {
public static int getSummaryJsonSize(JsonObject jsonObject, String summaryName) {
JsonArray array = jsonObject.get(summaryName).getAsJsonArray();
if (array == null) {
return 0;
}
return array.size();
}

public static JsonObject getSummaryJson(JsonObject jsonObject, String summaryName, int idx) {
JsonArray array = jsonObject.get(summaryName).getAsJsonArray();
if (array == null) {
return null;
}
if (idx >= array.size()) {
return null;
}
return array.get(idx).getAsJsonObject();
}
}