Skip to content

Commit

Permalink
[Segment Replication] Fix testAllocationWithDisruption flakyness
Browse files Browse the repository at this point in the history
Signed-off-by: Suraj Singh <[email protected]>
  • Loading branch information
dreamer-89 committed Mar 26, 2023
1 parent 07565ad commit eb8405a
Showing 1 changed file with 21 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,17 @@ public void testSingleIndexShardAllocation() throws Exception {
}

/**
* Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple node adding in and getting
* removed. The test asserts post each such event that primary shard distribution is balanced across single index.
* Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple nodes adding in and getting
* removed. The test asserts post each such event that primary shard distribution is balanced for each index.
*/
@AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/6565")
public void testAllocationWithDisruption() throws Exception {
internalCluster().startClusterManagerOnlyNode();
final int maxReplicaCount = 2;
final int maxShardCount = 5;
final int nodeCount = randomIntBetween(maxReplicaCount + 1, 10);
// Create maxReplicaCount+2 number of nodes. maxReplicaCount for replica shards & 1 node for primary shard
// allocation. One extra node to ensure post failover, primary shards do not remain stuck on one node due to
// SameShardAllocationDecider preventing re-balancing
final int nodeCount = randomIntBetween(maxReplicaCount + 2, 10);
final int numberOfIndices = randomIntBetween(1, 10);

logger.info("--> Creating {} nodes", nodeCount);
Expand All @@ -184,13 +186,11 @@ public void testAllocationWithDisruption() throws Exception {
}
enablePreferPrimaryBalance();

int shardCount, replicaCount, totalShardCount = 0, totalReplicaCount = 0;
int shardCount, replicaCount;
ClusterState state;
for (int i = 0; i < numberOfIndices; i++) {
shardCount = randomIntBetween(1, maxShardCount);
totalShardCount += shardCount;
replicaCount = randomIntBetween(1, maxReplicaCount);
totalReplicaCount += replicaCount;
logger.info("--> Creating index test{} with primary {} and replica {}", i, shardCount, replicaCount);
createIndex("test" + i, shardCount, replicaCount, i % 2 == 0);
ensureGreen(TimeValue.timeValueSeconds(60));
Expand All @@ -212,13 +212,15 @@ public void testAllocationWithDisruption() throws Exception {
logger.info(ShardAllocations.printShardDistribution(state));
verifyPerIndexPrimaryBalance();

logger.info("--> Stop one third nodes");
for (int i = 0; i < nodeCount; i += 3) {
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(i)));
int nodeCountToStop = additionalNodeCount;
while (nodeCountToStop > 0) {
internalCluster().stopRandomDataNode();
// give replica a chance to promote as primary before terminating node containing the replica
ensureGreen(TimeValue.timeValueSeconds(60));
nodeCountToStop--;
}
state = client().admin().cluster().prepareState().execute().actionGet().getState();
logger.info("--> Cluster state post nodes stop {}", state);
logger.info(ShardAllocations.printShardDistribution(state));
verifyPerIndexPrimaryBalance();
}
Expand All @@ -240,6 +242,15 @@ private void verifyPerIndexPrimaryBalance() throws Exception {
.filter(ShardRouting::primary)
.collect(Collectors.toList())
.size();
if (primaryCount > avgPrimaryShardsPerNode) {
logger.info(
"--> Primary shard balance assertion failure for index {} on node {} {} <= {}",
index.key,
node.node().getName(),
primaryCount,
avgPrimaryShardsPerNode
);
}
assertTrue(primaryCount <= avgPrimaryShardsPerNode);
}
}
Expand Down

0 comments on commit eb8405a

Please sign in to comment.