Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validation of the number_of_shards parameter in Shrink Action of ILM #74219

Merged
merged 14 commits into from
Aug 16, 2021
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -621,9 +621,6 @@ public void testRetryPolicy() throws Exception {
.put("index.lifecycle.name", "my_policy")
.build());
client.indices().create(createIndexRequest, RequestOptions.DEFAULT);
assertBusy(() -> assertNotNull(client.indexLifecycle()
.explainLifecycle(new ExplainLifecycleRequest("my_index"), RequestOptions.DEFAULT)
.getIndexResponses().get("my_index").getFailedStep()), 30, TimeUnit.SECONDS);
}

// tag::ilm-retry-lifecycle-policy-request
Expand All @@ -644,8 +641,8 @@ public void testRetryPolicy() throws Exception {

assertTrue(acknowledged);
} catch (ElasticsearchException e) {
// the retry API might fail as the shrink action steps are retryable (so if the retry API reaches ES when ILM is retrying the
// failed `shrink` step, the retry API will fail)
// the retry API might fail as the shrink action steps are retryable (ILM will stuck in the `check-target-shards-count` step
// with no failure, the retry API will fail)
// assert that's the exception we encountered (we want to test to fail if there is an actual error with the retry api)
assertThat(e.getMessage(), containsStringIgnoringCase("reason=cannot retry an action for an index [my_index] that has not " +
"encountered an error when running a Lifecycle Policy"));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
package org.elasticsearch.xpack.core.ilm;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.index.Index;
import org.elasticsearch.xpack.core.ilm.step.info.SingleMessageFieldInfo;

import java.util.Locale;

/**
* This step checks whether the new shrunken index's shards count is a factor of the source index's shards count.
*/
public class CheckTargetShardsCountStep extends ClusterStateWaitStep {

public static final String NAME = "check-target-shards-count";

private final Integer numberOfShards;

private static final Logger logger = LogManager.getLogger(CheckTargetShardsCountStep.class);

CheckTargetShardsCountStep(StepKey key, StepKey nextStepKey, Integer numberOfShards) {
super(key, nextStepKey);
this.numberOfShards = numberOfShards;
}

@Override
public boolean isRetryable() {
return true;
}

public Integer getNumberOfShards() {
return numberOfShards;
}

@Override
public Result isConditionMet(Index index, ClusterState clusterState) {
IndexMetadata indexMetadata = clusterState.metadata().index(index);
gaobinlong marked this conversation as resolved.
Show resolved Hide resolved
if (indexMetadata == null) {
// Index must have been since deleted, ignore it
logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists",
getKey().getAction(), index.getName());
return new Result(false, null);
}
String indexName = indexMetadata.getIndex().getName();
if (numberOfShards != null) {
int sourceNumberOfShards = indexMetadata.getNumberOfShards();
if (sourceNumberOfShards % numberOfShards != 0) {
String policyName = indexMetadata.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
String errorMessage = String.format(Locale.ROOT, "lifecycle action of policy [%s] for index [%s] cannot make progress " +
"because the target shards count [%d] must be a factor of the source index's shards count [%d]",
policyName, indexName, numberOfShards, sourceNumberOfShards);
logger.debug(errorMessage);
return new Result(false, new SingleMessageFieldInfo(errorMessage));
}
}

return new Result(true, null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ public List<Step> toSteps(Client client, String phase, Step.StepKey nextStepKey)
StepKey checkNotWriteIndex = new StepKey(phase, NAME, CheckNotDataStreamWriteIndexStep.NAME);
StepKey waitForNoFollowerStepKey = new StepKey(phase, NAME, WaitForNoFollowersStep.NAME);
StepKey readOnlyKey = new StepKey(phase, NAME, ReadOnlyAction.NAME);
StepKey checkTargetShardsCountKey = new StepKey(phase, NAME, CheckTargetShardsCountStep.NAME);
StepKey cleanupShrinkIndexKey = new StepKey(phase, NAME, CleanupShrinkIndexStep.NAME);
StepKey generateShrinkIndexNameKey = new StepKey(phase, NAME, GenerateUniqueIndexNameStep.NAME);
StepKey setSingleNodeKey = new StepKey(phase, NAME, SetSingleNodeAllocateStep.NAME);
Expand Down Expand Up @@ -167,7 +168,9 @@ public List<Step> toSteps(Client client, String phase, Step.StepKey nextStepKey)
CheckNotDataStreamWriteIndexStep checkNotWriteIndexStep = new CheckNotDataStreamWriteIndexStep(checkNotWriteIndex,
waitForNoFollowerStepKey);
WaitForNoFollowersStep waitForNoFollowersStep = new WaitForNoFollowersStep(waitForNoFollowerStepKey, readOnlyKey, client);
ReadOnlyStep readOnlyStep = new ReadOnlyStep(readOnlyKey, cleanupShrinkIndexKey, client);
ReadOnlyStep readOnlyStep = new ReadOnlyStep(readOnlyKey, checkTargetShardsCountKey, client);
CheckTargetShardsCountStep checkTargetShardsCountStep = new CheckTargetShardsCountStep(checkTargetShardsCountKey,
cleanupShrinkIndexKey, numberOfShards);
// we generate a unique shrink index name but we also retry if the allocation of the shrunk index is not possible, so we want to
// delete the "previously generated" shrink index (this is a no-op if it's the first run of the action and he haven't generated a
// shrink index name)
Expand Down Expand Up @@ -211,9 +214,9 @@ public List<Step> toSteps(Client client, String phase, Step.StepKey nextStepKey)
DeleteStep deleteSourceIndexStep = new DeleteStep(deleteIndexKey, isShrunkIndexKey, client);
ShrunkenIndexCheckStep waitOnShrinkTakeover = new ShrunkenIndexCheckStep(isShrunkIndexKey, nextStepKey);
return Arrays.asList(conditionalSkipShrinkStep, checkNotWriteIndexStep, waitForNoFollowersStep, readOnlyStep,
cleanupShrinkIndexStep, generateUniqueIndexNameStep, setSingleNodeStep, checkShrinkReadyStep, shrink, allocated,
copyMetadata, isDataStreamBranchingStep, aliasSwapAndDelete, waitOnShrinkTakeover, replaceDataStreamBackingIndex,
deleteSourceIndexStep);
checkTargetShardsCountStep, cleanupShrinkIndexStep, generateUniqueIndexNameStep, setSingleNodeStep, checkShrinkReadyStep,
shrink, allocated, copyMetadata, isDataStreamBranchingStep, aliasSwapAndDelete, waitOnShrinkTakeover,
replaceDataStreamBackingIndex, deleteSourceIndexStep);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
package org.elasticsearch.xpack.core.ilm;

import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.xpack.core.ilm.Step.StepKey;
import org.elasticsearch.xpack.core.ilm.step.info.SingleMessageFieldInfo;

import static org.hamcrest.Matchers.is;

public class CheckTargetShardsCountStepTests extends AbstractStepTestCase<CheckTargetShardsCountStep> {

@Override
protected CheckTargetShardsCountStep createRandomInstance() {
return new CheckTargetShardsCountStep(randomStepKey(), randomStepKey(), null);
}

@Override
protected CheckTargetShardsCountStep mutateInstance(CheckTargetShardsCountStep instance) {
StepKey key = instance.getKey();
StepKey nextKey = instance.getNextStepKey();

switch (between(0, 1)) {
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
default:
throw new AssertionError("Illegal randomisation branch");
}

return new CheckTargetShardsCountStep(key, nextKey, null);
}

@Override
protected CheckTargetShardsCountStep copyInstance(CheckTargetShardsCountStep instance) {
return new CheckTargetShardsCountStep(instance.getKey(), instance.getNextStepKey(), instance.getNumberOfShards());
}

public void testStepCompleteIfTargetShardsCountIsValid() {
String policyName = "test-ilm-policy";
IndexMetadata indexMetadata =
IndexMetadata.builder(randomAlphaOfLength(10)).settings(settings(Version.CURRENT)
.put(LifecycleSettings.LIFECYCLE_NAME, policyName))
.numberOfShards(10).numberOfReplicas(randomIntBetween(0, 5)).build();

ClusterState clusterState = ClusterState.builder(emptyClusterState()).metadata(
Metadata.builder().put(indexMetadata, true).build()).build();

CheckTargetShardsCountStep checkTargetShardsCountStep = new CheckTargetShardsCountStep(randomStepKey(), randomStepKey(), 2);

ClusterStateWaitStep.Result result = checkTargetShardsCountStep.isConditionMet(indexMetadata.getIndex(), clusterState);
assertThat(result.isComplete(), is(true));
}

public void testStepIncompleteIfTargetShardsCountNotValid() {
String indexName = randomAlphaOfLength(10);
String policyName = "test-ilm-policy";
IndexMetadata indexMetadata =
IndexMetadata.builder(indexName).settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME, policyName))
.numberOfShards(10).numberOfReplicas(randomIntBetween(0, 5)).build();

ClusterState clusterState = ClusterState.builder(emptyClusterState()).metadata(
Metadata.builder().put(indexMetadata, true).build()).build();

CheckTargetShardsCountStep checkTargetShardsCountStep = new CheckTargetShardsCountStep(randomStepKey(), randomStepKey(), 3);

ClusterStateWaitStep.Result result = checkTargetShardsCountStep.isConditionMet(indexMetadata.getIndex(), clusterState);
assertThat(result.isComplete(), is(false));
SingleMessageFieldInfo info = (SingleMessageFieldInfo) result.getInfomationContext();
assertThat(info.getMessage(), is("lifecycle action of policy [" + policyName + "] for index [" + indexName +
"] cannot make progress because the target shards count [3] must be a factor of the source index's shards count [10]"));
}
}
Loading