Skip to content

Commit

Permalink
ILM fix the init step to actually be retryable (#52076) (#52375)
Browse files Browse the repository at this point in the history
We marked the `init` ILM step as retryable but our test used `waitUntil`
without an assert so we didn’t catch the fact that we were not actually
able to retry this step as our ILM state didn’t contain any information
about the policy execution (as we were in the process of initialising
it).

This commit manually sets the current step to `init` when we’re moving
the ilm policy into the ERROR step (this enables us to successfully
move to the error step and later retry the step)

* ShrunkenIndexCheckStep: Use correct logger

(cherry picked from commit f78d4b3)
Signed-off-by: Andrei Dan <[email protected]>
  • Loading branch information
andreidan authored Feb 15, 2020
1 parent 0656a33 commit bd3a70d
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) {
return clusterState;
}

LifecycleExecutionState lifecycleState = LifecycleExecutionState
.fromIndexMetadata(indexMetaData);

if (lifecycleState.getLifecycleDate() != null) {
return clusterState;
}

IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData);
if (shouldParseIndexName(indexMetaData.getSettings())) {
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
.settings(Settings.builder()
.put(indexMetaData.getSettings())
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
.build()
);
LifecycleExecutionState lifecycleState;
try {
lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
if (lifecycleState.getLifecycleDate() != null) {
return clusterState;
}

if (shouldParseIndexName(indexMetaData.getSettings())) {
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
.settings(Settings.builder()
.put(indexMetaData.getSettings())
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
.build()
);
}
} catch (Exception e) {
String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
throw new InitializePolicyException(policy, index.getName(), e);
}

ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ilm;

import org.elasticsearch.ElasticsearchException;

import java.util.Locale;

/**
* Exception thrown when a problem is encountered while initialising an ILM policy for an index.
*/
public class InitializePolicyException extends ElasticsearchException {

public InitializePolicyException(String policy, String index, Throwable cause) {
super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
*/
public class ShrunkenIndexCheckStep extends ClusterStateWaitStep {
public static final String NAME = "is-shrunken-index";
private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class);
private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class);
private String shrunkIndexPrefix;

public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in
StepKey nextKey = instance.getNextStepKey();

switch (between(0, 1)) {
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
default:
throw new AssertionError("Illegal randomisation branch");
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
default:
throw new AssertionError("Illegal randomisation branch");
}

return new InitializePolicyContextStep(key, nextKey);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1185,26 +1185,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
// increases the wait time between calls exponentially, we might miss the window where the policy is on
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
waitUntil(() -> {
assertTrue(waitUntil(() -> {
try {
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
waitUntil(() -> {
assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

deleteIndex(rolledIndex);

Expand Down Expand Up @@ -1246,16 +1246,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing(
"}");
client().performRequest(moveToStepRequest);

waitUntil(() -> {
assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null
&& retryCount >= 1;
} catch (IOException e) {
return false;
}
});
}, 30, TimeUnit.SECONDS));

index(client(), index, "1", "foo", "bar");
Request refreshIndex = new Request("POST", "/" + index + "/_refresh");
Expand Down Expand Up @@ -1441,16 +1442,17 @@ public void testRetryableInitializationStep() throws Exception {
assertOK(client().performRequest(startReq));

// Wait until an error has occurred.
waitUntil(() -> {
assertTrue("ILM did not start retrying the init step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null
&& retryCount >= 1;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

// Turn origination date parsing back off
updateIndexSettings(index, Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.elasticsearch.xpack.core.ilm.ErrorStep;
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
import org.elasticsearch.xpack.core.ilm.InitializePolicyException;
import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
Expand Down Expand Up @@ -133,8 +134,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
causeXContentBuilder.endObject();
LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
"unable to move to an error step where there is no current step, state: " + currentState);
Step.StepKey currentStep;
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
// as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can
// record the error (and later retry the init policy step)
if (cause instanceof InitializePolicyException) {
currentStep = InitializePolicyContextStep.KEY;
} else {
currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
"unable to move to an error step where there is no current step, state: " + currentState);
}
LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState,
new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);

Expand Down

0 comments on commit bd3a70d

Please sign in to comment.