Skip to content

Commit

Permalink
Enhance the DSL error store to capture more error info (elastic#101475)
Browse files Browse the repository at this point in the history
This enhances the DSL error store to caputre more information about each
error, namely: the initial encounter timestamp, the current occurrence
timestamp and the retry count.

This also introduces a new setting,
`data_streams.lifecycle.signalling.error_retry_interval` to control when
we emit an `error` log entry. It defaults to 10 retry counts, namely, if
the error for an index remains the same for 10 consecutive DSL runs, we
log a message at level `error`.

This also exposes all the new error information as part of the
`_lifecycle/explain` API.
  • Loading branch information
andreidan authored Oct 31, 2023
1 parent 93b69a9 commit 3a087e4
Show file tree
Hide file tree
Showing 16 changed files with 365 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.elasticsearch.action.datastreams.CreateDataStreamAction;
import org.elasticsearch.action.datastreams.GetDataStreamAction;
import org.elasticsearch.action.datastreams.ModifyDataStreamsAction;
import org.elasticsearch.action.datastreams.lifecycle.ErrorEntry;
import org.elasticsearch.action.datastreams.lifecycle.ExplainIndexDataStreamLifecycle;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.cluster.metadata.ComposableIndexTemplate;
Expand Down Expand Up @@ -405,7 +406,7 @@ public void testErrorRecordingOnRollover() throws Exception {

assertBusy(() -> {
String writeIndexName = getBackingIndices(dataStreamName).get(1);
String writeIndexRolloverError = null;
ErrorEntry writeIndexRolloverError = null;
Iterable<DataStreamLifecycleService> lifecycleServices = internalCluster().getInstances(DataStreamLifecycleService.class);

for (DataStreamLifecycleService lifecycleService : lifecycleServices) {
Expand All @@ -416,7 +417,7 @@ public void testErrorRecordingOnRollover() throws Exception {
}

assertThat(writeIndexRolloverError, is(notNullValue()));
assertThat(writeIndexRolloverError, containsString("maximum normal shards open"));
assertThat(writeIndexRolloverError.error(), containsString("maximum normal shards open"));
});

// let's reset the cluster max shards per node limit to allow rollover to proceed and check the error store is empty
Expand Down Expand Up @@ -497,7 +498,7 @@ public void testErrorRecordingOnRetention() throws Exception {
String writeIndex = backingIndices.get(1).getName();
assertThat(writeIndex, backingIndexEqualTo(dataStreamName, 2));

String recordedRetentionExecutionError = null;
ErrorEntry recordedRetentionExecutionError = null;
Iterable<DataStreamLifecycleService> lifecycleServices = internalCluster().getInstances(DataStreamLifecycleService.class);

for (DataStreamLifecycleService lifecycleService : lifecycleServices) {
Expand All @@ -508,7 +509,7 @@ public void testErrorRecordingOnRetention() throws Exception {
}

assertThat(recordedRetentionExecutionError, is(notNullValue()));
assertThat(recordedRetentionExecutionError, containsString("blocked by: [FORBIDDEN/5/index read-only (api)"));
assertThat(recordedRetentionExecutionError.error(), containsString("blocked by: [FORBIDDEN/5/index read-only (api)"));
});

// let's mark the index as writeable and make sure it's deleted and the error store is empty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
Expand Down Expand Up @@ -252,7 +253,9 @@ public void testExplainLifecycleForIndicesWithErrors() throws Exception {
// index has not been rolled over yet
assertThat(explainIndex.getGenerationTime(System::currentTimeMillis), nullValue());

assertThat(explainIndex.getError(), containsString("maximum normal shards open"));
assertThat(explainIndex.getError(), notNullValue());
assertThat(explainIndex.getError().error(), containsString("maximum normal shards open"));
assertThat(explainIndex.getError().retryCount(), greaterThanOrEqualTo(1));
}
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ public List<Setting<?>> getSettings() {
pluginSettings.add(DataStreamLifecycleService.DATA_STREAM_LIFECYCLE_POLL_INTERVAL_SETTING);
pluginSettings.add(DataStreamLifecycleService.DATA_STREAM_MERGE_POLICY_TARGET_FLOOR_SEGMENT_SETTING);
pluginSettings.add(DataStreamLifecycleService.DATA_STREAM_MERGE_POLICY_TARGET_FACTOR_SETTING);
pluginSettings.add(DataStreamLifecycleService.DATA_STREAM_SIGNALLING_ERROR_RETRY_INTERVAL_SETTING);
return pluginSettings;
}

Expand All @@ -155,7 +156,7 @@ public Collection<?> createComponents(PluginServices services) {
);
this.updateTimeSeriesRangeService.set(updateTimeSeriesRangeService);
components.add(this.updateTimeSeriesRangeService.get());
errorStoreInitialisationService.set(new DataStreamLifecycleErrorStore());
errorStoreInitialisationService.set(new DataStreamLifecycleErrorStore(services.threadPool()::absoluteTimeInMillis));
dataLifecycleInitialisationService.set(
new DataStreamLifecycleService(
settings,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
package org.elasticsearch.datastreams.lifecycle;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.datastreams.lifecycle.ErrorEntry;
import org.elasticsearch.common.Strings;
import org.elasticsearch.core.Nullable;

import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.function.LongSupplier;

import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;

Expand All @@ -26,7 +28,12 @@
public class DataStreamLifecycleErrorStore {

public static final int MAX_ERROR_MESSAGE_LENGTH = 1000;
private final ConcurrentMap<String, String> indexNameToError = new ConcurrentHashMap<>();
private final ConcurrentMap<String, ErrorEntry> indexNameToError = new ConcurrentHashMap<>();
private final LongSupplier nowSupplier;

public DataStreamLifecycleErrorStore(LongSupplier nowSupplier) {
this.nowSupplier = nowSupplier;
}

/**
* Records a string representation of the provided exception for the provided index.
Expand All @@ -35,13 +42,24 @@ public class DataStreamLifecycleErrorStore {
* Returns the previously recorded error for the provided index, or null otherwise.
*/
@Nullable
public String recordError(String indexName, Exception e) {
public ErrorEntry recordError(String indexName, Exception e) {
String exceptionToString = Strings.toString((builder, params) -> {
ElasticsearchException.generateThrowableXContent(builder, EMPTY_PARAMS, e);
return builder;
});
String recordedError = Strings.substring(exceptionToString, 0, MAX_ERROR_MESSAGE_LENGTH);
return indexNameToError.put(indexName, recordedError);
String newError = Strings.substring(exceptionToString, 0, MAX_ERROR_MESSAGE_LENGTH);
ErrorEntry existingError = indexNameToError.get(indexName);
long recordedTimestamp = nowSupplier.getAsLong();
if (existingError == null) {
indexNameToError.put(indexName, new ErrorEntry(recordedTimestamp, newError, recordedTimestamp, 0));
} else {
if (existingError.error().equals(newError)) {
indexNameToError.put(indexName, ErrorEntry.incrementRetryCount(existingError, nowSupplier));
} else {
indexNameToError.put(indexName, new ErrorEntry(recordedTimestamp, newError, recordedTimestamp, 0));
}
}
return existingError;
}

/**
Expand All @@ -62,7 +80,7 @@ public void clearStore() {
* Retrieves the recorded error for the provided index.
*/
@Nullable
public String getError(String indexName) {
public ErrorEntry getError(String indexName) {
return indexNameToError.get(indexName);
}

Expand Down
Loading

0 comments on commit 3a087e4

Please sign in to comment.