Skip to content

Commit

Permalink
Fix Snapshot Corruption in Edge Case (#47552) (#47636)
Browse files Browse the repository at this point in the history
This fixes missing to marking shard snapshots as failures when
multiple data-nodes are lost during the snapshot process or
shard snapshot failures have occured before a node left the cluster.

The problem was that we were simply not adding any shard entries for completed
shards on node-left events. This has no effect for a successful shard, but
for a failed shard would lead to that shard not being marked as failed during
snapshot finalization. Fixed by corectly keeping track of all previous completed
shard states as well in this case.
Also, added an assertion that without this fix would trip on almost every run of the
resiliency tests and adjusted the serialization of SnapshotsInProgress.Entry so
we have a proper assertion message.

Closes #47550
  • Loading branch information
original-brownbear authored Oct 7, 2019
1 parent e0c5fa3 commit 29c5391
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,24 @@ protected void masterOperation(final SnapshotsStatusRequest request,
new TransportNodesSnapshotsStatus.Request(nodesIds.toArray(new String[nodesIds.size()]))
.snapshots(snapshots).timeout(request.masterNodeTimeout());
transportNodesSnapshotsStatus.execute(nodesRequest, new ActionListener<TransportNodesSnapshotsStatus.NodesSnapshotStatus>() {
@Override
public void onResponse(TransportNodesSnapshotsStatus.NodesSnapshotStatus nodeSnapshotStatuses) {
try {
List<SnapshotsInProgress.Entry> currentSnapshots =
snapshotsService.currentSnapshots(request.repository(), Arrays.asList(request.snapshots()));
listener.onResponse(buildResponse(request, currentSnapshots, nodeSnapshotStatuses));
} catch (Exception e) {
listener.onFailure(e);
}
}

@Override
public void onFailure(Exception e) {
@Override
public void onResponse(TransportNodesSnapshotsStatus.NodesSnapshotStatus nodeSnapshotStatuses) {
threadPool.generic().execute(() -> {
try {
List<SnapshotsInProgress.Entry> currentSnapshots =
snapshotsService.currentSnapshots(request.repository(), Arrays.asList(request.snapshots()));
listener.onResponse(buildResponse(request, currentSnapshots, nodeSnapshotStatuses));
} catch (Exception e) {
listener.onFailure(e);
}
});
}

@Override
public void onFailure(Exception e) {
listener.onFailure(e);
}
});
} else {
// We don't have any in-progress shards, just return current stats
listener.onResponse(buildResponse(request, currentSnapshots, null));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState.Custom;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -84,7 +85,7 @@ public String toString() {
return builder.append("]").toString();
}

public static class Entry {
public static class Entry implements ToXContent {
private final State state;
private final Snapshot snapshot;
private final boolean includeGlobalState;
Expand Down Expand Up @@ -211,7 +212,50 @@ public int hashCode() {

@Override
public String toString() {
return snapshot.toString();
return Strings.toString(this);
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(REPOSITORY, snapshot.getRepository());
builder.field(SNAPSHOT, snapshot.getSnapshotId().getName());
builder.field(UUID, snapshot.getSnapshotId().getUUID());
builder.field(INCLUDE_GLOBAL_STATE, includeGlobalState());
builder.field(PARTIAL, partial);
builder.field(STATE, state);
builder.startArray(INDICES);
{
for (IndexId index : indices) {
index.toXContent(builder, params);
}
}
builder.endArray();
builder.humanReadableField(START_TIME_MILLIS, START_TIME, new TimeValue(startTime));
builder.field(REPOSITORY_STATE_ID, repositoryStateId);
builder.startArray(SHARDS);
{
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardEntry : shards) {
ShardId shardId = shardEntry.key;
ShardSnapshotStatus status = shardEntry.value;
builder.startObject();
{
builder.field(INDEX, shardId.getIndex());
builder.field(SHARD, shardId.getId());
builder.field(STATE, status.state());
builder.field(NODE, status.nodeId());
}
builder.endObject();
}
}
builder.endArray();
builder.endObject();
return builder;
}

@Override
public boolean isFragment() {
return false;
}

// package private for testing
Expand Down Expand Up @@ -527,7 +571,7 @@ public void writeTo(StreamOutput out) throws IOException {
public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
builder.startArray(SNAPSHOTS);
for (Entry entry : entries) {
toXContent(entry, builder, params);
entry.toXContent(builder, params);
}
builder.endArray();
return builder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -765,18 +765,20 @@ public ClusterState execute(ClusterState currentState) {
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
boolean snapshotChanged = false;
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardEntry : snapshot.shards()) {
ShardSnapshotStatus shardStatus = shardEntry.value;
final ShardSnapshotStatus shardStatus = shardEntry.value;
final ShardId shardId = shardEntry.key;
if (!shardStatus.state().completed() && shardStatus.nodeId() != null) {
if (nodes.nodeExists(shardStatus.nodeId())) {
shards.put(shardEntry.key, shardEntry.value);
shards.put(shardId, shardStatus);
} else {
// TODO: Restart snapshot on another node?
snapshotChanged = true;
logger.warn("failing snapshot of shard [{}] on closed node [{}]",
shardEntry.key, shardStatus.nodeId());
shards.put(shardEntry.key,
new ShardSnapshotStatus(shardStatus.nodeId(), State.FAILED, "node shutdown"));
shardId, shardStatus.nodeId());
shards.put(shardId, new ShardSnapshotStatus(shardStatus.nodeId(), State.FAILED, "node shutdown"));
}
} else {
shards.put(shardId, shardStatus);
}
}
if (snapshotChanged) {
Expand Down Expand Up @@ -808,6 +810,8 @@ public void onFailure(Exception e) {
}
}, updatedSnapshot.getRepositoryStateId(), false);
}
assert updatedSnapshot.shards().size() == snapshot.shards().size()
: "Shard count changed during snapshot status update from [" + snapshot + "] to [" + updatedSnapshot + "]";
}
if (changed) {
return ClusterState.builder(currentState)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1229,6 +1229,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
}, 60L, TimeUnit.SECONDS);
}

public void testDataNodeRestartAfterShardSnapshotFailure() throws Exception {
logger.info("--> starting a master node and two data nodes");
internalCluster().startMasterOnlyNode();
final List<String> dataNodes = internalCluster().startDataOnlyNodes(2);
logger.info("--> creating repository");
assertAcked(client().admin().cluster().preparePutRepository("test-repo")
.setType("mock").setSettings(Settings.builder()
.put("location", randomRepoPath())
.put("compress", randomBoolean())
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
assertAcked(prepareCreate("test-idx", 0, Settings.builder()
.put("number_of_shards", 2).put("number_of_replicas", 0)));
ensureGreen();
logger.info("--> indexing some data");
final int numdocs = randomIntBetween(50, 100);
IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex("test-idx", "type1",
Integer.toString(i)).setSource("field1", "bar " + i);
}
indexRandom(true, builders);
flushAndRefresh();
blockAllDataNodes("test-repo");
logger.info("--> snapshot");
client(internalCluster().getMasterName()).admin().cluster()
.prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-idx").get();
logger.info("--> restarting first data node, which should cause the primary shard on it to be failed");
internalCluster().restartNode(dataNodes.get(0), InternalTestCluster.EMPTY_CALLBACK);

logger.info("--> wait for shard snapshot of first primary to show as failed");
assertBusy(() -> assertThat(
client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
.get(0).getShardsStats().getFailedShards(), is(1)), 60L, TimeUnit.SECONDS);

logger.info("--> restarting second data node, which should cause the primary shard on it to be failed");
internalCluster().restartNode(dataNodes.get(1), InternalTestCluster.EMPTY_CALLBACK);

// check that snapshot completes with both failed shards being accounted for in the snapshot result
assertBusy(() -> {
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
.prepareGetSnapshots("test-repo").setSnapshots("test-snap").setIgnoreUnavailable(true).get();
assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
assertTrue(snapshotInfo.state().toString(), snapshotInfo.state().completed());
assertThat(snapshotInfo.totalShards(), is(2));
assertThat(snapshotInfo.shardFailures(), hasSize(2));
}, 60L, TimeUnit.SECONDS);
}

private long calculateTotalFilesSize(List<Path> files) {
return files.stream().mapToLong(f -> {
try {
Expand Down

0 comments on commit 29c5391

Please sign in to comment.