From 3ff2a9fa22062227213409c162629d020fb1195c Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 3 Aug 2021 11:42:40 +0100 Subject: [PATCH] Include extra snapshot details in logs/APIs (#75917) Today we do not expose the shard generations in the cluster state API, nor do we indicate which snapshots are being deleted in the cluster state update description; these data would have been useful in a recent debugging session. This commit adds these extra details, and also expands some comments on `ClusterState` describing how it's used as an `XContentFragment`. --- .../elasticsearch/cluster/ClusterState.java | 38 +++++++++++-------- .../cluster/SnapshotsInProgress.java | 15 ++++++++ .../cluster/metadata/Metadata.java | 9 +++++ .../snapshots/SnapshotsService.java | 7 ++-- ...SnapshotsInProgressSerializationTests.java | 16 ++++++-- 5 files changed, 62 insertions(+), 23 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterState.java b/server/src/main/java/org/elasticsearch/cluster/ClusterState.java index 755122a53f108..7ded08c537d24 100644 --- a/server/src/main/java/org/elasticsearch/cluster/ClusterState.java +++ b/server/src/main/java/org/elasticsearch/cluster/ClusterState.java @@ -58,24 +58,23 @@ /** * Represents the current state of the cluster. *

- * The cluster state object is immutable with the exception of the {@link RoutingNodes} structure, which is - * built on demand from the {@link RoutingTable}. - * The cluster state can be updated only on the master node. All updates are performed by on a - * single thread and controlled by the {@link ClusterService}. After every update the - * {@link Discovery#publish} method publishes a new version of the cluster state to all other nodes in the - * cluster. The actual publishing mechanism is delegated to the {@link Discovery#publish} method and depends on - * the type of discovery. + * The cluster state object is immutable with the exception of the {@link RoutingNodes} structure, which is built on demand from the {@link + * RoutingTable}. The cluster state can be updated only on the master node. All updates are performed by on a single thread and controlled + * by the {@link ClusterService}. After every update the {@link Discovery#publish} method publishes a new version of the cluster state to + * all other nodes in the cluster. *

- * The cluster state implements the {@link Diffable} interface in order to support publishing of cluster state - * differences instead of the entire state on each change. The publishing mechanism should only send differences - * to a node if this node was present in the previous version of the cluster state. If a node was - * not present in the previous version of the cluster state, this node is unlikely to have the previous cluster - * state version and should be sent a complete version. In order to make sure that the differences are applied to the - * correct version of the cluster state, each cluster state version update generates {@link #stateUUID} that uniquely - * identifies this version of the state. This uuid is verified by the {@link ClusterStateDiff#apply} method to - * make sure that the correct diffs are applied. If uuids don’t match, the {@link ClusterStateDiff#apply} method - * throws the {@link IncompatibleClusterStateVersionException}, which causes the publishing mechanism to send + * Implements the {@link Diffable} interface in order to support publishing of cluster state differences instead of the entire state on each + * change. The publishing mechanism only sends differences to a node if this node was present in the previous version of the cluster state. + * If a node was not present in the previous version of the cluster state, this node is unlikely to have the previous cluster state version + * and should be sent a complete version. In order to make sure that the differences are applied to the correct version of the cluster + * state, each cluster state version update generates {@link #stateUUID} that uniquely identifies this version of the state. This uuid is + * verified by the {@link ClusterStateDiff#apply} method to make sure that the correct diffs are applied. If uuids don’t match, the {@link + * ClusterStateDiff#apply} method throws the {@link IncompatibleClusterStateVersionException}, which causes the publishing mechanism to send * a full version of the cluster state to the node on which this exception was thrown. + *

+ * Implements {@link ToXContentFragment} to be exposed in REST APIs (e.g. {@code GET _cluster/state} and {@code POST _cluster/reroute}) and + * to be indexed by monitoring, mostly just for diagnostics purposes. The XContent representation does not need to be 100% faithful since we + * never reconstruct a cluster state from its XContent representation, but the more faithful it is the more useful it is for diagnostics. */ public class ClusterState implements ToXContentFragment, Diffable { @@ -135,6 +134,13 @@ default boolean isPrivate() { return false; } + /** + * Serialize this {@link Custom} for diagnostic purposes, exposed by the

GET _cluster/state
API etc. The XContent + * representation does not need to be 100% faithful since we never reconstruct a cluster state from its XContent representation, but + * the more faithful it is the more useful it is for diagnostics. + */ + @Override + XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException; } private static final NamedDiffableValueSerializer CUSTOM_VALUE_SERIALIZER = new NamedDiffableValueSerializer<>(Custom.class); diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index 581f70d689a7d..d1bf9da68ebc5 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -942,7 +942,22 @@ private void writeShardSnapshotStatus(XContentBuilder builder, ToXContent indexI builder.field("index", indexId); builder.field("shard", shardId); builder.field("state", status.state()); + builder.field("generation", status.generation()); builder.field("node", status.nodeId()); + + if (status.state() == ShardState.SUCCESS) { + final ShardSnapshotResult result = status.shardSnapshotResult(); + builder.startObject("result"); + builder.field("generation", result.getGeneration()); + builder.humanReadableField("size_in_bytes", "size", result.getSize()); + builder.field("segments", result.getSegmentCount()); + builder.endObject(); + } + + if (status.reason() != null) { + builder.field("reason", status.reason()); + } + builder.endObject(); } diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/Metadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/Metadata.java index 96c8c1bc1cdab..f4321a7a5aac8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/Metadata.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/Metadata.java @@ -29,6 +29,7 @@ import org.elasticsearch.cluster.block.ClusterBlockLevel; import org.elasticsearch.cluster.coordination.CoordinationMetadata; import org.elasticsearch.common.xcontent.NamedObjectNotFoundException; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.ToXContentFragment; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -78,6 +79,10 @@ import static org.elasticsearch.common.settings.Settings.readSettingsFromStream; import static org.elasticsearch.common.settings.Settings.writeSettingsToStream; +/** + * {@link Metadata} is the part of the {@link ClusterState} which persists across restarts. This persistence is XContent-based, so a + * round-trip through XContent must be faithful in {@link XContentContext#GATEWAY} context. + */ public class Metadata implements Iterable, Diffable, ToXContentFragment { private static final Logger logger = LogManager.getLogger(Metadata.class); @@ -120,6 +125,10 @@ public enum XContentContext { */ public static EnumSet ALL_CONTEXTS = EnumSet.allOf(XContentContext.class); + /** + * Custom metadata that persists (via XContent) across restarts. The deserialization method for each implementation must be registered + * with the {@link NamedXContentRegistry}. + */ public interface Custom extends NamedDiffable, ToXContentFragment, ClusterState.FeatureAware { EnumSet context(); diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index a35bf0c846516..4e181cf3a4cf4 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -2519,6 +2519,7 @@ public void deleteSnapshots(final DeleteSnapshotRequest request, final ActionLis ); final Repository repository = repositoriesService.repository(repoName); + final String taskDescription = "delete snapshot [" + repository + "]" + Arrays.toString(snapshotNames); repository.executeConsistentStateUpdate(repositoryData -> new ClusterStateUpdateTask(request.masterNodeTimeout()) { private Snapshot runningSnapshot; @@ -2645,7 +2646,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS listener.onResponse(null); } else { clusterService.submitStateUpdateTask( - "delete snapshot", + taskDescription, createDeleteStateUpdate(outstandingDeletes, repoName, repositoryData, Priority.IMMEDIATE, listener) ); } @@ -2655,7 +2656,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS addListener(runningSnapshot, ActionListener.wrap(result -> { logger.debug("deleted snapshot completed - deleting files"); clusterService.submitStateUpdateTask( - "delete snapshot", + taskDescription, createDeleteStateUpdate(outstandingDeletes, repoName, result.v1(), Priority.IMMEDIATE, listener) ); }, e -> { @@ -2671,7 +2672,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS } })); } - }, "delete snapshot", listener::onFailure); + }, taskDescription, listener::onFailure); } private static List matchingSnapshotIds( diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotsInProgressSerializationTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotsInProgressSerializationTests.java index ba1aa724455ab..3bc23b01f280a 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotsInProgressSerializationTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotsInProgressSerializationTests.java @@ -375,9 +375,13 @@ public void testXContent() throws IOException { new ShardId("index", "uuid", 0), SnapshotsInProgress.ShardSnapshotStatus.success( "nodeId", - new ShardSnapshotResult("generation", new ByteSizeValue(1L), 1) + new ShardSnapshotResult("shardgen", new ByteSizeValue(1L), 1) ) ) + .fPut( + new ShardId("index", "uuid", 1), + new SnapshotsInProgress.ShardSnapshotStatus("nodeId", ShardState.FAILED, "failure-reason", "fail-gen") + ) .build(), null, null, @@ -398,9 +402,13 @@ public void testXContent() throws IOException { "{\"snapshots\":[{\"repository\":\"repo\",\"snapshot\":\"name\",\"uuid\":\"uuid\"," + "\"include_global_state\":true,\"partial\":true,\"state\":\"SUCCESS\"," + "\"indices\":[{\"name\":\"index\",\"id\":\"uuid\"}],\"start_time\":\"1970-01-01T00:20:34.567Z\"," - + "\"start_time_millis\":1234567,\"repository_state_id\":0," - + "\"shards\":[{\"index\":{\"index_name\":\"index\",\"index_uuid\":\"uuid\"}," - + "\"shard\":0,\"state\":\"SUCCESS\",\"node\":\"nodeId\"}],\"feature_states\":[],\"data_streams\":[]}]}" + + "\"start_time_millis\":1234567,\"repository_state_id\":0,\"shards\":[" + + "{\"index\":{\"index_name\":\"index\",\"index_uuid\":\"uuid\"},\"shard\":0,\"state\":\"SUCCESS\"," + + "\"generation\":\"shardgen\",\"node\":\"nodeId\"," + + "\"result\":{\"generation\":\"shardgen\",\"size\":\"1b\",\"size_in_bytes\":1,\"segments\":1}}," + + "{\"index\":{\"index_name\":\"index\",\"index_uuid\":\"uuid\"},\"shard\":1,\"state\":\"FAILED\"," + + "\"generation\":\"fail-gen\",\"node\":\"nodeId\",\"reason\":\"failure-reason\"}" + + "],\"feature_states\":[],\"data_streams\":[]}]}" ) ); }