Skip to content

Commit

Permalink
Add ephemeral node id to shutdown metadata
Browse files Browse the repository at this point in the history
Shutdown metadata is keyed on node id. This makes sense since only one
node with a given node id can exist within a cluster. However, it is
possible that shutdown was initiated for once instance of a node, but
that node is restarted. This commit adds the ephemeral node id to
shutdown metadata so that nodes with the same id but different ephemeral
id can be distinguished.
  • Loading branch information
rjernst committed Dec 14, 2024
1 parent b456e16 commit 3409255
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ static TransportVersion def(int id) {
public static final TransportVersion KNN_QUERY_RESCORE_OVERSAMPLE = def(8_806_00_0);
public static final TransportVersion SEMANTIC_QUERY_LENIENT = def(8_807_00_0);
public static final TransportVersion ESQL_QUERY_BUILDER_IN_SEARCH_FUNCTIONS = def(8_808_00_0);
public static final TransportVersion NODE_SHUTDOWN_EPHEMERAL_ID_ADDED = def(8_809_00_0);

/*
* STOP! READ THIS FIRST! No, really,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.Locale;
import java.util.Objects;

import static org.elasticsearch.TransportVersions.NODE_SHUTDOWN_EPHEMERAL_ID_ADDED;
import static org.elasticsearch.core.Strings.format;

/**
Expand All @@ -40,6 +41,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public static final TransportVersion GRACE_PERIOD_ADDED_VERSION = TransportVersions.V_8_9_X;

public static final ParseField NODE_ID_FIELD = new ParseField("node_id");
public static final ParseField EPHEMERAL_NODE_ID_FIELD = new ParseField("ephemeral_node_id");
public static final ParseField TYPE_FIELD = new ParseField("type");
public static final ParseField REASON_FIELD = new ParseField("reason");
public static final String STARTED_AT_READABLE_FIELD = "shutdown_started";
Expand All @@ -53,18 +55,20 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
"node_shutdown_info",
a -> new SingleNodeShutdownMetadata(
(String) a[0],
Type.valueOf((String) a[1]),
(String) a[2],
(long) a[3],
(boolean) a[4],
(TimeValue) a[5],
(String) a[6],
(TimeValue) a[7]
(String) a[1],
Type.valueOf((String) a[2]),
(String) a[3],
(long) a[4],
(boolean) a[5],
(TimeValue) a[6],
(String) a[7],
(TimeValue) a[8]
)
);

static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), NODE_ID_FIELD);
PARSER.declareString(ConstructingObjectParser.constructorArg(), EPHEMERAL_NODE_ID_FIELD);
PARSER.declareString(ConstructingObjectParser.constructorArg(), TYPE_FIELD);
PARSER.declareString(ConstructingObjectParser.constructorArg(), REASON_FIELD);
PARSER.declareLong(ConstructingObjectParser.constructorArg(), STARTED_AT_MILLIS_FIELD);
Expand All @@ -91,6 +95,7 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
public static final TimeValue DEFAULT_RESTART_SHARD_ALLOCATION_DELAY = TimeValue.timeValueMinutes(5);

private final String nodeId;
private final String ephemeralNodeId;
private final Type type;
private final String reason;
private final long startedAtMillis;
Expand All @@ -110,6 +115,7 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) {
*/
private SingleNodeShutdownMetadata(
String nodeId,
String ephemeralNodeId,
Type type,
String reason,
long startedAtMillis,
Expand All @@ -119,6 +125,7 @@ private SingleNodeShutdownMetadata(
@Nullable TimeValue gracePeriod
) {
this.nodeId = Objects.requireNonNull(nodeId, "node ID must not be null");
this.ephemeralNodeId = Objects.requireNonNull(ephemeralNodeId, "ephemeral node ID must not be null");
this.type = Objects.requireNonNull(type, "shutdown type must not be null");
this.reason = Objects.requireNonNull(reason, "shutdown reason must not be null");
this.startedAtMillis = startedAtMillis;
Expand Down Expand Up @@ -157,6 +164,11 @@ private SingleNodeShutdownMetadata(

public SingleNodeShutdownMetadata(StreamInput in) throws IOException {
this.nodeId = in.readString();
if (in.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
this.ephemeralNodeId = in.readString();
} else {
this.ephemeralNodeId = ""; // empty when talking to old nodes, meaning the persistent node id is the only differentiator
}
this.type = in.readEnum(Type.class);
this.reason = in.readString();
this.startedAtMillis = in.readVLong();
Expand All @@ -181,6 +193,13 @@ public String getNodeId() {
return nodeId;
}

/**
* @return The ephemeral ID of the node this {@link SingleNodeShutdownMetadata} concerns.
*/
public String getEphemeralNodeId() {
return ephemeralNodeId;
}

/**
* @return The type of shutdown this is (shutdown vs. permanent).
*/
Expand Down Expand Up @@ -241,6 +260,9 @@ public TimeValue getGracePeriod() {
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(nodeId);
if (out.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
out.writeString(ephemeralNodeId);
}
if ((out.getTransportVersion().before(REPLACE_SHUTDOWN_TYPE_ADDED_VERSION) && this.type == SingleNodeShutdownMetadata.Type.REPLACE)
|| (out.getTransportVersion().before(SIGTERM_ADDED_VERSION) && this.type == Type.SIGTERM)) {
out.writeEnum(SingleNodeShutdownMetadata.Type.REMOVE);
Expand All @@ -264,6 +286,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.startObject();
{
builder.field(NODE_ID_FIELD.getPreferredName(), nodeId);
builder.field(EPHEMERAL_NODE_ID_FIELD.getPreferredName(), ephemeralNodeId);
builder.field(TYPE_FIELD.getPreferredName(), type);
builder.field(REASON_FIELD.getPreferredName(), reason);
builder.timestampFieldsFromUnixEpochMillis(
Expand Down Expand Up @@ -323,6 +346,9 @@ public String toString() {
.append("nodeId=[")
.append(nodeId)
.append(']')
.append("ephemeralNodeId=[")
.append(ephemeralNodeId)
.append(']')
.append(", type=[")
.append(type)
.append("], reason=[")
Expand Down Expand Up @@ -350,6 +376,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {
return builder();
}
return new Builder().setNodeId(original.getNodeId())
.setEphemeralNodeId(original.getEphemeralNodeId())
.setType(original.getType())
.setReason(original.getReason())
.setStartedAtMillis(original.getStartedAtMillis())
Expand All @@ -359,6 +386,7 @@ public static Builder builder(SingleNodeShutdownMetadata original) {

public static class Builder {
private String nodeId;
private String ephemeralNodeId;
private Type type;
private String reason;
private long startedAtMillis = -1;
Expand All @@ -378,6 +406,15 @@ public Builder setNodeId(String nodeId) {
return this;
}

/**
* @param ephemeralNodeId The ephemeral node ID this metadata refers to.
* @return This builder.
*/
public Builder setEphemeralNodeId(String ephemeralNodeId) {
this.ephemeralNodeId = ephemeralNodeId;
return this;
}

/**
* @param type The type of shutdown.
* @return This builder.
Expand Down Expand Up @@ -444,6 +481,7 @@ public SingleNodeShutdownMetadata build() {

return new SingleNodeShutdownMetadata(
nodeId,
ephemeralNodeId,
type,
reason,
startedAtMillis,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
private static boolean putShutdownNodeState(
Map<String, SingleNodeShutdownMetadata> shutdownMetadata,
Predicate<String> nodeExists,
Request request
Request request,
String ephemeralNodeId
) {
if (isNoop(shutdownMetadata, request)) {
return false;
Expand All @@ -58,6 +59,7 @@ private static boolean putShutdownNodeState(
final boolean nodeSeen = nodeExists.test(request.getNodeId());
SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder()
.setNodeId(request.getNodeId())
.setEphemeralNodeId(ephemeralNodeId)
.setType(request.getType())
.setReason(request.getReason())
.setStartedAtMillis(System.currentTimeMillis())
Expand Down Expand Up @@ -103,8 +105,9 @@ public ClusterState execute(BatchExecutionContext<PutShutdownNodeTask> batchExec
boolean needsReroute = false;
for (final var taskContext : batchExecutionContext.taskContexts()) {
var request = taskContext.getTask().request();
var ephemeralNodeId = initialState.nodes().getNodes().get(request.getNodeId()).getEphemeralId();
try (var ignored = taskContext.captureResponseHeaders()) {
changed |= putShutdownNodeState(shutdownMetadata, nodeExistsPredicate, request);
changed |= putShutdownNodeState(shutdownMetadata, nodeExistsPredicate, request, ephemeralNodeId);
} catch (Exception e) {
taskContext.onFailure(e);
continue;
Expand Down

0 comments on commit 3409255

Please sign in to comment.