Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Periodic warning for 1-node cluster w/ seed hosts #88013

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/88013.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 88013
summary: Periodic warning for 1-node cluster w/ seed hosts
area: Cluster Coordination
type: enhancement
issues:
- 85222
5 changes: 5 additions & 0 deletions docs/reference/modules/discovery/discovery-settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ Sets how long the master node waits for each cluster state update to be
completely published to all nodes, unless `discovery.type` is set to
`single-node`. The default value is `30s`. See <<cluster-state-publishing>>.

`cluster.discovery_configuration_check.interval `::
(<<static-cluster-setting,Static>>)
Sets the interval of some checks that will log warnings about an
incorrect discovery configuration. The default value is `30s`.

`cluster.join_validation.cache_timeout`::
(<<static-cluster-setting,Static>>)
When a node requests to join the cluster, the elected master node sends it a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@

import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_ID;
import static org.elasticsearch.core.Strings.format;
import static org.elasticsearch.discovery.SettingsBasedSeedHostsProvider.DISCOVERY_SEED_HOSTS_SETTING;
import static org.elasticsearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered;
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
Expand All @@ -116,6 +117,13 @@ public class Coordinator extends AbstractLifecycleComponent implements ClusterSt
Setting.Property.NodeScope
);

public static final Setting<TimeValue> SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING = Setting.timeSetting(
kingherc marked this conversation as resolved.
Show resolved Hide resolved
"cluster.discovery_configuration_check.interval",
TimeValue.timeValueMillis(30000),
TimeValue.timeValueMillis(1),
Setting.Property.NodeScope
);

public static final String COMMIT_STATE_ACTION_NAME = "internal:cluster/coordination/commit_state";

private final Settings settings;
Expand All @@ -140,6 +148,9 @@ public class Coordinator extends AbstractLifecycleComponent implements ClusterSt
private final SeedHostsResolver configuredHostsResolver;
private final TimeValue publishTimeout;
private final TimeValue publishInfoTimeout;
private final TimeValue singleNodeClusterSeedHostsCheckInterval;
@Nullable
private Scheduler.Cancellable singleNodeClusterChecker = null;
private final PublicationTransportHandler publicationHandler;
private final LeaderChecker leaderChecker;
private final FollowersChecker followersChecker;
Expand Down Expand Up @@ -218,6 +229,7 @@ public Coordinator(
this.joinAccumulator = new InitialJoinAccumulator();
this.publishTimeout = PUBLISH_TIMEOUT_SETTING.get(settings);
this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
this.singleNodeClusterSeedHostsCheckInterval = SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING.get(settings);
this.random = random;
this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
this.preVoteCollector = new PreVoteCollector(
Expand Down Expand Up @@ -739,6 +751,38 @@ private void processJoinRequest(JoinRequest joinRequest, ActionListener<Void> jo
}
}

private void cancelSingleNodeClusterChecker() {
kingherc marked this conversation as resolved.
Show resolved Hide resolved
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
if (singleNodeClusterChecker != null) {
singleNodeClusterChecker.cancel();
singleNodeClusterChecker = null;
}
}

private void checkSingleNodeCluster() {
if (applierState.nodes().size() > 1) {
return;
}

if (DISCOVERY_SEED_HOSTS_SETTING.exists(settings)) {
if (DISCOVERY_SEED_HOSTS_SETTING.get(settings).isEmpty()) {
// For a single-node cluster, the only acceptable setting is an empty list.
return;
} else {
logger.warn(
"""
This node is a fully-formed single-node cluster with cluster UUID [{}], but it is configured as if to \
discover other nodes and form a multi-node cluster via the [{}] setting. Fully-formed clusters do not \
attempt to discover other nodes, and nodes with different cluster UUIDs cannot belong to the same cluster. \
The cluster UUID persists across restarts and can only be changed by deleting the contents of the node's \
data path(s). Remove the discovery configuration to suppress this message.""",
applierState.metadata().clusterUUID(),
DISCOVERY_SEED_HOSTS_SETTING.getKey() + "=" + DISCOVERY_SEED_HOSTS_SETTING.get(settings)
);
}
}
}

void becomeCandidate(String method) {
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
logger.debug(
Expand All @@ -748,6 +792,7 @@ void becomeCandidate(String method) {
mode,
lastKnownLeader
);
cancelSingleNodeClusterChecker();

if (mode != Mode.CANDIDATE) {
final Mode prevMode = mode;
Expand Down Expand Up @@ -803,6 +848,13 @@ private void becomeLeader() {

assert leaderChecker.leader() == null : leaderChecker.leader();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);

if (applierState.nodes().size() > 1) {
cancelSingleNodeClusterChecker();
} else if (singleNodeClusterChecker == null) {
singleNodeClusterChecker = transportService.getThreadPool()
.scheduleWithFixedDelay(() -> { checkSingleNodeCluster(); }, this.singleNodeClusterSeedHostsCheckInterval, Names.SAME);
}
}

void becomeFollower(String method, DiscoveryNode leaderNode) {
Expand All @@ -822,6 +874,7 @@ void becomeFollower(String method, DiscoveryNode leaderNode) {
lastKnownLeader
);
}
cancelSingleNodeClusterChecker();

final boolean restartLeaderChecker = (mode == Mode.FOLLOWER && Optional.of(leaderNode).equals(lastKnownLeader)) == false;

Expand Down Expand Up @@ -1028,6 +1081,10 @@ assert getLocalNode().equals(applierState.nodes().getMasterNode())
: coordinationState.get().getLastAcceptedConfiguration()
+ " != "
+ coordinationState.get().getLastCommittedConfiguration();

if (coordinationState.get().getLastAcceptedState().nodes().size() == 1) {
assert singleNodeClusterChecker != null;
}
} else if (mode == Mode.FOLLOWER) {
assert coordinationState.get().electionWon() == false : getLocalNode() + " is FOLLOWER so electionWon() should be false";
assert lastKnownLeader.isPresent() && (lastKnownLeader.get().equals(getLocalNode()) == false);
Expand All @@ -1045,6 +1102,7 @@ assert getLocalNode().equals(applierState.nodes().getMasterNode())
assert currentPublication.map(Publication::isCommitted).orElse(true);
assert preVoteCollector.getLeader().equals(lastKnownLeader.get()) : preVoteCollector;
assert clusterFormationFailureHelper.isRunning() == false;
assert singleNodeClusterChecker == null;
} else {
assert mode == Mode.CANDIDATE;
assert joinAccumulator instanceof JoinHelper.CandidateJoinAccumulator;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ public void apply(Settings value, Settings current, Settings previous) {
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
Coordinator.PUBLISH_TIMEOUT_SETTING,
Coordinator.PUBLISH_INFO_TIMEOUT_SETTING,
Coordinator.SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING,
JoinValidationService.JOIN_VALIDATION_CACHE_TIMEOUT_SETTING,
FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING,
FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_WRITES;
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION;
import static org.elasticsearch.discovery.PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING;
import static org.elasticsearch.discovery.SettingsBasedSeedHostsProvider.DISCOVERY_SEED_HOSTS_SETTING;
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
import static org.elasticsearch.test.NodeRoles.nonMasterNode;
Expand Down Expand Up @@ -2109,6 +2110,61 @@ public void assertMatched() {
}
}

@TestLogging(
reason = "testing warning of a single-node cluster having discovery seed hosts",
value = "org.elasticsearch.cluster.coordination.Coordinator:WARN"
)
public void testLogsWarningPeriodicallyIfSingleNodeClusterHasSeedHosts() throws IllegalAccessException {
final long warningDelayMillis;
final Settings settings;
final String fakeSeedHost = buildNewFakeTransportAddress().toString();
if (randomBoolean()) {
settings = Settings.builder().putList(DISCOVERY_SEED_HOSTS_SETTING.getKey(), fakeSeedHost).build();
warningDelayMillis = Coordinator.SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING.get(settings).millis();
} else {
warningDelayMillis = randomLongBetween(1, 100000);
settings = Settings.builder()
.put(ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.getKey(), warningDelayMillis + "ms")
.putList(DISCOVERY_SEED_HOSTS_SETTING.getKey(), fakeSeedHost)
.build();
}
logger.info("--> emitting warnings every [{}ms]", warningDelayMillis);

try (Cluster cluster = new Cluster(1, true, settings)) {
cluster.runRandomly();
cluster.stabilise();

for (int i = scaledRandomIntBetween(1, 10); i >= 0; i--) {
final MockLogAppender mockLogAppender = new MockLogAppender();
try {
mockLogAppender.start();
Loggers.addAppender(LogManager.getLogger(Coordinator.class), mockLogAppender);
mockLogAppender.addExpectation(new MockLogAppender.LoggingExpectation() {
String loggedClusterUuid;

@Override
public void match(LogEvent event) {
final String message = event.getMessage().getFormattedMessage();
assertThat(message, startsWith("This node is a fully-formed single-node cluster with cluster UUID"));
loggedClusterUuid = (String) event.getMessage().getParameters()[0];
}

@Override
public void assertMatched() {
final String clusterUuid = cluster.getAnyNode().getLastAppliedClusterState().metadata().clusterUUID();
assertThat(loggedClusterUuid + " vs " + clusterUuid, clusterUuid, equalTo(clusterUuid));
}
});
cluster.runFor(warningDelayMillis + DEFAULT_DELAY_VARIABILITY, "waiting for warning to be emitted");
mockLogAppender.assertAllExpectationsMatched();
} finally {
Loggers.removeAppender(LogManager.getLogger(Coordinator.class), mockLogAppender);
mockLogAppender.stop();
}
}
}
}

@TestLogging(
reason = "testing LagDetector and CoordinatorPublication logging",
value = "org.elasticsearch.cluster.coordination.LagDetector:DEBUG,"
Expand Down