Skip to content

Commit

Permalink
[PLAT-15225] Verify there is no running master on nodes selected for …
Browse files Browse the repository at this point in the history
…master replacement

Summary:
This adds remote call to check process status for master. As this is run in findReplacementMaster which is invoked in a transaction block, call site is changed to precheck to not hold up the transaction. This changes the object references. So, node name is returned
to avoid making mistakes by using the node object to update universe record. The call site move means master state also needs to be checked because isMaster can be false.

Test Plan:
1. Created a 5 node RF 3 cluster.
2. Stopped node with master spins up another master on node.
3. Started the node and started master on the tserver only node.
4. Stopped this new master node. Precheck failed.
5. Stoppped the master from the tserver only node.
6. Stopped the new master node. This time, it succeeded.
7. Repeat the same for remove node.
8. Tested master failover by stopping a master VM.

{F287918}

Reviewers: cwang, sanketh, yshchetinin

Reviewed By: cwang, yshchetinin

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D38254
  • Loading branch information
nkhogen committed Sep 26, 2024
1 parent d6a19da commit acbb1ba
Show file tree
Hide file tree
Showing 14 changed files with 333 additions and 125 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import com.yugabyte.yw.common.ImageBundleUtil;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.NodeUIApiHelper;
import com.yugabyte.yw.common.NodeUniverseManager;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.ReleaseManager;
import com.yugabyte.yw.common.RestoreManagerYb;
Expand Down Expand Up @@ -93,6 +94,7 @@ public abstract class AbstractTaskBase implements ITask {
protected final ReleaseManager releaseManager;
protected final YsqlQueryExecutor ysqlQueryExecutor;
protected final GFlagsValidation gFlagsValidation;
protected final NodeUniverseManager nodeUniverseManager;

@Inject
protected AbstractTaskBase(BaseTaskDependencies baseTaskDependencies) {
Expand Down Expand Up @@ -120,6 +122,7 @@ protected AbstractTaskBase(BaseTaskDependencies baseTaskDependencies) {
this.releaseManager = baseTaskDependencies.getReleaseManager();
this.ysqlQueryExecutor = baseTaskDependencies.getYsqlQueryExecutor();
this.gFlagsValidation = baseTaskDependencies.getGFlagsValidation();
this.nodeUniverseManager = baseTaskDependencies.getNodeUniverseManager();
}

protected ITaskParams taskParams() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,8 @@ Map<String, Long> getFollowerLagMs(String ip, int port) {
private CustomerTask submitMasterFailoverTask(
Customer customer, Universe universe, Action action) {
NodeDetails node = universe.getNode(action.getNodeName());
NodeDetails possibleReplacementCandidate = findReplacementMaster(universe, node);
String possibleReplacementCandidate =
findReplacementMaster(universe, node, true /* pickNewNode */);
if (possibleReplacementCandidate == null) {
log.error(
"No replacement master found for node {} in universe {}",
Expand All @@ -483,7 +484,7 @@ private CustomerTask submitMasterFailoverTask(
}
log.debug(
"Found a possible replacement master candidate {} for universe {}",
possibleReplacementCandidate.getNodeName(),
possibleReplacementCandidate,
universe.getUniverseUUID());
Set<String> leaderlessTablets = getLeaderlessTablets(universe.getUniverseUUID());
if (CollectionUtils.isNotEmpty(leaderlessTablets)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,8 @@ private void detectMasterFailure(
}
// Verify that the replacement node exists before creating the failover schedule.
NodeDetails node = universe.getNode(action.getNodeName());
NodeDetails possibleReplacementCandidate =
autoMasterFailover.findReplacementMaster(universe, node);
String possibleReplacementCandidate =
autoMasterFailover.findReplacementMaster(universe, node, true /* pickNewNode */);
if (possibleReplacementCandidate == null) {
disableSchedule(customer, failoverScheduleName, true);
log.info(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.yugabyte.yw.common.ImageBundleUtil;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.NodeUIApiHelper;
import com.yugabyte.yw.common.NodeUniverseManager;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.ReleaseManager;
import com.yugabyte.yw.common.RestoreManagerYb;
Expand Down Expand Up @@ -62,4 +63,5 @@ public class BaseTaskDependencies {
private final ReleaseManager releaseManager;
private final YsqlQueryExecutor ysqlQueryExecutor;
private final GFlagsValidation gFlagsValidation;
private final NodeUniverseManager nodeUniverseManager;
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import com.yugabyte.yw.models.NodeInstance;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.helpers.NodeDetails;
import com.yugabyte.yw.models.helpers.NodeDetails.MasterState;
import com.yugabyte.yw.models.helpers.NodeDetails.NodeState;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -134,6 +135,9 @@ private void configureNodeDetails(Universe universe) {
private void freezeUniverseInTxn(Universe universe) {
NodeDetails universeNode = universe.getNode(taskParams().nodeName);
universeNode.nodeUuid = currentNode.nodeUuid;
if (addMaster) {
universeNode.masterState = MasterState.ToStart;
}
// Confirm the node on hold.
commitReservedNodes();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
@Retryable
public class MasterFailover extends UniverseDefinitionTaskBase {

private String replacementMasterName;

@Inject
protected MasterFailover(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
Expand All @@ -29,7 +31,7 @@ protected NodeTaskParams taskParams() {
return (NodeTaskParams) taskParams;
}

private void freezeUniverseInTxn(Universe universe) {
private NodeDetails runBasicChecks(Universe universe) {
NodeDetails node = universe.getNode(taskParams().nodeName);
if (node == null) {
String errMsg =
Expand All @@ -39,24 +41,40 @@ private void freezeUniverseInTxn(Universe universe) {
log.error(errMsg);
throw new RuntimeException(errMsg);
}
if (!node.isMaster) {
if (!node.isMaster && node.masterState != MasterState.ToStop) {
// On first try, isMaster must be set. On retry, masterState must be set to ToStop.
String errMsg =
String.format(
"Node %s must be a master in the universe %s",
taskParams().nodeName, universe.getUniverseUUID());
log.error(errMsg);
throw new RuntimeException(errMsg);
}
NodeDetails newMasterNode = super.findReplacementMaster(universe, node);
if (newMasterNode == null) {
return node;
}

private void freezeUniverseInTxn(Universe universe) {
NodeDetails node = universe.getNode(taskParams().nodeName);
if (node == null) {
String errMsg =
String.format(
"No replacement is found for master %s in the universe %s",
"Node %s is not found in universe %s",
taskParams().nodeName, universe.getUniverseUUID());
log.error(errMsg);
throw new RuntimeException(errMsg);
}
newMasterNode.masterState = MasterState.ToStart;
if (!node.isMaster) {
String errMsg =
String.format(
"Node %s must be a master in the universe %s",
taskParams().nodeName, universe.getUniverseUUID());
log.error(errMsg);
throw new RuntimeException(errMsg);
}
NodeDetails replacementMaster = universe.getNode(replacementMasterName);
if (replacementMaster.masterState == null) {
replacementMaster.masterState = MasterState.ToStart;
}
node.masterState = MasterState.ToStop;
}

Expand All @@ -74,30 +92,23 @@ public void validateParams(boolean isFirstTry) {
throw new RuntimeException(errMsg);
}
}
NodeDetails node = universe.getNode(taskParams().nodeName);
if (node == null) {
runBasicChecks(universe);
}

@Override
protected void createPrecheckTasks(Universe universe) {
NodeDetails node = runBasicChecks(universe);
super.addBasicPrecheckTasks();
// Pick new only on first try.
replacementMasterName = super.findReplacementMaster(universe, node, isFirstTry());
if (replacementMasterName == null) {
String errMsg =
String.format(
"Node %s is not found in universe %s",
"No replacement is found for master %s in the universe %s",
taskParams().nodeName, universe.getUniverseUUID());
log.error(errMsg);
throw new RuntimeException(errMsg);
}
if (isFirstTry()) {
if (!node.isMaster) {
String errMsg =
String.format(
"Node %s must be a master in the universe %s",
taskParams().nodeName, universe.getUniverseUUID());
log.error(errMsg);
throw new RuntimeException(errMsg);
}
}
}

@Override
protected void createPrecheckTasks(Universe universe) {
super.addBasicPrecheckTasks();
}

@Override
Expand Down Expand Up @@ -137,7 +148,7 @@ public void run() {
createMasterReplacementTasks(
universe,
currentNode,
() -> super.findReplacementMaster(universe, currentNode),
() -> replacementMasterName == null ? null : universe.getNode(replacementMasterName),
super.instanceExists(taskParams()),
true /*ignoreStopErrors*/,
true /*ignoreMasterAddrsUpdateError*/,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
@Retryable
public class RemoveNodeFromUniverse extends UniverseDefinitionTaskBase {

private String replacementMasterName;

@Inject
protected RemoveNodeFromUniverse(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
Expand All @@ -55,16 +57,17 @@ protected NodeTaskParams taskParams() {
return (NodeTaskParams) taskParams;
}

protected NodeDetails findNewMasterIfApplicable(Universe universe, NodeDetails currentNode) {
private String findReplacementMasterIfApplicable(
Universe universe, NodeDetails currentNode, boolean pickNewNode) {
boolean startMasterOnRemoveNode =
confGetter.getGlobalConf(GlobalConfKeys.startMasterOnRemoveNode);
if (startMasterOnRemoveNode && NodeActionFormData.startMasterOnRemoveNode) {
return super.findReplacementMaster(universe, currentNode);
return super.findReplacementMaster(universe, currentNode, pickNewNode);
}
return null;
}

private void runBasicChecks(Universe universe) {
private NodeDetails runBasicChecks(Universe universe) {
NodeDetails currentNode = universe.getNode(taskParams().nodeName);
if (currentNode == null) {
String msg = "No node " + taskParams().nodeName + " found in universe " + universe.getName();
Expand All @@ -75,6 +78,7 @@ private void runBasicChecks(Universe universe) {
if (isFirstTry()) {
currentNode.validateActionOnState(NodeActionType.REMOVE);
}
return currentNode;
}

@Override
Expand All @@ -86,13 +90,33 @@ public void validateParams(boolean isFirstTry) {
@Override
protected void createPrecheckTasks(Universe universe) {
// Check again after locking.
runBasicChecks(getUniverse());
NodeDetails currentNode = runBasicChecks(getUniverse());
boolean alwaysWaitForDataMove =
confGetter.getConfForScope(getUniverse(), UniverseConfKeys.alwaysWaitForDataMove);
if (alwaysWaitForDataMove) {
performPrecheck();
}
addBasicPrecheckTasks();
// Pick new only on first try.
replacementMasterName = findReplacementMasterIfApplicable(universe, currentNode, isFirstTry());
}

private void freezeUniverseInTxn(Universe universe) {
NodeDetails node = universe.getNode(taskParams().nodeName);
if (node == null) {
String msg = "No node " + taskParams().nodeName + " found in universe " + universe.getName();
log.error(msg);
throw new RuntimeException(msg);
}
if (node.isMaster) {
if (replacementMasterName != null) {
NodeDetails replacementMaster = universe.getNode(replacementMasterName);
if (replacementMaster.masterState == null) {
replacementMaster.masterState = MasterState.ToStart;
}
}
node.masterState = MasterState.ToStop;
}
}

@Override
Expand All @@ -108,23 +132,7 @@ public void run() {

Universe universe =
lockAndFreezeUniverseForUpdate(
taskParams().expectedUniverseVersion,
u -> {
NodeDetails node = u.getNode(taskParams().nodeName);
if (node == null) {
String msg =
"No node " + taskParams().nodeName + " found in universe " + u.getName();
log.error(msg);
throw new RuntimeException(msg);
}
if (node.isMaster) {
NodeDetails newMasterNode = findNewMasterIfApplicable(u, node);
if (newMasterNode != null && newMasterNode.masterState == null) {
newMasterNode.masterState = MasterState.ToStart;
}
node.masterState = MasterState.ToStop;
}
});
taskParams().expectedUniverseVersion, this::freezeUniverseInTxn);
try {
preTaskActions();
NodeDetails currentNode = universe.getNode(taskParams().nodeName);
Expand Down Expand Up @@ -176,7 +184,7 @@ public void run() {
createMasterReplacementTasks(
universe,
currentNode,
() -> findNewMasterIfApplicable(universe, currentNode),
() -> replacementMasterName == null ? null : universe.getNode(replacementMasterName),
masterReachable,
true /* ignore stop error */);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public class StopNodeInUniverse extends UniverseDefinitionTaskBase {

@Inject private RuntimeConfGetter confGetter;

private String replacementMasterName;

@Inject
protected StopNodeInUniverse(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
Expand All @@ -45,10 +47,11 @@ protected NodeTaskParams taskParams() {
return (NodeTaskParams) taskParams;
}

protected NodeDetails findNewMasterIfApplicable(Universe universe, NodeDetails currentNode) {
protected String findReplacementMasterIfApplicable(
Universe universe, NodeDetails currentNode, boolean pickNewNode) {
boolean startMasterOnStopNode = confGetter.getGlobalConf(GlobalConfKeys.startMasterOnStopNode);
if (startMasterOnStopNode && NodeActionFormData.startMasterOnStopNode) {
return super.findReplacementMaster(universe, currentNode);
return super.findReplacementMaster(universe, currentNode, pickNewNode);
}
return null;
}
Expand Down Expand Up @@ -89,6 +92,8 @@ protected void createPrecheckTasks(Universe universe) {
false);
}
addBasicPrecheckTasks();
// Pick new only on first try.
replacementMasterName = findReplacementMasterIfApplicable(universe, currentNode, isFirstTry());
}

private void freezeUniverseInTxn(Universe universe) {
Expand All @@ -99,9 +104,11 @@ private void freezeUniverseInTxn(Universe universe) {
throw new RuntimeException(msg);
}
if (node.isMaster) {
NodeDetails newMasterNode = findNewMasterIfApplicable(universe, node);
if (newMasterNode != null && newMasterNode.masterState == null) {
newMasterNode.masterState = MasterState.ToStart;
if (replacementMasterName != null) {
NodeDetails replacementMaster = universe.getNode(replacementMasterName);
if (replacementMaster.masterState == null) {
replacementMaster.masterState = MasterState.ToStart;
}
}
node.masterState = MasterState.ToStop;
}
Expand Down Expand Up @@ -164,7 +171,7 @@ public void run() {
createMasterReplacementTasks(
universe,
currentNode,
() -> findNewMasterIfApplicable(universe, currentNode),
() -> replacementMasterName == null ? null : universe.getNode(replacementMasterName),
instanceExists,
false /* ignore stop error */);

Expand Down
Loading

0 comments on commit acbb1ba

Please sign in to comment.