Skip to content

Commit

Permalink
[PLAT-14366] Basic local provider test for master auto failover
Browse files Browse the repository at this point in the history
Summary: Local provider test for auto-master failover.

Test Plan:
Ran the failover in a docker container.

```
[info] Test run com.yugabyte.yw.commissioner.tasks.local.AutoMasterFailoverLocalTest finished: 0 failed, 0 ignored, 1 total, 364.275s
[info] Passed: Total 1, Failed 0, Errors 0, Passed 1
```

Reviewers: sanketh, cwang, yshchetinin

Reviewed By: cwang

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D36726
  • Loading branch information
nkhogen committed Jul 24, 2024
1 parent 49523f5 commit b039d1a
Show file tree
Hide file tree
Showing 18 changed files with 626 additions and 154 deletions.
8 changes: 4 additions & 4 deletions managed/devops/opscli/ybops/cloud/common/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,8 +854,8 @@ def callback(self, args):
raise YBOpsRecoverableError("Could not connect({}) into node {}:{} using username {}"
.format(host_port_user["connection_type"],
host_port_user["host"],
host_port_user["user"],
host_port_user["port"]))
host_port_user["port"],
host_port_user["user"]))

def get_device_names(self, args, host_info=None):
return self.cloud.get_device_names(args)
Expand Down Expand Up @@ -2183,5 +2183,5 @@ def callback(self, args):
raise YBOpsRecoverableError("Could not connect({}) into node {}:{} using username {}"
.format(host_port_user["connection_type"],
host_port_user["host"],
host_port_user["user"],
host_port_user["port"]))
host_port_user["port"],
host_port_user["user"]))
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
import com.yugabyte.yw.models.helpers.NodeDetails.NodeState;
import com.yugabyte.yw.models.helpers.TaskType;
import com.yugabyte.yw.models.helpers.schedule.JobConfig.RuntimeParams;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
Expand All @@ -37,7 +34,9 @@
import java.util.UUID;
import lombok.Builder;
import lombok.Getter;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.yb.client.GetMasterHeartbeatDelaysResponse;
import org.yb.client.YBClient;
import org.yb.util.ServerInfo;
Expand All @@ -55,6 +54,7 @@ public class AutoMasterFailover extends UniverseDefinitionTaskBase {

@Builder
@Getter
@ToString
// The fail-over action to be performed as a result of the detection.
static class Action {
@Builder.Default ActionType actionType = ActionType.NONE;
Expand Down Expand Up @@ -413,7 +413,7 @@ public Action getAllowedMasterFailoverAction(Customer customer, Universe univers
@VisibleForTesting
Map<String, Long> getFollowerLagMs(String ip, int port) {
String endpoint = String.format(FOLLOWER_LAG_URL_FORMAT, ip, port);
log.info("Getting follower lag for endpoint {} {}", endpoint, nodeUIApiHelper);
log.info("Getting follower lag for endpoint {}", endpoint);
try {
JsonNode currentNodeMetricsJson = nodeUIApiHelper.getRequest(endpoint);
JsonNode errors = currentNodeMetricsJson.get("error");
Expand All @@ -439,21 +439,6 @@ Map<String, Long> getFollowerLagMs(String ip, int port) {

private CustomerTask submitMasterFailoverTask(
Customer customer, Universe universe, String failedNodeName) {
CustomerTask lastTask = CustomerTask.getLastTaskByTargetUuid(universe.getUniverseUUID());
if (lastTask != null && lastTask.getCompletionTime() != null) {
// Cooldown is calculated from the last task.
Duration cooldownPeriod =
confGetter.getConfForScope(universe, UniverseConfKeys.autoMasterFailoverCooldown);
Instant restrictionEndTime =
lastTask
.getCompletionTime()
.toInstant()
.plus(cooldownPeriod.getSeconds(), ChronoUnit.SECONDS);
if (restrictionEndTime.isAfter(Instant.now())) {
log.info("Universe {} is cooling down", universe.getUniverseUUID());
return null;
}
}
NodeDetails node = universe.getNode(failedNodeName);
NodeDetails possibleReplacementCandidate = findReplacementMaster(universe, node);
if (possibleReplacementCandidate == null) {
Expand All @@ -467,6 +452,14 @@ private CustomerTask submitMasterFailoverTask(
"Found a possible replacement master candidate {} for universe {}",
possibleReplacementCandidate.getNodeName(),
universe.getUniverseUUID());
Set<String> leaderlessTablets = getLeaderlessTablets(universe.getUniverseUUID());
if (CollectionUtils.isNotEmpty(leaderlessTablets)) {
log.error(
"Leaderless tablets {} found for universe {}",
Iterables.limit(leaderlessTablets, 10),
universe.getUniverseUUID());
return null;
}
NodeTaskParams taskParams = new NodeTaskParams();
taskParams.setUniverseUUID(universe.getUniverseUUID());
taskParams.nodeName = failedNodeName;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,24 @@
import com.yugabyte.yw.commissioner.Common.CloudType;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.PlatformScheduler;
import com.yugabyte.yw.common.Util;
import com.yugabyte.yw.common.config.GlobalConfKeys;
import com.yugabyte.yw.common.config.RuntimeConfGetter;
import com.yugabyte.yw.common.config.UniverseConfKeys;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.UserIntent;
import com.yugabyte.yw.models.Customer;
import com.yugabyte.yw.models.CustomerTask;
import com.yugabyte.yw.models.JobSchedule;
import com.yugabyte.yw.models.TaskInfo;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.helpers.TaskType;
import com.yugabyte.yw.models.helpers.schedule.JobConfig;
import com.yugabyte.yw.models.helpers.schedule.JobConfig.RuntimeParams;
import com.yugabyte.yw.models.helpers.schedule.ScheduleConfig;
import com.yugabyte.yw.scheduler.JobScheduler;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
Expand All @@ -40,13 +46,11 @@
@Singleton
@Slf4j
public class AutoMasterFailoverScheduler {
// Polling interval for checking if a universe has the detection schedule created.
private static final String AUTO_MASTER_FAILOVER_POLLER_INTERVAL =
"yb.auto_master_failover.poller_interval";

private static final String AUTO_MASTER_FAILOVER_POOL_NAME = "auto_master_failover.executor";
private static final String AUTO_MASTER_FAILOVER_SCHEDULE_NAME_FORMAT = "AutoMasterFailover_%s";
private static final String DETECT_MASTER_FAILURE_SCHEDULE_NAME_FORMAT = "DetectMasterFailure_%s";
private static final String SUPPORTED_DB_STABLE_VERSION = "2.20.3.0-b10";
private static final String SUPPORTED_DB_PREVIEW_VERSION = "2.21.0.0-b309";

private final RuntimeConfGetter confGetter;
private final PlatformExecutorFactory platformExecutorFactory;
Expand Down Expand Up @@ -117,10 +121,10 @@ public CompletableFuture<?> executeJob(RuntimeParams runtime) {
.ifPresent(
tf -> {
if (tf.getTaskState() == TaskInfo.State.Success) {
// Task executed successfully. Delete the fail-over schedule.
// Task executed successfully. Disable the schedule for tracking.
runtime
.getJobScheduler()
.deleteSchedule(runtime.getJobSchedule().getUuid());
.disableSchedule(runtime.getJobSchedule().getUuid(), true);
} else {
// Fail the job and keep the schedule to keep track of the failed
// counts.
Expand All @@ -142,7 +146,7 @@ public CompletableFuture<?> executeJob(RuntimeParams runtime) {

public void init() {
Duration pollingInterval =
confGetter.getStaticConf().getDuration(AUTO_MASTER_FAILOVER_POLLER_INTERVAL);
confGetter.getGlobalConf(GlobalConfKeys.autoMasterFailoverPollerInterval);
platformScheduler.schedule(
getClass().getSimpleName(), pollingInterval, pollingInterval, this::createSchedules);
failoverExecutor =
Expand All @@ -169,19 +173,51 @@ private void createSchedules() {
u -> {
UserIntent userIntent =
u.getUniverseDetails().getPrimaryCluster().userIntent;
if (userIntent != null
&& userIntent.providerType != CloudType.kubernetes) {
try {
createDetectMasterFailureSchedule(c, u);
} catch (Exception e) {
log.error(
"Error in creating master failure detection schedule for universe"
+ " {} - {}",
u.getUniverseUUID(),
e.getMessage());
} finally {
universeUuids.add(u.getUniverseUUID());
}
if (userIntent == null
|| userIntent.providerType == CloudType.kubernetes) {
return;
}
boolean isFailoverEnabled =
confGetter.getConfForScope(
u, UniverseConfKeys.enableAutoMasterFailover);
if (!isFailoverEnabled) {
log.debug(
"Automated master failover for universe {} is disabled",
u.getUniverseUUID());
return;
}
if (u.getUniverseDetails().universePaused) {
log.debug(
"Automated master failover for universe {} is paused",
u.getUniverseUUID());
return;
}
String ybDbVersion = userIntent.ybSoftwareVersion;
if (Util.compareYBVersions(
ybDbVersion,
SUPPORTED_DB_STABLE_VERSION,
SUPPORTED_DB_PREVIEW_VERSION,
true)
< 0) {
log.info(
"Auto master failover not supported in current version {}",
ybDbVersion);
log.info(
"Supported versions are from {} (stable) and {} (preview)",
SUPPORTED_DB_STABLE_VERSION,
SUPPORTED_DB_PREVIEW_VERSION);
return;
}
try {
createDetectMasterFailureSchedule(c, u);
} catch (Exception e) {
log.error(
"Error in creating master failure detection schedule for universe"
+ " {} - {}",
u.getUniverseUUID(),
e.getMessage());
} finally {
universeUuids.add(u.getUniverseUUID());
}
}));
try {
Expand All @@ -199,41 +235,22 @@ private void createSchedules() {

/** Create a schedule to detect master failure for the given universe */
private void createDetectMasterFailureSchedule(Customer customer, Universe universe) {
boolean isFailoverEnabled =
confGetter.getConfForScope(universe, UniverseConfKeys.enableAutoMasterFailover);
if (!isFailoverEnabled) {
log.debug(
"Skipping automated master failover for universe {} because it is disabled",
universe.getUniverseUUID(),
isFailoverEnabled);
// Schedule is no longer needed.
jobScheduler
.maybeGetSchedule(customer.getUuid(), getDetectMasterFailureScheduleName(universe))
.ifPresent(s -> jobScheduler.deleteSchedule(s.getUuid()));
return;
}
if (universe.getUniverseDetails().universePaused) {
log.debug(
"Skipping automated master failover for universe {} because it is paused",
universe.getUniverseUUID());
// Schedule is no longer needed.
jobScheduler
.maybeGetSchedule(customer.getUuid(), getDetectMasterFailureScheduleName(universe))
.ifPresent(s -> jobScheduler.deleteSchedule(s.getUuid()));
return;
}
log.trace(
"Creating master failure detection schedule for universe {} if it is absent",
universe.getUniverseUUID());
Duration detectionInterval =
confGetter.getConfForScope(universe, UniverseConfKeys.autoMasterFailoverDetectionInterval);
log.debug(
"Master failover detection interval is set to {} seconds for universe {}",
detectionInterval.getSeconds(),
universe.getUniverseUUID());
String scheduleName = getDetectMasterFailureScheduleName(universe);
Optional<JobSchedule> optional =
jobScheduler.maybeGetSchedule(customer.getUuid(), scheduleName);
if (optional.isPresent()) {
ScheduleConfig scheduleConfig = optional.get().getScheduleConfig();
if (!detectionInterval.equals(scheduleConfig.getInterval())) {
log.debug(
log.info(
"Failover detection schedule has changed from {} to {}",
scheduleConfig.getInterval(),
detectionInterval);
Expand All @@ -242,7 +259,7 @@ private void createDetectMasterFailureSchedule(Customer customer, Universe unive
scheduleConfig.toBuilder().interval(detectionInterval).build());
}
} else {
log.debug(
log.info(
"Creating master failure detection schedule for universe {}", universe.getUniverseUUID());
JobSchedule jobSchedule = new JobSchedule();
jobSchedule.setCustomerUuid(customer.getUuid());
Expand Down Expand Up @@ -270,13 +287,6 @@ private void detectMasterFailure(
// Let the creator of this schedule handle the life-cycle.
return;
}
if (universe.getUniverseDetails().universePaused) {
log.debug(
"Skipping automated master failover for universe {} because it is paused",
universe.getUniverseUUID());
// Let the creator of this schedule handle the life-cycle.
return;
}
if (universe.universeIsLocked()) {
log.info(
"Skipping master failover for universe {} because it is already being updated",
Expand All @@ -287,26 +297,77 @@ private void detectMasterFailure(
String scheduleName = getAutoMasterFailoverScheduleName(universe);
Action action = autoMasterFailover.getAllowedMasterFailoverAction(customer, universe);
if (action.getActionType() == ActionType.NONE) {
// No fail-over action can be performed. Remove the schedule to restart fresh.
// No fail-over action can be performed. Disable to keep track of the last run.
jobScheduler
.maybeGetSchedule(customer.getUuid(), scheduleName)
.ifPresent(s -> jobScheduler.deleteSchedule(s.getUuid()));
.ifPresent(s -> jobScheduler.disableSchedule(s.getUuid(), true));
return;
}
log.debug("Detected master failure for universe {}", universe.getUniverseUUID());
log.info(
"Detected master failure for universe {}, next action {}",
universe.getUniverseUUID(),
action);
if (action.getActionType() == ActionType.SUBMIT
&& action.getTaskType() == TaskType.MasterFailover) {
Optional<CustomerTask> optional =
CustomerTask.maybeGetLastTaskByTargetUuidTaskType(
universe.getUniverseUUID(), CustomerTask.TaskType.MasterFailover);
if (optional.isPresent()) {
// Cooldown for a new master failover is calculated from the master failover task.
Duration cooldownPeriod =
confGetter.getConfForScope(universe, UniverseConfKeys.autoMasterFailoverCooldown);
log.debug(
"Cooldown period is set to {} seconds for universe {}",
cooldownPeriod.getSeconds(),
universe.getUniverseUUID());
Instant restrictionEndTime =
optional
.get()
.getCompletionTime()
.toInstant()
.plus(cooldownPeriod.getSeconds(), ChronoUnit.SECONDS);
Instant now = Instant.now();
if (restrictionEndTime.isAfter(now)) {
long diffSecs = now.until(restrictionEndTime, ChronoUnit.SECONDS);
log.info(
"Universe {} is cooling down for {} seconds", universe.getUniverseUUID(), diffSecs);
jobScheduler
.maybeGetSchedule(customer.getUuid(), scheduleName)
.ifPresent(s -> jobScheduler.disableSchedule(s.getUuid(), true));
return;
}
}
}
Duration taskInterval =
confGetter.getConfForScope(universe, UniverseConfKeys.autoMasterFailoverTaskInterval);
log.debug(
"Task interval period is set to {} seconds for universe {}",
taskInterval.getSeconds(),
universe.getUniverseUUID());
Optional<JobSchedule> optional =
jobScheduler.maybeGetSchedule(customer.getUuid(), scheduleName);
if (optional.isPresent()) {
ScheduleConfig scheduleConfig = optional.get().getScheduleConfig();
if (!taskInterval.equals(scheduleConfig.getInterval())) {
if (scheduleConfig.isDisabled()) {
jobScheduler.disableSchedule(optional.get().getUuid(), false);
log.info(
"Enabled schedule for action {} to perform on universe {} in {} seconds",
action,
universe.getUniverseUUID(),
taskInterval.getSeconds());
} else if (!taskInterval.equals(scheduleConfig.getInterval())) {
log.debug(
"Task submission schedule has changed from {} to {}",
scheduleConfig.getInterval(),
taskInterval);
jobScheduler.updateSchedule(
optional.get().getUuid(), scheduleConfig.toBuilder().interval(taskInterval).build());
optional.get().getUuid(),
scheduleConfig.toBuilder().disabled(false).interval(taskInterval).build());
log.info(
"Updated schedule for action {} to perform on universe {} in {} seconds",
action,
universe.getUniverseUUID(),
taskInterval.getSeconds());
}
} else {
JobSchedule jobSchedule = new JobSchedule();
Expand All @@ -320,6 +381,11 @@ private void detectMasterFailure(
.failoverJobType(FailoverJobType.MASTER_FAILOVER)
.build());
jobScheduler.submitSchedule(jobSchedule);
log.info(
"Scheduled action {} to perform on universe {} in {} seconds",
action,
universe.getUniverseUUID(),
taskInterval.getSeconds());
}
}
}
Loading

0 comments on commit b039d1a

Please sign in to comment.