Skip to content

Commit

Permalink
[PLAT-15682] Try to re-install YBC on nodes which fail ping check bef…
Browse files Browse the repository at this point in the history
…ore backup/restore

Summary:
Some infra side operations like disk replacement can sometimes make running YBC package be
removed. I am catching these scenarios via YBC ping check before triggering backup/restore and doing
a "best effort" re-install. The subtasks are added with `ignoreErrors` because there can be genuine
node outage scenarios where YBC ping check fails. In such cases we should simply skip the failed
subtasks and proceed.

We will run this only for cases which are not undergoing YBC Upgrade.
In a separate PR, I am going to change Ybc upgrade for VMs to use ansible so that unavailable YBC
server does not lead to upgrade failure.

- For VMs
 # Do a configure by copying package, creating symlinks etc
 # Do a serverctl stop and start

- For K8s
 # Copy package
 # Trigger start

Test Plan:
Manually verified by removing the YBC package and stopping the running YBC process. The
next backup attempt downloads the package and sets it up correctly.

Reviewers: anijhawan, nsingh, vpatibandla

Reviewed By: anijhawan, nsingh, vpatibandla

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D38919
  • Loading branch information
kv83821-yb committed Oct 30, 2024
1 parent 06e596b commit 50766f4
Show file tree
Hide file tree
Showing 21 changed files with 287 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Download ybc package | Move unpacked tarball to "{{ tmp_ybc_package_location }}"
copy:
Expand All @@ -21,6 +22,7 @@
remote_src: True
tags:
- ybc-install
- reinstall-ybc

- name: Download ybc package | Create release ybc directory
file:
Expand All @@ -31,13 +33,15 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Download ybc package | Clean up temporary ybc remote downloads
file:
path: "{{ remote_ybc_tmp_package }}"
state: absent
tags:
- ybc-install
- reinstall-ybc

- name: Download ybc package | Unpack package "{{ tmp_ybc_package_location }}"
unarchive:
Expand All @@ -48,10 +52,12 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Download ybc package | Remove "{{ tmp_ybc_package_location }}"
file:
path: "{{ tmp_ybc_package_location }}"
state: absent
tags:
- ybc-install
- reinstall-ybc
9 changes: 8 additions & 1 deletion managed/devops/roles/configure-ybc-server/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
become: yes
become_method: sudo
become_user: "{{ user_name }}"
tags: ybc-install
tags:
- ybc-install
- reinstall-ybc

- name: Create ybc symlink folders
file:
Expand All @@ -24,6 +26,7 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Install | Create symlink to ybc package folder
file:
Expand All @@ -35,6 +38,7 @@
state: link
tags:
- ybc-install
- reinstall-ybc

- block:
- set_fact:
Expand All @@ -50,6 +54,7 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Setup | Symlink {{ yb_process_type }} logs directory
file:
Expand All @@ -62,6 +67,7 @@
force: yes
tags:
- ybc-install
- reinstall-ybc

- name: Setup | Create ybc config directory
file:
Expand All @@ -72,6 +78,7 @@
group: "{{ user_name }}"
tags:
- ybc-install
- reinstall-ybc

- name: Configure | Create ybc gflags
set_fact:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ protected SubTaskGroup createSubTaskGroup(String name) {
}

protected SubTaskGroup createSubTaskGroup(String name, boolean ignoreErrors) {
return createSubTaskGroup(name, SubTaskGroupType.Invalid);
return createSubTaskGroup(name, SubTaskGroupType.Invalid, ignoreErrors);
}

protected SubTaskGroup createSubTaskGroup(String name, SubTaskGroupType subTaskGroupType) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ public void createUpgradeTask(
if (enableYbc) {
Set<NodeDetails> primaryTservers = new HashSet<>(universe.getTServersInPrimaryCluster());
installYbcOnThePods(
universe.getName(),
primaryTservers,
false,
ybcSoftwareVersion,
Expand Down Expand Up @@ -429,11 +428,7 @@ public void createUpgradeTask(
Set<NodeDetails> replicaTservers =
new HashSet<NodeDetails>(universe.getNodesInCluster(asyncCluster.uuid));
installYbcOnThePods(
universe.getName(),
replicaTservers,
true,
ybcSoftwareVersion,
asyncCluster.userIntent.ybcFlags);
replicaTservers, true, ybcSoftwareVersion, asyncCluster.userIntent.ybcFlags);
performYbcAction(replicaTservers, true, "stop");
createWaitForYbcServerTask(replicaTservers);
}
Expand Down Expand Up @@ -524,7 +519,6 @@ public void createNonRollingGflagUpgradeTask(
Set<NodeDetails> primaryTservers =
new HashSet<NodeDetails>(universe.getTServersInPrimaryCluster());
installYbcOnThePods(
universe.getName(),
primaryTservers,
false,
ybcSoftwareVersion,
Expand Down Expand Up @@ -565,7 +559,6 @@ public void createNonRollingGflagUpgradeTask(
universe.getNodesInCluster(
universe.getUniverseDetails().getReadOnlyClusters().get(0).uuid));
installYbcOnThePods(
universe.getName(),
replicaTservers,
true,
ybcSoftwareVersion,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ private void waitForSubTasks(boolean abortOnFailure) {
anyEx = (anyEx != null) ? anyEx : e.getCause();
removeCompletedSubTask(iter, runnableSubTask, e.getCause());
// Call parent task abort if abortOnFailure set.
if (abortOnFailure) {
if (abortOnFailure && !ignoreErrors) {
runnableTask.setAbortTime(Instant.now());
runnableTask.cancelWaiterIfAborted();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,26 +122,29 @@ public void run() {
getRunnableTask().reset();

if (isFirstTry()) {
if (ybcBackup
&& universe.isYbcEnabled()
&& !universe
.getUniverseDetails()
.getYbcSoftwareVersion()
.equals(ybcManager.getStableYbcVersion())) {

if (universe
if (ybcBackup && universe.isYbcEnabled()) {
if (!universe
.getUniverseDetails()
.getPrimaryCluster()
.userIntent
.providerType
.equals(Common.CloudType.kubernetes)) {
createUpgradeYbcTaskOnK8s(
params().getUniverseUUID(), ybcManager.getStableYbcVersion())
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
.getYbcSoftwareVersion()
.equals(ybcManager.getStableYbcVersion())) {
if (universe
.getUniverseDetails()
.getPrimaryCluster()
.userIntent
.providerType
.equals(Common.CloudType.kubernetes)) {
createUpgradeYbcTaskOnK8s(
params().getUniverseUUID(), ybcManager.getStableYbcVersion())
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
} else {
createUpgradeYbcTask(
params().getUniverseUUID(), ybcManager.getStableYbcVersion(), true)
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
}
} else {
createUpgradeYbcTask(
params().getUniverseUUID(), ybcManager.getStableYbcVersion(), true)
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
// Try re-install ybc if ping check fails
// Skip upgrade case, since upgrade will anyway re-configure it
handleUnavailableYbcServers(universe, ybcManager);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,14 +220,12 @@ public void run() {
// Install YBC on the pods
if (taskParams().isEnableYbc()) {
installYbcOnThePods(
universe.getName(),
tserversAdded,
false,
taskParams().getYbcSoftwareVersion(),
taskParams().getPrimaryCluster().userIntent.ybcFlags);
if (readClusters.size() == 1) {
installYbcOnThePods(
universe.getName(),
readOnlyTserversAdded,
true,
taskParams().getYbcSoftwareVersion(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,6 @@ private boolean editCluster(

if (universe.isYbcEnabled()) {
installYbcOnThePods(
universe.getName(),
tserversToAdd,
isReadOnlyCluster,
ybcManager.getStableYbcVersion(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1380,71 +1380,50 @@ public KubernetesCommandExecutor createKubernetesExecutorTask(
}

public void installYbcOnThePods(
String universeName,
Set<NodeDetails> servers,
boolean isReadOnlyCluster,
String ybcSoftwareVersion,
Map<String, String> ybcGflags) {
SubTaskGroup ybcUpload =
createSubTaskGroup(
KubernetesCommandExecutor.CommandType.COPY_PACKAGE.getSubTaskGroupName());
createKubernetesYbcExecutorTask(
ybcUpload,
universeName,
KubernetesCommandExecutor.CommandType.COPY_PACKAGE,
servers,
isReadOnlyCluster,
ybcSoftwareVersion,
ybcGflags);
createSubTaskGroup(KubernetesCommandExecutor.CommandType.COPY_PACKAGE.getSubTaskGroupName())
.setSubTaskGroupType(SubTaskGroupType.ConfigureUniverse);
createKubernetesYbcCopyPackageTask(
ybcUpload, servers, isReadOnlyCluster, ybcSoftwareVersion, ybcGflags);
getRunnableTask().addSubTaskGroup(ybcUpload);
}

public void performYbcAction(
Set<NodeDetails> servers, boolean isReadOnlyCluster, String command) {
SubTaskGroup ybcAction =
createSubTaskGroup(KubernetesCommandExecutor.CommandType.YBC_ACTION.getSubTaskGroupName());
createKubernetesYbcExecutorTask(
ybcAction,
KubernetesCommandExecutor.CommandType.YBC_ACTION,
servers,
isReadOnlyCluster,
command);
createSubTaskGroup(KubernetesCommandExecutor.CommandType.YBC_ACTION.getSubTaskGroupName())
.setSubTaskGroupType(SubTaskGroupType.StartingNodeProcesses);
createKubernetesYbcActionTask(ybcAction, servers, isReadOnlyCluster, command);
getRunnableTask().addSubTaskGroup(ybcAction);
}

// Create Kubernetes Executor task for copying YBC package and conf file to the pod
public void createKubernetesYbcExecutorTask(
public void createKubernetesYbcCopyPackageTask(
SubTaskGroup subTaskGroup,
String universeName,
KubernetesCommandExecutor.CommandType commandType,
Set<NodeDetails> servers,
boolean isReadOnlyCluster,
String ybcSoftwareVersion,
Map<String, String> ybcGflags) {
for (NodeDetails node : servers) {
KubernetesCommandExecutor.Params params = new KubernetesCommandExecutor.Params();
Cluster primaryCluster = taskParams().getPrimaryCluster();
Universe universe = Universe.getOrBadRequest(taskParams().getUniverseUUID());
if (primaryCluster == null) {
primaryCluster = universe.getUniverseDetails().getPrimaryCluster();
}
params.commandType = commandType;
params.setUniverseUUID(taskParams().getUniverseUUID());
params.ybcServerName = node.nodeName;
params.setYbcSoftwareVersion(ybcSoftwareVersion);
params.ybcGflags = ybcGflags;
List<Cluster> readOnlyClusters = taskParams().getReadOnlyClusters();
if (isReadOnlyCluster && readOnlyClusters.size() == 0) {
readOnlyClusters = universe.getUniverseDetails().getReadOnlyClusters();
}
params.providerUUID =
UUID providerUUID =
isReadOnlyCluster
? UUID.fromString(readOnlyClusters.get(0).userIntent.provider)
: UUID.fromString(primaryCluster.userIntent.provider);
KubernetesCommandExecutor task = createTask(KubernetesCommandExecutor.class);
task.initialize(params);
task.setUserTaskUUID(getUserTaskUUID());
subTaskGroup.addSubTask(task);
createKubernetesYbcCopyPackageSubTask(
subTaskGroup, node, providerUUID, ybcSoftwareVersion, ybcGflags);
}
}

Expand Down Expand Up @@ -1512,14 +1491,12 @@ public void createKubernetesYbcExecutorTask(
}

// Create Kubernetes Executor task for perform ybc
public void createKubernetesYbcExecutorTask(
public void createKubernetesYbcActionTask(
SubTaskGroup subTaskGroup,
KubernetesCommandExecutor.CommandType commandType,
Set<NodeDetails> servers,
boolean isReadOnlyCluster,
String command) {
for (NodeDetails node : servers) {
KubernetesCommandExecutor.Params params = new KubernetesCommandExecutor.Params();
Cluster primaryCluster = taskParams().getPrimaryCluster();
List<Cluster> readOnlyClusters = taskParams().getReadOnlyClusters();
Universe universe = Universe.getOrBadRequest(taskParams().getUniverseUUID());
Expand All @@ -1529,19 +1506,12 @@ public void createKubernetesYbcExecutorTask(
if (isReadOnlyCluster && readOnlyClusters.size() == 0) {
readOnlyClusters = universe.getUniverseDetails().getReadOnlyClusters();
}
params.commandType = commandType;
params.setUniverseUUID(taskParams().getUniverseUUID());
params.ybcServerName = node.nodeName;
params.isReadOnlyCluster = isReadOnlyCluster;
params.providerUUID =
UUID providerUUID =
isReadOnlyCluster
? UUID.fromString(readOnlyClusters.get(0).userIntent.provider)
: UUID.fromString(primaryCluster.userIntent.provider);
params.command = command;
KubernetesCommandExecutor task = createTask(KubernetesCommandExecutor.class);
task.initialize(params);
task.setUserTaskUUID(getUserTaskUUID());
subTaskGroup.addSubTask(task);
createKubernetesYbcActionSubTask(
subTaskGroup, node, providerUUID, isReadOnlyCluster, command);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ public void run() {
// Install YBC on the RR tservers and wait for its completion
if (universe.isYbcEnabled()) {
installYbcOnThePods(
universe.getName(),
tserversAdded,
true,
ybcManager.getStableYbcVersion(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,25 +53,29 @@ public void run() {

if (isFirstTry()) {
backupHelper.validateRestoreOverwrites(taskParams().backupStorageInfoList, universe);
if (universe.isYbcEnabled()
&& !universe
.getUniverseDetails()
.getYbcSoftwareVersion()
.equals(ybcManager.getStableYbcVersion())) {

if (universe
if (universe.isYbcEnabled()) {
if (!universe
.getUniverseDetails()
.getPrimaryCluster()
.userIntent
.providerType
.equals(Common.CloudType.kubernetes)) {
createUpgradeYbcTaskOnK8s(
taskParams().getUniverseUUID(), ybcManager.getStableYbcVersion())
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
.getYbcSoftwareVersion()
.equals(ybcManager.getStableYbcVersion())) {
if (universe
.getUniverseDetails()
.getPrimaryCluster()
.userIntent
.providerType
.equals(Common.CloudType.kubernetes)) {
createUpgradeYbcTaskOnK8s(
taskParams().getUniverseUUID(), ybcManager.getStableYbcVersion())
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
} else {
createUpgradeYbcTask(
taskParams().getUniverseUUID(), ybcManager.getStableYbcVersion(), true)
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
}
} else {
createUpgradeYbcTask(
taskParams().getUniverseUUID(), ybcManager.getStableYbcVersion(), true)
.setSubTaskGroupType(SubTaskGroupType.UpgradingYbc);
// Try re-install ybc if ping check fails
// Skip upgrade case, since upgrade will anyway re-configure it
handleUnavailableYbcServers(universe, ybcManager);
}
}
}
Expand Down
Loading

0 comments on commit 50766f4

Please sign in to comment.