Skip to content

Commit

Permalink
[BACKPORT 2024.1][PLAT-14696] DDL atomicity check metric and alert1
Browse files Browse the repository at this point in the history
Summary:
Adding DDL atomicity check to the health check script.
Check is performed in case special argument is passed to the script. NOt performed by default.
Passing required arguments from the HealthChecker once an hour for each universe.
Check is performed on one tserver node only. We try to run on master leader node. If it's a dedicated master - we'll run on the TServer node from the same region. If it's not present - will run on random TServer node.
Exporting metric status yb_ddl_atomicity_check from YBA for this check
Also added an alert based on the metric.

Original diff: https://phorge.dev.yugabyte.com/D36886

Test Plan:
Created multiple universes with various versions.
Make sure DDL atomicity check runs on universes with versions between 2.18.4.0-b23 and 2.19.0.0-b1 and higher than 2.19.1.0-b301.
Make sure DDL atomicity check passes by default.
Force DDL atomicity issues on universes with versions 2.18.4.0-b23 and 2.20.0.0-b50.
Make sure health check result shows the list of errors.
MAke sure alert is raised for both universes.
Fix the issues by deleting the affected tables.
Make sure alert is cleaned after next DDL atomicity check passes.

Reviewers: #yba-api-review!, vbansal

Reviewed By: vbansal

Subscribers: yugaware, sanketh

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D36949
  • Loading branch information
anmalysh-yb committed Jul 31, 2024
1 parent 35881f7 commit c82b1c4
Show file tree
Hide file tree
Showing 13 changed files with 383 additions and 110 deletions.
1 change: 1 addition & 0 deletions managed/RUNTIME-FLAGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
| "Clock Skew" | "yb.alert.max_clock_skew_ms" | "UNIVERSE" | "Default threshold for Clock Skew alert" | "Duration" |
| "Health Log Output" | "yb.health.logOutput" | "UNIVERSE" | "It determines whether to log the output of the node health check script to the console" | "Boolean" |
| "Node Checkout Time" | "yb.health.nodeCheckTimeoutSec" | "UNIVERSE" | "The timeout (in seconds) for node check operation as part of universe health check" | "Integer" |
| "DDL Atomicity Check Interval" | "yb.health.ddl_atomicity_interval_sec" | "UNIVERSE" | "The interval (in seconds) between DDL atomicity checks" | "Integer" |
| "YB Upgrade Blacklist Leaders" | "yb.upgrade.blacklist_leaders" | "UNIVERSE" | "Determines (boolean) whether we enable/disable leader blacklisting when performing universe/node tasks" | "Boolean" |
| "YB Upgrade Blacklist Leader Wait Time in Ms" | "yb.upgrade.blacklist_leader_wait_time_ms" | "UNIVERSE" | "The timeout (in milliseconds) that we wait of leader blacklisting on a node to complete" | "Integer" |
| "Fail task on leader blacklist timeout" | "yb.node_ops.leader_blacklist.fail_on_timeout" | "UNIVERSE" | "Determines (boolean) whether we fail the task after waiting for leader blacklist timeout is reached" | "Boolean" |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import static com.yugabyte.yw.common.metrics.MetricService.DEFAULT_METRIC_EXPIRY_SEC;
import static com.yugabyte.yw.models.helpers.CommonUtils.nowPlusWithoutMillis;

import autovalue.shaded.com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableList;
import com.yugabyte.yw.common.metrics.MetricLabelsBuilder;
import com.yugabyte.yw.models.Customer;
Expand All @@ -26,6 +27,7 @@
import java.time.temporal.ChronoUnit;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
Expand All @@ -47,6 +49,7 @@ public class HealthCheckMetrics {
private static final String CORE_FILES_CHECK = "Core files";
static final String OPENED_FILE_DESCRIPTORS_CHECK = "Opened file descriptors";
static final String CLOCK_SYNC_CHECK = "Clock synchronization";
static final String DDL_ATOMICITY_CHECK = "DDL atomicity";
private static final String NODE_TO_NODE_CA_CERT_CHECK = "Node To Node CA Cert Expiry Days";
private static final String NODE_TO_NODE_CERT_CHECK = "Node To Node Cert Expiry Days";
private static final String CLIENT_TO_NODE_CA_CERT_CHECK = "Client To Node CA Cert Expiry Days";
Expand All @@ -57,6 +60,11 @@ public class HealthCheckMetrics {
private static final String YB_CONTROLLER_CHECK = "YB-Controller server check";

public static final String CUSTOM_NODE_METRICS_COLLECTION_METRIC = "yb_node_custom_node_metrics";
public static final String DDL_ATOMICITY_CHECK_METRIC = "yb_ddl_atomicity_check";
public static final Set<String> UNIVERSE_WIDE_CHECK_METRICS =
ImmutableSet.of(DDL_ATOMICITY_CHECK_METRIC);
public static final Set<String> SKIP_CLEANUP_METRICS =
ImmutableSet.of(DDL_ATOMICITY_CHECK_METRIC);

public static final List<PlatformMetrics> HEALTH_CHECK_METRICS_WITHOUT_STATUS =
ImmutableList.<PlatformMetrics>builder()
Expand Down Expand Up @@ -171,17 +179,20 @@ private static List<Metric> buildNodeMetric(
.setSourceUuid(universe.getUniverseUUID())
.setLabels(
MetricLabelsBuilder.create().appendSource(universe).getMetricLabels())
.setKeyLabel(KnownAlertLabels.NODE_NAME, nodeData.getNodeName())
.setLabel(KnownAlertLabels.NODE_ADDRESS, nodeData.getNode())
.setLabel(KnownAlertLabels.NODE_IDENTIFIER, nodeData.getNodeIdentifier())
.setValue(value.getValue());
if (nodeData.getNodeName() != null
&& universe.getNode(nodeData.getNodeName()) != null) {
NodeDetails nodeDetails = universe.getNode(nodeData.getNodeName());
result.setLabel(KnownAlertLabels.NODE_REGION, nodeDetails.getRegion());
result.setLabel(
KnownAlertLabels.NODE_CLUSTER_TYPE,
universe.getCluster(nodeDetails.placementUuid).clusterType.name());
if (!UNIVERSE_WIDE_CHECK_METRICS.contains(metric.getName())) {
result
.setKeyLabel(KnownAlertLabels.NODE_NAME, nodeData.getNodeName())
.setLabel(KnownAlertLabels.NODE_ADDRESS, nodeData.getNode())
.setLabel(KnownAlertLabels.NODE_IDENTIFIER, nodeData.getNodeIdentifier());
if (nodeData.getNodeName() != null
&& universe.getNode(nodeData.getNodeName()) != null) {
NodeDetails nodeDetails = universe.getNode(nodeData.getNodeName());
result.setLabel(KnownAlertLabels.NODE_REGION, nodeDetails.getRegion());
result.setLabel(
KnownAlertLabels.NODE_CLUSTER_TYPE,
universe.getCluster(nodeDetails.placementUuid).clusterType.name());
}
}
if (CollectionUtils.isNotEmpty(value.getLabels())) {
value
Expand Down
146 changes: 93 additions & 53 deletions managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@

package com.yugabyte.yw.commissioner;

import static com.yugabyte.yw.commissioner.HealthCheckMetrics.CLOCK_SYNC_CHECK;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.CUSTOM_NODE_METRICS_COLLECTION_METRIC;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.HEALTH_CHECK_METRICS;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.HEALTH_CHECK_METRICS_WITHOUT_STATUS;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.NODE_EXPORTER_CHECK;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.OPENED_FILE_DESCRIPTORS_CHECK;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.UPTIME_CHECK;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.getCountMetricByCheckName;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.getNodeMetrics;
import static com.yugabyte.yw.commissioner.HealthCheckMetrics.*;
import static com.yugabyte.yw.common.metrics.MetricService.STATUS_OK;
import static com.yugabyte.yw.common.metrics.MetricService.buildMetricTemplate;
import static play.mvc.Http.Status.BAD_REQUEST;
Expand All @@ -33,14 +25,7 @@
import com.yugabyte.yw.commissioner.Common.CloudType;
import com.yugabyte.yw.commissioner.tasks.KubernetesTaskBase;
import com.yugabyte.yw.commissioner.tasks.UniverseTaskBase;
import com.yugabyte.yw.common.EmailHelper;
import com.yugabyte.yw.common.FileHelperService;
import com.yugabyte.yw.common.NodeUniverseManager;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.PlatformScheduler;
import com.yugabyte.yw.common.PlatformServiceException;
import com.yugabyte.yw.common.ShellProcessContext;
import com.yugabyte.yw.common.ShellResponse;
import com.yugabyte.yw.common.*;
import com.yugabyte.yw.common.alerts.MaintenanceService;
import com.yugabyte.yw.common.alerts.SmtpData;
import com.yugabyte.yw.common.config.RuntimeConfGetter;
Expand Down Expand Up @@ -78,6 +63,8 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
Expand Down Expand Up @@ -127,6 +114,9 @@ public class HealthChecker {
private static final String MAX_NUM_THREADS_NODE_CHECK_KEY =
"yb.health.max_num_parallel_node_checks";

private static final String DDL_ATOMICITY_CHECK_RELEASE = "2.18.4.0-b23";
private static final String DDL_ATOMICITY_CHECK_PREVIEW_RELEASE = "2.19.1.0-b301";

private final Environment environment;

private final Config config;
Expand Down Expand Up @@ -159,6 +149,7 @@ public class HealthChecker {

// We upload health check script to the node only when NodeInfo is updates
private final Map<Pair<UUID, String>, NodeInfo> uploadedNodeInfo = new ConcurrentHashMap<>();
private final Map<UUID, Instant> ddlAtomicityCheckTimestamp = new ConcurrentHashMap<>();

private final Set<String> healthScriptMetrics =
Collections.newSetFromMap(new ConcurrentHashMap<>());
Expand Down Expand Up @@ -246,22 +237,19 @@ public void initialize() {
// The interval at which the checker will run.
// Can be overridden per customer.
private long healthCheckIntervalMs() {
Long interval = config.getLong("yb.health.check_interval_ms");
return interval == null ? 0 : interval;
return config.getLong("yb.health.check_interval_ms");
}

// The interval at which check result will be stored to DB
// Can be overridden per customer.
private long healthCheckStoreIntervalMs() {
Long interval = config.getLong("yb.health.store_interval_ms");
return interval == null ? 0 : interval;
return config.getLong("yb.health.store_interval_ms");
}

// The interval at which to send a status update of all the current universes.
// Can be overridden per customer.
private long statusUpdateIntervalMs() {
Long interval = config.getLong("yb.health.status_interval_ms");
return interval == null ? 0 : interval;
return config.getLong("yb.health.status_interval_ms");
}

/**
Expand Down Expand Up @@ -331,7 +319,12 @@ private void processMetrics(Customer c, Universe u, Details report) {
}
if (shouldCollectNodeMetrics
|| checkName.equals(OPENED_FILE_DESCRIPTORS_CHECK)
|| checkName.equals(CLOCK_SYNC_CHECK)) {
|| checkName.equals(CLOCK_SYNC_CHECK)
|| checkName.equals(DDL_ATOMICITY_CHECK)) {
if (checkName.equals(DDL_ATOMICITY_CHECK)) {
ddlAtomicityCheckTimestamp.put(
u.getUniverseUUID(), report.getTimestampIso().toInstant());
}
// Used FD count metric is always collected through health check as it's not
// calculated properly from inside the collect_metrics service - it gets service limit
// instead of user limit for file descriptors
Expand All @@ -344,11 +337,14 @@ private void processMetrics(Customer c, Universe u, Details report) {
}

healthScriptMetrics.addAll(
metrics.stream().map(Metric::getName).collect(Collectors.toList()));
metrics.stream()
.map(Metric::getName)
.filter(n -> !SKIP_CLEANUP_METRICS.contains(n))
.toList());
metrics.addAll(
platformMetrics.entrySet().stream()
.map(e -> buildMetricTemplate(e.getKey(), u).setValue(e.getValue().doubleValue()))
.collect(Collectors.toList()));
.toList());
// Clean all health check metrics for universe before saving current values
// just in case list of nodes changed between runs.
MetricFilter toClean =
Expand Down Expand Up @@ -526,7 +522,7 @@ public void markUniverseForReUpload(UUID universeUUID) {
List<Pair<UUID, String>> universeNodeInfos =
uploadedNodeInfo.keySet().stream()
.filter(key -> key.getFirst().equals(universeUUID))
.collect(Collectors.toList());
.toList();
universeNodeInfos.forEach(uploadedNodeInfo::remove);
}

Expand All @@ -536,7 +532,7 @@ public void handleUniverseRemoval(UUID universeUUID) {
List<Pair<UUID, String>> universeNodeInfos =
uploadedNodeInfo.keySet().stream()
.filter(key -> key.getFirst().equals(universeUUID))
.collect(Collectors.toList());
.toList();
universeNodeInfos.forEach(uploadedNodeInfo::remove);
}

Expand Down Expand Up @@ -613,7 +609,7 @@ public CompletableFuture<Void> runHealthCheck(

private String getAlertDestinations(Universe u, Customer c) {
List<String> destinations = emailHelper.getDestinations(c.getUuid());
if (destinations.size() == 0) {
if (destinations.isEmpty()) {
return null;
}

Expand Down Expand Up @@ -683,9 +679,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
}
providerCode = provider.getCode();
List<NodeDetails> activeNodes =
details.getNodesInCluster(cluster.uuid).stream()
.filter(NodeDetails::isActive)
.collect(Collectors.toList());
details.getNodesInCluster(cluster.uuid).stream().filter(NodeDetails::isActive).toList();
for (NodeDetails nd : activeNodes) {
if (nd.cloudInfo.private_ip == null) {
log.warn(
Expand All @@ -697,9 +691,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
}
}
List<NodeDetails> sortedDetails =
activeNodes.stream()
.sorted(Comparator.comparing(NodeDetails::getNodeName))
.collect(Collectors.toList());
activeNodes.stream().sorted(Comparator.comparing(NodeDetails::getNodeName)).toList();
Set<UUID> nodeUuids =
sortedDetails.stream()
.map(NodeDetails::getNodeUuid)
Expand Down Expand Up @@ -792,7 +784,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
return;
}

List<NodeData> nodeReports = checkNodes(params.universe, nodeMetadata);
List<NodeData> nodeReports = checkNodes(params, nodeMetadata);

Details fullReport =
new Details()
Expand Down Expand Up @@ -822,9 +814,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
durationMs);
if (healthCheckReport.getHasError()) {
List<NodeData> failedChecks =
healthCheckReport.getData().stream()
.filter(NodeData::getHasError)
.collect(Collectors.toList());
healthCheckReport.getData().stream().filter(NodeData::getHasError).toList();
log.warn(
"Following checks failed for universe {}:\n{}",
params.universe.getName(),
Expand Down Expand Up @@ -853,20 +843,58 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
buildMetricTemplate(PlatformMetrics.HEALTH_CHECK_STATUS, params.universe));
}

private List<NodeData> checkNodes(Universe universe, List<NodeInfo> nodes) {
private List<NodeData> checkNodes(CheckSingleUniverseParams params, List<NodeInfo> nodes) {
// Check if it should log the output of the command.
Universe universe = params.universe;
boolean shouldLogOutput =
confGetter.getConfForScope(universe, UniverseConfKeys.healthLogOutput);
int nodeCheckTimeoutSec =
confGetter.getConfForScope(universe, UniverseConfKeys.nodeCheckTimeoutSec);

int ddlAtomicityIntervalSec =
confGetter.getConfForScope(universe, UniverseConfKeys.ddlAtomicityIntervalSec);

Instant lastDdlAtomicityCheckTimestamp =
ddlAtomicityCheckTimestamp.get(universe.getUniverseUUID());
String nodeToRunDdlTAtomicityCheck = null;
String masterLeaderUrl = null;
String ybDbRelease =
universe.getUniverseDetails().getPrimaryCluster().userIntent.ybSoftwareVersion;
boolean ddlAtomicityCheckSupported =
CommonUtils.isReleaseBetween(DDL_ATOMICITY_CHECK_RELEASE, "2.19.0.0-b0", ybDbRelease)
|| CommonUtils.isReleaseEqualOrAfter(DDL_ATOMICITY_CHECK_PREVIEW_RELEASE, ybDbRelease);
if (ddlAtomicityCheckSupported
&& !params.onlyMetrics
&& (lastDdlAtomicityCheckTimestamp == null
|| lastDdlAtomicityCheckTimestamp
.plus(ddlAtomicityIntervalSec, ChronoUnit.SECONDS)
.isBefore(Instant.now()))) {
// We should schedule DDL atomicity check.
NodeDetails masterLeader = universe.getMasterLeaderNode();
if (masterLeader != null) {
NodeDetails nodeToRun = CommonUtils.getServerToRunYsqlQuery(universe);
boolean httpsEnabledUI =
universe.getConfig().getOrDefault(Universe.HTTPS_ENABLED_UI, "false").equals("true");
masterLeaderUrl =
(httpsEnabledUI ? "https" : "http")
+ "://"
+ masterLeader.cloudInfo.private_ip
+ ":"
+ masterLeader.masterHttpPort;
nodeToRunDdlTAtomicityCheck = nodeToRun.getNodeName();
}
}
Map<String, CompletableFuture<Details>> nodeChecks = new HashMap<>();
for (NodeInfo nodeInfo : nodes) {
NodeCheckContext context =
new NodeCheckContext().setLogOutput(shouldLogOutput).setTimeoutSec(nodeCheckTimeoutSec);
if (nodeInfo.getNodeName().equals(nodeToRunDdlTAtomicityCheck)) {
context.setDdlAtomicityCheck(true);
context.setMasterLeaderUrl(masterLeaderUrl);
}
nodeChecks.put(
nodeInfo.getNodeName(),
CompletableFuture.supplyAsync(
() -> checkNode(universe, nodeInfo, shouldLogOutput, nodeCheckTimeoutSec),
nodeExecutor));
() -> checkNode(universe, nodeInfo, context), nodeExecutor));
}

List<NodeData> result = new ArrayList<>();
Expand Down Expand Up @@ -916,14 +944,14 @@ private List<NodeData> checkNodes(Universe universe, List<NodeInfo> nodes) {
}

private Details checkNode(
Universe universe, NodeInfo nodeInfo, boolean logOutput, int timeoutSec) {
Universe universe, NodeInfo nodeInfo, NodeCheckContext nodeCheckContext) {
Pair<UUID, String> nodeKey = new Pair<>(universe.getUniverseUUID(), nodeInfo.getNodeName());
NodeInfo uploadedInfo = uploadedNodeInfo.get(nodeKey);
ShellProcessContext context =
ShellProcessContext.builder()
.logCmdOutput(logOutput)
.logCmdOutput(nodeCheckContext.isLogOutput())
.traceLogging(true)
.timeoutSecs(timeoutSec)
.timeoutSecs(nodeCheckContext.getTimeoutSec())
.build();
if (uploadedInfo == null && !nodeInfo.isK8s()) {
// Only upload it once for new node, as it only depends on yb home dir.
Expand Down Expand Up @@ -966,9 +994,15 @@ private Details checkNode(
}
uploadedNodeInfo.put(nodeKey, nodeInfo);

List<String> commandToRun = new ArrayList<>();
commandToRun.add(scriptPath);
if (nodeCheckContext.ddlAtomicityCheck) {
commandToRun.add("--ddl_atomicity_check=true");
commandToRun.add("--master_leader_url=" + nodeCheckContext.getMasterLeaderUrl());
}
ShellResponse response =
nodeUniverseManager
.runCommand(nodeInfo.getNodeDetails(), universe, scriptPath, context)
.runCommand(nodeInfo.getNodeDetails(), universe, commandToRun, context)
.processErrors();

return Json.fromJson(Json.parse(response.extractRunCommandOutput()), Details.class);
Expand Down Expand Up @@ -1027,8 +1061,7 @@ private String generateNodeCheckScript(UUID universeUuid, NodeInfo nodeInfo) {
private MetricFilter metricSourceKeysFilterWithHealthScriptMetrics(
Customer customer, Universe universe, List<PlatformMetrics> metrics) {
Set<String> allMetricNames = new HashSet<>(healthScriptMetrics);
allMetricNames.addAll(
metrics.stream().map(PlatformMetrics::getMetricName).collect(Collectors.toList()));
allMetricNames.addAll(metrics.stream().map(PlatformMetrics::getMetricName).toList());
List<MetricSourceKey> metricSourceKeys =
allMetricNames.stream()
.map(
Expand All @@ -1038,7 +1071,7 @@ private MetricFilter metricSourceKeysFilterWithHealthScriptMetrics(
.name(metricName)
.sourceUuid(universe.getUniverseUUID())
.build())
.collect(Collectors.toList());
.toList();
return MetricFilter.builder().sourceKeys(metricSourceKeys).build();
}

Expand Down Expand Up @@ -1100,11 +1133,18 @@ public static class NodeInfo {
@JsonIgnore @EqualsAndHashCode.Exclude private NodeDetails nodeDetails;
}

@Data
@Accessors(chain = true)
private static class NodeCheckContext {
private boolean logOutput;
private int timeoutSec;
private boolean ddlAtomicityCheck;
private String masterLeaderUrl;
}

private Details removeMetricOnlyChecks(Details details) {
List<NodeData> nodeReports =
details.getData().stream()
.filter(data -> !data.getMetricsOnly())
.collect(Collectors.toList());
details.getData().stream().filter(data -> !data.getMetricsOnly()).toList();
return new Details()
.setTimestampIso(details.getTimestampIso())
.setYbVersion(details.getYbVersion())
Expand Down
Loading

0 comments on commit c82b1c4

Please sign in to comment.