diff --git a/managed/RUNTIME-FLAGS.md b/managed/RUNTIME-FLAGS.md index 90308912c50e..435d4fc951dd 100644 --- a/managed/RUNTIME-FLAGS.md +++ b/managed/RUNTIME-FLAGS.md @@ -131,6 +131,7 @@ | "Clock Skew" | "yb.alert.max_clock_skew_ms" | "UNIVERSE" | "Default threshold for Clock Skew alert" | "Duration" | | "Health Log Output" | "yb.health.logOutput" | "UNIVERSE" | "It determines whether to log the output of the node health check script to the console" | "Boolean" | | "Node Checkout Time" | "yb.health.nodeCheckTimeoutSec" | "UNIVERSE" | "The timeout (in seconds) for node check operation as part of universe health check" | "Integer" | +| "DDL Atomicity Check Interval" | "yb.health.ddl_atomicity_interval_sec" | "UNIVERSE" | "The interval (in seconds) between DDL atomicity checks" | "Integer" | | "YB Upgrade Blacklist Leaders" | "yb.upgrade.blacklist_leaders" | "UNIVERSE" | "Determines (boolean) whether we enable/disable leader blacklisting when performing universe/node tasks" | "Boolean" | | "YB Upgrade Blacklist Leader Wait Time in Ms" | "yb.upgrade.blacklist_leader_wait_time_ms" | "UNIVERSE" | "The timeout (in milliseconds) that we wait of leader blacklisting on a node to complete" | "Integer" | | "Fail task on leader blacklist timeout" | "yb.node_ops.leader_blacklist.fail_on_timeout" | "UNIVERSE" | "Determines (boolean) whether we fail the task after waiting for leader blacklist timeout is reached" | "Boolean" | diff --git a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthCheckMetrics.java b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthCheckMetrics.java index 8be10936aae8..fa6308850958 100644 --- a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthCheckMetrics.java +++ b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthCheckMetrics.java @@ -13,6 +13,7 @@ import static com.yugabyte.yw.common.metrics.MetricService.DEFAULT_METRIC_EXPIRY_SEC; import static com.yugabyte.yw.models.helpers.CommonUtils.nowPlusWithoutMillis; +import autovalue.shaded.com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableList; import com.yugabyte.yw.common.metrics.MetricLabelsBuilder; import com.yugabyte.yw.models.Customer; @@ -26,6 +27,7 @@ import java.time.temporal.ChronoUnit; import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections4.CollectionUtils; @@ -47,6 +49,7 @@ public class HealthCheckMetrics { private static final String CORE_FILES_CHECK = "Core files"; static final String OPENED_FILE_DESCRIPTORS_CHECK = "Opened file descriptors"; static final String CLOCK_SYNC_CHECK = "Clock synchronization"; + static final String DDL_ATOMICITY_CHECK = "DDL atomicity"; private static final String NODE_TO_NODE_CA_CERT_CHECK = "Node To Node CA Cert Expiry Days"; private static final String NODE_TO_NODE_CERT_CHECK = "Node To Node Cert Expiry Days"; private static final String CLIENT_TO_NODE_CA_CERT_CHECK = "Client To Node CA Cert Expiry Days"; @@ -57,6 +60,11 @@ public class HealthCheckMetrics { private static final String YB_CONTROLLER_CHECK = "YB-Controller server check"; public static final String CUSTOM_NODE_METRICS_COLLECTION_METRIC = "yb_node_custom_node_metrics"; + public static final String DDL_ATOMICITY_CHECK_METRIC = "yb_ddl_atomicity_check"; + public static final Set UNIVERSE_WIDE_CHECK_METRICS = + ImmutableSet.of(DDL_ATOMICITY_CHECK_METRIC); + public static final Set SKIP_CLEANUP_METRICS = + ImmutableSet.of(DDL_ATOMICITY_CHECK_METRIC); public static final List HEALTH_CHECK_METRICS_WITHOUT_STATUS = ImmutableList.builder() @@ -171,17 +179,20 @@ private static List buildNodeMetric( .setSourceUuid(universe.getUniverseUUID()) .setLabels( MetricLabelsBuilder.create().appendSource(universe).getMetricLabels()) - .setKeyLabel(KnownAlertLabels.NODE_NAME, nodeData.getNodeName()) - .setLabel(KnownAlertLabels.NODE_ADDRESS, nodeData.getNode()) - .setLabel(KnownAlertLabels.NODE_IDENTIFIER, nodeData.getNodeIdentifier()) .setValue(value.getValue()); - if (nodeData.getNodeName() != null - && universe.getNode(nodeData.getNodeName()) != null) { - NodeDetails nodeDetails = universe.getNode(nodeData.getNodeName()); - result.setLabel(KnownAlertLabels.NODE_REGION, nodeDetails.getRegion()); - result.setLabel( - KnownAlertLabels.NODE_CLUSTER_TYPE, - universe.getCluster(nodeDetails.placementUuid).clusterType.name()); + if (!UNIVERSE_WIDE_CHECK_METRICS.contains(metric.getName())) { + result + .setKeyLabel(KnownAlertLabels.NODE_NAME, nodeData.getNodeName()) + .setLabel(KnownAlertLabels.NODE_ADDRESS, nodeData.getNode()) + .setLabel(KnownAlertLabels.NODE_IDENTIFIER, nodeData.getNodeIdentifier()); + if (nodeData.getNodeName() != null + && universe.getNode(nodeData.getNodeName()) != null) { + NodeDetails nodeDetails = universe.getNode(nodeData.getNodeName()); + result.setLabel(KnownAlertLabels.NODE_REGION, nodeDetails.getRegion()); + result.setLabel( + KnownAlertLabels.NODE_CLUSTER_TYPE, + universe.getCluster(nodeDetails.placementUuid).clusterType.name()); + } } if (CollectionUtils.isNotEmpty(value.getLabels())) { value diff --git a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java index 251cc6d01571..974435fec9a6 100644 --- a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java +++ b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java @@ -10,15 +10,7 @@ package com.yugabyte.yw.commissioner; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.CLOCK_SYNC_CHECK; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.CUSTOM_NODE_METRICS_COLLECTION_METRIC; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.HEALTH_CHECK_METRICS; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.HEALTH_CHECK_METRICS_WITHOUT_STATUS; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.NODE_EXPORTER_CHECK; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.OPENED_FILE_DESCRIPTORS_CHECK; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.UPTIME_CHECK; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.getCountMetricByCheckName; -import static com.yugabyte.yw.commissioner.HealthCheckMetrics.getNodeMetrics; +import static com.yugabyte.yw.commissioner.HealthCheckMetrics.*; import static com.yugabyte.yw.common.metrics.MetricService.STATUS_OK; import static com.yugabyte.yw.common.metrics.MetricService.buildMetricTemplate; import static play.mvc.Http.Status.BAD_REQUEST; @@ -33,14 +25,7 @@ import com.yugabyte.yw.commissioner.Common.CloudType; import com.yugabyte.yw.commissioner.tasks.KubernetesTaskBase; import com.yugabyte.yw.commissioner.tasks.UniverseTaskBase; -import com.yugabyte.yw.common.EmailHelper; -import com.yugabyte.yw.common.FileHelperService; -import com.yugabyte.yw.common.NodeUniverseManager; -import com.yugabyte.yw.common.PlatformExecutorFactory; -import com.yugabyte.yw.common.PlatformScheduler; -import com.yugabyte.yw.common.PlatformServiceException; -import com.yugabyte.yw.common.ShellProcessContext; -import com.yugabyte.yw.common.ShellResponse; +import com.yugabyte.yw.common.*; import com.yugabyte.yw.common.alerts.MaintenanceService; import com.yugabyte.yw.common.alerts.SmtpData; import com.yugabyte.yw.common.config.RuntimeConfGetter; @@ -78,6 +63,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.time.Duration; +import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -127,6 +114,9 @@ public class HealthChecker { private static final String MAX_NUM_THREADS_NODE_CHECK_KEY = "yb.health.max_num_parallel_node_checks"; + private static final String DDL_ATOMICITY_CHECK_RELEASE = "2.18.4.0-b23"; + private static final String DDL_ATOMICITY_CHECK_PREVIEW_RELEASE = "2.19.1.0-b301"; + private final Environment environment; private final Config config; @@ -159,6 +149,7 @@ public class HealthChecker { // We upload health check script to the node only when NodeInfo is updates private final Map, NodeInfo> uploadedNodeInfo = new ConcurrentHashMap<>(); + private final Map ddlAtomicityCheckTimestamp = new ConcurrentHashMap<>(); private final Set healthScriptMetrics = Collections.newSetFromMap(new ConcurrentHashMap<>()); @@ -246,22 +237,19 @@ public void initialize() { // The interval at which the checker will run. // Can be overridden per customer. private long healthCheckIntervalMs() { - Long interval = config.getLong("yb.health.check_interval_ms"); - return interval == null ? 0 : interval; + return config.getLong("yb.health.check_interval_ms"); } // The interval at which check result will be stored to DB // Can be overridden per customer. private long healthCheckStoreIntervalMs() { - Long interval = config.getLong("yb.health.store_interval_ms"); - return interval == null ? 0 : interval; + return config.getLong("yb.health.store_interval_ms"); } // The interval at which to send a status update of all the current universes. // Can be overridden per customer. private long statusUpdateIntervalMs() { - Long interval = config.getLong("yb.health.status_interval_ms"); - return interval == null ? 0 : interval; + return config.getLong("yb.health.status_interval_ms"); } /** @@ -331,7 +319,12 @@ private void processMetrics(Customer c, Universe u, Details report) { } if (shouldCollectNodeMetrics || checkName.equals(OPENED_FILE_DESCRIPTORS_CHECK) - || checkName.equals(CLOCK_SYNC_CHECK)) { + || checkName.equals(CLOCK_SYNC_CHECK) + || checkName.equals(DDL_ATOMICITY_CHECK)) { + if (checkName.equals(DDL_ATOMICITY_CHECK)) { + ddlAtomicityCheckTimestamp.put( + u.getUniverseUUID(), report.getTimestampIso().toInstant()); + } // Used FD count metric is always collected through health check as it's not // calculated properly from inside the collect_metrics service - it gets service limit // instead of user limit for file descriptors @@ -344,11 +337,14 @@ private void processMetrics(Customer c, Universe u, Details report) { } healthScriptMetrics.addAll( - metrics.stream().map(Metric::getName).collect(Collectors.toList())); + metrics.stream() + .map(Metric::getName) + .filter(n -> !SKIP_CLEANUP_METRICS.contains(n)) + .toList()); metrics.addAll( platformMetrics.entrySet().stream() .map(e -> buildMetricTemplate(e.getKey(), u).setValue(e.getValue().doubleValue())) - .collect(Collectors.toList())); + .toList()); // Clean all health check metrics for universe before saving current values // just in case list of nodes changed between runs. MetricFilter toClean = @@ -526,7 +522,7 @@ public void markUniverseForReUpload(UUID universeUUID) { List> universeNodeInfos = uploadedNodeInfo.keySet().stream() .filter(key -> key.getFirst().equals(universeUUID)) - .collect(Collectors.toList()); + .toList(); universeNodeInfos.forEach(uploadedNodeInfo::remove); } @@ -536,7 +532,7 @@ public void handleUniverseRemoval(UUID universeUUID) { List> universeNodeInfos = uploadedNodeInfo.keySet().stream() .filter(key -> key.getFirst().equals(universeUUID)) - .collect(Collectors.toList()); + .toList(); universeNodeInfos.forEach(uploadedNodeInfo::remove); } @@ -613,7 +609,7 @@ public CompletableFuture runHealthCheck( private String getAlertDestinations(Universe u, Customer c) { List destinations = emailHelper.getDestinations(c.getUuid()); - if (destinations.size() == 0) { + if (destinations.isEmpty()) { return null; } @@ -683,9 +679,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { } providerCode = provider.getCode(); List activeNodes = - details.getNodesInCluster(cluster.uuid).stream() - .filter(NodeDetails::isActive) - .collect(Collectors.toList()); + details.getNodesInCluster(cluster.uuid).stream().filter(NodeDetails::isActive).toList(); for (NodeDetails nd : activeNodes) { if (nd.cloudInfo.private_ip == null) { log.warn( @@ -697,9 +691,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { } } List sortedDetails = - activeNodes.stream() - .sorted(Comparator.comparing(NodeDetails::getNodeName)) - .collect(Collectors.toList()); + activeNodes.stream().sorted(Comparator.comparing(NodeDetails::getNodeName)).toList(); Set nodeUuids = sortedDetails.stream() .map(NodeDetails::getNodeUuid) @@ -792,7 +784,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { return; } - List nodeReports = checkNodes(params.universe, nodeMetadata); + List nodeReports = checkNodes(params, nodeMetadata); Details fullReport = new Details() @@ -822,9 +814,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { durationMs); if (healthCheckReport.getHasError()) { List failedChecks = - healthCheckReport.getData().stream() - .filter(NodeData::getHasError) - .collect(Collectors.toList()); + healthCheckReport.getData().stream().filter(NodeData::getHasError).toList(); log.warn( "Following checks failed for universe {}:\n{}", params.universe.getName(), @@ -853,20 +843,58 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { buildMetricTemplate(PlatformMetrics.HEALTH_CHECK_STATUS, params.universe)); } - private List checkNodes(Universe universe, List nodes) { + private List checkNodes(CheckSingleUniverseParams params, List nodes) { // Check if it should log the output of the command. + Universe universe = params.universe; boolean shouldLogOutput = confGetter.getConfForScope(universe, UniverseConfKeys.healthLogOutput); int nodeCheckTimeoutSec = confGetter.getConfForScope(universe, UniverseConfKeys.nodeCheckTimeoutSec); - + int ddlAtomicityIntervalSec = + confGetter.getConfForScope(universe, UniverseConfKeys.ddlAtomicityIntervalSec); + + Instant lastDdlAtomicityCheckTimestamp = + ddlAtomicityCheckTimestamp.get(universe.getUniverseUUID()); + String nodeToRunDdlTAtomicityCheck = null; + String masterLeaderUrl = null; + String ybDbRelease = + universe.getUniverseDetails().getPrimaryCluster().userIntent.ybSoftwareVersion; + boolean ddlAtomicityCheckSupported = + CommonUtils.isReleaseBetween(DDL_ATOMICITY_CHECK_RELEASE, "2.19.0.0-b0", ybDbRelease) + || CommonUtils.isReleaseEqualOrAfter(DDL_ATOMICITY_CHECK_PREVIEW_RELEASE, ybDbRelease); + if (ddlAtomicityCheckSupported + && !params.onlyMetrics + && (lastDdlAtomicityCheckTimestamp == null + || lastDdlAtomicityCheckTimestamp + .plus(ddlAtomicityIntervalSec, ChronoUnit.SECONDS) + .isBefore(Instant.now()))) { + // We should schedule DDL atomicity check. + NodeDetails masterLeader = universe.getMasterLeaderNode(); + if (masterLeader != null) { + NodeDetails nodeToRun = CommonUtils.getServerToRunYsqlQuery(universe); + boolean httpsEnabledUI = + universe.getConfig().getOrDefault(Universe.HTTPS_ENABLED_UI, "false").equals("true"); + masterLeaderUrl = + (httpsEnabledUI ? "https" : "http") + + "://" + + masterLeader.cloudInfo.private_ip + + ":" + + masterLeader.masterHttpPort; + nodeToRunDdlTAtomicityCheck = nodeToRun.getNodeName(); + } + } Map> nodeChecks = new HashMap<>(); for (NodeInfo nodeInfo : nodes) { + NodeCheckContext context = + new NodeCheckContext().setLogOutput(shouldLogOutput).setTimeoutSec(nodeCheckTimeoutSec); + if (nodeInfo.getNodeName().equals(nodeToRunDdlTAtomicityCheck)) { + context.setDdlAtomicityCheck(true); + context.setMasterLeaderUrl(masterLeaderUrl); + } nodeChecks.put( nodeInfo.getNodeName(), CompletableFuture.supplyAsync( - () -> checkNode(universe, nodeInfo, shouldLogOutput, nodeCheckTimeoutSec), - nodeExecutor)); + () -> checkNode(universe, nodeInfo, context), nodeExecutor)); } List result = new ArrayList<>(); @@ -916,14 +944,14 @@ private List checkNodes(Universe universe, List nodes) { } private Details checkNode( - Universe universe, NodeInfo nodeInfo, boolean logOutput, int timeoutSec) { + Universe universe, NodeInfo nodeInfo, NodeCheckContext nodeCheckContext) { Pair nodeKey = new Pair<>(universe.getUniverseUUID(), nodeInfo.getNodeName()); NodeInfo uploadedInfo = uploadedNodeInfo.get(nodeKey); ShellProcessContext context = ShellProcessContext.builder() - .logCmdOutput(logOutput) + .logCmdOutput(nodeCheckContext.isLogOutput()) .traceLogging(true) - .timeoutSecs(timeoutSec) + .timeoutSecs(nodeCheckContext.getTimeoutSec()) .build(); if (uploadedInfo == null && !nodeInfo.isK8s()) { // Only upload it once for new node, as it only depends on yb home dir. @@ -966,9 +994,15 @@ private Details checkNode( } uploadedNodeInfo.put(nodeKey, nodeInfo); + List commandToRun = new ArrayList<>(); + commandToRun.add(scriptPath); + if (nodeCheckContext.ddlAtomicityCheck) { + commandToRun.add("--ddl_atomicity_check=true"); + commandToRun.add("--master_leader_url=" + nodeCheckContext.getMasterLeaderUrl()); + } ShellResponse response = nodeUniverseManager - .runCommand(nodeInfo.getNodeDetails(), universe, scriptPath, context) + .runCommand(nodeInfo.getNodeDetails(), universe, commandToRun, context) .processErrors(); return Json.fromJson(Json.parse(response.extractRunCommandOutput()), Details.class); @@ -1027,8 +1061,7 @@ private String generateNodeCheckScript(UUID universeUuid, NodeInfo nodeInfo) { private MetricFilter metricSourceKeysFilterWithHealthScriptMetrics( Customer customer, Universe universe, List metrics) { Set allMetricNames = new HashSet<>(healthScriptMetrics); - allMetricNames.addAll( - metrics.stream().map(PlatformMetrics::getMetricName).collect(Collectors.toList())); + allMetricNames.addAll(metrics.stream().map(PlatformMetrics::getMetricName).toList()); List metricSourceKeys = allMetricNames.stream() .map( @@ -1038,7 +1071,7 @@ private MetricFilter metricSourceKeysFilterWithHealthScriptMetrics( .name(metricName) .sourceUuid(universe.getUniverseUUID()) .build()) - .collect(Collectors.toList()); + .toList(); return MetricFilter.builder().sourceKeys(metricSourceKeys).build(); } @@ -1100,11 +1133,18 @@ public static class NodeInfo { @JsonIgnore @EqualsAndHashCode.Exclude private NodeDetails nodeDetails; } + @Data + @Accessors(chain = true) + private static class NodeCheckContext { + private boolean logOutput; + private int timeoutSec; + private boolean ddlAtomicityCheck; + private String masterLeaderUrl; + } + private Details removeMetricOnlyChecks(Details details) { List nodeReports = - details.getData().stream() - .filter(data -> !data.getMetricsOnly()) - .collect(Collectors.toList()); + details.getData().stream().filter(data -> !data.getMetricsOnly()).toList(); return new Details() .setTimestampIso(details.getTimestampIso()) .setYbVersion(details.getYbVersion()) diff --git a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java index 8bed48c32deb..3b315d7eb897 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java +++ b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java @@ -42,6 +42,8 @@ public enum AlertTemplate { DB_QUEUES_OVERFLOW, DB_DRIVE_FAILURE, DB_WRITE_READ_TEST_ERROR, + + DDL_ATOMICITY_CHECK, NODE_TO_NODE_CA_CERT_EXPIRY, NODE_TO_NODE_CERT_EXPIRY, CLIENT_TO_NODE_CA_CERT_EXPIRY, diff --git a/managed/src/main/java/com/yugabyte/yw/common/config/UniverseConfKeys.java b/managed/src/main/java/com/yugabyte/yw/common/config/UniverseConfKeys.java index 9319582b0557..f34e96a80d5a 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/config/UniverseConfKeys.java +++ b/managed/src/main/java/com/yugabyte/yw/common/config/UniverseConfKeys.java @@ -46,6 +46,16 @@ public class UniverseConfKeys extends RuntimeConfigKeysModule { "The timeout (in seconds) for node check operation as part of universe health check", ConfDataType.IntegerType, ImmutableList.of(ConfKeyTags.PUBLIC)); + + public static final ConfKeyInfo ddlAtomicityIntervalSec = + new ConfKeyInfo<>( + "yb.health.ddl_atomicity_interval_sec", + ScopeType.UNIVERSE, + "DDL Atomicity Check Interval", + "The interval (in seconds) between DDL atomicity checks", + ConfDataType.IntegerType, + ImmutableList.of(ConfKeyTags.PUBLIC)); + public static final ConfKeyInfo ybUpgradeBlacklistLeaders = new ConfKeyInfo<>( "yb.upgrade.blacklist_leaders", diff --git a/managed/src/main/java/com/yugabyte/yw/models/helpers/CommonUtils.java b/managed/src/main/java/com/yugabyte/yw/models/helpers/CommonUtils.java index f0ef70727125..438747ac9d62 100644 --- a/managed/src/main/java/com/yugabyte/yw/models/helpers/CommonUtils.java +++ b/managed/src/main/java/com/yugabyte/yw/models/helpers/CommonUtils.java @@ -772,26 +772,50 @@ public static Optional isAnnotatedWith( public static NodeDetails getARandomLiveTServer(Universe universe) { UniverseDefinitionTaskParams.Cluster primaryCluster = universe.getUniverseDetails().getPrimaryCluster(); - List tserverLiveNodes = - universe.getUniverseDetails().getNodesInCluster(primaryCluster.uuid).stream() - .filter(nodeDetails -> nodeDetails.isTserver) - .filter(nodeDetails -> nodeDetails.state == NodeState.Live) - .collect(Collectors.toList()); - if (tserverLiveNodes.isEmpty()) { + NodeDetails randomLiveTServer = + getARandomLiveTServer(universe.getUniverseDetails().getNodesInCluster(primaryCluster.uuid)); + if (randomLiveTServer == null) { throw new IllegalStateException( "No live TServers found for Universe UUID: " + universe.getUniverseUUID()); } - return tserverLiveNodes.get(new Random().nextInt(tserverLiveNodes.size())); + return randomLiveTServer; } public static NodeDetails getServerToRunYsqlQuery(Universe universe) { // Prefer the master leader since that will result in a faster query. - // If the leader does not have a tserver process though, select any random tserver. - NodeDetails nodeToUse = universe.getMasterLeaderNode(); - if (nodeToUse == null || !nodeToUse.isTserver) { - nodeToUse = getARandomLiveTServer(universe); + // If the master leader is not a tserver - prefer tserver in the same region. + // If no tserver in the same region either - use random live tserver. + NodeDetails masterLeader = universe.getMasterLeaderNode(); + if (masterLeader != null) { + if (masterLeader.isTserver) { + // If master leader is a TServer - use that. + return masterLeader; + } + UniverseDefinitionTaskParams.Cluster primaryCluster = + universe.getUniverseDetails().getPrimaryCluster(); + List sameRegionNodes = + universe.getUniverseDetails().getNodesInCluster(primaryCluster.uuid).stream() + .filter(n -> n.getRegion().equals(masterLeader.getRegion())) + .toList(); + NodeDetails sameRegionTServer = getARandomLiveTServer(sameRegionNodes); + if (sameRegionTServer != null) { + // Live TServer present in master leader region - use that. + return sameRegionTServer; + } } - return nodeToUse; + return getARandomLiveTServer(universe); + } + + private static NodeDetails getARandomLiveTServer(Collection nodes) { + List tserverLiveNodes = + nodes.stream() + .filter(nodeDetails -> nodeDetails.isTserver) + .filter(nodeDetails -> nodeDetails.state == NodeState.Live) + .collect(Collectors.toList()); + if (tserverLiveNodes.isEmpty()) { + return null; + } + return tserverLiveNodes.get(new Random().nextInt(tserverLiveNodes.size())); } public static String logTableName(String tableName) { diff --git a/managed/src/main/resources/alert/alert_templates.yml b/managed/src/main/resources/alert/alert_templates.yml index becc5a03f475..439dc8833ed7 100644 --- a/managed/src/main/resources/alert/alert_templates.yml +++ b/managed/src/main/resources/alert/alert_templates.yml @@ -654,6 +654,29 @@ templates: Affected nodes: {{ $labels.affected_node_names }} + DDL_ATOMICITY_CHECK: + name: DDL Atomicity Check Failed + description: Some failed DDL operations were not atomic, which can cause subsequent backups to require a manual fixup before they can be restored. + queryTemplate: yb_ddl_atomicity_check{universe_uuid="__universeUuid__"} + {{ query_condition }} {{ query_threshold }} + createForNewCustomer: true + defaultThresholdMap: + SEVERE: + threshold: 1.0 + targetType: UNIVERSE + defaultThresholdCondition: LESS_THAN + defaultThresholdUnit: STATUS + thresholdMaxValue: 1.0 + thresholdReadOnly: true + labels: + affected_node_names: >- + {{ range $index, $element := query "max by (universe_uuid, node_name) + (up{universe_uuid='{{ $labels.universe_uuid }}'})" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }} + annotations: + summary: >- + DDL atomicity issues found for universe '{{ $labels.source_name }}'. + Please see health check results for more details. + ENCRYPTION_AT_REST_CONFIG_EXPIRY: name: Encryption At Rest config expiry description: Encryption At Rest config expires soon diff --git a/managed/src/main/resources/db/migration/default_/postgres/V364__DDL_Atomicity_Check_Alert.sql b/managed/src/main/resources/db/migration/default_/postgres/V364__DDL_Atomicity_Check_Alert.sql new file mode 100644 index 000000000000..b07d1882f7cb --- /dev/null +++ b/managed/src/main/resources/db/migration/default_/postgres/V364__DDL_Atomicity_Check_Alert.sql @@ -0,0 +1,21 @@ +-- Copyright (c) YugaByte, Inc. + + -- DDL Atomicity check + insert into alert_configuration + (uuid, customer_uuid, name, description, create_time, target_type, target, thresholds, threshold_unit, template, active, default_destination) +select + gen_random_uuid(), + uuid, + 'DDL Atomicity Check Failed', + 'Some failed DDL operations were not atomic, which can cause subsequent backups to require a manual fixup before they can be restored.', + current_timestamp, + 'UNIVERSE', + '{"all":true}', + '{"SEVERE":{"condition":"LESS_THAN", "threshold":1.0}}', + 'STATUS', + 'DDL_ATOMICITY_CHECK', + true, + true +from customer; + +select create_universe_alert_definitions('DDL Atomicity Check Failed'); diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index e05559f80f83..a1855b4bc081 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -39,11 +39,18 @@ import subprocess import sys import time +try: + import html +except Exception as e: + import HTMLParser + html = HTMLParser.HTMLParser() + try: from builtins import RuntimeError except Exception as e: from exceptions import RuntimeError from datetime import datetime, timedelta +from collections import defaultdict NODE_INFO = '{{NODE_INFO}}' @@ -207,6 +214,10 @@ YB_NODE_CLOCK_SKEW_CHECK = MetricDefinition( YB_NODE_CONTROLLER_CHECK = MetricDefinition( "yb_node_controller_check", "YB-Controller server check") +YB_DDL_ATOMICITY_CHECK = MetricDefinition( + "yb_ddl_atomicity_check", + "Status of DDL atomicity check" +) ################################################################################################### # Reporting @@ -494,7 +505,8 @@ class NodeChecker(): yb_home_dir, ybc_dir, start_time_ms, ysql_port, ycql_port, redis_port, enable_tls_client, enable_tls, root_and_client_root_ca_same, ssl_protocol, enable_ysql, enable_ysql_auth, master_http_port, tserver_http_port, - ysql_server_http_port, node_version, is_ybc_enabled, ybc_port): + ysql_server_http_port, node_version, is_ybc_enabled, ybc_port, + ddl_atomicity_check, master_leader_url): self.node = node self.node_name = node_name self.node_identifier = node_identifier @@ -520,6 +532,8 @@ class NodeChecker(): self.is_ybc_enabled = is_ybc_enabled self.ybc_port = ybc_port self.additional_info = {} + self.ddl_atomicity_check = ddl_atomicity_check + self.master_leader_url = master_leader_url def _new_entry(self, message, process=None): return Entry(message, self.node, process, self.node_name, self.node_identifier) @@ -1238,7 +1252,7 @@ class NodeChecker(): metric = Metric.from_definition(YB_NODE_REDIS_CONNECT).add_value(0 if has_errors else 1) return e.fill_and_return_entry(errors, has_error=has_errors, metrics=[metric]) - def create_ysqlsh_command(self): + def create_ysqlsh_command(self, db_name="system_platform"): ysqlsh = '{}/bin/ysqlsh'.format(self.yb_tserver_dir()) port_args = "-p {}".format(self.ysql_port) host = self.node @@ -1254,8 +1268,9 @@ class NodeChecker(): else: raise RuntimeError("Could not find local socket") - ysqlsh_cmd = "{} {} -h {} {} -U yugabyte -d system_platform".format( - 'env sslmode="require"' if (self.enable_tls_client) else '', ysqlsh, host, port_args, ) + ysqlsh_cmd = "{} {} -h {} {} -U yugabyte -d {}".format( + 'env sslmode="require"' if (self.enable_tls_client) else '', + ysqlsh, host, port_args, db_name) return ysqlsh_cmd @@ -1399,6 +1414,123 @@ class NodeChecker(): metric = Metric.from_definition(YB_NODE_CLOCK_SKEW_CHECK).add_value(0 if has_errors else 1) return e.fill_and_return_entry(errors, has_error=has_errors, metrics=[metric]) + def check_ddl_atomicity(self): + logging.info("Checking DDL atomicity on node {}".format(self.node)) + e = self._new_entry("DDL atomicity") + metric = Metric.from_definition(YB_DDL_ATOMICITY_CHECK) + + try: + ysqlsh_cmd = self.create_ysqlsh_command("") + except RuntimeError as re: + metric.add_value(0) + return e.fill_and_return_entry([str(re)], has_error=True, metrics=[metric]) + + errors = [] + try: + # Get table data + tables_output = (json.loads(self.http_request( + "{}/api/v1/tables".format(self.master_leader_url)))) + table_data_json = tables_output["user"] + table_data_json += tables_output["index"] + + # Initialize a dictionary to store table data by database + db_tables = {} + + # Iterate through each line of table data + for table in table_data_json: + pg_oid = table["ysql_oid"] + dbname = table["keyspace"] + + # Skip over tables that aren't in YSQL/are hidden. + if pg_oid == "" or table["hidden"]: + continue + # Extract table oid + yb_pg_table_oid = str(int(table["uuid"][-4:], 16)) + + # Add table to the database's list in the dictionary + if dbname not in db_tables: + db_tables[dbname] = [] + db_tables[dbname].append( + (table["table_name"], pg_oid, yb_pg_table_oid, table["uuid"])) + + # Iterate through each database + for dbname, tables in db_tables.items(): + pg_class_cmd = "{}{} -t -c \"{}\"".format(ysqlsh_cmd, dbname, + "SELECT json_agg(row_to_json(t)) FROM \ + (SELECT relname, oid, relfilenode FROM pg_class WHERE oid >= 16384) t;") + + # Fetch all user tables from pg_class for the database + pg_class_output = json.loads(self._check_output(pg_class_cmd).strip()) + pg_class_oid_tableinfo_dict = {} + # Use relfilenode if it exists (as the table may be rewritten) + for table in pg_class_output: + if table['relfilenode'] != '0': + pg_class_oid_tableinfo_dict[table['relfilenode']] = table + else: + pg_class_oid_tableinfo_dict[table['oid']] = table + + pg_attribute_cmd = "{}{} -t -c \"{}\"".format(ysqlsh_cmd, dbname, + "SELECT json_agg(row_to_json(t)) FROM \ + (SELECT attname, attrelid FROM pg_attribute WHERE attrelid >= 16384) t;") + pg_attribute_output = json.loads(self._check_output(pg_attribute_cmd).strip()) + pg_attribute_attrelid_attnames_dict = defaultdict(list) + for attribute in pg_attribute_output: + (pg_attribute_attrelid_attnames_dict[attribute['attrelid']] + .append(attribute['attname'])) + + # Iterate through each table + for tablename, pg_oid, yb_pg_table_oid, tableid in tables: + # Check if the table exists in pg_class + if yb_pg_table_oid not in pg_class_oid_tableinfo_dict: + # Note: on versions older than 2024.1, the oid in this log + # will refer to the relfilenode for materialized views. + errors.append(("Table {} with oid {} and uuid {} does not exist in " + "database {} - ORPHANED TABLE NEEDS TO BE DROPPED") + .format(tablename, pg_oid, tableid, dbname)) + continue + + pg_class_entry = pg_class_oid_tableinfo_dict[yb_pg_table_oid] + # work-around for versions older than 2024.1, as master UI doesn't populate + # YSQL table oid on the UI correctly + # (it populated it with relfilenode oid instead). + pg_oid = pg_class_entry['oid'] + + if tablename != pg_class_entry['relname']: + errors.append(("Table {} with oid {} and uuid {} exists in {} but has a " + "mismatched table name - TABLE NAME NEEDS TO BE FIXED") + .format(tablename, pg_oid, tableid, dbname)) + continue + + # Get columns + table_schema_json = json.loads(self.http_request( + "{}/api/v1/table?id={}".format(self.master_leader_url, tableid))) + columns = [html.unescape( + column['column']) for column in table_schema_json["columns"]] + # Check if each column exists in pg_attribute + for column in columns: + if (column == "ybrowid" or column == "ybuniqueidxkeysuffix" + or column == "ybidxbasectid"): + continue + if column not in pg_attribute_attrelid_attnames_dict[pg_oid]: + errors.append(("Column {} does not exist in table {} in database {} - " + "ORPHANED COLUMN NEEDS TO BE DROPPED") + .format(column, tablename, dbname)) + continue + except Exception as ex: + metric.add_value(0) + return e.fill_and_return_entry([str(ex)], has_error=True, metrics=[metric]) + + has_errors = len(errors) > 0 + if has_errors: + msgs = ["Found {} errors:".format(len(errors))] + msgs.extend(errors[:10]) + if len(errors) > 10: + msgs.append("...") + else: + msgs = ["No errors found"] + metric.add_value(0 if has_errors else 1) + return e.fill_and_return_entry(msgs, has_error=has_errors, metrics=[metric]) + def check_openssl_availability(self): cmd = "which openssl &>/dev/null; echo $?" output = self._check_output(cmd).rstrip() @@ -1617,6 +1749,11 @@ def main(): help='Output file to which the metrics will be written to.') parser.add_argument('--retry_interval_secs', type=int, required=False, default=30, help='Time to wait between retries of failed checks.') + parser.add_argument('--ddl_atomicity_check', type=bool, default=False, required=False, + help='In case we want to get only metrics as an output.') + parser.add_argument('--master_leader_url', type=str, default="", required=False, + help='Master leader URL.') + args = parser.parse_args() if args.metrics_only: @@ -1638,8 +1775,8 @@ def main(): n.yb_home_dir, n.ybc_dir, n.node_start_time, n.ysql_port, n.ycql_port, n.redis_port, n.enable_tls_client, n.enable_tls, n.root_and_client_root_ca_same, n.ssl_protocol, n.enable_ysql, n.enable_ysql_auth, n.master_http_port, n.tserver_http_port, - n.ysql_server_http_port, n.yb_version, n.is_ybc_enabled, n.ybc_port) - + n.ysql_server_http_port, n.yb_version, n.is_ybc_enabled, n.ybc_port, + args.ddl_atomicity_check, args.master_leader_url) coordinator.add_precheck(checker, "check_openssl_availability") coordinator.add_check(checker, "check_node_metrics_collection") @@ -1703,6 +1840,9 @@ def main(): if not n.is_k8s: coordinator.add_check(checker, "check_oom_kills") + if args.ddl_atomicity_check: + coordinator.add_check(checker, "check_ddl_atomicity") + entries = coordinator.run() for e in entries: report.add_entry(e) diff --git a/managed/src/main/resources/reference.conf b/managed/src/main/resources/reference.conf index ded3893a92d2..ec5c4fd5877e 100644 --- a/managed/src/main/resources/reference.conf +++ b/managed/src/main/resources/reference.conf @@ -787,6 +787,8 @@ yb { store_interval_ms = 300000 # Interval at which to send a status report email. Default: 12 hours. status_interval_ms = 43200000 + # Interval at which to check for DDL atomicity. Default: 1 hour. + ddl_atomicity_interval_sec = 3600 logOutput = false nodeCheckTimeoutSec = 180 diff --git a/managed/src/main/resources/swagger-strict.json b/managed/src/main/resources/swagger-strict.json index fef249b8e1e3..387e6b9e2171 100644 --- a/managed/src/main/resources/swagger-strict.json +++ b/managed/src/main/resources/swagger-strict.json @@ -673,7 +673,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "thresholdUnit" : { @@ -726,7 +726,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "uuids" : { @@ -887,7 +887,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "thresholdConditionReadOnly" : { @@ -2816,7 +2816,7 @@ "type" : "object" } ] }, - "BootstrapBackupParams" : { + "BootstarpBackupParams" : { "description" : "Backup parameters for bootstrapping", "properties" : { "parallelism" : { @@ -2837,7 +2837,7 @@ "description" : "Bootstrap parameters", "properties" : { "backupRequestParams" : { - "$ref" : "#/definitions/BootstrapBackupParams", + "$ref" : "#/definitions/BootstarpBackupParams", "description" : "Parameters used to do Backup/restore" }, "tables" : { @@ -9929,7 +9929,7 @@ "description" : "Bootstrap parameters for restarting", "properties" : { "backupRequestParams" : { - "$ref" : "#/definitions/BootstrapBackupParams", + "$ref" : "#/definitions/BootstarpBackupParams", "description" : "Parameters used to do Backup/restore" } }, diff --git a/managed/src/main/resources/swagger.json b/managed/src/main/resources/swagger.json index 4c2bcb4cec92..aab548f2be36 100644 --- a/managed/src/main/resources/swagger.json +++ b/managed/src/main/resources/swagger.json @@ -685,7 +685,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "thresholdUnit" : { @@ -738,7 +738,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "uuids" : { @@ -899,7 +899,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "NEW_YSQL_TABLES_ADDED", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "NODE_AGENT_DOWN", "TABLET_PEERS_GUARDRAIL" ], "type" : "string" }, "thresholdConditionReadOnly" : { @@ -2832,7 +2832,7 @@ "type" : "object" } ] }, - "BootstrapBackupParams" : { + "BootstarpBackupParams" : { "description" : "Backup parameters for bootstrapping", "properties" : { "parallelism" : { @@ -2853,7 +2853,7 @@ "description" : "Bootstrap parameters", "properties" : { "backupRequestParams" : { - "$ref" : "#/definitions/BootstrapBackupParams", + "$ref" : "#/definitions/BootstarpBackupParams", "description" : "Parameters used to do Backup/restore" }, "tables" : { @@ -10033,7 +10033,7 @@ "description" : "Bootstrap parameters for restarting", "properties" : { "backupRequestParams" : { - "$ref" : "#/definitions/BootstrapBackupParams", + "$ref" : "#/definitions/BootstarpBackupParams", "description" : "Parameters used to do Backup/restore" } }, diff --git a/managed/src/test/java/com/yugabyte/yw/commissioner/HealthCheckerTest.java b/managed/src/test/java/com/yugabyte/yw/commissioner/HealthCheckerTest.java index 43ad5c261a16..9cd265988264 100644 --- a/managed/src/test/java/com/yugabyte/yw/commissioner/HealthCheckerTest.java +++ b/managed/src/test/java/com/yugabyte/yw/commissioner/HealthCheckerTest.java @@ -12,10 +12,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyBoolean; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.*; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.spy; @@ -24,6 +21,7 @@ import static org.mockito.Mockito.when; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.collect.ImmutableList; import com.typesafe.config.Config; import com.yugabyte.yw.common.ApiUtils; import com.yugabyte.yw.common.AssertHelper; @@ -65,10 +63,7 @@ import jakarta.mail.MessagingException; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; +import java.util.*; import java.util.concurrent.ExecutorService; import junitparams.JUnitParamsRunner; import junitparams.Parameters; @@ -150,6 +145,9 @@ public void setUp() { when(mockConfGetter.getConfForScope( any(Universe.class), eq(UniverseConfKeys.nodeCheckTimeoutSec))) .thenReturn(1); + when(mockConfGetter.getConfForScope( + any(Universe.class), eq(UniverseConfKeys.ddlAtomicityIntervalSec))) + .thenReturn(3600); when(mockConfGetter.getGlobalConf(eq(GlobalConfKeys.backwardCompatibleDate))).thenReturn(false); when(mockFileHelperService.createTempFile(anyString(), anyString())) .thenAnswer( @@ -262,11 +260,11 @@ private Universe setupDisabledAlertsConfig(String email, long disabledUntilSecs) private void verifyNodeUniverseManager(int uploads, int commands) { verify(mockNodeUniverseManager, times(uploads)) .uploadFileToNode(any(), any(), any(), any(), any(), any()); - verify(mockNodeUniverseManager, times(commands)).runCommand(any(), any(), anyString(), any()); + verify(mockNodeUniverseManager, times(commands)).runCommand(any(), any(), anyList(), any()); } private void verifyK8sHealthManager() { - ArgumentCaptor command = ArgumentCaptor.forClass(String.class); + ArgumentCaptor> command = ArgumentCaptor.forClass(List.class); verify(mockNodeUniverseManager, times(1)).runCommand(any(), any(), command.capture(), any()); // TODO assert } @@ -344,7 +342,7 @@ public void testCheckStatusMetricsCleaned() { ShellResponse dummyShellResponseFail = ShellResponse.create(1, "Should error"); doAnswer(i -> dummyShellResponseFail) .when(mockNodeUniverseManager) - .runCommand(any(), any(), anyString(), any()); + .runCommand(any(), any(), anyList(), any()); testSingleUniverse(u, null, true, 2); } @@ -624,7 +622,7 @@ public void testTimingLogic() throws MessagingException { public void testCheckSingleUniverse_ScriptFailure() { ShellResponse dummyShellResponseFail = ShellResponse.create(1, "Should error"); - when(mockNodeUniverseManager.runCommand(any(), any(), anyString(), any())) + when(mockNodeUniverseManager.runCommand(any(), any(), anyList(), any())) .thenReturn(dummyShellResponseFail); Universe u = setupUniverse("univ1"); setupAlertingData(null, false, false); @@ -660,11 +658,12 @@ private void testSingleUniverseWithYedisState(boolean enabledYEDIS) { healthChecker.checkSingleUniverse( new HealthChecker.CheckSingleUniverseParams(u, defaultCustomer, true, false, false, null)); - ArgumentCaptor expectedCommand = ArgumentCaptor.forClass(String.class); + ArgumentCaptor> expectedCommand = ArgumentCaptor.forClass(List.class); verify(mockNodeUniverseManager, times(4)) .runCommand(any(), any(), expectedCommand.capture(), any()); - assertThat(expectedCommand.getValue(), equalTo("/home/yugabyte/bin/node_health.py")); + assertThat( + expectedCommand.getValue(), equalTo(ImmutableList.of("/home/yugabyte/bin/node_health.py"))); } @Test @@ -710,7 +709,7 @@ public void testCheckSingleUniverse_EmailSentWithTwoContentTypes() throws Messag } private void mockGoodHealthResponse() { - when(mockNodeUniverseManager.runCommand(any(), any(), anyString(), any())) + when(mockNodeUniverseManager.runCommand(any(), any(), anyList(), any())) .thenAnswer( i -> { NodeDetails nodeDetails = i.getArgument(0); @@ -729,7 +728,7 @@ private void mockGoodHealthResponse() { } private void mockBadHealthResponse() { - when(mockNodeUniverseManager.runCommand(any(), any(), anyString(), any())) + when(mockNodeUniverseManager.runCommand(any(), any(), anyList(), any())) .thenAnswer( i -> { NodeDetails nodeDetails = i.getArgument(0); @@ -805,7 +804,7 @@ public void testNodeCheckTimeout() { .thenReturn(EmailFixtures.createSmtpData()); setupAlertingData(YB_ALERT_TEST_EMAIL, false, false); - when(mockNodeUniverseManager.runCommand(any(), any(), anyString(), any())) + when(mockNodeUniverseManager.runCommand(any(), any(), anyList(), any())) .thenReturn(ShellResponse.create(9, StringUtils.EMPTY)); healthChecker.checkSingleUniverse( new HealthChecker.CheckSingleUniverseParams(