Skip to content

Commit

Permalink
[PLAT-14165] Health check to report on time drift.
Browse files Browse the repository at this point in the history
Summary:
Added a new health check that uses chrony to report warnings and errors
depending on the amount of drift (250 ms for warning and 400 for error
by default).

If chrony is not installed, the health check skips and does not report
anything

Test Plan:
1. chrony installed with 0 offset (check passed)
2. chrony installed with large offset (check reported error)
3. chrony not installed (check skipped)
4. chrony stopped (check reported error)
5. hard coded drift_ms for error and warnings (reported error/warning as expected)

Reviewers: vbansal, yshchetinin, svarshney, amalyshev

Reviewed By: amalyshev

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D36145
  • Loading branch information
shubin-yb committed Jun 28, 2024
1 parent 25d427a commit 26cd461
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,15 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
if (!provider.getCode().equals(CloudType.onprem.toString())
&& !provider.getCode().equals(CloudType.kubernetes.toString())) {
nodeInfo.setCheckClock(true);
if (confGetter.getConfForScope(params.universe, UniverseConfKeys.healthCheckTimeDrift)) {
nodeInfo.setCheckTimeDrift(true);
nodeInfo.setTimeDriftWrnThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftWrnThreshold));
nodeInfo.setTimeDriftErrThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftErrThreshold));
}
}
if (params.universe.isYbcEnabled()) {
nodeInfo
Expand Down Expand Up @@ -1083,6 +1092,9 @@ public static class NodeInfo {
private int tserverHttpPort = 9000;
private int ysqlServerHttpPort = 13000;
private boolean checkClock = false;
private boolean checkTimeDrift = true;
private int timeDriftWrnThreshold = 250;
private int timeDriftErrThreshold = 400;
private Long nodeStartTime = null;
private boolean testReadWrite = true;
private boolean testYsqlshConnectivity = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1224,4 +1224,28 @@ public class UniverseConfKeys extends RuntimeConfigKeysModule {
"CPU usage alert aggregation interval in seconds.",
ConfDataType.IntegerType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Boolean> healthCheckTimeDrift =
new ConfKeyInfo<>(
"yb.health_checks.check_clock_time_drift",
ScopeType.UNIVERSE,
"Enable health checks for time drift between nodes",
"Enable health checks for time drift between nodes.",
ConfDataType.BooleanType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Integer> healthCheckTimeDriftWrnThreshold =
new ConfKeyInfo<>(
"yb.health_checks.time_drift_wrn_threshold_ms",
ScopeType.UNIVERSE,
"Time drift threshold for warning health check",
"Threshold to raise a warning when time drift exceeds this amount",
ConfDataType.IntegerType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Integer> healthCheckTimeDriftErrThreshold =
new ConfKeyInfo<>(
"yb.health_checks.time_drift_err_threshold_ms",
ScopeType.UNIVERSE,
"Time drift threshold for error health check",
"Threshold to raise a error when time drift exceeds this amount",
ConfDataType.IntegerType,
ImmutableList.of(ConfKeyTags.PUBLIC));
}
59 changes: 56 additions & 3 deletions managed/src/main/resources/health/node_health.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ YB_NODE_CLOCK_SKEW_CHECK = MetricDefinition(
YB_NODE_CONTROLLER_CHECK = MetricDefinition(
"yb_node_controller_check",
"YB-Controller server check")
YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition(
"yb_node_clock_drift_check",
"Time Drift betwen nodes within limits"
)

###################################################################################################
# Reporting
Expand Down Expand Up @@ -342,10 +346,11 @@ class Entry:
self.ignore_result = True
return self

def fill_and_return_warning_entry(self, details):
def fill_and_return_warning_entry(self, details, metrics=None):
self.details = details
self.has_error = False
self.has_warning = True
self.metrics = metrics
return self

def as_json(self):
Expand Down Expand Up @@ -493,7 +498,8 @@ class NodeChecker():
yb_home_dir, ybc_dir, start_time_ms, ysql_port, ycql_port, redis_port,
enable_tls_client, enable_tls, root_and_client_root_ca_same, ssl_protocol,
enable_ysql, enable_ysql_auth, master_http_port, tserver_http_port,
ysql_server_http_port, node_version, is_ybc_enabled, ybc_port):
ysql_server_http_port, node_version, is_ybc_enabled, ybc_port,
time_drift_wrn_threshold, time_drift_err_threshold):
self.node = node
self.node_name = node_name
self.node_identifier = node_identifier
Expand All @@ -518,6 +524,8 @@ class NodeChecker():
self.node_version = node_version
self.is_ybc_enabled = is_ybc_enabled
self.ybc_port = ybc_port
self.time_drift_wrn_threshold = time_drift_wrn_threshold
self.time_drift_err_threshold = time_drift_err_threshold
self.additional_info = {}

def _new_entry(self, message, process=None):
Expand Down Expand Up @@ -1410,6 +1418,27 @@ class NodeChecker():
metric.add_value(0 if has_errors else 1)
return e.fill_and_return_entry(errors, has_error=has_errors, metrics=[metric])

def check_yb_node_clock_drift(self):
e = self._new_entry("Node Clock Drift")
if not chrony_exists():
return e.ignore_check()
metric = Metric.from_definition(YB_NODE_CLOCK_DRIFT_CHECK)
drift_ms = get_clock_drift_ms()
# Returns error string on failure, int on success
if isinstance(drift_ms, str):
return e.fill_and_return_entry([drift_ms], has_error=True)
metric.add_value(drift_ms)
errors = []
if drift_ms > self.time_drift_err_threshold:
errors.append("Node clock drift is {} ms, over {} ms".format(
drift_ms, self.time_drift_err_threshold))
return e.fill_and_return_entry(errors, has_error=True, metrics=[metric])
if drift_ms > self.time_drift_wrn_threshold:
errors.append("Node clock drift is {} ms, over {} ms".format(
drift_ms, self.time_drift_wrn_threshold))
return e.fill_and_return_warning_entry(errors, metrics=[metric])
return e.fill_and_return_entry(["%s ms" % drift_ms], has_error=False, metrics=[metric])

###################################################################################################
# Utility functions
###################################################################################################
Expand Down Expand Up @@ -1478,6 +1507,24 @@ def is_equal_release_build(release_build1, release_build2):
return (is_equal_or_newer_release_build(release_build1, release_build2) and
is_equal_or_newer_release_build(release_build2, release_build1))

def get_clock_drift_ms():
"""
Get the clock drift in milliseconds. Returns absolute value of the drift
"""
env = os.environ.copy()
chrony_out = check_output("systemctl status chronyd.service", env)
if "Error" not in chrony_out and "Active: active" in chrony_out:
# Check drift using chrony
out = check_output("chronyc tracking", env)
match = re.search("System time.*: (.*) second", out, re.MULTILINE)
if match:
return int(float(match.group(1))*1000) # Convert seconds to milliseconds
return "Error: Unknown time service"

def chrony_exists():
env = os.environ.copy()
chrony_out = check_output("command -v chronyc", env)
return "Error" not in chrony_out

class CheckCoordinator:
class PreCheckRunInfo:
Expand Down Expand Up @@ -1583,6 +1630,9 @@ class NodeInfo:
self.tserver_http_port = data["tserverHttpPort"]
self.ysql_server_http_port = data["ysqlServerHttpPort"]
self.check_clock = data["checkClock"]
self.check_time_drift = data["checkTimeDrift"]
self.time_drift_wrn_threshold = data["timeDriftWrnThreshold"]
self.time_drift_err_threshold = data["timeDriftErrThreshold"]
self.test_read_write = data["testReadWrite"]
self.test_ysqlsh_connectivity = data["testYsqlshConnectivity"]
self.test_cqlsh_connectivity = data["testCqlshConnectivity"]
Expand Down Expand Up @@ -1624,7 +1674,8 @@ def main():
n.yb_home_dir, n.ybc_dir, n.node_start_time, n.ysql_port, n.ycql_port, n.redis_port,
n.enable_tls_client, n.enable_tls, n.root_and_client_root_ca_same, n.ssl_protocol,
n.enable_ysql, n.enable_ysql_auth, n.master_http_port, n.tserver_http_port,
n.ysql_server_http_port, n.yb_version, n.is_ybc_enabled, n.ybc_port)
n.ysql_server_http_port, n.yb_version, n.is_ybc_enabled, n.ybc_port,
n.time_drift_wrn_threshold, n.time_drift_err_threshold)

coordinator.add_precheck(checker, "check_openssl_availability")

Expand All @@ -1648,6 +1699,8 @@ def main():
coordinator.add_check(checker, "check_file_descriptors")
if n.check_clock:
coordinator.add_check(checker, "check_clock_skew")
if n.check_time_drift:
coordinator.add_check(checker, "check_yb_node_clock_drift")

if n.master_index >= 0:
coordinator.add_check(checker, "check_uptime_for_process", MASTER)
Expand Down
6 changes: 6 additions & 0 deletions managed/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,12 @@ yb {
tmpDirectory = "/tmp"
}

health_checks {
check_clock_time_drift = true
time_drift_wrn_threshold_ms = 250
time_drift_err_threshold_ms = 400
}

# Alerts thresholds
alert {
# Value of maximum allowed clock skew before an alert is generated (in ms).
Expand Down

0 comments on commit 26cd461

Please sign in to comment.