From 5cb74a7cfeb0295d91065d2787c2abdbc3993c80 Mon Sep 17 00:00:00 2001 From: Daniel Shubin Date: Thu, 25 Jul 2024 16:26:41 +0000 Subject: [PATCH] [PLAT-14164] New Alert for clock drift Summary: Created a new alert to report when chronyc (and soon ntpd) report clock skew >200 (warning) and > 400 (severe) milliseconds. This leverages the clock drift health check, which currently only uses chrony, but will be enhanced for ntpd. Test Plan: created universe and manually caused clock skew. Valdiated alert was correctly raised Reviewers: sanketh, amalyshev, skurapati Reviewed By: amalyshev Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D36850 --- .../com/yugabyte/yw/common/AlertTemplate.java | 3 ++- .../main/resources/alert/alert_templates.yml | 25 +++++++++++++++++++ .../postgres/V366__Alert_Clock_Drift.sql | 21 ++++++++++++++++ .../resources/health/node_health.py.template | 13 +++++----- .../src/main/resources/swagger-strict.json | 6 ++--- managed/src/main/resources/swagger.json | 6 ++--- 6 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql diff --git a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java index 76de95942d99..4e84b76593d2 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java +++ b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java @@ -81,5 +81,6 @@ public enum AlertTemplate { UNIVERSE_RELEASE_FILES_STATUS, HA_VERSION_MISMATCH, TABLET_PEERS_GUARDRAIL, - XCLUSTER_CONFIG_TABLE_BAD_STATE; + XCLUSTER_CONFIG_TABLE_BAD_STATE, + NODE_CLOCK_DRIFT; } diff --git a/managed/src/main/resources/alert/alert_templates.yml b/managed/src/main/resources/alert/alert_templates.yml index 06f00f1084ff..c8b4b34a6e06 100644 --- a/managed/src/main/resources/alert/alert_templates.yml +++ b/managed/src/main/resources/alert/alert_templates.yml @@ -1901,3 +1901,28 @@ templates: Percentage of tablet peers is high for universe '{{ $labels.source_name }}'. Current value is {{ $value | printf "%.0f" }} %. Affected nodes: {{ $labels.affected_node_names }} + + NODE_CLOCK_DRIFT: + name: High clock drift + description: Local clock on the node has drift too far from the actual time in the past 10 minutes. + queryTemplate: max by (universe_uuid) (yb_node_clock_drift_check_ms{universe_uuid="__universeUuid__"}) {{ query_condition }} {{ query_threshold }} + createForNewCustomer: true + defaultThresholdMap: + WARNING: + threshold: 200.0 + SEVERE: + threshold: 400.0 + targetType: UNIVERSE + defaultThresholdCondition: GREATER_THAN + defaultThresholdUnit: MILLISECOND + thresholdUnitName: ms + labels: + affected_node_names: >- + {{ range $index, $element := query "max by (universe_uuid, node_name) + (yb_node_clock_drift_check_ms{universe_uuid='{{ $labels.universe_uuid }}'}) + / 1000 {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }} + annotations: + summary: >- + Clock drift is high for universe '{{ $labels.source_name }}'. + Current drift from actual clock is {{ $value }} milliseconds. + Affected nodes: {{ $labels.affected_node_names }} \ No newline at end of file diff --git a/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql b/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql new file mode 100644 index 000000000000..b1cb5c7c4ffb --- /dev/null +++ b/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql @@ -0,0 +1,21 @@ +-- Copyright (c) YugaByte, Inc. + + -- Clock skew alert + insert into alert_configuration + (uuid, customer_uuid, name, description, create_time, target_type, target, thresholds, threshold_unit, template, active, default_destination) +select + gen_random_uuid(), + uuid, + 'High clock drift', + 'Local clock on the node has drift too far from the actual time.', + current_timestamp, + 'UNIVERSE', + '{"all":true}', + '{"WARNING":{"condition":"GREATER_THAN","threshold":200.0},"SEVERE":{"condition":"GREATER_THAN", "threshold":400.0}}', + 'MILLISECOND', + 'NODE_CLOCK_DRIFT', + true, + true +from customer; + +select create_universe_alert_definitions('High clock drift'); \ No newline at end of file diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index 8f30ff6f0c26..23a51177a814 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -221,9 +221,9 @@ YB_NODE_CONTROLLER_CHECK = MetricDefinition( "yb_node_controller_check", "YB-Controller server check") YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition( - "yb_node_clock_drift_check", - "Time Drift betwen nodes within limits" -) + "yb_node_clock_drift_check_ms", + "Time Drift betwen nodes within limits", + "millisec") YB_DDL_ATOMICITY_CHECK = MetricDefinition( "yb_ddl_atomicity_check", "Status of DDL atomicity check" @@ -1914,7 +1914,7 @@ def get_clock_drift_ms(): out = check_output("chronyc tracking", env) match = re.search("System time.*: (.*) second", out, re.MULTILINE) if match: - return int(float(match.group(1))*1000) # Convert seconds to milliseconds + return float(match.group(1))*1000 # Convert seconds to milliseconds return "Error: Unknown time service" def chrony_exists(): @@ -2109,8 +2109,9 @@ def main(): coordinator.add_check(checker, "check_file_descriptors") if n.check_clock: coordinator.add_check(checker, "check_clock_skew") - if n.check_time_drift: - coordinator.add_check(checker, "check_yb_node_clock_drift") + + if n.check_time_drift: + coordinator.add_check(checker, "check_yb_node_clock_drift") if n.master_index >= 0: coordinator.add_check(checker, "check_uptime_for_process", MASTER) diff --git a/managed/src/main/resources/swagger-strict.json b/managed/src/main/resources/swagger-strict.json index 0692ad026f5a..c9e305c2f644 100644 --- a/managed/src/main/resources/swagger-strict.json +++ b/managed/src/main/resources/swagger-strict.json @@ -673,7 +673,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdUnit" : { @@ -726,7 +726,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "uuids" : { @@ -887,7 +887,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdConditionReadOnly" : { diff --git a/managed/src/main/resources/swagger.json b/managed/src/main/resources/swagger.json index c98d19e3d1dc..a16cabc32a0e 100644 --- a/managed/src/main/resources/swagger.json +++ b/managed/src/main/resources/swagger.json @@ -685,7 +685,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdUnit" : { @@ -738,7 +738,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "uuids" : { @@ -899,7 +899,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdConditionReadOnly" : {