From 776e8eab262ce4a781cb732b7ef6e8c10d9ac24d Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev Date: Wed, 18 Sep 2024 00:10:33 +0300 Subject: [PATCH] [Backport 2.20][PLAT-15326] Proper error handling in DDL atomicity check Summary: Currently various errors are not handled properly in ddl atomicity check. Fixing it. Original diff: https://phorge.dev.yugabyte.com/D38142 Test Plan: Install 2.20.0.0_b50 universe. Make sure DDL atomicity check passes. Force DDL atomicity errors. Force check. Make sure check show valid errors. Stop TServer on master leader node. Make sure check shows 'TServer is not running on this node' error Stop 2 masters. Make sure check shows 'Master Leader HTTP endpoint is not running' error Reviewers: vbansal, skurapati Reviewed By: vbansal Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D38153 --- .../resources/health/node_health.py.template | 119 +++++++++++++++--- 1 file changed, 99 insertions(+), 20 deletions(-) diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index 296170ca65ca..6591eb21a339 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -474,7 +474,7 @@ def wrap_command(command_str): return "timeout {} bash -c 'set -o pipefail; {}'".format(CMD_TIMEOUT_SEC, command_str) -def has_errors(str): +def check_for_errors(str): return str is not None and str.startswith('Error') @@ -558,7 +558,7 @@ class NodeChecker(): found_error = False output = self.get_disk_utilization() msgs = [] - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) # Do not process the headers. @@ -636,7 +636,7 @@ class NodeChecker(): return e.fill_and_return_warning_entry(["OpenSSL is not installed, skipped"]) output = self.get_certificate_expiration_date(cert_path) - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) if output == 'File not found': @@ -798,7 +798,7 @@ class NodeChecker(): cmd = 'curl -s -L --insecure {}'.format(endpoint) output = self._check_output(cmd).strip() - if has_errors(output): + if check_for_errors(output): logging.info("HTTP request to {} returned error: {}".format(endpoint, output)) return None return output @@ -882,7 +882,7 @@ class NodeChecker(): '-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES), log_severity) output = self._check_output(cmd) - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) log_files = self.check_logs_find_output(output) @@ -917,7 +917,7 @@ class NodeChecker(): yb_cores_dir, yb_cores_dir, '-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES)) output = self._check_output(cmd) - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) files = [] @@ -986,11 +986,11 @@ class NodeChecker(): else: e = self._new_metric_entry("Uptime", process) uptime = self.get_uptime_for_process(process) - if has_errors(uptime): + if check_for_errors(uptime): return e.fill_and_return_entry([uptime], has_error=True) boot_time = self.get_boot_time_for_process(process) - if has_errors(boot_time): + if check_for_errors(boot_time): return e.fill_and_return_entry([boot_time], has_error=True) if uptime is None or not uptime: @@ -1155,7 +1155,7 @@ class NodeChecker(): logging.info("Checking node exporter on node {}".format(self.node)) e = self._new_metric_entry("Node exporter") output = self.get_command_for_process("node_exporter") - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) metric = Metric.from_definition(YB_NODE_CUSTOM_NODE_METRICS) running = ('node_exporter' in output) @@ -1193,7 +1193,7 @@ class NodeChecker(): awk '\"'\"'{print $1}'\"'\"' /proc/sys/fs/file-nr" output = self._check_output(cmd) - if has_errors(output): + if check_for_errors(output): return e.fill_and_return_entry([output], has_error=True) counts = output.split('\n') @@ -1440,6 +1440,13 @@ class NodeChecker(): e = self._new_entry("DDL atomicity") metric = Metric.from_definition(YB_DDL_ATOMICITY_CHECK) + tserver_pid = self.get_process_pid_by_name(TSERVER) + if tserver_pid is None: + metric.add_value(0) + return e.fill_and_return_entry(["TServer is not running on this node"], + has_error=True, + metrics=[metric]) + try: ysqlsh_cmd = self.create_ysqlsh_command("") except RuntimeError as re: @@ -1449,8 +1456,23 @@ class NodeChecker(): errors = [] try: # Get table data - tables_output = (json.loads(self.http_request( - "{}/api/v1/tables".format(self.master_leader_url)))) + tables_response = self.http_request( + "{}/api/v1/tables".format(self.master_leader_url)) + if not tables_response: + metric.add_value(0) + return e.fill_and_return_entry( + ['Master Leader HTTP endpoint is not running'], + has_error=True, + metrics=[metric]) + try: + tables_output = json.loads(tables_response) + except Exception as ex: + logging.warning("Tables HTTP API response is not a valid json: %s", tables_response) + metric.add_value(0) + return e.fill_and_return_entry( + ['Tables HTTP API response is not a valid json'], + has_error=True, + metrics=[metric]) table_data_json = tables_output["user"] table_data_json += tables_output["index"] @@ -1481,10 +1503,27 @@ class NodeChecker(): (SELECT relname, oid, relfilenode FROM pg_class WHERE oid >= 16384) t;") # Fetch all user tables from pg_class for the database - pg_class_output = json.loads(self._check_output(pg_class_cmd).strip()) + pg_class_output = self._check_output(pg_class_cmd).strip() + + if check_for_errors(pg_class_output): + metric.add_value(0) + return e.fill_and_return_entry( + ["Failed to retrieve pg_class info: {}".format(pg_class_output)], + has_error=True, + metrics=[metric]) + try: + pg_class_json = json.loads(pg_class_output) + except Exception as ex: + logging.warning("pg_class query returned invalid json: %s", + pg_class_output) + metric.add_value(0) + return e.fill_and_return_entry( + ['pg_class query returned invalid json'], + has_error=True, + metrics=[metric]) pg_class_oid_tableinfo_dict = {} # Use relfilenode if it exists (as the table may be rewritten) - for table in pg_class_output: + for table in pg_class_json: if table['relfilenode'] != '0': pg_class_oid_tableinfo_dict[table['relfilenode']] = table else: @@ -1493,9 +1532,28 @@ class NodeChecker(): pg_attribute_cmd = "{}{} -t -c \"{}\"".format(ysqlsh_cmd, dbname, "SELECT json_agg(row_to_json(t)) FROM \ (SELECT attname, attrelid FROM pg_attribute WHERE attrelid >= 16384) t;") - pg_attribute_output = json.loads(self._check_output(pg_attribute_cmd).strip()) + + pg_attribute_output = self._check_output(pg_attribute_cmd).strip() + + if check_for_errors(pg_attribute_output): + metric.add_value(0) + return e.fill_and_return_entry( + ["Failed to retrieve pg_attribute info: {}".format(pg_attribute_output)], + has_error=True, + metrics=[metric]) + + try: + pg_attribute_json = json.loads(pg_attribute_output) + except Exception as ex: + logging.warning("pg_attribute query returned invalid json: %s", + pg_attribute_output) + metric.add_value(0) + return e.fill_and_return_entry( + ['pg_attribute query returned invalid json'], + has_error=True, + metrics=[metric]) pg_attribute_attrelid_attnames_dict = defaultdict(list) - for attribute in pg_attribute_output: + for attribute in pg_attribute_json: (pg_attribute_attrelid_attnames_dict[attribute['attrelid']] .append(attribute['attname'])) @@ -1523,8 +1581,25 @@ class NodeChecker(): continue # Get columns - table_schema_json = json.loads(self.http_request( - "{}/api/v1/table?id={}".format(self.master_leader_url, tableid))) + table_schema_response = self.http_request( + "{}/api/v1/table?id={}".format(self.master_leader_url, tableid)) + if not table_schema_response: + metric.add_value(0) + return e.fill_and_return_entry( + ['Master Leader HTTP endpoint is not running'], + has_error=True, + metrics=[metric]) + + try: + table_schema_json = json.loads(table_schema_response) + except Exception as ex: + logging.warning("Table HTTP API response is not a valid json: %s", + table_schema_response) + metric.add_value(0) + return e.fill_and_return_entry( + ['Table HTTP API response is not a valid json'], + has_error=True, + metrics=[metric]) columns = [html.unescape( column['column']) for column in table_schema_json["columns"]] # Check if each column exists in pg_attribute @@ -1538,8 +1613,12 @@ class NodeChecker(): .format(column, tablename, dbname)) continue except Exception as ex: + logging.exception('Got exception on while performing DDL Atomicity check') metric.add_value(0) - return e.fill_and_return_entry([str(ex)], has_error=True, metrics=[metric]) + return e.fill_and_return_entry( + ["Unexpected error occurred"], + has_error=True, + metrics=[metric]) has_errors = len(errors) > 0 if has_errors: @@ -1558,7 +1637,7 @@ class NodeChecker(): logging.info("OpenSSL installed state for node %s: %s", self.node, output) return {"ssl_installed:" + self.node: (output == "0") - if not has_errors(output) else None} + if not check_for_errors(output) else None} def check_yb_controller_availability(self): controller_cli = '{}/bin/yb-controller-cli'.format(self.yb_controller_dir())