Skip to content

Commit

Permalink
[Backport 2.20][PLAT-15326] Proper error handling in DDL atomicity check
Browse files Browse the repository at this point in the history
Summary:
Currently various errors are not handled properly in ddl atomicity check.
Fixing it.

Original diff: https://phorge.dev.yugabyte.com/D38142

Test Plan:
Install 2.20.0.0_b50 universe.
Make sure DDL atomicity check passes.
Force DDL atomicity errors. Force check.
Make sure check show valid errors.
Stop TServer on master leader node.
Make sure check shows 'TServer is not running on this node' error
Stop 2 masters.
Make sure check shows 'Master Leader HTTP endpoint is not running' error

Reviewers: vbansal, skurapati

Reviewed By: vbansal

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D38153
  • Loading branch information
anmalysh-yb committed Sep 18, 2024
1 parent 4017745 commit 776e8ea
Showing 1 changed file with 99 additions and 20 deletions.
119 changes: 99 additions & 20 deletions managed/src/main/resources/health/node_health.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def wrap_command(command_str):
return "timeout {} bash -c 'set -o pipefail; {}'".format(CMD_TIMEOUT_SEC, command_str)


def has_errors(str):
def check_for_errors(str):
return str is not None and str.startswith('Error')


Expand Down Expand Up @@ -558,7 +558,7 @@ class NodeChecker():
found_error = False
output = self.get_disk_utilization()
msgs = []
if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)

# Do not process the headers.
Expand Down Expand Up @@ -636,7 +636,7 @@ class NodeChecker():
return e.fill_and_return_warning_entry(["OpenSSL is not installed, skipped"])

output = self.get_certificate_expiration_date(cert_path)
if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)

if output == 'File not found':
Expand Down Expand Up @@ -798,7 +798,7 @@ class NodeChecker():
cmd = 'curl -s -L --insecure {}'.format(endpoint)

output = self._check_output(cmd).strip()
if has_errors(output):
if check_for_errors(output):
logging.info("HTTP request to {} returned error: {}".format(endpoint, output))
return None
return output
Expand Down Expand Up @@ -882,7 +882,7 @@ class NodeChecker():
'-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES),
log_severity)
output = self._check_output(cmd)
if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)

log_files = self.check_logs_find_output(output)
Expand Down Expand Up @@ -917,7 +917,7 @@ class NodeChecker():
yb_cores_dir, yb_cores_dir,
'-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES))
output = self._check_output(cmd)
if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)

files = []
Expand Down Expand Up @@ -986,11 +986,11 @@ class NodeChecker():
else:
e = self._new_metric_entry("Uptime", process)
uptime = self.get_uptime_for_process(process)
if has_errors(uptime):
if check_for_errors(uptime):
return e.fill_and_return_entry([uptime], has_error=True)

boot_time = self.get_boot_time_for_process(process)
if has_errors(boot_time):
if check_for_errors(boot_time):
return e.fill_and_return_entry([boot_time], has_error=True)

if uptime is None or not uptime:
Expand Down Expand Up @@ -1155,7 +1155,7 @@ class NodeChecker():
logging.info("Checking node exporter on node {}".format(self.node))
e = self._new_metric_entry("Node exporter")
output = self.get_command_for_process("node_exporter")
if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)
metric = Metric.from_definition(YB_NODE_CUSTOM_NODE_METRICS)
running = ('node_exporter' in output)
Expand Down Expand Up @@ -1193,7 +1193,7 @@ class NodeChecker():
awk '\"'\"'{print $1}'\"'\"' /proc/sys/fs/file-nr"
output = self._check_output(cmd)

if has_errors(output):
if check_for_errors(output):
return e.fill_and_return_entry([output], has_error=True)

counts = output.split('\n')
Expand Down Expand Up @@ -1440,6 +1440,13 @@ class NodeChecker():
e = self._new_entry("DDL atomicity")
metric = Metric.from_definition(YB_DDL_ATOMICITY_CHECK)

tserver_pid = self.get_process_pid_by_name(TSERVER)
if tserver_pid is None:
metric.add_value(0)
return e.fill_and_return_entry(["TServer is not running on this node"],
has_error=True,
metrics=[metric])

try:
ysqlsh_cmd = self.create_ysqlsh_command("")
except RuntimeError as re:
Expand All @@ -1449,8 +1456,23 @@ class NodeChecker():
errors = []
try:
# Get table data
tables_output = (json.loads(self.http_request(
"{}/api/v1/tables".format(self.master_leader_url))))
tables_response = self.http_request(
"{}/api/v1/tables".format(self.master_leader_url))
if not tables_response:
metric.add_value(0)
return e.fill_and_return_entry(
['Master Leader HTTP endpoint is not running'],
has_error=True,
metrics=[metric])
try:
tables_output = json.loads(tables_response)
except Exception as ex:
logging.warning("Tables HTTP API response is not a valid json: %s", tables_response)
metric.add_value(0)
return e.fill_and_return_entry(
['Tables HTTP API response is not a valid json'],
has_error=True,
metrics=[metric])
table_data_json = tables_output["user"]
table_data_json += tables_output["index"]

Expand Down Expand Up @@ -1481,10 +1503,27 @@ class NodeChecker():
(SELECT relname, oid, relfilenode FROM pg_class WHERE oid >= 16384) t;")

# Fetch all user tables from pg_class for the database
pg_class_output = json.loads(self._check_output(pg_class_cmd).strip())
pg_class_output = self._check_output(pg_class_cmd).strip()

if check_for_errors(pg_class_output):
metric.add_value(0)
return e.fill_and_return_entry(
["Failed to retrieve pg_class info: {}".format(pg_class_output)],
has_error=True,
metrics=[metric])
try:
pg_class_json = json.loads(pg_class_output)
except Exception as ex:
logging.warning("pg_class query returned invalid json: %s",
pg_class_output)
metric.add_value(0)
return e.fill_and_return_entry(
['pg_class query returned invalid json'],
has_error=True,
metrics=[metric])
pg_class_oid_tableinfo_dict = {}
# Use relfilenode if it exists (as the table may be rewritten)
for table in pg_class_output:
for table in pg_class_json:
if table['relfilenode'] != '0':
pg_class_oid_tableinfo_dict[table['relfilenode']] = table
else:
Expand All @@ -1493,9 +1532,28 @@ class NodeChecker():
pg_attribute_cmd = "{}{} -t -c \"{}\"".format(ysqlsh_cmd, dbname,
"SELECT json_agg(row_to_json(t)) FROM \
(SELECT attname, attrelid FROM pg_attribute WHERE attrelid >= 16384) t;")
pg_attribute_output = json.loads(self._check_output(pg_attribute_cmd).strip())

pg_attribute_output = self._check_output(pg_attribute_cmd).strip()

if check_for_errors(pg_attribute_output):
metric.add_value(0)
return e.fill_and_return_entry(
["Failed to retrieve pg_attribute info: {}".format(pg_attribute_output)],
has_error=True,
metrics=[metric])

try:
pg_attribute_json = json.loads(pg_attribute_output)
except Exception as ex:
logging.warning("pg_attribute query returned invalid json: %s",
pg_attribute_output)
metric.add_value(0)
return e.fill_and_return_entry(
['pg_attribute query returned invalid json'],
has_error=True,
metrics=[metric])
pg_attribute_attrelid_attnames_dict = defaultdict(list)
for attribute in pg_attribute_output:
for attribute in pg_attribute_json:
(pg_attribute_attrelid_attnames_dict[attribute['attrelid']]
.append(attribute['attname']))

Expand Down Expand Up @@ -1523,8 +1581,25 @@ class NodeChecker():
continue

# Get columns
table_schema_json = json.loads(self.http_request(
"{}/api/v1/table?id={}".format(self.master_leader_url, tableid)))
table_schema_response = self.http_request(
"{}/api/v1/table?id={}".format(self.master_leader_url, tableid))
if not table_schema_response:
metric.add_value(0)
return e.fill_and_return_entry(
['Master Leader HTTP endpoint is not running'],
has_error=True,
metrics=[metric])

try:
table_schema_json = json.loads(table_schema_response)
except Exception as ex:
logging.warning("Table HTTP API response is not a valid json: %s",
table_schema_response)
metric.add_value(0)
return e.fill_and_return_entry(
['Table HTTP API response is not a valid json'],
has_error=True,
metrics=[metric])
columns = [html.unescape(
column['column']) for column in table_schema_json["columns"]]
# Check if each column exists in pg_attribute
Expand All @@ -1538,8 +1613,12 @@ class NodeChecker():
.format(column, tablename, dbname))
continue
except Exception as ex:
logging.exception('Got exception on while performing DDL Atomicity check')
metric.add_value(0)
return e.fill_and_return_entry([str(ex)], has_error=True, metrics=[metric])
return e.fill_and_return_entry(
["Unexpected error occurred"],
has_error=True,
metrics=[metric])

has_errors = len(errors) > 0
if has_errors:
Expand All @@ -1558,7 +1637,7 @@ class NodeChecker():
logging.info("OpenSSL installed state for node %s: %s", self.node, output)

return {"ssl_installed:" + self.node: (output == "0")
if not has_errors(output) else None}
if not check_for_errors(output) else None}

def check_yb_controller_availability(self):
controller_cli = '{}/bin/yb-controller-cli'.format(self.yb_controller_dir())
Expand Down

0 comments on commit 776e8ea

Please sign in to comment.