Skip to content

Commit

Permalink
Backups: Fail restore process on YSQL Dump apply error
Browse files Browse the repository at this point in the history
Summary:
1. Stop backup-restore process if YSQL Dump Apply (via `ysqlsh` tool) returns a error.

2. If an YSQLDump apply error happens - do clean-up on exit: `DROP DATABASE <target-db-name>`.

Added new `yb_backup.py` arguments:
`--ysql_ignore_restore_errors` - Do NOT use ON_ERROR_STOP mode when applying YSQL dumps.
`--ysql_disable_db_drop_on_restore_error` - Do NOT drop just created (or existing) YSQL DB if YSQL dump apply failed.

Global python variable in `yb_backup.py` `ENABLE_STOP_ON_YSQL_DUMP_RESTORE_ERROR` controls the feature by default (now `= False`).

Test Plan:
Jenkins: all tests

ybd  --cxx-test yb-backup-test --gtest_filter YBBackupTest.TestYSQLKeyspaceBackup
ybd  --cxx-test yb-backup-test --gtest_filter YBBackupTest.TestYSQLKeyspaceBackupWithDropTable
ybd  --cxx-test yb-backup-test --gtest_filter YBBackupTest.TestYSQLBackupWithDropYugabyteDB

Other backup tests

Reviewers: mihnea, tverona, yguan

Reviewed By: yguan

Subscribers: yql, yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D30253
  • Loading branch information
OlegLoginov committed Dec 11, 2023
1 parent b758786 commit b969e29
Showing 1 changed file with 79 additions and 6 deletions.
85 changes: 79 additions & 6 deletions managed/devops/bin/yb_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
STARTED_SNAPSHOT_CREATION_RE = re.compile(r'[\S\s]*Started snapshot creation: (?P<uuid>.*)')
YSQL_CATALOG_VERSION_RE = re.compile(r'[\S\s]*Version: (?P<version>.*)')

YSQL_SHELL_ERROR_RE = re.compile('.*ERROR.*')
ENABLE_STOP_ON_YSQL_DUMP_RESTORE_ERROR = False

ROCKSDB_PATH_PREFIX = '/yb-data/tserver/data/rocksdb'

SNAPSHOT_DIR_GLOB = '*/table-*/tablet-*.snapshots/*'
Expand Down Expand Up @@ -148,6 +151,11 @@ class YbAdminOpNotSupportedException(BackupException):
pass


class YsqlDumpApplyFailedException(BackupException):
"""Exception raised if 'ysqlsh' tool failed to apply the YSQL dump file."""
pass


def split_by_tab(line):
return [item.replace(' ', '') for item in line.split("\t")]

Expand Down Expand Up @@ -1208,8 +1216,14 @@ def parse_arguments(self):
'--remote_ysql_shell_binary', default=DEFAULT_REMOTE_YSQL_SHELL_PATH,
help="Path to the remote ysql shell binary")
parser.add_argument(
'--pg_based_backup', action='store_true', default=False, help="Use it to trigger "
"pg based backup.")
'--pg_based_backup', action='store_true', default=False,
help="Use it to trigger pg based backup.")
parser.add_argument(
'--ysql_ignore_restore_errors', action='store_true', default=False,
help="Do NOT use ON_ERROR_STOP mode when applying YSQL dumps in backup-restore.")
parser.add_argument(
'--ysql_disable_db_drop_on_restore_errors', action='store_true', default=False,
help="Do NOT drop just created (or existing) YSQL DB if YSQL dump apply failed.")
parser.add_argument(
'--disable_xxhash_checksum', action='store_true', default=False,
help="Disables xxhash algorithm for checksum computation.")
Expand Down Expand Up @@ -1525,7 +1539,7 @@ def post_process_arguments(self):
xxh64_bin = XXH64_X86_BIN
self.xxhash_checksum_path = os.path.join(xxhash_tool_path, xxh64_bin)
except Exception:
logging.warn("[app] xxhsum tool missing on the host, continuing with sha256")
logging.warning("[app] xxhsum tool missing on the host, continuing with sha256")
else:
raise BackupException("No Live TServer exists. "
"Check the TServer nodes status & try again.")
Expand Down Expand Up @@ -3429,8 +3443,12 @@ def import_ysql_dump(self, dump_file_path):
"""
Import the YSQL dump using the provided file.
"""
new_db_name = None
if self.args.keyspace:
on_error_stop = (not self.args.ysql_ignore_restore_errors and
ENABLE_STOP_ON_YSQL_DUMP_RESTORE_ERROR)

old_db_name = None
if self.args.keyspace or on_error_stop:
# Get YSQL DB name from the dump file.
cmd = get_db_name_cmd(dump_file_path)

if self.args.local_yb_admin_binary:
Expand All @@ -3443,6 +3461,8 @@ def import_ysql_dump(self, dump_file_path):
# ssh_librabry(yugabyte-db/managed/devops/opscli/ybops/utils/ssh.py).
old_db_name = old_db_name.splitlines()[-1].strip()

new_db_name = None
if self.args.keyspace:
if old_db_name:
new_db_name = keyspace_name(self.args.keyspace[0])
if new_db_name == old_db_name:
Expand Down Expand Up @@ -3472,7 +3492,45 @@ def import_ysql_dump(self, dump_file_path):
else:
self.run_ssh_cmd(cmd, self.get_main_host_ip())

self.run_ysql_shell(['--echo-all', '--file=' + dump_file_path])
ysql_shell_args = ['--echo-all', '--file=' + dump_file_path]

if on_error_stop:
logging.info(
"[app] Enable ON_ERROR_STOP mode when applying '{}'".format(dump_file_path))
ysql_shell_args.append('--variable=ON_ERROR_STOP=1')
error_msg = None
try:
self.run_ysql_shell(ysql_shell_args)
except subprocess.CalledProcessError as ex:
output = str(ex.output.decode('utf-8', errors='replace')
.encode("ascii", "ignore")
.decode("ascii"))
errors = []
for line in output.splitlines():
if YSQL_SHELL_ERROR_RE.match(line):
errors.append(line)

error_msg = "Failed to apply YSQL dump '{}' with RC={}: {}".format(
dump_file_path, ex.returncode, '; '.join(errors))
except Exception as ex:
error_msg = "Failed to apply YSQL dump '{}': {}".format(dump_file_path, ex)

if error_msg:
logging.error("[app] {}".format(error_msg))
db_name_to_drop = new_db_name if new_db_name else old_db_name
if db_name_to_drop:
logging.info("[app] YSQL DB '{}' will be deleted at exit...".format(
db_name_to_drop))
atexit.register(self.delete_created_ysql_db, db_name_to_drop)
else:
logging.warning("[app] Skip new YSQL DB drop because YSQL DB name "
"was not found in file '{}'".format(dump_file_path))

raise YsqlDumpApplyFailedException(error_msg)
else:
logging.warning("[app] Ignoring YSQL errors when applying '{}'".format(dump_file_path))
self.run_ysql_shell(ysql_shell_args)

return new_db_name

def import_snapshot(self, metadata_file_path):
Expand Down Expand Up @@ -3872,6 +3930,21 @@ def delete_created_snapshot(self, snapshot_id):

return self.run_yb_admin(['delete_snapshot', snapshot_id])

def delete_created_ysql_db(self, db_name):
"""
Callback run on exit to delete incomplete/partly created YSQL DB.
"""
if db_name:
if self.args.ysql_disable_db_drop_on_restore_errors:
logging.warning("[app] Skip YSQL DB '{}' drop because "
"the clean-up is disabled".format(db_name))
else:
ysql_shell_args = ['-c', 'DROP DATABASE {};'.format(db_name)]
logging.info("[app] Do clean-up due to previous error: "
"'{}'".format(' '.join(ysql_shell_args)))
output = self.run_ysql_shell(ysql_shell_args)
logging.info("YSQL DB drop result: {}".format(output.replace('\n', ' ')))

def TEST_yb_admin_unsupported_commands(self):
try:
self.run_yb_admin(["fake_command"])
Expand Down

0 comments on commit b969e29

Please sign in to comment.