diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ee378ab0..2d6e3ec0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -77,7 +77,7 @@ jobs: wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - sudo apt-get update # Setup build deps - sudo apt-get install -y libsnappy-dev postgresql-10 postgresql-11 postgresql-12 postgresql-13 postgresql-14 postgresql-15 postgresql-16 + sudo apt-get install -y libsnappy-dev postgresql-12 postgresql-13 postgresql-14 postgresql-15 postgresql-16 postgresql-17 # Setup common python dependencies python -m pip install --upgrade pip pip install . diff --git a/README.rst b/README.rst index 242244b5..565e80fe 100644 --- a/README.rst +++ b/README.rst @@ -133,7 +133,7 @@ python 3.10, 3.11 and 3.12 virtual environments and installations of postgresql By default vagrant up will start a Virtualbox environment. The Vagrantfile will also work for libvirt, just prefix ``VAGRANT_DEFAULT_PROVIDER=libvirt`` to the ``vagrant up`` command. -Any combination of Python (3.10, 3.11 and 3.12) and Postgresql (12, 13, 14, 15 and 16) +Any combination of Python (3.10, 3.11 and 3.12) and Postgresql (12, 13, 14, 15, 16 and 17) Bring up vagrant instance and connect via ssh:: diff --git a/Vagrantfile b/Vagrantfile index 489c428d..82cb4c90 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -39,7 +39,7 @@ Vagrant.configure("2") do |config| sed -i "s/^#create_main_cluster.*/create_main_cluster=false/g" /etc/postgresql-common/createcluster.conf apt-get install -y python{3.10,3.11,3.12} python{3.10,3.11,3.12}-dev python{3.10,3.11,3.12}-venv - apt-get install -y postgresql-{12,13,14,15,16} postgresql-server-dev-{12,13,14,15,16} + apt-get install -y postgresql-{12,13,14,15,16,17} postgresql-server-dev-{12,13,14,15,16,17} username="$(< /dev/urandom tr -dc a-z | head -c${1:-32};echo;)" password=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-32};echo;) diff --git a/docs/development.rst b/docs/development.rst index 576e01ec..273a62d7 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -54,12 +54,12 @@ Vagrant ======= The Vagrantfile can be used to setup a vagrant development environment. The vagrant environment has -python 3.10, 3.11 and 3.12 virtual environments and installations of postgresql 12, 13, 14, 15 and 16. +python 3.10, 3.11 and 3.12 virtual environments and installations of postgresql 12, 13, 14, 15, 16 and 17. By default vagrant up will start a Virtualbox environment. The Vagrantfile will also work for libvirt, just prefix ``VAGRANT_DEFAULT_PROVIDER=libvirt`` to the ``vagrant up`` command. -Any combination of Python (3.10, 3.11 and 3.12) and Postgresql (12, 13, 14, 15 and 16) +Any combination of Python (3.10, 3.11 and 3.12) and Postgresql (12, 13, 14, 15, 16 and 17) Bring up vagrant instance and connect via ssh:: diff --git a/pghoard/basebackup/base.py b/pghoard/basebackup/base.py index 260bd850..91c639b3 100644 --- a/pghoard/basebackup/base.py +++ b/pghoard/basebackup/base.py @@ -16,6 +16,7 @@ import tarfile import time from contextlib import suppress +from dataclasses import dataclass from pathlib import Path from tempfile import NamedTemporaryFile from typing import Any, Dict, Optional, Tuple @@ -52,6 +53,13 @@ ] +@dataclass +class BackupStopResult: + lsn: str + backup_label: str + backup_end_time: datetime.datetime + + class PGBaseBackup(PGHoardThread): def __init__( self, @@ -670,11 +678,7 @@ def run_local_tar_basebackup(self, delta: bool = False, with_delta_stats: bool = with open(os.path.join(pgdata, "global", "pg_control"), "rb") as fp: pg_control = fp.read() - if self.pg_version_server >= 150000: - cursor.execute("SELECT labelfile FROM pg_backup_stop()") - else: - cursor.execute("SELECT labelfile FROM pg_stop_backup(false)") - backup_label = cursor.fetchone()[0] # type: ignore + backup_stop_result = self._stop_backup(cursor) db_conn.commit() backup_stopped = True @@ -690,13 +694,13 @@ def run_local_tar_basebackup(self, delta: bool = False, with_delta_stats: bool = finally: db_conn.rollback() if not backup_stopped: - self._stop_backup(cursor) + _ = self._stop_backup(cursor) db_conn.commit() - assert backup_label - backup_label_data = backup_label.encode("utf-8") + assert backup_stop_result + backup_label_data = backup_stop_result.backup_label.encode("utf-8") backup_start_wal_segment, backup_start_time = self.parse_backup_label(backup_label_data) - backup_end_wal_segment, backup_end_time = self.get_backup_end_segment_and_time(db_conn) + backup_end_wal_segment, backup_end_time = self.get_backup_end_segment_and_time(backup_stop_result, db_conn) # Generate and upload the metadata chunk metadata = { @@ -769,7 +773,7 @@ def find_and_split_files_to_backup(self, *, pgdata, tablespaces, target_chunk_si chunks.append(one_chunk_files) return total_file_count, chunks - def get_backup_end_segment_and_time(self, db_conn): + def get_backup_end_segment_and_time(self, backup_stop_result: BackupStopResult, db_conn): """Grab a timestamp and WAL segment name after the end of the backup: this is a point in time to which we must be able to recover to, and the last WAL segment that is required for the backup to be consistent. @@ -792,6 +796,23 @@ def get_backup_end_segment_and_time(self, db_conn): db_conn.commit() return None, backup_end_time + if self.pg_version_server >= 170000: + # In PostgreSQL versions prior to 17, the `pg_walfile_name` function could return the previous WAL file + # if the provided LSN fell at the boundary between WAL segments (e.g., at the start of the next WAL file). + # This could lead to inconsistencies in the backup metadata, as the LSN obtained from `pg_backup_stop` + # might not correspond to the actual WAL file that contains the backup's end LSN. + # + # To handle this, we had to fetch the current LSN after stopping the backup and use it to determine the + # correct WAL file name, ensuring the backup's end segment was accurately identified. + # + # However, in PostgreSQL 17 and later, this issue is resolved. `pg_walfile_name` now always returns + # the current WAL file, regardless of where the LSN falls within the segment. As a result, the extra step + # of manually fetching the LSN after `pg_backup_stop` is no longer necessary— the output from + # `pg_backup_stop` is now accurate and can be relied upon directly. + + cursor.execute(f"SELECT pg_walfile_name('{backup_stop_result.lsn}')") + return cursor.fetchone()[0], backup_stop_result.backup_end_time + if self.pg_version_server >= 100000: cursor.execute("SELECT pg_walfile_name(pg_current_wal_lsn()), txid_current()") else: @@ -820,8 +841,12 @@ def _start_backup(self, cursor: psycopg2.extensions.cursor, basebackup_name: str else: cursor.execute("SELECT pg_start_backup(%s, true, false)", [basebackup_name]) - def _stop_backup(self, cursor: psycopg2.extensions.cursor) -> None: - if self.pg_version_server >= 150000: - cursor.execute("SELECT pg_backup_stop()") - else: - cursor.execute("SELECT pg_stop_backup(false)") + def _stop_backup(self, cursor: psycopg2.extensions.cursor) -> BackupStopResult: + stop_backup_func = "pg_backup_stop()" if self.pg_version_server >= 150000 else "pg_stop_backup(false)" + cursor.execute(f"SELECT lsn, labelfile, now() FROM {stop_backup_func}") + result = cursor.fetchone() + + if not isinstance(result, tuple): # mypy: help + raise ValueError("No rows returned after trying to stop backup.") + + return BackupStopResult(*result) diff --git a/pghoard/config.py b/pghoard/config.py index 058b03c8..a79a6484 100644 --- a/pghoard/config.py +++ b/pghoard/config.py @@ -17,7 +17,7 @@ from pghoard.common import (extract_pg_command_version_string, pg_major_version, pg_version_string_to_number) from pghoard.postgres_command import PGHOARD_HOST, PGHOARD_PORT -SUPPORTED_VERSIONS = ["16", "15", "14", "13", "12", "11", "10", "9.6", "9.5", "9.4", "9.3"] +SUPPORTED_VERSIONS = ["17", "16", "15", "14", "13", "12", "11", "10", "9.6", "9.5", "9.4", "9.3"] def get_cpu_count(): diff --git a/pghoard/wal.py b/pghoard/wal.py index 58f55ede..8e850a80 100644 --- a/pghoard/wal.py +++ b/pghoard/wal.py @@ -32,6 +32,7 @@ 0xD10D: 140000, 0xD110: 150000, 0xD113: 160000, + 0xD116: 170000, } WAL_MAGIC_BY_VERSION = {value: key for key, value in WAL_MAGIC.items()} diff --git a/test/basebackup/test_basebackup.py b/test/basebackup/test_basebackup.py index 961e524d..93c458b4 100644 --- a/test/basebackup/test_basebackup.py +++ b/test/basebackup/test_basebackup.py @@ -231,6 +231,72 @@ def test_find_and_split_files_to_backup(self, tmpdir): assert chunk3[2] == "pgdata/split_top/split_sub/f2" assert chunk3[3] == "pgdata/split_top/split_sub/f3" + def test_backup_end_segment_and_time(self, db, pg_version: str): + conn = db.connect() + conn.autocommit = True + cursor = conn.cursor() + + # Force a WAL switch to guarantee that the next LSN is at the start of a new WAL segment + cursor.execute("SELECT pg_switch_wal();") + + # Generate enough activity to ensure we approach a segment boundary (16 MB) + cursor.execute("DROP TABLE IF EXISTS wal_test;") + cursor.execute("CREATE TABLE wal_test (id serial, data text);") + for _ in range(1024): + cursor.execute("INSERT INTO wal_test (data) SELECT 'x' || repeat('y', 1024) FROM generate_series(1, 1024);") + cursor.execute("CHECKPOINT;") + cursor.execute("SELECT pg_switch_wal();") + + # basic PGBaseBackup, just use backup_end_segment_and_time for noticing discrepancies among versions + # when LSN falls on a boundary + + pgb = PGBaseBackup( + config=SIMPLE_BACKUP_CONFIG, + site="foosite", + connection_info=None, + basebackup_path=None, + compression_queue=None, + storage=None, + transfer_queue=None, + metrics=metrics.Metrics(statsd={}), + pg_version_server=int(pg_version) * 10000, + ) + + basebackup_name = "test_backup" + + # Manually start and stop backup (could be better, but just need to test basic behavior) + # pylint: disable=protected-access + pgb._start_backup(cursor, basebackup_name) + stop_backup_result = pgb._stop_backup(cursor) + + expected_segment = self._calculate_wal_segment(lsn=stop_backup_result.lsn) + + # for >=PG17 must rely on stop_backup_result LSN, not on current_lsn. For demo this, + # will change pg_version_server to PG16 and show what will happen if fix in pg_walfile_name is not considered + if int(pg_version) >= 17: + pgb.pg_version_server = 160000 + result_end_segment, _ = pgb.get_backup_end_segment_and_time(stop_backup_result, conn) + assert result_end_segment != expected_segment # relying on steps for str: + # Extract XLog ID and Offset from LSN (format: XLogID/Offset) + xlog_id, xlog_offset = map(lambda x: int(x, 16), lsn.split("/")) + + # Calculate the WAL segment number + segment_number = (xlog_id << 32 | xlog_offset) // 0x01000000 + + # Format the segment number into XLog ID and Offset + xlog_id_formatted = f"{segment_number >> 32:08X}" + xlog_offset_formatted = f"{segment_number & 0xFFFFFFFF:08X}" + + return f"{timeline}{xlog_id_formatted}{xlog_offset_formatted}" + def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, active_backup_mode="archive_command"): pghoard.create_backup_site_paths(pghoard.test_site) basebackup_path = os.path.join(pghoard.config["backup_location"], pghoard.test_site, "basebackup") diff --git a/test/conftest.py b/test/conftest.py index cabeed52..a98ccece 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -35,7 +35,7 @@ logutil.configure_logging() -DEFAULT_PG_VERSIONS = ["16", "15", "14", "13", "12"] +DEFAULT_PG_VERSIONS = ["17", "16", "15", "14", "13", "12"] def port_is_listening(hostname: str, port: int, timeout: float = 0.5) -> bool: diff --git a/test/test_webserver.py b/test/test_webserver.py index 6825e8de..7c7f0976 100644 --- a/test/test_webserver.py +++ b/test/test_webserver.py @@ -242,7 +242,7 @@ def _switch_wal(self, db, count): conn.close() return start_wal, end_wal - def test_archive_sync(self, db, pghoard): + def test_archive_sync(self, db, pghoard, pg_version: str): log = logging.getLogger("test_archive_sync") store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) @@ -273,7 +273,11 @@ def list_archive(folder): # cluster between all tests) pg_wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]) pg_wals = {f for f in os.listdir(pg_wal_dir) if wal.WAL_RE.match(f) and f > start_wal} - assert len(pg_wals) >= 4 + + # consider changes in pg_walfile_name, as pg_walfile_name(pg_current_wal_lsn()) might return + # previous walfile name and not current. + expected_min_wals = 4 if int(pg_version) < 17 else 3 + assert len(pg_wals) >= expected_min_wals # create a couple of "recycled" xlog files that we must ignore last_wal = sorted(pg_wals)[-1] @@ -291,7 +295,7 @@ def write_dummy_wal(inc): # check what we have archived, there should be at least the three # above WALs that are NOT there at the moment archived_wals = set(list_archive("xlog")) - assert len(pg_wals - archived_wals) >= 4 + assert len(pg_wals - archived_wals) >= expected_min_wals # now perform an archive sync arsy = ArchiveSync() arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) @@ -329,7 +333,7 @@ def write_dummy_wal(inc): "restore_command = 'false'", ] if Version(db.pgver).major >= 12: - with open(os.path.join(db.pgdata, "standby.signal"), "w") as fp: + with open(os.path.join(db.pgdata, "standby.signal"), "w"): pass recovery_conf_path = "postgresql.auto.conf" @@ -339,7 +343,7 @@ def write_dummy_wal(inc): recovery_conf_path = "recovery.conf" open_mode = "w" - with open(os.path.join(db.pgdata, recovery_conf_path), open_mode) as fp: + with open(os.path.join(db.pgdata, recovery_conf_path), open_mode) as fp: # type: ignore fp.write("\n".join(recovery_conf) + "\n") # start PG and promote it