From af68b0a32afe3f035d3008ce4b4faa4ff98d2f58 Mon Sep 17 00:00:00 2001 From: Vitaliy Kukharik <37010174+vitabaks@users.noreply.github.com> Date: Tue, 12 Nov 2024 12:04:33 +0300 Subject: [PATCH] pgBackRest: Point-In-Time Recovery (PITR) Improvements (#765) --- automation/inventory | 4 - automation/roles/patroni/tasks/main.yml | 121 ++++++++++++++++++------ automation/vars/main.yml | 5 +- 3 files changed, 96 insertions(+), 34 deletions(-) diff --git a/automation/inventory b/automation/inventory index 9f7628657..17bbf3980 100644 --- a/automation/inventory +++ b/automation/inventory @@ -61,7 +61,3 @@ ansible_ssh_port='22' #ansible_ssh_pass='secretpassword' # "sshpass" package is required for use "ansible_ssh_pass" #ansible_ssh_private_key_file= #ansible_python_interpreter='/usr/bin/python3' - -[pgbackrest:vars] -#ansible_user='postgres' -#ansible_ssh_pass='secretpassword' diff --git a/automation/roles/patroni/tasks/main.yml b/automation/roles/patroni/tasks/main.yml index 123945fea..709cf5ee4 100644 --- a/automation/roles/patroni/tasks/main.yml +++ b/automation/roles/patroni/tasks/main.yml @@ -507,7 +507,7 @@ - name: Prepare PostgreSQL | start PostgreSQL on Master become: true become_user: postgres - ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t 1800" + ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }}" when: pg_ctl_status_result.rc == 3 - name: Prepare PostgreSQL | check PostgreSQL is accepting connections @@ -584,7 +584,7 @@ - name: Prepare PostgreSQL | stop PostgreSQL (will be managed by patroni) become: true become_user: postgres - ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800" + ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}" when: checkpoint_result.rc is defined and checkpoint_result.rc == 0 - name: Prepare PostgreSQL | check PostgreSQL is stopped @@ -598,7 +598,7 @@ tags: patroni, patroni_start_master - block: # PITR (custom bootstrap) - # Prepare (install pexpect, ruamel.yaml) + # Prepare (install pexpect, ruamel.yaml) - name: Prepare | Make sure the ansible required python library is exist ansible.builtin.pip: name: "{{ item }}" @@ -612,7 +612,8 @@ environment: PATH: "{{ ansible_env.PATH }}:/usr/local/bin:/usr/bin" PIP_BREAK_SYSTEM_PACKAGES: "1" - # Run PITR + + # Run PITR - name: Stop patroni service on the Replica servers (if running) ansible.builtin.systemd: name: patroni @@ -625,6 +626,21 @@ state: stopped when: is_master | bool + - name: Check that PostgreSQL is stopped + become: true + become_user: postgres + ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}" + register: pg_ctl_status_result + changed_when: false + failed_when: false + + - name: Stop PostgreSQL + become: true + become_user: postgres + ansible.builtin.command: >- + {{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }} + when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4) + - name: Remove patroni cluster "{{ patroni_cluster_name }}" from DCS (if exist) become: true become_user: postgres @@ -648,7 +664,7 @@ ansible.builtin.command: > {{ pgbackrest_patroni_cluster_restore_command }} {{ '--target-action=promote' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }} - async: 86400 # timeout 24 hours + async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours poll: 0 register: pgbackrest_restore_master when: is_master | bool @@ -658,7 +674,7 @@ ansible.builtin.command: > {{ pgbackrest_patroni_cluster_restore_command }} {{ '--target-action=shutdown' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }} - async: 86400 # timeout 24 hours + async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours poll: 0 register: pgbackrest_restore_replica when: not is_master | bool and 'pgbackrest' in patroni_create_replica_methods @@ -673,7 +689,7 @@ label: "{{ item.changed }}" register: pgbackrest_restore_jobs_result until: pgbackrest_restore_jobs_result.finished - retries: 2880 # timeout 24 hours + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours delay: 30 when: item.ansible_job_id is defined @@ -685,20 +701,52 @@ when: not keep_patroni_dynamic_json|bool - name: Start PostgreSQL for Recovery - ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -o '-c hot_standby=off' -w -t 1800" + ansible.builtin.command: >- + {{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }} + -o '--config-file={{ postgresql_conf_dir }}/postgresql.conf' + -o '-c hot_standby=off' + {% if postgresql_version | int >= 12 %} + -o '-c restore_command="pgbackrest --stanza={{ pgbackrest_stanza }} archive-get %f %p"' + {% endif %} + -o '-c archive_command=/bin/true' + -l /tmp/pg_recovery_{{ ansible_date_time.date }}.log + async: "{{ pg_ctl_timeout | default(3600) }}" # run the command asynchronously + poll: 0 + register: pg_ctl_start_result when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods) - - name: Waiting for PostgreSQL Recovery to complete (WAL apply) + - name: Wait for the PostgreSQL start command to complete + ansible.builtin.async_status: + jid: "{{ pg_ctl_start_result.ansible_job_id }}" + register: pg_ctl_start_job_result + until: pg_ctl_start_job_result.finished + retries: "{{ (pg_ctl_timeout | default(3600) | int) // 10 }}" + delay: 10 + when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods) + + - name: Wait for PostgreSQL recovery to complete (WAL apply) ansible.builtin.command: >- - {{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc - "select pg_is_in_recovery()" + {{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres + -tAXc "select pg_is_in_recovery()" register: pg_is_in_recovery - until: pg_is_in_recovery.stdout != "t" - retries: 1200 # timeout 10 hours + until: pg_is_in_recovery.stdout == "f" + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours delay: 30 changed_when: false failed_when: false - when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods) + when: is_master | bool + + - name: Check PostgreSQL recovery log + ansible.builtin.command: "grep -A2 'recovery stopping' /tmp/pg_recovery_{{ ansible_date_time.date }}.log" + register: pg_recovery_result + changed_when: false + failed_when: false + when: is_master | bool + + - name: PostgreSQL recovery details + ansible.builtin.debug: + msg: '{{ pg_recovery_result.stdout_lines }}' + when: pg_recovery_result.stdout_lines is defined - name: Check that PostgreSQL is stopped ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}" @@ -707,7 +755,8 @@ failed_when: false - name: Stop PostgreSQL - ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800" + ansible.builtin.command: >- + {{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }} when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4) when: patroni_cluster_bootstrap_method == "pgbackrest" become: true @@ -718,17 +767,20 @@ tags: patroni, point_in_time_recovery - block: # PITR (custom bootstrap) - disable archive_command - - name: Check the patroni.dynamic.json exists + - name: Check if patroni.dynamic.json exists ansible.builtin.stat: path: "{{ postgresql_data_dir }}/patroni.dynamic.json" register: patroni_dynamic_json + when: not keep_patroni_dynamic_json | bool - name: Remove patroni.dynamic.json file ansible.builtin.file: path: "{{ postgresql_data_dir }}/patroni.dynamic.json" state: absent - when: patroni_dynamic_json.stat.exists and - not keep_patroni_dynamic_json|bool + when: + - patroni_dynamic_json is defined + - patroni_dynamic_json.stat is defined + - patroni_dynamic_json.stat.exists - name: Edit patroni.dynamic.json | disable archive_command (if enabled) yedit: @@ -736,18 +788,17 @@ key: postgresql.parameters.archive_command value: "cd ." # not doing anything yet with WAL-s content_type: json - when: patroni_dynamic_json.stat.exists and - keep_patroni_dynamic_json|bool and disable_archive_command|bool + when: disable_archive_command | bool - name: Edit patroni.yml | disable archive_command (if enabled) yedit: src: /etc/patroni/patroni.yml key: bootstrap.dcs.postgresql.parameters.archive_command value: "cd ." # not doing anything yet with WAL-s - when: disable_archive_command|bool + when: disable_archive_command | bool when: patroni_cluster_bootstrap_method != "initdb" and - (pgbackrest_install|bool or wal_g_install|bool) and - (existing_pgcluster is not defined or not existing_pgcluster|bool) + (pgbackrest_install | bool or wal_g_install | bool) and + (existing_pgcluster is not defined or not existing_pgcluster | bool) become: true become_user: postgres tags: patroni, point_in_time_recovery @@ -791,13 +842,27 @@ "select pg_is_in_recovery()" register: pg_is_in_recovery until: pg_is_in_recovery.stdout == "f" - retries: 1200 # timeout 10 hours + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours delay: 30 changed_when: false failed_when: false when: patroni_cluster_bootstrap_method == "wal-g" - - name: Check PostgreSQL is started and accepting connections on Master + - name: Wait for the Standby cluster initialization to complete + ansible.builtin.uri: + url: "http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/standby-leader" + status_code: 200 + register: standby_leader_result + until: standby_leader_result.status == 200 + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours + delay: 30 + environment: + no_proxy: "{{ inventory_hostname }}" + when: + - (patroni_standby_cluster.host is defined and patroni_standby_cluster.host | length > 0) + - not ansible_check_mode + + - name: Check PostgreSQL is started and accepting connections become: true become_user: postgres ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_isready -p {{ postgresql_port }}" @@ -853,8 +918,8 @@ {{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc "select pg_is_in_recovery()" register: pg_is_in_recovery - until: pg_is_in_recovery.stdout != "t" - retries: 1200 # timeout 10 hours + until: pg_is_in_recovery.stdout == "f" + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours delay: 30 changed_when: false when: is_master | bool @@ -961,7 +1026,7 @@ status_code: 200 register: replica_result until: replica_result.status == 200 - retries: 1200 # timeout 10 hours + retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours delay: 30 environment: no_proxy: "{{ inventory_hostname }}" diff --git a/automation/vars/main.yml b/automation/vars/main.yml index 9646c72ca..e58b248b5 100644 --- a/automation/vars/main.yml +++ b/automation/vars/main.yml @@ -460,11 +460,11 @@ patroni_create_replica_methods: - basebackup pgbackrest: - - { option: "command", value: "/usr/bin/pgbackrest --stanza={{ pgbackrest_stanza }} --delta restore" } + - { option: "command", value: "{{ pgbackrest_patroni_cluster_restore_command }}" } - { option: "keep_data", value: "True" } - { option: "no_params", value: "True" } wal_g: - - { option: "command", value: "{{ wal_g_path }} backup-fetch {{ postgresql_data_dir }} LATEST" } + - { option: "command", value: "{{ wal_g_patroni_cluster_bootstrap_command }}" } - { option: "no_params", value: "True" } basebackup: - { option: "max-rate", value: "1000M" } @@ -645,6 +645,7 @@ pgbackrest_cron_jobs: # PITR mode (if patroni_cluster_bootstrap_method: "pgbackrest" or "wal-g"): # 1) The database cluster directory will be cleaned (for "wal-g") or overwritten (for "pgbackrest" --delta restore). # 2) And also the patroni cluster "{{ patroni_cluster_name }}" will be removed from the DCS (if exist) before recovery. +cluster_restore_timeout: 86400 # backup and WAL restore timeout in seconds (24 hours) disable_archive_command: true # or 'false' to not disable archive_command after restore keep_patroni_dynamic_json: true # or 'false' to remove patroni.dynamic.json after restore (if exists)