Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pgBackRest: Point-In-Time Recovery (PITR) Improvements #765

Merged
merged 29 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
53f68c6
Add cluster restore timeout
vitabaks Sep 20, 2024
9770ee0
Add pg_ctl_timeout variable
vitabaks Sep 20, 2024
b9a64c4
Add Standby cluster initialization check
vitabaks Sep 20, 2024
6b3a050
cluster_restore_timeout
vitabaks Sep 23, 2024
59df9d0
remove duplicate pgbackrest vars from the inventory
vitabaks Sep 30, 2024
dd786ee
update task "Start PostgreSQL for Recovery"
vitabaks Sep 30, 2024
9d1079a
temporarily disable tests for Citus (repository problem: Error 402)
vitabaks Sep 30, 2024
c13fe1c
fix typo
vitabaks Sep 30, 2024
e53e0f2
enable citus tests
vitabaks Oct 1, 2024
f95cbfd
Update main.yml
vitabaks Oct 2, 2024
5bac0cd
Update main.yml
vitabaks Oct 2, 2024
a41ca81
formatting
vitabaks Oct 2, 2024
c1769f4
Check PostgreSQL is started
vitabaks Oct 2, 2024
e7524f0
Update main.yml
vitabaks Oct 2, 2024
54ea8b8
Update main.yml
vitabaks Oct 2, 2024
9deee34
Merge branch 'master' into restore-timeout
vitabaks Oct 11, 2024
c4a3089
Update main.yml
vitabaks Oct 21, 2024
372be89
Update pg_ctl start command
vitabaks Nov 1, 2024
4048b39
Check that PostgreSQL is stopped
vitabaks Nov 1, 2024
16eebaa
disable archive_command after recovery
vitabaks Nov 1, 2024
83518f8
fix typo
vitabaks Nov 1, 2024
d72b41e
add restore_command
vitabaks Nov 1, 2024
d5efcae
Update main.yml
vitabaks Nov 1, 2024
269be4a
Print recovery log
vitabaks Nov 1, 2024
9118c58
Update main.yml
vitabaks Nov 1, 2024
28efdc3
Update main.yml
vitabaks Nov 1, 2024
3540d14
update condition
vitabaks Nov 1, 2024
704119c
Update main.yml
vitabaks Nov 1, 2024
d4e0063
PostgreSQL recovery details
vitabaks Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions automation/inventory
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,3 @@ ansible_ssh_port='22'
#ansible_ssh_pass='secretpassword' # "sshpass" package is required for use "ansible_ssh_pass"
#ansible_ssh_private_key_file=
#ansible_python_interpreter='/usr/bin/python3'

[pgbackrest:vars]
#ansible_user='postgres'
#ansible_ssh_pass='secretpassword'
121 changes: 93 additions & 28 deletions automation/roles/patroni/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@
- name: Prepare PostgreSQL | start PostgreSQL on Master
become: true
become_user: postgres
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t 1800"
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }}"
when: pg_ctl_status_result.rc == 3

- name: Prepare PostgreSQL | check PostgreSQL is accepting connections
Expand Down Expand Up @@ -584,7 +584,7 @@
- name: Prepare PostgreSQL | stop PostgreSQL (will be managed by patroni)
become: true
become_user: postgres
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800"
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}"
when: checkpoint_result.rc is defined and checkpoint_result.rc == 0

- name: Prepare PostgreSQL | check PostgreSQL is stopped
Expand All @@ -598,7 +598,7 @@
tags: patroni, patroni_start_master

- block: # PITR (custom bootstrap)
# Prepare (install pexpect, ruamel.yaml)
# Prepare (install pexpect, ruamel.yaml)
- name: Prepare | Make sure the ansible required python library is exist
ansible.builtin.pip:
name: "{{ item }}"
Expand All @@ -612,7 +612,8 @@
environment:
PATH: "{{ ansible_env.PATH }}:/usr/local/bin:/usr/bin"
PIP_BREAK_SYSTEM_PACKAGES: "1"
# Run PITR

# Run PITR
- name: Stop patroni service on the Replica servers (if running)
ansible.builtin.systemd:
name: patroni
Expand All @@ -625,6 +626,21 @@
state: stopped
when: is_master | bool

- name: Check that PostgreSQL is stopped
become: true
become_user: postgres
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}"
register: pg_ctl_status_result
changed_when: false
failed_when: false

- name: Stop PostgreSQL
become: true
become_user: postgres
ansible.builtin.command: >-
{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}
when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4)

- name: Remove patroni cluster "{{ patroni_cluster_name }}" from DCS (if exist)
become: true
become_user: postgres
Expand All @@ -648,7 +664,7 @@
ansible.builtin.command: >
{{ pgbackrest_patroni_cluster_restore_command }}
{{ '--target-action=promote' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }}
async: 86400 # timeout 24 hours
async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours
poll: 0
register: pgbackrest_restore_master
when: is_master | bool
Expand All @@ -658,7 +674,7 @@
ansible.builtin.command: >
{{ pgbackrest_patroni_cluster_restore_command }}
{{ '--target-action=shutdown' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }}
async: 86400 # timeout 24 hours
async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours
poll: 0
register: pgbackrest_restore_replica
when: not is_master | bool and 'pgbackrest' in patroni_create_replica_methods
Expand All @@ -673,7 +689,7 @@
label: "{{ item.changed }}"
register: pgbackrest_restore_jobs_result
until: pgbackrest_restore_jobs_result.finished
retries: 2880 # timeout 24 hours
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
when: item.ansible_job_id is defined

Expand All @@ -685,20 +701,52 @@
when: not keep_patroni_dynamic_json|bool

- name: Start PostgreSQL for Recovery
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -o '-c hot_standby=off' -w -t 1800"
ansible.builtin.command: >-
{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }}
-o '--config-file={{ postgresql_conf_dir }}/postgresql.conf'
-o '-c hot_standby=off'
{% if postgresql_version | int >= 12 %}
-o '-c restore_command="pgbackrest --stanza={{ pgbackrest_stanza }} archive-get %f %p"'
{% endif %}
-o '-c archive_command=/bin/true'
-l /tmp/pg_recovery_{{ ansible_date_time.date }}.log
async: "{{ pg_ctl_timeout | default(3600) }}" # run the command asynchronously
poll: 0
register: pg_ctl_start_result
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)

- name: Waiting for PostgreSQL Recovery to complete (WAL apply)
- name: Wait for the PostgreSQL start command to complete
ansible.builtin.async_status:
jid: "{{ pg_ctl_start_result.ansible_job_id }}"
register: pg_ctl_start_job_result
until: pg_ctl_start_job_result.finished
retries: "{{ (pg_ctl_timeout | default(3600) | int) // 10 }}"
delay: 10
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)

- name: Wait for PostgreSQL recovery to complete (WAL apply)
ansible.builtin.command: >-
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc
"select pg_is_in_recovery()"
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres
-tAXc "select pg_is_in_recovery()"
register: pg_is_in_recovery
until: pg_is_in_recovery.stdout != "t"
retries: 1200 # timeout 10 hours
until: pg_is_in_recovery.stdout == "f"
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
changed_when: false
failed_when: false
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)
when: is_master | bool

- name: Check PostgreSQL recovery log
ansible.builtin.command: "grep -A2 'recovery stopping' /tmp/pg_recovery_{{ ansible_date_time.date }}.log"
register: pg_recovery_result
changed_when: false
failed_when: false
when: is_master | bool

- name: PostgreSQL recovery details
ansible.builtin.debug:
msg: '{{ pg_recovery_result.stdout_lines }}'
when: pg_recovery_result.stdout_lines is defined

- name: Check that PostgreSQL is stopped
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}"
Expand All @@ -707,7 +755,8 @@
failed_when: false

- name: Stop PostgreSQL
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800"
ansible.builtin.command: >-
{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}
when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4)
when: patroni_cluster_bootstrap_method == "pgbackrest"
become: true
Expand All @@ -718,36 +767,38 @@
tags: patroni, point_in_time_recovery

- block: # PITR (custom bootstrap) - disable archive_command
- name: Check the patroni.dynamic.json exists
- name: Check if patroni.dynamic.json exists
ansible.builtin.stat:
path: "{{ postgresql_data_dir }}/patroni.dynamic.json"
register: patroni_dynamic_json
when: not keep_patroni_dynamic_json | bool

- name: Remove patroni.dynamic.json file
ansible.builtin.file:
path: "{{ postgresql_data_dir }}/patroni.dynamic.json"
state: absent
when: patroni_dynamic_json.stat.exists and
not keep_patroni_dynamic_json|bool
when:
- patroni_dynamic_json is defined
- patroni_dynamic_json.stat is defined
- patroni_dynamic_json.stat.exists

- name: Edit patroni.dynamic.json | disable archive_command (if enabled)
yedit:
src: "{{ postgresql_data_dir }}/patroni.dynamic.json"
key: postgresql.parameters.archive_command
value: "cd ." # not doing anything yet with WAL-s
content_type: json
when: patroni_dynamic_json.stat.exists and
keep_patroni_dynamic_json|bool and disable_archive_command|bool
when: disable_archive_command | bool

- name: Edit patroni.yml | disable archive_command (if enabled)
yedit:
src: /etc/patroni/patroni.yml
key: bootstrap.dcs.postgresql.parameters.archive_command
value: "cd ." # not doing anything yet with WAL-s
when: disable_archive_command|bool
when: disable_archive_command | bool
when: patroni_cluster_bootstrap_method != "initdb" and
(pgbackrest_install|bool or wal_g_install|bool) and
(existing_pgcluster is not defined or not existing_pgcluster|bool)
(pgbackrest_install | bool or wal_g_install | bool) and
(existing_pgcluster is not defined or not existing_pgcluster | bool)
become: true
become_user: postgres
tags: patroni, point_in_time_recovery
Expand Down Expand Up @@ -791,13 +842,27 @@
"select pg_is_in_recovery()"
register: pg_is_in_recovery
until: pg_is_in_recovery.stdout == "f"
retries: 1200 # timeout 10 hours
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
changed_when: false
failed_when: false
when: patroni_cluster_bootstrap_method == "wal-g"

- name: Check PostgreSQL is started and accepting connections on Master
- name: Wait for the Standby cluster initialization to complete
ansible.builtin.uri:
url: "http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/standby-leader"
status_code: 200
register: standby_leader_result
until: standby_leader_result.status == 200
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
environment:
no_proxy: "{{ inventory_hostname }}"
when:
- (patroni_standby_cluster.host is defined and patroni_standby_cluster.host | length > 0)
- not ansible_check_mode

- name: Check PostgreSQL is started and accepting connections
become: true
become_user: postgres
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_isready -p {{ postgresql_port }}"
Expand Down Expand Up @@ -853,8 +918,8 @@
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc
"select pg_is_in_recovery()"
register: pg_is_in_recovery
until: pg_is_in_recovery.stdout != "t"
retries: 1200 # timeout 10 hours
until: pg_is_in_recovery.stdout == "f"
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
changed_when: false
when: is_master | bool
Expand Down Expand Up @@ -961,7 +1026,7 @@
status_code: 200
register: replica_result
until: replica_result.status == 200
retries: 1200 # timeout 10 hours
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
delay: 30
environment:
no_proxy: "{{ inventory_hostname }}"
Expand Down
5 changes: 3 additions & 2 deletions automation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,11 @@ patroni_create_replica_methods:
- basebackup

pgbackrest:
- { option: "command", value: "/usr/bin/pgbackrest --stanza={{ pgbackrest_stanza }} --delta restore" }
- { option: "command", value: "{{ pgbackrest_patroni_cluster_restore_command }}" }
- { option: "keep_data", value: "True" }
- { option: "no_params", value: "True" }
wal_g:
- { option: "command", value: "{{ wal_g_path }} backup-fetch {{ postgresql_data_dir }} LATEST" }
- { option: "command", value: "{{ wal_g_patroni_cluster_bootstrap_command }}" }
- { option: "no_params", value: "True" }
basebackup:
- { option: "max-rate", value: "1000M" }
Expand Down Expand Up @@ -645,6 +645,7 @@ pgbackrest_cron_jobs:
# PITR mode (if patroni_cluster_bootstrap_method: "pgbackrest" or "wal-g"):
# 1) The database cluster directory will be cleaned (for "wal-g") or overwritten (for "pgbackrest" --delta restore).
# 2) And also the patroni cluster "{{ patroni_cluster_name }}" will be removed from the DCS (if exist) before recovery.
cluster_restore_timeout: 86400 # backup and WAL restore timeout in seconds (24 hours)

disable_archive_command: true # or 'false' to not disable archive_command after restore
keep_patroni_dynamic_json: true # or 'false' to remove patroni.dynamic.json after restore (if exists)
Expand Down
Loading