-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathcharm.py
executable file
·1846 lines (1591 loc) · 75 KB
/
charm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env -S LD_LIBRARY_PATH=lib python3
# Copyright 2021 Canonical Ltd.
# See LICENSE file for licensing details.
"""Charmed Machine Operator for the PostgreSQL database."""
import json
import logging
import os
import platform
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Literal, Optional, Set, Tuple, get_args
import psycopg2
from charms.data_platform_libs.v0.data_interfaces import DataPeerData, DataPeerUnitData
from charms.data_platform_libs.v0.data_models import TypedCharmBase
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.operator_libs_linux.v2 import snap
from charms.postgresql_k8s.v0.postgresql import (
REQUIRED_PLUGINS,
PostgreSQL,
PostgreSQLCreateUserError,
PostgreSQLEnableDisableExtensionError,
PostgreSQLListUsersError,
PostgreSQLUpdateUserPasswordError,
)
from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS
from charms.rolling_ops.v0.rollingops import RollingOpsManager, RunWithLock
from charms.tempo_k8s.v1.charm_tracing import trace_charm
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer
from ops import JujuVersion
from ops.charm import (
ActionEvent,
HookEvent,
InstallEvent,
LeaderElectedEvent,
RelationDepartedEvent,
StartEvent,
)
from ops.framework import EventBase
from ops.main import main
from ops.model import (
ActiveStatus,
BlockedStatus,
MaintenanceStatus,
ModelError,
Relation,
Unit,
WaitingStatus,
)
from tenacity import RetryError, Retrying, retry, stop_after_attempt, stop_after_delay, wait_fixed
from backups import CANNOT_RESTORE_PITR, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, PostgreSQLBackups
from cluster import (
NotReadyError,
Patroni,
RemoveRaftMemberFailedError,
SwitchoverFailedError,
)
from cluster_topology_observer import (
ClusterTopologyChangeCharmEvents,
ClusterTopologyObserver,
)
from config import CharmConfig
from constants import (
APP_SCOPE,
BACKUP_USER,
METRICS_PORT,
MONITORING_PASSWORD_KEY,
MONITORING_SNAP_SERVICE,
MONITORING_USER,
PATRONI_CONF_PATH,
PEER,
POSTGRESQL_SNAP_NAME,
REPLICATION_PASSWORD_KEY,
REWIND_PASSWORD_KEY,
SECRET_DELETED_LABEL,
SECRET_INTERNAL_LABEL,
SECRET_KEY_OVERRIDES,
SNAP_PACKAGES,
SYSTEM_USERS,
TLS_CA_FILE,
TLS_CERT_FILE,
TLS_KEY_FILE,
TRACING_PROTOCOL,
TRACING_RELATION_NAME,
UNIT_SCOPE,
USER,
USER_PASSWORD_KEY,
)
from relations.async_replication import (
REPLICATION_CONSUMER_RELATION,
REPLICATION_OFFER_RELATION,
PostgreSQLAsyncReplication,
)
from relations.db import EXTENSIONS_BLOCKING_MESSAGE, DbProvides
from relations.postgresql_provider import PostgreSQLProvider
from upgrade import PostgreSQLUpgrade, get_postgresql_dependencies_model
from utils import new_password
logger = logging.getLogger(__name__)
PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit"
EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs"
Scopes = Literal[APP_SCOPE, UNIT_SCOPE]
@trace_charm(
tracing_endpoint="tracing_endpoint",
extra_types=(
ClusterTopologyObserver,
COSAgentProvider,
DbProvides,
Patroni,
PostgreSQL,
PostgreSQLAsyncReplication,
PostgreSQLBackups,
PostgreSQLProvider,
PostgreSQLTLS,
PostgreSQLUpgrade,
RollingOpsManager,
),
)
class PostgresqlOperatorCharm(TypedCharmBase[CharmConfig]):
"""Charmed Operator for the PostgreSQL database."""
config_type = CharmConfig
on = ClusterTopologyChangeCharmEvents()
def __init__(self, *args):
super().__init__(*args)
# Support for disabling the operator.
disable_file = Path(f"{os.environ.get('CHARM_DIR')}/disable")
if disable_file.exists():
logger.warning(
f"\n\tDisable file `{disable_file.resolve()}` found, the charm will skip all events."
"\n\tTo resume normal operations, please remove the file."
)
self.unit.status = BlockedStatus("Disabled")
sys.exit(0)
self.peer_relation_app = DataPeerData(
self.model,
relation_name=PEER,
secret_field_name=SECRET_INTERNAL_LABEL,
deleted_label=SECRET_DELETED_LABEL,
)
self.peer_relation_unit = DataPeerUnitData(
self.model,
relation_name=PEER,
secret_field_name=SECRET_INTERNAL_LABEL,
deleted_label=SECRET_DELETED_LABEL,
)
juju_version = JujuVersion.from_environ()
if juju_version.major > 2:
run_cmd = "/usr/bin/juju-exec"
else:
run_cmd = "/usr/bin/juju-run"
self._observer = ClusterTopologyObserver(self, run_cmd)
self.framework.observe(self.on.cluster_topology_change, self._on_cluster_topology_change)
self.framework.observe(self.on.install, self._on_install)
self.framework.observe(self.on.leader_elected, self._on_leader_elected)
self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.get_primary_action, self._on_get_primary)
self.framework.observe(self.on[PEER].relation_changed, self._on_peer_relation_changed)
self.framework.observe(self.on.secret_changed, self._on_peer_relation_changed)
self.framework.observe(self.on[PEER].relation_departed, self._on_peer_relation_departed)
self.framework.observe(self.on.pgdata_storage_detaching, self._on_pgdata_storage_detaching)
self.framework.observe(self.on.start, self._on_start)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.update_status, self._on_update_status)
self.cluster_name = self.app.name
self._member_name = self.unit.name.replace("/", "-")
self._storage_path = self.meta.storages["pgdata"].location
self.upgrade = PostgreSQLUpgrade(
self,
model=get_postgresql_dependencies_model(),
relation_name="upgrade",
substrate="vm",
)
self.postgresql_client_relation = PostgreSQLProvider(self)
self.legacy_db_relation = DbProvides(self, admin=False)
self.legacy_db_admin_relation = DbProvides(self, admin=True)
self.backup = PostgreSQLBackups(self, "s3-parameters")
self.tls = PostgreSQLTLS(self, PEER)
self.async_replication = PostgreSQLAsyncReplication(self)
self.restart_manager = RollingOpsManager(
charm=self, relation="restart", callback=self._restart
)
self._observer.start_observer()
self._grafana_agent = COSAgentProvider(
self,
metrics_endpoints=[{"path": "/metrics", "port": METRICS_PORT}],
scrape_configs=self.patroni_scrape_config,
refresh_events=[
self.on[PEER].relation_changed,
self.on.secret_changed,
self.on.secret_remove,
],
log_slots=[f"{POSTGRESQL_SNAP_NAME}:logs"],
)
self._tracing = TracingEndpointRequirer(
self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL]
)
def patroni_scrape_config(self) -> List[Dict]:
"""Generates scrape config for the Patroni metrics endpoint."""
return [
{
"metrics_path": "/metrics",
"static_configs": [{"targets": [f"{self._unit_ip}:8008"]}],
"tls_config": {"insecure_skip_verify": True},
"scheme": "https" if self.is_tls_enabled else "http",
}
]
@property
def app_units(self) -> set[Unit]:
"""The peer-related units in the application."""
if not self._peers:
return set()
return {self.unit, *self._peers.units}
@property
def app_peer_data(self) -> Dict:
"""Application peer relation data object."""
relation = self.model.get_relation(PEER)
if relation is None:
return {}
return relation.data[self.app]
@property
def unit_peer_data(self) -> Dict:
"""Unit peer relation data object."""
relation = self.model.get_relation(PEER)
if relation is None:
return {}
return relation.data[self.unit]
@property
def tracing_endpoint(self) -> Optional[str]:
"""Otlp http endpoint for charm instrumentation."""
if self._tracing.is_ready():
return self._tracing.get_endpoint(TRACING_PROTOCOL)
def _peer_data(self, scope: Scopes) -> Dict:
"""Return corresponding databag for app/unit."""
relation = self.model.get_relation(PEER)
if relation is None:
return {}
return relation.data[self._scope_obj(scope)]
def _scope_obj(self, scope: Scopes):
if scope == APP_SCOPE:
return self.app
if scope == UNIT_SCOPE:
return self.unit
def peer_relation_data(self, scope: Scopes) -> DataPeerData:
"""Returns the peer relation data per scope."""
if scope == APP_SCOPE:
return self.peer_relation_app
elif scope == UNIT_SCOPE:
return self.peer_relation_unit
def _translate_field_to_secret_key(self, key: str) -> str:
"""Change 'key' to secrets-compatible key field."""
if not JujuVersion.from_environ().has_secrets:
return key
key = SECRET_KEY_OVERRIDES.get(key, key)
new_key = key.replace("_", "-")
return new_key.strip("-")
def get_secret(self, scope: Scopes, key: str) -> Optional[str]:
"""Get secret from the secret storage."""
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")
peers = self.model.get_relation(PEER)
if not peers:
return None
secret_key = self._translate_field_to_secret_key(key)
# Old translation in databag is to be taken
if key != secret_key and (
result := self.peer_relation_data(scope).fetch_my_relation_field(peers.id, key)
):
return result
return self.peer_relation_data(scope).get_secret(peers.id, secret_key)
def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[str]:
"""Set secret from the secret storage."""
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")
if not value:
return self.remove_secret(scope, key)
peers = self.model.get_relation(PEER)
secret_key = self._translate_field_to_secret_key(key)
# Old translation in databag is to be deleted
if key != secret_key and self.peer_relation_data(scope).fetch_my_relation_field(
peers.id, key
):
self.peer_relation_data(scope).delete_relation_data(peers.id, [key])
self.peer_relation_data(scope).set_secret(peers.id, secret_key, value)
def remove_secret(self, scope: Scopes, key: str) -> None:
"""Removing a secret."""
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")
peers = self.model.get_relation(PEER)
secret_key = self._translate_field_to_secret_key(key)
if scope == APP_SCOPE:
self.peer_relation_app.delete_relation_data(peers.id, [secret_key])
else:
self.peer_relation_unit.delete_relation_data(peers.id, [secret_key])
@property
def is_cluster_initialised(self) -> bool:
"""Returns whether the cluster is already initialised."""
return "cluster_initialised" in self.app_peer_data
@property
def postgresql(self) -> PostgreSQL:
"""Returns an instance of the object used to interact with the database."""
return PostgreSQL(
primary_host=self.primary_endpoint,
current_host=self._unit_ip,
user=USER,
password=self.get_secret(APP_SCOPE, f"{USER}-password"),
database="postgres",
system_users=SYSTEM_USERS,
)
@property
def primary_endpoint(self) -> Optional[str]:
"""Returns the endpoint of the primary instance or None when no primary available."""
if not self._peers:
logger.debug("primary endpoint early exit: Peer relation not joined yet.")
return None
try:
for attempt in Retrying(stop=stop_after_delay(5), wait=wait_fixed(3)):
with attempt:
primary = self._patroni.get_primary()
if primary is None and (standby_leader := self._patroni.get_standby_leader()):
primary = standby_leader
primary_endpoint = self._patroni.get_member_ip(primary)
# Force a retry if there is no primary or the member that was
# returned is not in the list of the current cluster members
# (like when the cluster was not updated yet after a failed switchover).
if not primary_endpoint or primary_endpoint not in self._units_ips:
# TODO figure out why peer data is not available
if (
primary_endpoint
and len(self._units_ips) == 1
and len(self._peers.units) > 1
):
logger.warning(
"Possibly incoplete peer data: Will not map primary IP to unit IP"
)
return primary_endpoint
logger.debug(
"primary endpoint early exit: Primary IP not in cached peer list."
)
primary_endpoint = None
except RetryError:
return None
else:
return primary_endpoint
def get_hostname_by_unit(self, _) -> str:
"""Create a DNS name for a PostgreSQL unit.
Returns:
A string representing the hostname of the PostgreSQL unit.
"""
# For now, as there is no DNS hostnames on VMs, and it would also depend on
# the underlying provider (LXD, MAAS, etc.), the unit IP is returned.
return self._unit_ip
def _on_get_primary(self, event: ActionEvent) -> None:
"""Get primary instance."""
try:
primary = self._patroni.get_primary(unit_name_pattern=True)
event.set_results({"primary": primary})
except RetryError as e:
logger.error(f"failed to get primary with error {e}")
def _updated_synchronous_node_count(self, num_units: int = None) -> bool:
"""Tries to update synchronous_node_count configuration and reports the result."""
try:
self._patroni.update_synchronous_node_count(num_units)
return True
except RetryError:
logger.debug("Unable to set synchronous_node_count")
return False
def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
"""The leader removes the departing units from the list of cluster members."""
# Don't handle this event in the same unit that is departing.
if event.departing_unit == self.unit:
logger.debug("Early exit on_peer_relation_departed: Skipping departing unit")
return
# Remove the departing member from the raft cluster.
try:
departing_member = event.departing_unit.name.replace("/", "-")
member_ip = self._patroni.get_member_ip(departing_member)
self._patroni.remove_raft_member(member_ip)
except RemoveRaftMemberFailedError:
logger.debug(
"Deferring on_peer_relation_departed: Failed to remove member from raft cluster"
)
event.defer()
return
# Allow leader to update the cluster members.
if not self.unit.is_leader():
return
if "cluster_initialised" not in self._peers.data[
self.app
] or not self._updated_synchronous_node_count(len(self._units_ips)):
logger.debug("Deferring on_peer_relation_departed: cluster not initialized")
event.defer()
return
# Remove cluster members one at a time.
for member_ip in self._get_ips_to_remove():
# Check that all members are ready before removing unit from the cluster.
if not self._patroni.are_all_members_ready():
logger.info("Deferring reconfigure: another member doing sync right now")
event.defer()
return
# Update the list of the current members.
self._remove_from_members_ips(member_ip)
self.update_config()
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
return
# Update the sync-standby endpoint in the async replication data.
self.async_replication.update_async_replication_data()
def _on_pgdata_storage_detaching(self, _) -> None:
# Change the primary if it's the unit that is being removed.
try:
primary = self._patroni.get_primary(unit_name_pattern=True)
except RetryError:
# Ignore the event if the primary couldn't be retrieved.
# If a switchover is needed, an automatic failover will be triggered
# when the unit is removed.
logger.debug("Early exit on_pgdata_storage_detaching: primary cannot be retrieved")
return
if self.unit.name != primary:
return
if not self._patroni.are_all_members_ready():
logger.warning(
"could not switchover because not all members are ready"
" - an automatic failover will be triggered"
)
return
# Try to switchover to another member and raise an exception if it doesn't succeed.
# If it doesn't happen on time, Patroni will automatically run a fail-over.
try:
# Get the current primary to check if it has changed later.
current_primary = self._patroni.get_primary()
# Trigger the switchover.
self._patroni.switchover()
# Wait for the switchover to complete.
self._patroni.primary_changed(current_primary)
logger.info("successful switchover")
except (RetryError, SwitchoverFailedError) as e:
logger.warning(
f"switchover failed with reason: {e} - an automatic failover will be triggered"
)
return
# Only update the connection endpoints if there is a primary.
# A cluster can have all members as replicas for some time after
# a failed switchover, so wait until the primary is elected.
if self.primary_endpoint:
self._update_relation_endpoints()
def _on_peer_relation_changed(self, event: HookEvent):
"""Reconfigure cluster members when something changes."""
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader.
if "cluster_initialised" not in self._peers.data[self.app]:
logger.debug("Deferring on_peer_relation_changed: cluster not initialized")
event.defer()
return
# If the unit is the leader, it can reconfigure the cluster.
if self.unit.is_leader() and not self._reconfigure_cluster(event):
event.defer()
return
if self._update_member_ip():
return
# Don't update this member before it's part of the members list.
if self._unit_ip not in self.members_ips:
logger.debug("Early exit on_peer_relation_changed: Unit not in the members list")
return
# Update the list of the cluster members in the replicas to make them know each other.
try:
# Update the members of the cluster in the Patroni configuration on this unit.
self.update_config()
except RetryError:
self.unit.status = BlockedStatus("failed to update cluster members on member")
return
except ValueError as e:
self.unit.status = BlockedStatus("Configuration Error. Please check the logs")
logger.error("Invalid configuration: %s", str(e))
return
# If PITR restore failed, then wait it for resolve.
if (
"restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data
) and isinstance(self.unit.status, BlockedStatus):
event.defer()
return
# Start can be called here multiple times as it's idempotent.
# At this moment, it starts Patroni at the first time the data is received
# in the relation.
self._patroni.start_patroni()
# Assert the member is up and running before marking the unit as active.
if not self._patroni.member_started:
logger.debug("Deferring on_peer_relation_changed: awaiting for member to start")
self.unit.status = WaitingStatus("awaiting for member to start")
event.defer()
return
# Restart the workload if it's stuck on the starting state after a timeline divergence
# due to a backup that was restored.
if (
not self.is_primary
and not self.is_standby_leader
and (
self._patroni.member_replication_lag == "unknown"
or int(self._patroni.member_replication_lag) > 1000
)
):
self._patroni.reinitialize_postgresql()
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
self.unit.status = MaintenanceStatus("reinitialising replica")
event.defer()
return
self._start_stop_pgbackrest_service(event)
self._update_new_unit_status()
# Split off into separate function, because of complexity _on_peer_relation_changed
def _start_stop_pgbackrest_service(self, event: HookEvent) -> None:
# Start or stop the pgBackRest TLS server service when TLS certificate change.
if not self.backup.start_stop_pgbackrest_service():
logger.debug(
"Deferring on_peer_relation_changed: awaiting for TLS server service to start on primary"
)
event.defer()
return
self.backup.coordinate_stanza_fields()
self.backup.check_stanza()
if "exporter-started" not in self.unit_peer_data:
self._setup_exporter()
def _update_new_unit_status(self) -> None:
"""Update the status of a new unit that recently joined the cluster."""
# Only update the connection endpoints if there is a primary.
# A cluster can have all members as replicas for some time after
# a failed switchover, so wait until the primary is elected.
if self.primary_endpoint:
self._update_relation_endpoints()
self.async_replication.handle_read_only_mode()
else:
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
def _reconfigure_cluster(self, event: HookEvent):
"""Reconfigure the cluster by adding and removing members IPs to it.
Returns:
Whether it was possible to reconfigure the cluster.
"""
if (
hasattr(event, "unit")
and event.relation.data.get(event.unit) is not None
and event.relation.data[event.unit].get("ip-to-remove") is not None
):
ip_to_remove = event.relation.data[event.unit].get("ip-to-remove")
logger.info("Removing %s from the cluster due to IP change", ip_to_remove)
try:
self._patroni.remove_raft_member(ip_to_remove)
except RemoveRaftMemberFailedError:
logger.debug("Deferring on_peer_relation_changed: failed to remove raft member")
return False
if ip_to_remove in self.members_ips:
self._remove_from_members_ips(ip_to_remove)
self._add_members(event)
return True
def _update_member_ip(self) -> bool:
"""Update the member IP in the unit databag.
Returns:
Whether the IP was updated.
"""
# Stop Patroni (and update the member IP) if it was previously isolated
# from the cluster network. Patroni will start back when its IP address is
# updated in all the units through the peer relation changed event (in that
# hook, the configuration is updated and the service is started - or only
# reloaded in the other units).
stored_ip = self.unit_peer_data.get("ip")
current_ip = self.get_hostname_by_unit(None)
if stored_ip is None:
self.unit_peer_data.update({"ip": current_ip})
return False
elif current_ip != stored_ip:
logger.info(f"ip changed from {stored_ip} to {current_ip}")
self.unit_peer_data.update({"ip-to-remove": stored_ip})
self.unit_peer_data.update({"ip": current_ip})
self._patroni.stop_patroni()
self._update_certificate()
return True
else:
self.unit_peer_data.update({"ip-to-remove": ""})
return False
def _add_members(self, event):
"""Add new cluster members.
This method is responsible for adding new members to the cluster
when new units are added to the application. This event is deferred if
one of the current units is copying data from the primary, to avoid
multiple units copying data at the same time, which can cause slow
transfer rates in these processes and overload the primary instance.
"""
try:
# Compare set of Patroni cluster members and Juju hosts
# to avoid the unnecessary reconfiguration.
if self._patroni.cluster_members == self._hosts:
logger.debug("Early exit add_members: Patroni members equal Juju hosts")
return
logger.info("Reconfiguring cluster")
self.unit.status = MaintenanceStatus("reconfiguring cluster")
for member in self._hosts - self._patroni.cluster_members:
logger.debug("Adding %s to cluster", member)
self.add_cluster_member(member)
self._patroni.update_synchronous_node_count()
except NotReadyError:
logger.info("Deferring reconfigure: another member doing sync right now")
event.defer()
except RetryError:
logger.info("Deferring reconfigure: couldn't retrieve current cluster members")
event.defer()
def add_cluster_member(self, member: str) -> None:
"""Add member to the cluster if all members are already up and running.
Raises:
NotReadyError if either the new member or the current members are not ready.
"""
unit = self.model.get_unit("/".join(member.rsplit("-", 1)))
member_ip = self._get_unit_ip(unit)
if not self._patroni.are_all_members_ready():
logger.info("not all members are ready")
raise NotReadyError("not all members are ready")
# Add the member to the list that should be updated in each other member.
self._add_to_members_ips(member_ip)
# Update Patroni configuration file.
try:
self.update_config()
except RetryError:
self.unit.status = BlockedStatus("failed to update cluster members on member")
def _get_unit_ip(self, unit: Unit) -> Optional[str]:
"""Get the IP address of a specific unit."""
# Check if host is current host.
if unit == self.unit:
return str(self.model.get_binding(PEER).network.bind_address)
# Check if host is a peer.
elif unit in self._peers.data:
return str(self._peers.data[unit].get("private-address"))
# Return None if the unit is not a peer neither the current unit.
else:
return None
@property
def _hosts(self) -> set:
"""List of the current Juju hosts.
Returns:
a set containing the current Juju hosts
with the names using - instead of /
to match Patroni members names
"""
peers = self.model.get_relation(PEER)
hosts = [self.unit.name.replace("/", "-")] + [
unit.name.replace("/", "-") for unit in peers.units
]
return set(hosts)
@property
def _patroni(self) -> Patroni:
"""Returns an instance of the Patroni object."""
return Patroni(
self,
self._unit_ip,
self.cluster_name,
self._member_name,
self.app.planned_units(),
self._peer_members_ips,
self._get_password(),
self._replication_password,
self.get_secret(APP_SCOPE, REWIND_PASSWORD_KEY),
bool(self.unit_peer_data.get("tls")),
)
@property
def is_primary(self) -> bool:
"""Return whether this unit is the primary instance."""
return self.unit.name == self._patroni.get_primary(unit_name_pattern=True)
@property
def is_standby_leader(self) -> bool:
"""Return whether this unit is the standby leader instance."""
return self.unit.name == self._patroni.get_standby_leader(unit_name_pattern=True)
@property
def is_tls_enabled(self) -> bool:
"""Return whether TLS is enabled."""
return all(self.tls.get_tls_files())
@property
def _peer_members_ips(self) -> Set[str]:
"""Fetch current list of peer members IPs.
Returns:
A list of peer members addresses (strings).
"""
# Get all members IPs and remove the current unit IP from the list.
addresses = self.members_ips
current_unit_ip = self._unit_ip
if current_unit_ip in addresses:
addresses.remove(current_unit_ip)
return addresses
@property
def _units_ips(self) -> Set[str]:
"""Fetch current list of peers IPs.
Returns:
A list of peers addresses (strings).
"""
# Get all members IPs and remove the current unit IP from the list.
addresses = {self._get_unit_ip(unit) for unit in self._peers.units}
addresses.add(self._unit_ip)
return addresses
@property
def members_ips(self) -> Set[str]:
"""Returns the list of IPs addresses of the current members of the cluster."""
return set(json.loads(self._peers.data[self.app].get("members_ips", "[]")))
def _add_to_members_ips(self, ip: str) -> None:
"""Add one IP to the members list."""
self._update_members_ips(ip_to_add=ip)
def _remove_from_members_ips(self, ip: str) -> None:
"""Remove IPs from the members list."""
self._update_members_ips(ip_to_remove=ip)
def _update_members_ips(self, ip_to_add: str = None, ip_to_remove: str = None) -> None:
"""Update cluster member IPs on application data.
Member IPs on application data are used to determine when a unit of PostgreSQL
should be added or removed from the PostgreSQL cluster.
NOTE: this function does not update the IPs on the PostgreSQL cluster
in the Patroni configuration.
"""
# Allow leader to reset which members are part of the cluster.
if not self.unit.is_leader():
return
ips = json.loads(self._peers.data[self.app].get("members_ips", "[]"))
if ip_to_add and ip_to_add not in ips:
ips.append(ip_to_add)
elif ip_to_remove:
ips.remove(ip_to_remove)
self._peers.data[self.app]["members_ips"] = json.dumps(ips)
@retry(
stop=stop_after_delay(60),
wait=wait_fixed(5),
reraise=True,
)
def _change_primary(self) -> None:
"""Change the primary member of the cluster."""
# Try to switchover to another member and raise an exception if it doesn't succeed.
# If it doesn't happen on time, Patroni will automatically run a fail-over.
try:
# Get the current primary to check if it has changed later.
current_primary = self._patroni.get_primary()
# Trigger the switchover.
self._patroni.switchover()
# Wait for the switchover to complete.
self._patroni.primary_changed(current_primary)
logger.info("successful switchover")
except (RetryError, SwitchoverFailedError) as e:
logger.warning(
f"switchover failed with reason: {e} - an automatic failover will be triggered"
)
@property
def _unit_ip(self) -> str:
"""Current unit ip."""
return str(self.model.get_binding(PEER).network.bind_address)
def _on_cluster_topology_change(self, _):
"""Updates endpoints and (optionally) certificates when the cluster topology changes."""
logger.info("Cluster topology changed")
if self.primary_endpoint:
self._update_relation_endpoints()
self.unit.status = ActiveStatus()
def _on_install(self, event: InstallEvent) -> None:
"""Install prerequisites for the application."""
logger.debug("Install start time: %s", datetime.now())
if not self._is_storage_attached():
self._reboot_on_detached_storage(event)
return
self.unit.status = MaintenanceStatus("installing PostgreSQL")
# Install the charmed PostgreSQL snap.
try:
self._install_snap_packages(packages=SNAP_PACKAGES)
except snap.SnapError:
self.unit.status = BlockedStatus("failed to install snap packages")
return
cache = snap.SnapCache()
postgres_snap = cache[POSTGRESQL_SNAP_NAME]
postgres_snap.alias("patronictl")
postgres_snap.alias("psql")
# Create the user home directory for the snap_daemon user.
# This is needed due to https://bugs.launchpad.net/snapd/+bug/2011581.
try:
subprocess.check_call("mkdir -p /home/snap_daemon".split())
subprocess.check_call("chown snap_daemon:snap_daemon /home/snap_daemon".split())
subprocess.check_call("usermod -d /home/snap_daemon snap_daemon".split())
except subprocess.CalledProcessError:
logger.exception("Unable to create snap_daemon home dir")
self.unit.status = WaitingStatus("waiting to start PostgreSQL")
def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
"""Handle the leader-elected event."""
# The leader sets the needed passwords if they weren't set before.
if self.get_secret(APP_SCOPE, USER_PASSWORD_KEY) is None:
self.set_secret(APP_SCOPE, USER_PASSWORD_KEY, new_password())
if self.get_secret(APP_SCOPE, REPLICATION_PASSWORD_KEY) is None:
self.set_secret(APP_SCOPE, REPLICATION_PASSWORD_KEY, new_password())
if self.get_secret(APP_SCOPE, REWIND_PASSWORD_KEY) is None:
self.set_secret(APP_SCOPE, REWIND_PASSWORD_KEY, new_password())
if self.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY) is None:
self.set_secret(APP_SCOPE, MONITORING_PASSWORD_KEY, new_password())
# Update the list of the current PostgreSQL hosts when a new leader is elected.
# Add this unit to the list of cluster members
# (the cluster should start with only this member).
if self._unit_ip not in self.members_ips:
self._add_to_members_ips(self._unit_ip)
# Remove departing units when the leader changes.
for ip in self._get_ips_to_remove():
logger.info("Removing %s from the cluster", ip)
self._remove_from_members_ips(ip)
self.update_config()
# Don't update connection endpoints in the first time this event run for
# this application because there are no primary and replicas yet.
if "cluster_initialised" not in self._peers.data[self.app]:
logger.debug("Early exit on_leader_elected: Cluster not initialized")
return
# Only update the connection endpoints if there is a primary.
# A cluster can have all members as replicas for some time after
# a failed switchover, so wait until the primary is elected.
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
def _on_config_changed(self, event) -> None:
"""Handle configuration changes, like enabling plugins."""
if not self.is_cluster_initialised:
logger.debug("Defer on_config_changed: cluster not initialised yet")
event.defer()
return
if not self.upgrade.idle:
logger.debug("Defer on_config_changed: upgrade in progress")
event.defer()
return
try:
self._validate_config_options()
# update config on every run
self.update_config()
except psycopg2.OperationalError:
logger.debug("Defer on_config_changed: Cannot connect to database")
event.defer()
return
except ValueError as e:
self.unit.status = BlockedStatus("Configuration Error. Please check the logs")
logger.error("Invalid configuration: %s", str(e))
return
if self.is_blocked and "Configuration Error" in self.unit.status.message:
self.unit.status = ActiveStatus()
# Update the sync-standby endpoint in the async replication data.
self.async_replication.update_async_replication_data()
if not self.unit.is_leader():
return
# Enable and/or disable the extensions.
self.enable_disable_extensions()
# Unblock the charm after extensions are enabled (only if it's blocked due to application
# charms requesting extensions).
if self.unit.status.message != EXTENSIONS_BLOCKING_MESSAGE:
return
for relation in [
*self.model.relations.get("db", []),
*self.model.relations.get("db-admin", []),
]:
if not self.legacy_db_relation.set_up_relation(relation):
logger.debug(
"Early exit on_config_changed: legacy relation requested extensions that are still disabled"
)
return
def enable_disable_extensions(self, database: str = None) -> None:
"""Enable/disable PostgreSQL extensions set through config options.
Args:
database: optional database where to enable/disable the extension.
"""
if self._patroni.get_primary() is None:
logger.debug("Early exit enable_disable_extensions: standby cluster")
return
spi_module = ["refint", "autoinc", "insert_username", "moddatetime"]
plugins_exception = {"uuid_ossp": '"uuid-ossp"'}
original_status = self.unit.status
extensions = {}
# collect extensions