Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-3684] Three units scenarios #663

Draft
wants to merge 16 commits into
base: dpe-3684-reinitialise-raft
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,7 @@ def _raft_reinitialisation(self) -> None:
self.unit_peer_data.pop("raft_stopped", None)
self.update_config()
self._patroni.start_patroni()
self._set_primary_status_message()

if self.unit.is_leader():
self._stuck_raft_cluster_cleanup()
Expand Down Expand Up @@ -1594,6 +1595,8 @@ def _can_run_on_update_status(self) -> bool:

if self.has_raft_keys():
logger.debug("Early exit on_update_status: Raft recovery in progress")
if self.unit.is_leader():
self._raft_reinitialisation()
return False

if not self.upgrade.idle:
Expand Down Expand Up @@ -1670,10 +1673,18 @@ def _set_primary_status_message(self) -> None:
self.app_peer_data["s3-initialization-block-message"]
)
return
if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name:
self.unit.status = ActiveStatus("Primary")
elif self.is_standby_leader:
self.unit.status = ActiveStatus("Standby")
if (
self._patroni.get_primary(unit_name_pattern=True) == self.unit.name
or self.is_standby_leader
):
danger_state = ""
if not self._patroni.has_raft_quorum():
danger_state = " (disaster)"
elif len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
danger_state = " (degraded)"
self.unit.status = ActiveStatus(
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
)
elif self._patroni.member_started:
self.unit.status = ActiveStatus()
except (RetryError, ConnectionError) as e:
Expand Down
52 changes: 46 additions & 6 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import requests
from charms.operator_libs_linux.v2 import snap
from jinja2 import Template
from ops import MaintenanceStatus
from pysyncobj.utility import TcpUtility, UtilityException
from tenacity import (
AttemptManager,
Expand Down Expand Up @@ -774,6 +775,19 @@ def primary_changed(self, old_primary: str) -> bool:
primary = self.get_primary()
return primary != old_primary

def has_raft_quorum(self) -> bool:
"""Check if raft cluster has quorum."""
# Get the status of the raft cluster.
syncobj_util = TcpUtility(password=self.raft_password, timeout=3)

raft_host = "127.0.0.1:2222"
try:
raft_status = syncobj_util.executeCommand(raft_host, ["status"])
except UtilityException:
logger.warning("Has raft quorum: Cannot connect to raft cluster")
return False
return raft_status["has_quorum"]

def remove_raft_data(self) -> None:
"""Stops Patroni and removes the raft journals."""
logger.info("Stopping patroni")
Expand Down Expand Up @@ -827,6 +841,33 @@ def reinitialise_raft_data(self) -> None:
raise RaftPostgresqlNotUpError()
logger.info("Raft should be unstuck")

def _get_role(self) -> str | None:
members = requests.get(
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
auth=self._patroni_auth,
).json()["members"]
member_name = self.charm.unit.name[::-1].replace("/", "-", 1)[::-1]
for member in members:
if member["name"] == member_name:
return member["role"]

def get_running_cluster_members(self) -> list[str]:
"""List running patroni members."""
try:
members = requests.get(
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
auth=self._patroni_auth,
).json()["members"]
return [
member["name"] for member in members if member["state"] in ("streaming", "running")
]
except Exception:
return []

def remove_raft_member(self, member_ip: str) -> None:
"""Remove a member from the raft cluster.

Expand Down Expand Up @@ -860,16 +901,15 @@ def remove_raft_member(self, member_ip: str) -> None:
if not raft_status["has_quorum"] and (
not raft_status["leader"] or raft_status["leader"].host == member_ip
):
self.charm.unit.status = MaintenanceStatus("Reinitialising raft")
logger.warning("Remove raft member: Stuck raft cluster detected")
data_flags = {"raft_stuck": "True"}
try:
health_status = self.get_patroni_health()
candidate = self._get_role() in ["leader", "sync_standby"]
except Exception:
logger.warning("Remove raft member: Unable to get health status")
health_status = {}
if health_status.get("role") in ("leader", "master") or health_status.get(
"sync_standby"
):
logger.warning("Remove raft member: Unable to get cluster status")
candidate = False
if candidate:
logger.info(f"{self.charm.unit.name} is raft candidate")
data_flags["raft_candidate"] = "True"
self.charm.unit_peer_data.update(data_flags)
Expand Down
108 changes: 108 additions & 0 deletions tests/integration/ha_tests/test_scaling_three_units.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python3
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import logging
from asyncio import gather, sleep

import pytest
from pytest_operator.plugin import OpsTest

from .. import markers
from ..helpers import (
CHARM_BASE,
DATABASE_APP_NAME,
get_machine_from_unit,
stop_machine,
)
from .conftest import APPLICATION_NAME
from .helpers import (
app_name,
are_writes_increasing,
check_writes,
get_cluster_roles,
start_continuous_writes,
)

logger = logging.getLogger(__name__)

charm = None


@pytest.mark.group(1)
@markers.juju3
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
"""Build and deploy two PostgreSQL clusters."""
# This is a potentially destructive test, so it shouldn't be run against existing clusters
charm = await ops_test.build_charm(".")
async with ops_test.fast_forward():
# Deploy the first cluster with reusable storage
await gather(
ops_test.model.deploy(
charm,
application_name=DATABASE_APP_NAME,
num_units=3,
base=CHARM_BASE,
config={"profile": "testing"},
),
ops_test.model.deploy(
APPLICATION_NAME,
application_name=APPLICATION_NAME,
base=CHARM_BASE,
channel="edge",
),
)

await ops_test.model.wait_for_idle(status="active", timeout=1500)


@pytest.mark.group(1)
@markers.juju3
@pytest.mark.parametrize(
"roles",
[
["primaries"],
["sync_standbys"],
["replicas"],
["primaries", "replicas"],
["sync_standbys", "replicas"],
],
)
@pytest.mark.abort_on_fail
async def test_removing_unit(ops_test: OpsTest, roles: list[str], continuous_writes) -> None:
logger.info(f"removing {', '.join(roles)}")
# Start an application that continuously writes data to the database.
app = await app_name(ops_test)
original_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[DATABASE_APP_NAME].units[0].name
)
await start_continuous_writes(ops_test, app)
logger.info("Stopping unit")
units = [original_roles[role][0] for role in roles]
for unit in units:
await stop_machine(ops_test, await get_machine_from_unit(ops_test, unit))
await sleep(15)
logger.info("Deleting unit")
for unit in units:
await ops_test.model.destroy_unit(unit, force=True, destroy_storage=False, max_wait=1500)

await ops_test.model.wait_for_idle(status="active", timeout=600, idle_period=45)

await are_writes_increasing(ops_test, unit)

logger.info("Scaling back up")
await ops_test.model.applications[DATABASE_APP_NAME].add_unit(count=len(roles))
await ops_test.model.wait_for_idle(status="active", timeout=1500)

new_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[DATABASE_APP_NAME].units[0].name
)
assert len(new_roles["primaries"]) == 1
assert len(new_roles["sync_standbys"]) == 1
assert len(new_roles["replicas"]) == 1
if "primaries" in roles:
assert new_roles["primaries"][0] == original_roles["sync_standbys"][0]
else:
assert new_roles["primaries"][0] == original_roles["primaries"][0]

await check_writes(ops_test)
7 changes: 6 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,7 @@ def test_raft_reinitialisation(harness):
patch("charm.Patroni.remove_raft_data") as _remove_raft_data,
patch("charm.Patroni.reinitialise_raft_data") as _reinitialise_raft_data,
patch("charm.PostgresqlOperatorCharm.update_config") as _update_config,
patch("charm.PostgresqlOperatorCharm._set_primary_status_message"),
):
# No data
harness.charm._raft_reinitialisation()
Expand Down Expand Up @@ -2569,6 +2570,8 @@ def test_update_new_unit_status(harness):
@pytest.mark.parametrize("is_leader", [True, False])
def test_set_primary_status_message(harness, is_leader):
with (
patch("charm.Patroni.has_raft_quorum", return_value=True),
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
patch(
"charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock
Expand Down Expand Up @@ -2602,7 +2605,9 @@ def test_set_primary_status_message(harness, is_leader):
assert isinstance(harness.charm.unit.status, MaintenanceStatus)
else:
_is_standby_leader.side_effect = None
_is_standby_leader.return_value = values[1]
_is_standby_leader.return_value = (
values[0] != harness.charm.unit.name and values[1]
)
harness.charm._set_primary_status_message()
assert isinstance(
harness.charm.unit.status,
Expand Down
14 changes: 9 additions & 5 deletions tests/unit/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,7 @@ def test_remove_raft_member(patroni):
def test_remove_raft_member_no_quorum(patroni, harness):
with (
patch("cluster.TcpUtility") as _tcp_utility,
patch("cluster.Patroni.get_patroni_health") as _get_patroni_health,
patch("cluster.requests.get") as _get,
patch(
"charm.PostgresqlOperatorCharm.unit_peer_data", new_callable=PropertyMock
) as _unit_peer_data,
Expand All @@ -747,7 +747,9 @@ def test_remove_raft_member_no_quorum(patroni, harness):
"has_quorum": False,
"leader": None,
}
_get_patroni_health.return_value = {"role": "replica", "sync_standby": False}
_get.return_value.json.return_value = {
"members": [{"role": "async_replica", "name": "postgresql-0"}]
}

patroni.remove_raft_member("1.2.3.4")
assert harness.charm.unit_peer_data == {"raft_stuck": "True"}
Expand All @@ -759,7 +761,7 @@ def test_remove_raft_member_no_quorum(patroni, harness):
"has_quorum": False,
"leader": None,
}
_get_patroni_health.side_effect = Exception
_get.side_effect = Exception

patroni.remove_raft_member("1.2.3.4")

Expand All @@ -774,8 +776,10 @@ def test_remove_raft_member_no_quorum(patroni, harness):
"has_quorum": False,
"leader": leader_mock,
}
_get_patroni_health.side_effect = None
_get_patroni_health.return_value = {"role": "replica", "sync_standby": True}
_get.side_effect = None
_get.return_value.json.return_value = {
"members": [{"role": "sync_standby", "name": "postgresql-0"}]
}

patroni.remove_raft_member("1.2.3.4")

Expand Down