Skip to content

Commit

Permalink
ducktape: test for node wise recovery.
Browse files Browse the repository at this point in the history
  • Loading branch information
bharathv committed Nov 29, 2023
1 parent 7f76cf5 commit f75a059
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 1 deletion.
13 changes: 13 additions & 0 deletions tests/rptest/services/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,19 @@ def force_abort_partition_move(self,
path = f"partitions/{namespace}/{topic}/{partition}/unclean_abort_reconfiguration"
return self._request('post', path, node=node)

def get_majority_lost_partitions_from_nodes(self,
defunct_brokers: list[int],
node=None):
assert defunct_brokers
brokers_csv = ','.join(str(b) for b in defunct_brokers)
path = f"partitions/majority_lost?defunct_nodes={brokers_csv}"
return self._request('get', path, node).json()

def force_recover_partitions_from_nodes(self, payload: dict, node=None):
assert payload
path = "partitions/force_recover_from_nodes"
return self._request('post', path, node, json=payload)

def create_user(self, username, password, algorithm):
self.redpanda.logger.debug(
f"Creating user {username}:{password}:{algorithm}")
Expand Down
136 changes: 135 additions & 1 deletion tests/rptest/tests/partition_force_reconfiguration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0
import random
import requests
from rptest.services.cluster import cluster
from rptest.clients.types import TopicSpec
Expand All @@ -30,7 +31,7 @@ def __init__(self, test_context, *args, **kwargs):
self).__init__(test_context, *args, **kwargs)

SPEC = TopicSpec(name="topic", replication_factor=5)
WAIT_TIMEOUT_S = 30
WAIT_TIMEOUT_S = 60

def _start_redpanda(self, acks=-1):
self.start_redpanda(
Expand Down Expand Up @@ -284,3 +285,136 @@ def get_lso():
self.await_num_produced(min_records=10000)
self.start_consumer()
self.run_validation()

@cluster(num_nodes=5)
@matrix(defunct_node_count=[1, 2])
def test_node_wise_recovery(self, defunct_node_count):
self.start_redpanda(num_nodes=5,
extra_rp_conf={
"partition_autobalancing_mode": "continuous",
"enable_leader_balancer": False,
})
num_topics = 20
# Create a mix of rf=1 and 3 topics.
topics = []
for i in range(0, num_topics):
rf = 3 if i % 2 == 0 else 1
parts = random.randint(1, 3)
spec = TopicSpec(name=f"topic-{i}",
replication_factor=rf,
partition_count=parts)
topics.append(spec)
self.client().create_topic(spec)

admin = self.redpanda._admin

to_kill_nodes = random.sample(self.redpanda.started_nodes(),
defunct_node_count)
to_kill_node_ids = [
int(self.redpanda.node_id(n)) for n in to_kill_nodes
]

partitions_lost_majority = admin.get_majority_lost_partitions_from_nodes(
defunct_brokers=to_kill_node_ids)

self.logger.debug(f"Stopping nodes: {to_kill_node_ids}")
self.redpanda.for_nodes(to_kill_nodes, self.redpanda.stop_node)

def controller_available():
controller = self.redpanda.controller()
return controller is not None and self.redpanda.node_id(
controller) not in to_kill_node_ids

wait_until(controller_available,
timeout_sec=self.WAIT_TIMEOUT_S,
backoff_sec=3,
err_msg="Controller not available")

def make_recovery_payload(defunct_nodes: list[int],
partitions_lost_majority: dict):
return {
"defunct_nodes": defunct_nodes,
"partitions_to_force_recover": partitions_lost_majority
}

payload = make_recovery_payload(to_kill_node_ids,
partitions_lost_majority)
self.logger.debug(f"payload: {payload}")

surviving_node = random.choice([
n for n in self.redpanda.started_nodes()
if self.redpanda.node_id(n) not in to_kill_node_ids
])

# issue a node wise recovery
self.redpanda._admin.force_recover_partitions_from_nodes(
payload=make_recovery_payload(to_kill_node_ids,
partitions_lost_majority),
node=surviving_node)

# Ensure the brokers are marked defunct.
def brokers_are_defunct():
brokers = admin.get_brokers(node=surviving_node)
for broker in brokers:
expected_liveness_state = "defunct" if broker[
"node_id"] in to_kill_node_ids else "functional"
assert "liveness_status" in broker.keys()
if expected_liveness_state != broker["liveness_status"]:
return False
return len(brokers) == len(self.redpanda.nodes)

wait_until(brokers_are_defunct,
timeout_sec=self.WAIT_TIMEOUT_S,
backoff_sec=2,
err_msg="Brokers are not designated defunct")

# Wait until there are no partition assignments with majority loss due to dead nodes.
wait_until(lambda: len(
admin.get_majority_lost_partitions_from_nodes(
defunct_brokers=to_kill_node_ids, node=surviving_node)) == 0,
timeout_sec=self.WAIT_TIMEOUT_S,
backoff_sec=3,
err_msg="Node wise recovery failed")

def pending_force_reconfigurations():
try:
return admin.get_partition_balancer_status(
)["partitions_pending_force_recovery_count"]
except:
return -1

wait_until(lambda: pending_force_reconfigurations() == 0,
timeout_sec=self.WAIT_TIMEOUT_S,
backoff_sec=2,
err_msg="reported force recovery count is non zero")

# Ensure every partition has a stable leader.
for topic in topics:
for part in range(0, topic.partition_count):
self.redpanda._admin.await_stable_leader(
topic=topic.name,
partition=part,
timeout_s=self.WAIT_TIMEOUT_S,
backoff_s=2,
hosts=self._alive_nodes())

def nodes_have_no_replicas(nodes: list[int]):
"""Returns True if there are no replicas of user ntps on input set of nodes."""
node_set = set(nodes)
partitions = []
for topic in topics:
p_list = self.redpanda._admin.get_partitions(topic=topic.name)
for partition in p_list:
replicas = set(
[r['node_id'] for r in partition["replicas"]])
if any([r in node_set for r in replicas]):
partitions.append(partition)
self.logger.debug(
f"partition with replicas on: {nodes} list: {partitions}")
return len(partitions) == 0

# Ensure the partition balancer drains all replicas from defunct nodes.
wait_until(lambda: nodes_have_no_replicas(to_kill_node_ids),
timeout_sec=self.WAIT_TIMEOUT_S,
backoff_sec=3,
err_msg="Not all replicas are drained from defunct nodes")

0 comments on commit f75a059

Please sign in to comment.