From 0f921ac9126520d8a3985132d4c68e0530a696dd Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sun, 11 Aug 2024 22:28:52 +0200 Subject: [PATCH] raft: Fix cluster break down on leaving with some nodes down. WIP Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 37 ++++++++--- tests/ovsdb-cluster.at | 143 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 10 deletions(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 9c3c351b5be..2b541c677de 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -436,7 +436,7 @@ raft_alloc(void) hmap_node_nullify(&raft->hmap_node); hmap_init(&raft->servers); raft->log_start = raft->log_end = 1; - raft->role = RAFT_FOLLOWER; + raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__); sset_init(&raft->remote_addresses); raft->join_timeout = LLONG_MAX; ovs_list_init(&raft->waiters); @@ -1860,10 +1860,6 @@ raft_start_election(struct raft *raft, bool is_prevote, /* Leadership transfer doesn't use pre-vote. */ ovs_assert(!is_prevote || !leadership_transfer); - if (raft->leaving) { - return; - } - struct raft_server *me = raft_find_server(raft, &raft->sid); if (!me) { return; @@ -1876,7 +1872,7 @@ raft_start_election(struct raft *raft, bool is_prevote, ovs_assert(raft->role != RAFT_LEADER); raft->leader_sid = UUID_ZERO; - raft->role = RAFT_CANDIDATE; + raft->role = RAFT_CANDIDATE; VLOG_INFO("%s: CANDIDATE", __func__); raft->prevote_passed = !is_prevote; if (is_prevote || leadership_transfer) { @@ -1990,6 +1986,12 @@ raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn) return true; } + /* Keep the connection until we send a RemoveServerReply. */ + if (raft->remove_server + && uuid_equals(&conn->sid, &raft->remove_server->sid)) { + return true; + } + /* We have joined the cluster. If we did that "recently", then there is a * chance that we do not have the most recent server configuration log * entry. If so, it's a waste to disconnect from the servers that were in @@ -2116,6 +2118,8 @@ raft_run(struct raft *raft) count ++; } } + VLOG_INFO("Replied: %d out of %d servers", + count, (int) hmap_count(&raft->servers)); if (count >= hmap_count(&raft->servers) / 2) { HMAP_FOR_EACH (server, hmap_node, &raft->servers) { server->replied = false; @@ -2132,6 +2136,11 @@ raft_run(struct raft *raft) } if (raft->leaving && time_msec() >= raft->leave_timeout) { + if (raft->role == RAFT_LEADER) { + raft_transfer_leadership(raft, + "this server is leaving the cluster"); + raft_become_follower(raft); + } raft_send_remove_server_requests(raft); } @@ -2440,7 +2449,7 @@ raft_command_execute__(struct raft *raft, const struct json *data, const struct json *servers, uint64_t election_timer, const struct uuid *prereq, struct uuid *result) { - if (raft->joining || raft->leaving || raft->left || raft->failed) { + if (raft->joining || raft->left || raft->failed) { return raft_command_create_completed(RAFT_CMD_SHUTDOWN); } @@ -2778,7 +2787,7 @@ raft_become_follower(struct raft *raft) return; } - raft->role = RAFT_FOLLOWER; + raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__); raft_reset_election_timer(raft); /* Notify clients about lost leadership. @@ -2906,7 +2915,7 @@ raft_become_leader(struct raft *raft) raft->n_votes, hmap_count(&raft->servers)); ovs_assert(raft->role != RAFT_LEADER); - raft->role = RAFT_LEADER; + raft->role = RAFT_LEADER; VLOG_INFO("%s: LEADER", __func__); raft->election_won = time_msec(); raft_set_leader(raft, &raft->sid); raft_reset_election_timer(raft); @@ -3367,7 +3376,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid) * least as large as the candidate's current term, then the * candidate recognizes the leader as legitimate and returns to * follower state. */ - raft->role = RAFT_FOLLOWER; + raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__); } return true; } @@ -4143,6 +4152,14 @@ raft_handle_remove_server_request(struct raft *raft, return; } + /* Check for a server being removed right now. */ + if (raft->remove_server + && uuid_equals(&rq->sid, &raft->remove_server->sid)) { + raft_send_remove_server_reply(raft, rq, + false, RAFT_SERVER_IN_PROGRESS); + return; + } + /* If the server isn't configured, report that. */ target = raft_find_server(raft, &rq->sid); if (!target) { diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 9d8b4d06a4a..19925ac4478 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -578,6 +578,149 @@ for i in $(seq $n); do OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) done +AT_CLEANUP + +AT_SETUP([OVSDB cluster - leaving the cluster with some servers down]) +AT_KEYWORDS([ovsdb server negative unix cluster leave qwe]) + +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db \ + $top_srcdir/vswitchd/vswitch.ovsschema unix:s1.raft], [0], [], [stderr]) +schema_name=$(ovsdb-tool schema-name $top_srcdir/vswitchd/vswitch.ovsschema) +for i in 2 3 4 5; do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +on_exit " + for i in \$(ls $(pwd)/s[[0-5]]); do + ovs-appctl --timeout 1 -t \$i cluster/status $schema_name; + done +" +dnl Starting all the servers. +for i in 1 2 3 4 5; do + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s$i.log \ + --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in 1 2 3 4 5; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +dnl Make sure the cluster is operational. +m4_define([DB_REMOTE], [unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait init]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-1], [0], [ignore]) + +dnl Stop servers 1 and 2. +OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid]) +OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid]) + +dnl Make sure that all remaining servers are functional as a cluster. +for i in 3 4 5; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +dnl Make sure the cluster is still operational. +m4_define([DB_REMOTE], [unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-2], [0], [ignore]) + +dnl Servers 1 and 2 in a cluster of 5 are down, 3 are still alive. +dnl Seirver 3 can't leave, because the NEW configuration will be a cluster of +dnl 4 with 2 servers down and it doesn't have a quorum. Try it. +AT_CHECK([ovs-appctl -t $(pwd)/s3 cluster/leave $schema_name]) +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s3 cluster/status $schema_name \ + | grep -q leaving]) +dnl Make sure that all the servers, including the server 3 are still part +dnl of the cluster and know about server 3. +for i in 3 4 5; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) + OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \ + | grep -q '\']) +done + +dnl Make sure the cluster is still operational and server 3 can commit. +m4_define([DB_REMOTE], [unix:s3.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-3], [0], [ignore]) + +dnl Now bring back the server 2. This should allow server 3 to leave. +AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s2.log \ + --pidfile=s2.pid --unixctl=s2 \ + --remote=punix:s2.ovsdb s2.db]) + +dnl Wait for server 3 to actually leave and stop the server. +AT_CHECK([ovsdb_client_wait unix:s3.ovsdb $schema_name removed]) +OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s3], [s3.pid]) + +dnl Make sure that all remaining servers are functional as a cluster. +for i in 2 4 5; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done +dnl Make sure the cluster is still operational. +m4_define([DB_REMOTE], [unix:s2.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-4], [0], [ignore]) + + +dnl Now we have a cluster of 4 servers (1, 2, 4, 5) with 1 server down. +dnl Server 2 should be able to leave, because the NEW configuration will +dnl be a cluster of 3 servers with 1 being down and it has a quorum. +AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/leave $schema_name]) +dnl Wait for server 2 to actually leave and stop the server. +AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name removed]) +OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid]) +dnl Make sure the cluster is still operational. +m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-5], [0], [ignore]) + +dnl Now we have a cluster of 3 servers (1, 4, 5) with 1 server down. +dnl None of the alive servers can leave, because the NEW configuration +dnl will be a cluster of 2 with 1 server down and it has no quorum. +dnl Request to leave anyway. +for i in 4 5; do + AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/leave $schema_name]) + OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \ + | grep -q leaving]) +done + +dnl Make sure the cluster is still operational. +m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb]) +AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-6], [0], [ignore]) + +dnl Now bring back the first server. +AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s1.log \ + --pidfile=s1.pid --unixctl=s1 \ + --remote=punix:s1.ovsdb s1.db]) + +dnl Now it should be possible for all the other servers to leave, so we +dnl should end up with a single-node cluster that consists of server 1. +for i in 4 5; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name removed]) +done +for i in 4 5; do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +dnl Wait for the first server to become a leader of a single-node cluster. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s1 cluster/status $schema_name \ + | grep -q 'Role: leader']) +dnl Check that the database is operational and the data is still in there. +AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait create QoS type=test-7], [0], [ignore]) +AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait \ + --columns=type --bare list QoS | sort], [0], [dnl +test-1 +test-2 +test-3 +test-4 +test-5 +test-6 +test-7 +]) + +OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid]) AT_CLEANUP