From 0f921ac9126520d8a3985132d4c68e0530a696dd Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Sun, 11 Aug 2024 22:28:52 +0200
Subject: [PATCH] raft: Fix cluster break down on leaving with some nodes down.
 WIP

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 ovsdb/raft.c           |  37 ++++++++---
 tests/ovsdb-cluster.at | 143 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+), 10 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 9c3c351b5be..2b541c677de 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -436,7 +436,7 @@ raft_alloc(void)
     hmap_node_nullify(&raft->hmap_node);
     hmap_init(&raft->servers);
     raft->log_start = raft->log_end = 1;
-    raft->role = RAFT_FOLLOWER;
+    raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
     sset_init(&raft->remote_addresses);
     raft->join_timeout = LLONG_MAX;
     ovs_list_init(&raft->waiters);
@@ -1860,10 +1860,6 @@ raft_start_election(struct raft *raft, bool is_prevote,
     /* Leadership transfer doesn't use pre-vote. */
     ovs_assert(!is_prevote || !leadership_transfer);
 
-    if (raft->leaving) {
-        return;
-    }
-
     struct raft_server *me = raft_find_server(raft, &raft->sid);
     if (!me) {
         return;
@@ -1876,7 +1872,7 @@ raft_start_election(struct raft *raft, bool is_prevote,
     ovs_assert(raft->role != RAFT_LEADER);
 
     raft->leader_sid = UUID_ZERO;
-    raft->role = RAFT_CANDIDATE;
+    raft->role = RAFT_CANDIDATE; VLOG_INFO("%s: CANDIDATE", __func__);
     raft->prevote_passed = !is_prevote;
 
     if (is_prevote || leadership_transfer) {
@@ -1990,6 +1986,12 @@ raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn)
         return true;
     }
 
+    /* Keep the connection until we send a RemoveServerReply. */
+    if (raft->remove_server
+        && uuid_equals(&conn->sid, &raft->remove_server->sid)) {
+        return true;
+    }
+
     /* We have joined the cluster.  If we did that "recently", then there is a
      * chance that we do not have the most recent server configuration log
      * entry.  If so, it's a waste to disconnect from the servers that were in
@@ -2116,6 +2118,8 @@ raft_run(struct raft *raft)
                     count ++;
                 }
             }
+            VLOG_INFO("Replied: %d out of %d servers",
+                      count, (int) hmap_count(&raft->servers));
             if (count >= hmap_count(&raft->servers) / 2) {
                 HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
                     server->replied = false;
@@ -2132,6 +2136,11 @@ raft_run(struct raft *raft)
     }
 
     if (raft->leaving && time_msec() >= raft->leave_timeout) {
+        if (raft->role == RAFT_LEADER) {
+            raft_transfer_leadership(raft,
+                                     "this server is leaving the cluster");
+            raft_become_follower(raft);
+        }
         raft_send_remove_server_requests(raft);
     }
 
@@ -2440,7 +2449,7 @@ raft_command_execute__(struct raft *raft, const struct json *data,
                        const struct json *servers, uint64_t election_timer,
                        const struct uuid *prereq, struct uuid *result)
 {
-    if (raft->joining || raft->leaving || raft->left || raft->failed) {
+    if (raft->joining || raft->left || raft->failed) {
         return raft_command_create_completed(RAFT_CMD_SHUTDOWN);
     }
 
@@ -2778,7 +2787,7 @@ raft_become_follower(struct raft *raft)
         return;
     }
 
-    raft->role = RAFT_FOLLOWER;
+    raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
     raft_reset_election_timer(raft);
 
     /* Notify clients about lost leadership.
@@ -2906,7 +2915,7 @@ raft_become_leader(struct raft *raft)
                  raft->n_votes, hmap_count(&raft->servers));
 
     ovs_assert(raft->role != RAFT_LEADER);
-    raft->role = RAFT_LEADER;
+    raft->role = RAFT_LEADER; VLOG_INFO("%s: LEADER", __func__);
     raft->election_won = time_msec();
     raft_set_leader(raft, &raft->sid);
     raft_reset_election_timer(raft);
@@ -3367,7 +3376,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
          * least as large as the candidate's current term, then the
          * candidate recognizes the leader as legitimate and returns to
          * follower state. */
-        raft->role = RAFT_FOLLOWER;
+        raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
     }
     return true;
 }
@@ -4143,6 +4152,14 @@ raft_handle_remove_server_request(struct raft *raft,
         return;
     }
 
+    /* Check for a server being removed right now. */
+    if (raft->remove_server
+        && uuid_equals(&rq->sid, &raft->remove_server->sid)) {
+        raft_send_remove_server_reply(raft, rq,
+                                      false, RAFT_SERVER_IN_PROGRESS);
+        return;
+    }
+
     /* If the server isn't configured, report that. */
     target = raft_find_server(raft, &rq->sid);
     if (!target) {
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 9d8b4d06a4a..19925ac4478 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -578,6 +578,149 @@ for i in $(seq $n); do
     OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
 done
 
+AT_CLEANUP
+
+AT_SETUP([OVSDB cluster - leaving the cluster with some servers down])
+AT_KEYWORDS([ovsdb server negative unix cluster leave qwe])
+
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db \
+            $top_srcdir/vswitchd/vswitch.ovsschema unix:s1.raft], [0], [], [stderr])
+schema_name=$(ovsdb-tool schema-name $top_srcdir/vswitchd/vswitch.ovsschema)
+for i in 2 3 4 5; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill $(cat *.pid)'
+on_exit "
+  for i in \$(ls $(pwd)/s[[0-5]]); do
+    ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
+  done
+"
+dnl Starting all the servers.
+for i in 1 2 3 4 5; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                           --detach --no-chdir --log-file=s$i.log \
+                           --pidfile=s$i.pid --unixctl=s$i \
+                           --remote=punix:s$i.ovsdb s$i.db])
+done
+
+dnl Make sure that all servers joined the cluster.
+for i in 1 2 3 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+dnl Make sure the cluster is operational.
+m4_define([DB_REMOTE], [unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait init])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-1], [0], [ignore])
+
+dnl Stop servers 1 and 2.
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])
+
+dnl Make sure that all remaining servers are functional as a cluster.
+for i in 3 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-2], [0], [ignore])
+
+dnl Servers 1 and 2 in a cluster of 5 are down, 3 are still alive.
+dnl Seirver 3 can't leave, because the NEW configuration will be a cluster of
+dnl 4 with 2 servers down and it doesn't have a quorum.  Try it.
+AT_CHECK([ovs-appctl -t $(pwd)/s3 cluster/leave $schema_name])
+OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s3 cluster/status $schema_name \
+                    | grep -q leaving])
+dnl Make sure that all the servers, including the server 3 are still part
+dnl of the cluster and know about server 3.
+for i in 3 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+    OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
+                        | grep -q '\<s3\>'])
+done
+
+dnl Make sure the cluster is still operational and server 3 can commit.
+m4_define([DB_REMOTE], [unix:s3.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-3], [0], [ignore])
+
+dnl Now bring back the server 2.  This should allow server 3 to leave.
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                          --detach --no-chdir --log-file=s2.log \
+                          --pidfile=s2.pid --unixctl=s2 \
+                          --remote=punix:s2.ovsdb s2.db])
+
+dnl Wait for server 3 to actually leave and stop the server.
+AT_CHECK([ovsdb_client_wait unix:s3.ovsdb $schema_name removed])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s3], [s3.pid])
+
+dnl Make sure that all remaining servers are functional as a cluster.
+for i in 2 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s2.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-4], [0], [ignore])
+
+
+dnl Now we have a cluster of 4 servers (1, 2, 4, 5) with 1 server down.
+dnl Server 2 should be able to leave, because the NEW configuration will
+dnl be a cluster of 3 servers with 1 being down and it has a quorum.
+AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/leave $schema_name])
+dnl Wait for server 2 to actually leave and stop the server.
+AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name removed])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-5], [0], [ignore])
+
+dnl Now we have a cluster of 3 servers (1, 4, 5) with 1 server down.
+dnl None of the alive servers can leave, because the NEW configuration
+dnl will be a cluster of 2 with 1 server down and it has no quorum.
+dnl Request to leave anyway.
+for i in 4 5; do
+    AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/leave $schema_name])
+    OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
+                        | grep -q leaving])
+done
+
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-6], [0], [ignore])
+
+dnl Now bring back the first server.
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                          --detach --no-chdir --log-file=s1.log \
+                          --pidfile=s1.pid --unixctl=s1 \
+                          --remote=punix:s1.ovsdb s1.db])
+
+dnl Now it should be possible for all the other servers to leave, so we
+dnl should end up with a single-node cluster that consists of server 1.
+for i in 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name removed])
+done
+for i in 4 5; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
+done
+
+dnl Wait for the first server to become a leader of a single-node cluster.
+OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s1 cluster/status $schema_name \
+                    | grep -q 'Role: leader'])
+dnl Check that the database is operational and the data is still in there.
+AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait create QoS type=test-7], [0], [ignore])
+AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait \
+            --columns=type --bare list QoS | sort], [0], [dnl
+test-1
+test-2
+test-3
+test-4
+test-5
+test-6
+test-7
+])
+
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
 AT_CLEANUP