Skip to content

Commit

Permalink
raft: Fix cluster break down on leaving with some nodes down. WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Ilya Maximets <[email protected]>
  • Loading branch information
igsilya committed Aug 11, 2024
1 parent 365d415 commit 0f921ac
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 10 deletions.
37 changes: 27 additions & 10 deletions ovsdb/raft.c
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ raft_alloc(void)
hmap_node_nullify(&raft->hmap_node);
hmap_init(&raft->servers);
raft->log_start = raft->log_end = 1;
raft->role = RAFT_FOLLOWER;
raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
sset_init(&raft->remote_addresses);
raft->join_timeout = LLONG_MAX;
ovs_list_init(&raft->waiters);
Expand Down Expand Up @@ -1860,10 +1860,6 @@ raft_start_election(struct raft *raft, bool is_prevote,
/* Leadership transfer doesn't use pre-vote. */
ovs_assert(!is_prevote || !leadership_transfer);

if (raft->leaving) {
return;
}

struct raft_server *me = raft_find_server(raft, &raft->sid);
if (!me) {
return;
Expand All @@ -1876,7 +1872,7 @@ raft_start_election(struct raft *raft, bool is_prevote,
ovs_assert(raft->role != RAFT_LEADER);

raft->leader_sid = UUID_ZERO;
raft->role = RAFT_CANDIDATE;
raft->role = RAFT_CANDIDATE; VLOG_INFO("%s: CANDIDATE", __func__);
raft->prevote_passed = !is_prevote;

if (is_prevote || leadership_transfer) {
Expand Down Expand Up @@ -1990,6 +1986,12 @@ raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn)
return true;
}

/* Keep the connection until we send a RemoveServerReply. */
if (raft->remove_server
&& uuid_equals(&conn->sid, &raft->remove_server->sid)) {
return true;
}

/* We have joined the cluster. If we did that "recently", then there is a
* chance that we do not have the most recent server configuration log
* entry. If so, it's a waste to disconnect from the servers that were in
Expand Down Expand Up @@ -2116,6 +2118,8 @@ raft_run(struct raft *raft)
count ++;
}
}
VLOG_INFO("Replied: %d out of %d servers",
count, (int) hmap_count(&raft->servers));
if (count >= hmap_count(&raft->servers) / 2) {
HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
server->replied = false;
Expand All @@ -2132,6 +2136,11 @@ raft_run(struct raft *raft)
}

if (raft->leaving && time_msec() >= raft->leave_timeout) {
if (raft->role == RAFT_LEADER) {
raft_transfer_leadership(raft,
"this server is leaving the cluster");
raft_become_follower(raft);
}
raft_send_remove_server_requests(raft);
}

Expand Down Expand Up @@ -2440,7 +2449,7 @@ raft_command_execute__(struct raft *raft, const struct json *data,
const struct json *servers, uint64_t election_timer,
const struct uuid *prereq, struct uuid *result)
{
if (raft->joining || raft->leaving || raft->left || raft->failed) {
if (raft->joining || raft->left || raft->failed) {
return raft_command_create_completed(RAFT_CMD_SHUTDOWN);
}

Expand Down Expand Up @@ -2778,7 +2787,7 @@ raft_become_follower(struct raft *raft)
return;
}

raft->role = RAFT_FOLLOWER;
raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
raft_reset_election_timer(raft);

/* Notify clients about lost leadership.
Expand Down Expand Up @@ -2906,7 +2915,7 @@ raft_become_leader(struct raft *raft)
raft->n_votes, hmap_count(&raft->servers));

ovs_assert(raft->role != RAFT_LEADER);
raft->role = RAFT_LEADER;
raft->role = RAFT_LEADER; VLOG_INFO("%s: LEADER", __func__);
raft->election_won = time_msec();
raft_set_leader(raft, &raft->sid);
raft_reset_election_timer(raft);
Expand Down Expand Up @@ -3367,7 +3376,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
* least as large as the candidate's current term, then the
* candidate recognizes the leader as legitimate and returns to
* follower state. */
raft->role = RAFT_FOLLOWER;
raft->role = RAFT_FOLLOWER; VLOG_INFO("%s: FOLLOWER", __func__);
}
return true;
}
Expand Down Expand Up @@ -4143,6 +4152,14 @@ raft_handle_remove_server_request(struct raft *raft,
return;
}

/* Check for a server being removed right now. */
if (raft->remove_server
&& uuid_equals(&rq->sid, &raft->remove_server->sid)) {
raft_send_remove_server_reply(raft, rq,
false, RAFT_SERVER_IN_PROGRESS);
return;
}

/* If the server isn't configured, report that. */
target = raft_find_server(raft, &rq->sid);
if (!target) {
Expand Down
143 changes: 143 additions & 0 deletions tests/ovsdb-cluster.at
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,149 @@ for i in $(seq $n); do
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
done

AT_CLEANUP

AT_SETUP([OVSDB cluster - leaving the cluster with some servers down])
AT_KEYWORDS([ovsdb server negative unix cluster leave qwe])

AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db \
$top_srcdir/vswitchd/vswitch.ovsschema unix:s1.raft], [0], [], [stderr])
schema_name=$(ovsdb-tool schema-name $top_srcdir/vswitchd/vswitch.ovsschema)
for i in 2 3 4 5; do
AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
done

on_exit 'kill $(cat *.pid)'
on_exit "
for i in \$(ls $(pwd)/s[[0-5]]); do
ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
done
"
dnl Starting all the servers.
for i in 1 2 3 4 5; do
AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
--detach --no-chdir --log-file=s$i.log \
--pidfile=s$i.pid --unixctl=s$i \
--remote=punix:s$i.ovsdb s$i.db])
done

dnl Make sure that all servers joined the cluster.
for i in 1 2 3 4 5; do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
done

dnl Make sure the cluster is operational.
m4_define([DB_REMOTE], [unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait init])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-1], [0], [ignore])

dnl Stop servers 1 and 2.
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])

dnl Make sure that all remaining servers are functional as a cluster.
for i in 3 4 5; do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
done

dnl Make sure the cluster is still operational.
m4_define([DB_REMOTE], [unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-2], [0], [ignore])

dnl Servers 1 and 2 in a cluster of 5 are down, 3 are still alive.
dnl Seirver 3 can't leave, because the NEW configuration will be a cluster of
dnl 4 with 2 servers down and it doesn't have a quorum. Try it.
AT_CHECK([ovs-appctl -t $(pwd)/s3 cluster/leave $schema_name])
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s3 cluster/status $schema_name \
| grep -q leaving])
dnl Make sure that all the servers, including the server 3 are still part
dnl of the cluster and know about server 3.
for i in 3 4 5; do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
| grep -q '\<s3\>'])
done

dnl Make sure the cluster is still operational and server 3 can commit.
m4_define([DB_REMOTE], [unix:s3.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-3], [0], [ignore])

dnl Now bring back the server 2. This should allow server 3 to leave.
AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
--detach --no-chdir --log-file=s2.log \
--pidfile=s2.pid --unixctl=s2 \
--remote=punix:s2.ovsdb s2.db])

dnl Wait for server 3 to actually leave and stop the server.
AT_CHECK([ovsdb_client_wait unix:s3.ovsdb $schema_name removed])
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s3], [s3.pid])

dnl Make sure that all remaining servers are functional as a cluster.
for i in 2 4 5; do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
done
dnl Make sure the cluster is still operational.
m4_define([DB_REMOTE], [unix:s2.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-4], [0], [ignore])


dnl Now we have a cluster of 4 servers (1, 2, 4, 5) with 1 server down.
dnl Server 2 should be able to leave, because the NEW configuration will
dnl be a cluster of 3 servers with 1 being down and it has a quorum.
AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/leave $schema_name])
dnl Wait for server 2 to actually leave and stop the server.
AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name removed])
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])
dnl Make sure the cluster is still operational.
m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-5], [0], [ignore])

dnl Now we have a cluster of 3 servers (1, 4, 5) with 1 server down.
dnl None of the alive servers can leave, because the NEW configuration
dnl will be a cluster of 2 with 1 server down and it has no quorum.
dnl Request to leave anyway.
for i in 4 5; do
AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/leave $schema_name])
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
| grep -q leaving])
done

dnl Make sure the cluster is still operational.
m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait create QoS type=test-6], [0], [ignore])

dnl Now bring back the first server.
AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
--detach --no-chdir --log-file=s1.log \
--pidfile=s1.pid --unixctl=s1 \
--remote=punix:s1.ovsdb s1.db])

dnl Now it should be possible for all the other servers to leave, so we
dnl should end up with a single-node cluster that consists of server 1.
for i in 4 5; do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name removed])
done
for i in 4 5; do
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
done

dnl Wait for the first server to become a leader of a single-node cluster.
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s1 cluster/status $schema_name \
| grep -q 'Role: leader'])
dnl Check that the database is operational and the data is still in there.
AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait create QoS type=test-7], [0], [ignore])
AT_CHECK([ovs-vsctl --db="unix:s1.ovsdb" --no-wait \
--columns=type --bare list QoS | sort], [0], [dnl
test-1
test-2
test-3
test-4
test-5
test-6
test-7
])

OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
AT_CLEANUP


Expand Down

0 comments on commit 0f921ac

Please sign in to comment.