From 71e09fe4c760c404251846e7ea8160340e6d215a Mon Sep 17 00:00:00 2001
From: Damien Ciabrini <dciabrin@redhat.com>
Date: Tue, 19 Nov 2024 14:21:03 +0100
Subject: [PATCH] Improve the failover of galera service

When a galera node is in the process of shutting down (e.g. during a rolling restart
caused by a minor update), the node is unable to serve SQL queries, however it is
still connected to clients. This confuses clients who get unexpected SQL status [1]
and prevent them from retrying their queries, causing unexpected errors down the road.

Improve the pod stop pre-hook to failover the active endpoint to another pod prior
to shutting down the galera server, and kill connected clients to force them to reconnect
to the new active endpoint. At this stage, the galera server can be safely shutdown
as no client will see its WSREP state update.

Also update the failover script: 1) when no endpoint is available, ensure no traffic
is going through any pod. 2) do not trigger a endpoint failover as long as the current
endpoint targets a galera node that is still part of the primary partition (i.e. it
is still able to serve traffic).

[1] 'WSREP has not yet prepared node for application use'

Jira: OSPRH-11488
---
 templates/galera/bin/mysql_shutdown.sh     |  13 ++-
 templates/galera/bin/mysql_wsrep_notify.sh | 112 +++++++++++++++++----
 2 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/templates/galera/bin/mysql_shutdown.sh b/templates/galera/bin/mysql_shutdown.sh
index 426e1a0c..ce9cf834 100755
--- a/templates/galera/bin/mysql_shutdown.sh
+++ b/templates/galera/bin/mysql_shutdown.sh
@@ -44,5 +44,16 @@ if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header
     done
 fi
 
-log "Shutting down local galera node"
+log "Initiating orchestrated shutdown of the local galera node"
+
+log "Failover service to another available galera node"
+bash $(dirname $0)/mysql_wsrep_notify.sh --status failover
+
+log "Close all active connections to this local galera node"
+# filter out system and localhost connections, only consider clients with a port in the host field
+# from that point, clients will automatically reconnect to another node
+CLIENTS=$(mysql -uroot -p${DB_ROOT_PASSWORD} -nN -e "select id from information_schema.processlist where host like '%:%';")
+echo -n "$CLIENTS" | tr '\n' ',' | xargs mysqladmin -uroot -p${DB_ROOT_PASSWORD} kill
+
+log "Shutdown local server"
 mysqladmin -uroot -p"${DB_ROOT_PASSWORD}" shutdown
diff --git a/templates/galera/bin/mysql_wsrep_notify.sh b/templates/galera/bin/mysql_wsrep_notify.sh
index 374e8260..4658f419 100755
--- a/templates/galera/bin/mysql_wsrep_notify.sh
+++ b/templates/galera/bin/mysql_wsrep_notify.sh
@@ -32,18 +32,26 @@ function log_error() {
 function mysql_get_status {
     local name=$1
     mysql -nNE -uroot -p"${DB_ROOT_PASSWORD}" -e "show status like '${name}';" | tail -1
-    if [ $? != 0 ]; then
-        log_error "could not get value of mysql variable '${name}' (rc=$?)"
-        return 1
-    fi
+    local rc=$?
+    [ $rc = 0 ] || log_error "could not get value of mysql variable '${name}' (rc=$rc)"
+}
+
+function mysql_get_members {
+    mysql -nN -uroot -p"${DB_ROOT_PASSWORD}" -e "select node_name from mysql.wsrep_cluster_members;"
+    local rc=$?
+    [ $rc = 0 ] || log_error "could not get cluster members from mysql' (rc=$rc)"
 }
 
-# Refresh environment variables with the latest WSREP state from mysql
+# When optional script parameters are not provided, set up the environment
+# variables with the latest WSREP state retrieved from mysql
 function mysql_probe_state {
-    UUID=$(mysql_get_status wsrep_gcomm_uuid)
-    PARTITION=$(mysql_get_status wsrep_cluster_status)
-    INDEX=$(mysql_get_status wsrep_local_index)
-    SIZE=$(mysql_get_status wsrep_cluster_size)
+    [ "$1" = "reprobe" ] && unset UUID PARTITION INDEX SIZE MEMBERS
+    : ${UUID=$(mysql_get_status wsrep_gcomm_uuid)}
+    : ${PARTITION=$(mysql_get_status wsrep_cluster_status)}
+    : ${INDEX=$(mysql_get_status wsrep_local_index)}
+    : ${SIZE=$(mysql_get_status wsrep_cluster_size)}
+    : ${MEMBERS=$(mysql_get_members)}
+    [ -n "${UUID}" -a -n "${PARTITION}" -a -n "${INDEX}" -a -n "${SIZE}" -a -n "${MEMBERS}" ]
 }
 
 # REST API call to the k8s API server
@@ -83,12 +91,10 @@ function api_server {
 # Update the service's active endpoint
 # (parse JSON with python3 as we don't have jq in the container image)
 function service_endpoint {
-    local endpoint=$1
-    if [ -n "${endpoint}" ]; then
-        python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="'${endpoint}'";print(json.dumps(s,indent=2))'
-    else
-        python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"].pop("statefulset.kubernetes.io/pod-name", None);print(json.dumps(s,indent=2))'
-    fi
+    local endpoint="$1"
+    # note: empty endpoint means "block incoming traffic", so the selector must still
+    # be present, otherwise k8s would balance incoming traffic to _any_ available pod.
+    python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="'${endpoint}'";print(json.dumps(s,indent=2))'
     [ $? == 0 ] || log_error "Could not parse json endpoint (rc=$?)"
 }
 
@@ -123,7 +129,7 @@ function retry {
         retries=$((retries - 1))
         # reprobe mysql state now, as if the cluster state changed since
         # the start of this script, we might not need to retry the action
-        mysql_probe_state
+        mysql_probe_state reprobe
     done
     if [ $rc -ne 0 ]; then
         log_error "Could not run action after ${RETRIES} tries. Stop retrying."
@@ -149,6 +155,11 @@ function reconfigure_service_endpoint {
 
     CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")')
     [ $? == 0 ] || return 1
+    # do not reconfigure endpoint if unecessary, to avoid client disconnections
+    if [ -n "${CURRENT_ENDPOINT}" ] && echo "$MEMBERS" | grep -q "^${CURRENT_ENDPOINT}\$"; then
+        log "Active endpoint ${CURRENT_ENDPOINT} is still part of the primary partition. Nothing to be done."
+        return 0
+    fi
     if [ "${CURRENT_ENDPOINT}" == "${PODNAME}" ]; then
         log "Node ${PODNAME} is currently the active endpoint for service ${SERVICE}. Nothing to be done."
         return 0
@@ -164,6 +175,39 @@ function reconfigure_service_endpoint {
     return 0
 }
 
+## Failover to another node if we are the current Active endpoint
+function failover_service_endpoint {
+    if [ $PARTITION != "Primary" ]; then
+        log "Node ${PODNAME} is not the Primary partion. Nothing to be done."
+        return 0
+    fi
+
+    CURRENT_SVC=$(api_server GET "$SERVICE")
+    local rc=$?
+    [ $rc == 0 ] || return $rc
+
+    CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")')
+    [ $? == 0 ] || return 1
+    if [ "${CURRENT_ENDPOINT}" != "${PODNAME}" ]; then
+        log "Node ${PODNAME} is not the active endpoint. Nothing to be done."
+        return 0
+    fi
+    # select the first available node in the primary partition to be the failover endpoint
+    NEW_ENDPOINT=$(echo "$MEMBERS" | grep -v "${PODNAME}" | head -1)
+    if [ -z "${NEW_ENDPOINT}" ]; then
+        log "No other available node to become the active endpoint."
+    fi
+
+    NEW_SVC=$(echo "$CURRENT_SVC" | service_endpoint "$NEW_ENDPOINT")
+    [ $? == 0 ] || return 1
+
+    log "Configuring a new active endpoint for service ${SERVICE}: '${CURRENT_ENDPOINT}' -> '${NEW_ENDPOINT}'"
+    UPDATE_RESULT=$(echo "$NEW_SVC" | api_server PUT "$SERVICE")
+    [ $? == 0 ] || return 1
+
+    return 0
+}
+
 ## Change the Active endpoint from the service
 function remove_service_endpoint {
     CURRENT_SVC=$(api_server GET "$SERVICE")
@@ -194,17 +238,29 @@ function remove_service_endpoint {
 log "called with args: $*"
 
 # Galera always calls script with --status argument
-# All other arguments (uuid,partition,index...) are optional,
-# so get those values by probing mysql directly
-STATUS=""
-PARTITION=""
-INDEX=""
+# All other optional arguments (uuid,partition,index...):
+# UUID: cluster's current UUID
+# MEMBERS: galera node connected to the cluster
+# SIZE: number of nodes in the cluster
+# INDEX: member index in the cluster
+# PARTITION: cluster partition we're in (Primary, Non-primary)
 while [ $# -gt 0 ]; do
     case $1 in
         --status)
             STATUS=$2
             shift;;
-        --uuid|--members|--primary|--index)
+        --members)
+            MEMBERS=$(echo "$2" | tr ',' '\n' | cut -d/ -f2)
+            SIZE=$(echo "$MEMBERS" | wc -l)
+            shift;;
+        --primary)
+            [ "$2" = "yes" ] && PARTITION="Primary"
+            [ "$2" = "no" ] && PARTITION="Non-primary"
+            shift;;
+        --index)
+            INDEX=$2
+            shift;;
+        --uuid)
             shift;;
     esac
     shift
@@ -215,6 +271,15 @@ if [ -z "${STATUS}" ]; then
     exit 1
 fi
 
+# Contition: ask for a failover. This should be called when mysql is running
+if echo "${STATUS}" | grep -i -q -e 'failover'; then
+    mysql_probe_state
+    if [ $? != 0 ]; then
+        log_error "Could not probe missing mysql information. Aborting"
+    fi
+    retry "failover_service_endpoint"
+fi
+
 # Condition: disconnecting -> remove oneself from endpoint if Active
 if echo "${STATUS}" | grep -i -q -e 'disconnecting'; then
     retry "remove_service_endpoint"
@@ -228,6 +293,9 @@ fi
 
 # At this point mysql is started, query missing arguments
 mysql_probe_state
+if [ $? != 0 ]; then
+    log_error "Could not probe missing mysql information. Aborting"
+fi
 
 # Condition: first member of the primary partition -> set as Active endpoint
 if [ $PARTITION = "Primary" -a $SIZE -ge 0 -a "$INDEX" = "0" ]; then