From 8b5705fd2585599dfe53a42a75fcef4af1ae7db8 Mon Sep 17 00:00:00 2001
From: Luca Miccini <lmiccini@redhat.com>
Date: Thu, 5 Dec 2024 09:09:50 +0100
Subject: [PATCH] Configure keystonemiddleware/oslo to deal with memcached pods
 failures

Whenever one of the mecached pods disappears, because of a rolling
restart during a minor update or as result of a failure, APIs can
take a long time to detect that the pod went away and keep trying
to reconnect.

From a quick round of tests we saw downtimes up to ~150s.

By tuning memcache_pool_dead_retry and memcache_pool_conn_get_timeout
the behavior seems much more acceptable.

Since neutron also uses memcached directly we also need to tweak
the [cache] section enabling the retry mechanism in the client and
apply similar defaults.

Jira: https://issues.redhat.com/browse/OSPRH-11935
---
 templates/neutronapi/config/01-neutron.conf | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/templates/neutronapi/config/01-neutron.conf b/templates/neutronapi/config/01-neutron.conf
index 6d3407b1..8f498c58 100644
--- a/templates/neutronapi/config/01-neutron.conf
+++ b/templates/neutronapi/config/01-neutron.conf
@@ -54,6 +54,8 @@ ovn_sb_ca_cert = /etc/pki/tls/certs/ovndbca.crt
 www_authenticate_uri = {{ .KeystonePublicURL }}
 auth_url = {{ .KeystoneInternalURL }}
 memcached_servers={{ .MemcachedServersWithInet }}
+memcache_pool_dead_retry = 10
+memcache_pool_conn_get_timeout = 2
 auth_type = password
 project_domain_name = Default
 user_domain_name = Default
@@ -91,9 +93,13 @@ lock_path = /var/lib/neutron/tmp
 {{if .MemcachedTLS}}
 backend = dogpile.cache.pymemcache
 memcache_servers = {{ .MemcachedServers }}
+enable_retry_client = true
+retry_attempts = 2
+retry_delay = 0
 {{else}}
 backend = dogpile.cache.memcached
 memcache_servers = {{ .MemcachedServersWithInet }}
+memcache_dead_retry = 10
 {{end}}
 enabled=true
 tls_enabled={{ .MemcachedTLS }}