From 8b5705fd2585599dfe53a42a75fcef4af1ae7db8 Mon Sep 17 00:00:00 2001 From: Luca Miccini Date: Thu, 5 Dec 2024 09:09:50 +0100 Subject: [PATCH] Configure keystonemiddleware/oslo to deal with memcached pods failures Whenever one of the mecached pods disappears, because of a rolling restart during a minor update or as result of a failure, APIs can take a long time to detect that the pod went away and keep trying to reconnect. From a quick round of tests we saw downtimes up to ~150s. By tuning memcache_pool_dead_retry and memcache_pool_conn_get_timeout the behavior seems much more acceptable. Since neutron also uses memcached directly we also need to tweak the [cache] section enabling the retry mechanism in the client and apply similar defaults. Jira: https://issues.redhat.com/browse/OSPRH-11935 --- templates/neutronapi/config/01-neutron.conf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/templates/neutronapi/config/01-neutron.conf b/templates/neutronapi/config/01-neutron.conf index 6d3407b1..8f498c58 100644 --- a/templates/neutronapi/config/01-neutron.conf +++ b/templates/neutronapi/config/01-neutron.conf @@ -54,6 +54,8 @@ ovn_sb_ca_cert = /etc/pki/tls/certs/ovndbca.crt www_authenticate_uri = {{ .KeystonePublicURL }} auth_url = {{ .KeystoneInternalURL }} memcached_servers={{ .MemcachedServersWithInet }} +memcache_pool_dead_retry = 10 +memcache_pool_conn_get_timeout = 2 auth_type = password project_domain_name = Default user_domain_name = Default @@ -91,9 +93,13 @@ lock_path = /var/lib/neutron/tmp {{if .MemcachedTLS}} backend = dogpile.cache.pymemcache memcache_servers = {{ .MemcachedServers }} +enable_retry_client = true +retry_attempts = 2 +retry_delay = 0 {{else}} backend = dogpile.cache.memcached memcache_servers = {{ .MemcachedServersWithInet }} +memcache_dead_retry = 10 {{end}} enabled=true tls_enabled={{ .MemcachedTLS }}