From 19e1b0906b4b0afa64a2dd2baa38a0e69a3b4eeb Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Fri, 28 Jul 2023 10:39:40 +0200 Subject: [PATCH 1/5] add a timeout to redis queries --- apollo-router/src/cache/redis.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/cache/redis.rs b/apollo-router/src/cache/redis.rs index db9fbf25dd..2caf54b36e 100644 --- a/apollo-router/src/cache/redis.rs +++ b/apollo-router/src/cache/redis.rs @@ -9,6 +9,7 @@ use fred::prelude::RedisError; use fred::prelude::RedisErrorKind; use fred::types::Expiration; use fred::types::FromRedis; +use fred::types::PerformanceConfig; use fred::types::ReconnectPolicy; use fred::types::RedisConfig; use url::Url; @@ -114,8 +115,11 @@ impl RedisCacheStorage { let client = RedisClient::new( config, - None, //perf policy - Some(ReconnectPolicy::new_exponential(10, 1, 2000, 10)), + Some(PerformanceConfig { + default_command_timeout_ms: 1, + ..Default::default() + }), + Some(ReconnectPolicy::new_exponential(10, 1, 200, 5)), ); let _handle = client.connect(); From ac072d184f0aeb2f4e9f9b172e95e6ba76daf35c Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Fri, 28 Jul 2023 10:42:27 +0200 Subject: [PATCH 2/5] SubSelectionKey needs a custom serializer it is used as key in a HashMap, and in JSON keys must be serialized as strings --- apollo-router/src/spec/query/subselections.rs | 60 ++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/apollo-router/src/spec/query/subselections.rs b/apollo-router/src/spec/query/subselections.rs index cb5ae374e6..4bbb16ac45 100644 --- a/apollo-router/src/spec/query/subselections.rs +++ b/apollo-router/src/spec/query/subselections.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use serde::de::Visitor; use serde::Deserialize; use serde::Serialize; use serde_json_bytes::ByteString; @@ -14,12 +15,69 @@ use crate::spec::IncludeSkip; use crate::spec::SpecError; use crate::Configuration; -#[derive(Debug, Serialize, Deserialize, PartialEq, Hash, Eq)] +#[derive(Debug, PartialEq, Hash, Eq)] pub(crate) struct SubSelectionKey { pub(crate) defer_label: Option, pub(crate) defer_conditions: BooleanValues, } +// Do not replace this with a derived Serialize implementation +// SubSelectionKey must serialize to a string because it is used as a JSON object key +impl Serialize for SubSelectionKey { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let s = format!( + "{:?}|{}", + self.defer_conditions.bits, + self.defer_label.as_deref().unwrap_or("") + ); + serializer.serialize_str(&s) + } +} + +impl<'de> Deserialize<'de> for SubSelectionKey { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(SubSelectionKeyVisitor) + } +} + +struct SubSelectionKeyVisitor; +impl<'de> Visitor<'de> for SubSelectionKeyVisitor { + type Value = SubSelectionKey; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter + .write_str("a string containing the defer label and defer conditions separated by |") + } + + fn visit_str(self, s: &str) -> Result + where + E: serde::de::Error, + { + if let Some((bits_str, label)) = s.split_once('|') { + Ok(SubSelectionKey { + defer_conditions: BooleanValues { + bits: bits_str + .parse::() + .map_err(|_| E::custom("expected a number"))?, + }, + defer_label: if label.is_empty() { + None + } else { + Some(label.to_string()) + }, + }) + } else { + Err(E::custom("invalid subselection")) + } + } +} + #[derive(Debug, Serialize, Deserialize)] pub(crate) struct SubSelectionValue { pub(crate) selection_set: Vec, From 1048e2ff5cb037e36756abe1924d2537db24332f Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Fri, 28 Jul 2023 11:22:15 +0200 Subject: [PATCH 3/5] do not set a maximum number of reconnection attempts with fred's reconnection policy, once the max attempts is reached, it will never try again. We actually want it to retry at some point, so we use the exponential delay retry, along with a timeout so that reconnecting does not block requests. The maximum delay for reconnection is 2 seconds, so the router will retry at this interval --- apollo-router/src/cache/redis.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/cache/redis.rs b/apollo-router/src/cache/redis.rs index 2caf54b36e..1dae895287 100644 --- a/apollo-router/src/cache/redis.rs +++ b/apollo-router/src/cache/redis.rs @@ -119,7 +119,7 @@ impl RedisCacheStorage { default_command_timeout_ms: 1, ..Default::default() }), - Some(ReconnectPolicy::new_exponential(10, 1, 200, 5)), + Some(ReconnectPolicy::new_exponential(0, 1, 2000, 5)), ); let _handle = client.connect(); From 96e4fc9d3c16b27ab45eae1d95ebc79969eab735 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Fri, 28 Jul 2023 11:49:00 +0200 Subject: [PATCH 4/5] do not hold the in memory cache lock while waiting for redis if we are reconnecting, we might wait for a long time before we get an answer, and in the meantime the in memory cache could answer other requests --- apollo-router/src/cache/storage.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index f11c39429b..9996f1d23a 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -82,9 +82,10 @@ where } pub(crate) async fn get(&self, key: &K) -> Option { - let mut guard = self.inner.lock().await; let instant_memory = Instant::now(); - match guard.get(key) { + let res = self.inner.lock().await.get(key).cloned(); + + match res { Some(v) => { tracing::info!( monotonic_counter.apollo_router_cache_hit_count = 1u64, @@ -97,7 +98,7 @@ where kind = %self.caller, storage = &tracing::field::display(CacheStorageName::Memory), ); - Some(v.clone()) + Some(v) } None => { let duration = instant_memory.elapsed().as_secs_f64(); @@ -117,7 +118,8 @@ where let inner_key = RedisKey(key.clone()); match redis.get::(inner_key).await { Some(v) => { - guard.put(key.clone(), v.0.clone()); + self.inner.lock().await.put(key.clone(), v.0.clone()); + tracing::info!( monotonic_counter.apollo_router_cache_hit_count = 1u64, kind = %self.caller, From b128afb515fb7fcf575d7a400ea741b65a93447d Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Fri, 28 Jul 2023 12:06:47 +0200 Subject: [PATCH 5/5] changeset --- .changesets/fix_geal_redis_fixes.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .changesets/fix_geal_redis_fixes.md diff --git a/.changesets/fix_geal_redis_fixes.md b/.changesets/fix_geal_redis_fixes.md new file mode 100644 index 0000000000..abb955e25c --- /dev/null +++ b/.changesets/fix_geal_redis_fixes.md @@ -0,0 +1,9 @@ +### Fix redis reconnections ([Issue #3045](https://github.com/apollographql/router/issues/3045)) + +The reconnection policy was using an exponential backoff delay with a maximum number of attempts. Once that maximum is reached, reconnection was never tried again (there's no baseline retry). We change that behaviour by adding infinite retries with a maximum delay of 2 seconds, and a timeout of 1 millisecond on redis commands, so that the router can continue serving requests in the meantime. + +This commit contains additional fixes: +- release the lock on the in memory cache while waiting for redis, to let the in memory cache serve other requests +- add a custom serializer for `SubSelectionKey`: this type is used as key in a `HashMap`, which is converted to a JSON object, and object keys must be strings, so a specific serializer is needed instead of the derived one + +By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/3509 \ No newline at end of file