Fix flaky memberlist tests

Lots of tests fail due to the probe interval being 5s, which is the same time we usually poll for This should fix these issues: - #389 - #220
grafana · Oct 8, 2024 · de540a5 · de540a5
1 parent 53283a0
commit de540a5
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/.github/workflows/find-flaky-tests.yml b/.github/workflows/find-flaky-tests.yml
@@ -8,6 +8,7 @@ jobs:
   find-flaky-tests:
     # Only run this workflow when the comment contains '/find-flaky-tests' or it's manually triggered
     if: contains(github.event.comment.body, '/find-flaky-tests') || github.event_name == 'workflow_dispatch'
+    fail-fast: false
     strategy:
       matrix:
         runs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] # Run the build workflow 20 times

diff --git a/kv/memberlist/memberlist_client.go b/kv/memberlist/memberlist_client.go
@@ -137,6 +137,8 @@ type KVConfig struct {
 	GossipToTheDeadTime time.Duration `yaml:"gossip_to_dead_nodes_time" category:"advanced"`
 	DeadNodeReclaimTime time.Duration `yaml:"dead_node_reclaim_time" category:"advanced"`
 	EnableCompression   bool          `yaml:"compression_enabled" category:"advanced"`
+	ProbeInterval       time.Duration `yaml:"probe_interval" category:"advanced"` // Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.
+	ProbeTimeout        time.Duration `yaml:"probe_timeout" category:"advanced"`  // Timeout for the direct probe.
 
 	// ip:port to advertise other cluster members. Used for NAT traversal
 	AdvertiseAddr string `yaml:"advertise_addr"`
@@ -201,6 +203,12 @@ func (cfg *KVConfig) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
 	f.BoolVar(&cfg.ClusterLabelVerificationDisabled, prefix+"memberlist.cluster-label-verification-disabled", mlDefaults.SkipInboundLabelCheck, "When true, memberlist doesn't verify that inbound packets and gossip streams have the cluster label matching the configured one. This verification should be disabled while rolling out the change to the configured cluster label in a live memberlist cluster.")
 	f.DurationVar(&cfg.BroadcastTimeoutForLocalUpdatesOnShutdown, prefix+"memberlist.broadcast-timeout-for-local-updates-on-shutdown", 10*time.Second, "Timeout for broadcasting all remaining locally-generated updates to other nodes when shutting down. Only used if there are nodes left in the memberlist cluster, and only applies to locally-generated updates, not to broadcast messages that are result of incoming gossip updates. 0 = no timeout, wait until all locally-generated updates are sent.")
 
+	// For our use cases, we don't need a very fast detection of dead nodes. Since we use a TCP transport
+	// and we open a new TCP connection for each packet, we prefer to reduce the probe frequency and increase
+	// the timeout compared to defaults.
+	f.DurationVar(&cfg.ProbeInterval, prefix+"memberlist.probe-interval", 5*time.Second, "Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.")
+	f.DurationVar(&cfg.ProbeTimeout, prefix+"memberlist.probe-timeout", 2*time.Second, "Timeout for the direct probe.")
+
 	cfg.TCPTransport.RegisterFlagsWithPrefix(f, prefix)
 }
 
@@ -404,6 +412,8 @@ func (m *KV) buildMemberlistConfig() (*memberlist.Config, error) {
 	mlCfg.GossipToTheDeadTime = m.cfg.GossipToTheDeadTime
 	mlCfg.DeadNodeReclaimTime = m.cfg.DeadNodeReclaimTime
 	mlCfg.EnableCompression = m.cfg.EnableCompression
+	mlCfg.ProbeInterval = m.cfg.ProbeInterval
+	mlCfg.ProbeTimeout = m.cfg.ProbeTimeout
 
 	mlCfg.AdvertiseAddr = m.cfg.AdvertiseAddr
 	mlCfg.AdvertisePort = m.cfg.AdvertisePort
@@ -425,12 +435,6 @@ func (m *KV) buildMemberlistConfig() (*memberlist.Config, error) {
 	// As we don't use UDP for sending packets, we can use higher value here.
 	mlCfg.UDPBufferSize = 10 * 1024 * 1024
 
-	// For our use cases, we don't need a very fast detection of dead nodes. Since we use a TCP transport
-	// and we open a new TCP connection for each packet, we prefer to reduce the probe frequency and increase
-	// the timeout compared to defaults.
-	mlCfg.ProbeInterval = 5 * time.Second // Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.
-	mlCfg.ProbeTimeout = 2 * time.Second  // Timeout for the direct probe.
-
 	// Since we use a custom transport based on TCP, having TCP-based fallbacks doesn't give us any benefit.
 	// On the contrary, if we keep TCP pings enabled, each node will effectively run 2x pings against a dead
 	// node, because the TCP-based fallback will always trigger.

diff --git a/kv/memberlist/memberlist_client_test.go b/kv/memberlist/memberlist_client_test.go
@@ -566,6 +566,7 @@ func defaultKVConfig(i int) KVConfig {
 	cfg.GossipInterval = 100 * time.Millisecond
 	cfg.GossipNodes = 10
 	cfg.PushPullInterval = 5 * time.Second
+	cfg.ProbeInterval = 1 * time.Second // Default is 5s, but we want to speed up tests.
 
 	cfg.TCPTransport = TCPTransportConfig{
 		BindAddrs: getLocalhostAddrs(),