Skip to content

Commit

Permalink
Fix flaky memberlist tests
Browse files Browse the repository at this point in the history
Lots of tests fail due to the probe interval being 5s, which is the same time we usually poll for
This should fix these issues:
- #389
- #220
  • Loading branch information
julienduchesne committed Oct 8, 2024
1 parent 53283a0 commit de540a5
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
1 change: 1 addition & 0 deletions .github/workflows/find-flaky-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ jobs:
find-flaky-tests:
# Only run this workflow when the comment contains '/find-flaky-tests' or it's manually triggered
if: contains(github.event.comment.body, '/find-flaky-tests') || github.event_name == 'workflow_dispatch'
fail-fast: false
strategy:
matrix:
runs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] # Run the build workflow 20 times
Expand Down
16 changes: 10 additions & 6 deletions kv/memberlist/memberlist_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ type KVConfig struct {
GossipToTheDeadTime time.Duration `yaml:"gossip_to_dead_nodes_time" category:"advanced"`
DeadNodeReclaimTime time.Duration `yaml:"dead_node_reclaim_time" category:"advanced"`
EnableCompression bool `yaml:"compression_enabled" category:"advanced"`
ProbeInterval time.Duration `yaml:"probe_interval" category:"advanced"` // Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.
ProbeTimeout time.Duration `yaml:"probe_timeout" category:"advanced"` // Timeout for the direct probe.

// ip:port to advertise other cluster members. Used for NAT traversal
AdvertiseAddr string `yaml:"advertise_addr"`
Expand Down Expand Up @@ -201,6 +203,12 @@ func (cfg *KVConfig) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
f.BoolVar(&cfg.ClusterLabelVerificationDisabled, prefix+"memberlist.cluster-label-verification-disabled", mlDefaults.SkipInboundLabelCheck, "When true, memberlist doesn't verify that inbound packets and gossip streams have the cluster label matching the configured one. This verification should be disabled while rolling out the change to the configured cluster label in a live memberlist cluster.")
f.DurationVar(&cfg.BroadcastTimeoutForLocalUpdatesOnShutdown, prefix+"memberlist.broadcast-timeout-for-local-updates-on-shutdown", 10*time.Second, "Timeout for broadcasting all remaining locally-generated updates to other nodes when shutting down. Only used if there are nodes left in the memberlist cluster, and only applies to locally-generated updates, not to broadcast messages that are result of incoming gossip updates. 0 = no timeout, wait until all locally-generated updates are sent.")

// For our use cases, we don't need a very fast detection of dead nodes. Since we use a TCP transport
// and we open a new TCP connection for each packet, we prefer to reduce the probe frequency and increase
// the timeout compared to defaults.
f.DurationVar(&cfg.ProbeInterval, prefix+"memberlist.probe-interval", 5*time.Second, "Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.")
f.DurationVar(&cfg.ProbeTimeout, prefix+"memberlist.probe-timeout", 2*time.Second, "Timeout for the direct probe.")

cfg.TCPTransport.RegisterFlagsWithPrefix(f, prefix)
}

Expand Down Expand Up @@ -404,6 +412,8 @@ func (m *KV) buildMemberlistConfig() (*memberlist.Config, error) {
mlCfg.GossipToTheDeadTime = m.cfg.GossipToTheDeadTime
mlCfg.DeadNodeReclaimTime = m.cfg.DeadNodeReclaimTime
mlCfg.EnableCompression = m.cfg.EnableCompression
mlCfg.ProbeInterval = m.cfg.ProbeInterval
mlCfg.ProbeTimeout = m.cfg.ProbeTimeout

mlCfg.AdvertiseAddr = m.cfg.AdvertiseAddr
mlCfg.AdvertisePort = m.cfg.AdvertisePort
Expand All @@ -425,12 +435,6 @@ func (m *KV) buildMemberlistConfig() (*memberlist.Config, error) {
// As we don't use UDP for sending packets, we can use higher value here.
mlCfg.UDPBufferSize = 10 * 1024 * 1024

// For our use cases, we don't need a very fast detection of dead nodes. Since we use a TCP transport
// and we open a new TCP connection for each packet, we prefer to reduce the probe frequency and increase
// the timeout compared to defaults.
mlCfg.ProbeInterval = 5 * time.Second // Probe a random node every this interval. This setting is also the total timeout for the direct + indirect probes.
mlCfg.ProbeTimeout = 2 * time.Second // Timeout for the direct probe.

// Since we use a custom transport based on TCP, having TCP-based fallbacks doesn't give us any benefit.
// On the contrary, if we keep TCP pings enabled, each node will effectively run 2x pings against a dead
// node, because the TCP-based fallback will always trigger.
Expand Down
1 change: 1 addition & 0 deletions kv/memberlist/memberlist_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ func defaultKVConfig(i int) KVConfig {
cfg.GossipInterval = 100 * time.Millisecond
cfg.GossipNodes = 10
cfg.PushPullInterval = 5 * time.Second
cfg.ProbeInterval = 1 * time.Second // Default is 5s, but we want to speed up tests.

cfg.TCPTransport = TCPTransportConfig{
BindAddrs: getLocalhostAddrs(),
Expand Down

0 comments on commit de540a5

Please sign in to comment.