From 7a76e51e542d8fea30ec7889f7371d00fe3870d6 Mon Sep 17 00:00:00 2001 From: Wen Xu Date: Tue, 29 Aug 2023 18:46:23 -0400 Subject: [PATCH] add configurable final-sleep time for basic lifecycler (#5517) --- CHANGELOG.md | 1 + docs/blocks-storage/store-gateway.md | 5 +++++ docs/configuration/config-file-reference.md | 15 +++++++++++++++ docs/configuration/v1-guarantees.md | 4 ++++ pkg/alertmanager/alertmanager_ring.go | 4 ++++ pkg/alertmanager/multitenant_test.go | 1 + pkg/ring/basic_lifecycler.go | 3 +++ pkg/ruler/ruler_ring.go | 4 ++++ pkg/ruler/ruler_test.go | 1 + pkg/storegateway/gateway_ring.go | 5 +++++ 10 files changed, 43 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3367ee659..9aa2b897b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ * [ENHANCEMENT] DDBKV: Change metric name from dynamodb_kv_read_capacity_total to dynamodb_kv_consumed_capacity_total and include Delete, Put, Batch dimension. #5481 * [ENHANCEMENT] Compactor: allow unregisteronshutdown to be configurable. #5503 * [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532 +* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517 * [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265 * [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286 * [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293 diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md index 3a127edd10..891b5824cf 100644 --- a/docs/blocks-storage/store-gateway.md +++ b/docs/blocks-storage/store-gateway.md @@ -309,6 +309,11 @@ store_gateway: # CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration [wait_stability_max_duration: | default = 5m] + # The sleep seconds when store-gateway is shutting down. Need to be close to + # or larger than KV Store information propagation delay + # CLI flag: -store-gateway.sharding-ring.final-sleep + [final_sleep: | default = 0s] + # Name of network interface to read address from. # CLI flag: -store-gateway.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index b80b6bc0de..076267f92b 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -369,6 +369,11 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled [zone_awareness_enabled: | default = false] + # The sleep seconds when alertmanager is shutting down. Need to be close to or + # larger than KV Store information propagation delay + # CLI flag: -alertmanager.sharding-ring.final-sleep + [final_sleep: | default = 0s] + # Name of network interface to read address from. # CLI flag: -alertmanager.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] @@ -3945,6 +3950,11 @@ ring: # CLI flag: -ruler.ring.num-tokens [num_tokens: | default = 128] + # The sleep seconds when ruler is shutting down. Need to be close to or larger + # than KV Store information propagation delay + # CLI flag: -ruler.ring.final-sleep + [final_sleep: | default = 0s] + # Period with which to attempt to flush rule groups. # CLI flag: -ruler.flush-period [flush_period: | default = 1m] @@ -4836,6 +4846,11 @@ sharding_ring: # CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration [wait_stability_max_duration: | default = 5m] + # The sleep seconds when store-gateway is shutting down. Need to be close to + # or larger than KV Store information propagation delay + # CLI flag: -store-gateway.sharding-ring.final-sleep + [final_sleep: | default = 0s] + # Name of network interface to read address from. # CLI flag: -store-gateway.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md index 1a125ac783..382cc5c502 100644 --- a/docs/configuration/v1-guarantees.md +++ b/docs/configuration/v1-guarantees.md @@ -108,3 +108,7 @@ Currently experimental features are: - Store Gateway Zone Stable Shuffle Sharding - `-store-gateway.sharding-ring.zone-stable-shuffle-sharding` CLI flag - `zone_stable_shuffle_sharding` (boolean) field in config file +- Basic Lifecycler (Storegateway, Alertmanager, Ruler) Final Sleep on shutdown, which tells the pod wait before shutdown, allowing a delay to propagate ring changes. + - `-ruler.ring.final-sleep` (duration) CLI flag + - `store-gateway.sharding-ring.final-sleep` (duration) CLI flag + - `alertmanager-sharding-ring.final-sleep` (duration) CLI flag diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index 66532a3744..dc26f6a4db 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -49,6 +49,8 @@ type RingConfig struct { ReplicationFactor int `yaml:"replication_factor"` ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` + FinalSleep time.Duration `yaml:"final_sleep"` + // Instance details InstanceID string `yaml:"instance_id" doc:"hidden"` InstanceInterfaceNames []string `yaml:"instance_interface_names"` @@ -79,6 +81,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f) f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).") + f.DurationVar(&cfg.FinalSleep, rfprefix+"final-sleep", 0*time.Second, "The sleep seconds when alertmanager is shutting down. Need to be close to or larger than KV Store information propagation delay") f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.") f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.") @@ -110,6 +113,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl TokensObservePeriod: 0, Zone: cfg.InstanceZone, NumTokens: RingNumTokens, + FinalSleep: cfg.FinalSleep, }, nil } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 4f6680ba26..7c17b80838 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -82,6 +82,7 @@ func mockAlertmanagerConfig(t *testing.T) *MultitenantAlertmanagerConfig { cfg.ShardingRing.InstanceID = "test" cfg.ShardingRing.InstanceAddr = "127.0.0.1" cfg.PollInterval = time.Minute + cfg.ShardingRing.FinalSleep = 0 return cfg } diff --git a/pkg/ring/basic_lifecycler.go b/pkg/ring/basic_lifecycler.go index 41910be9a2..19b3e9cb40 100644 --- a/pkg/ring/basic_lifecycler.go +++ b/pkg/ring/basic_lifecycler.go @@ -55,6 +55,8 @@ type BasicLifecyclerConfig struct { // If true lifecycler doesn't unregister instance from the ring when it's stopping. Default value is false, // which means unregistering. KeepInstanceInTheRingOnShutdown bool + + FinalSleep time.Duration } // BasicLifecycler is a basic ring lifecycler which allows to hook custom @@ -251,6 +253,7 @@ heartbeatLoop: level.Info(l.logger).Log("msg", "instance removed from the ring", "ring", l.ringName) } + time.Sleep(l.cfg.FinalSleep) return nil } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 9017a8429b..1f5422f060 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -42,6 +42,8 @@ type RingConfig struct { InstanceAddr string `yaml:"instance_addr" doc:"hidden"` NumTokens int `yaml:"num_tokens"` + FinalSleep time.Duration `yaml:"final_sleep"` + // Injected internally ListenPort int `yaml:"-"` @@ -60,6 +62,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f) f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).") + f.DurationVar(&cfg.FinalSleep, "ruler.ring.final-sleep", 0*time.Second, "The sleep seconds when ruler is shutting down. Need to be close to or larger than KV Store information propagation delay") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} @@ -86,6 +89,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl HeartbeatPeriod: cfg.HeartbeatPeriod, TokensObservePeriod: 0, NumTokens: cfg.NumTokens, + FinalSleep: cfg.FinalSleep, }, nil } diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 295b326ff5..c8034ee62f 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -74,6 +74,7 @@ func defaultRulerConfig(t testing.TB) Config { cfg.Ring.ListenPort = 0 cfg.Ring.InstanceAddr = "localhost" cfg.Ring.InstanceID = "localhost" + cfg.Ring.FinalSleep = 0 cfg.EnableQueryStats = false return cfg diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index 987be83863..06d2836835 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -73,6 +73,8 @@ type RingConfig struct { WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"` WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"` + FinalSleep time.Duration `yaml:"final_sleep"` + // Instance details InstanceID string `yaml:"instance_id" doc:"hidden"` InstanceInterfaceNames []string `yaml:"instance_interface_names"` @@ -109,6 +111,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.WaitStabilityMinDuration, ringFlagsPrefix+"wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.") f.DurationVar(&cfg.WaitStabilityMaxDuration, ringFlagsPrefix+"wait-stability-max-duration", 5*time.Minute, "Maximum time to wait for ring stability at startup. If the store-gateway ring keeps changing after this period of time, the store-gateway will start anyway.") + f.DurationVar(&cfg.FinalSleep, ringFlagsPrefix+"final-sleep", 0*time.Second, "The sleep seconds when store-gateway is shutting down. Need to be close to or larger than KV Store information propagation delay") + // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), ringFlagsPrefix+"instance-interface-names", "Name of network interface to read address from.") @@ -150,5 +154,6 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl TokensObservePeriod: 0, NumTokens: RingNumTokens, KeepInstanceInTheRingOnShutdown: cfg.KeepInstanceInTheRingOnShutdown, + FinalSleep: cfg.FinalSleep, }, nil }