Skip to content

Commit

Permalink
add configurable final-sleep time for basic lifecycler (#5517)
Browse files Browse the repository at this point in the history
  • Loading branch information
wenxu1024 authored Aug 29, 2023
1 parent b91a24d commit 7a76e51
Show file tree
Hide file tree
Showing 10 changed files with 43 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
* [ENHANCEMENT] DDBKV: Change metric name from dynamodb_kv_read_capacity_total to dynamodb_kv_consumed_capacity_total and include Delete, Put, Batch dimension. #5481
* [ENHANCEMENT] Compactor: allow unregisteronshutdown to be configurable. #5503
* [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532
* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293
Expand Down
5 changes: 5 additions & 0 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,11 @@ store_gateway:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]

# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
[final_sleep: <duration> | default = 0s]

# Name of network interface to read address from.
# CLI flag: -store-gateway.sharding-ring.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]
Expand Down
15 changes: 15 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,11 @@ sharding_ring:
# CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled
[zone_awareness_enabled: <boolean> | default = false]
# The sleep seconds when alertmanager is shutting down. Need to be close to or
# larger than KV Store information propagation delay
# CLI flag: -alertmanager.sharding-ring.final-sleep
[final_sleep: <duration> | default = 0s]
# Name of network interface to read address from.
# CLI flag: -alertmanager.sharding-ring.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]
Expand Down Expand Up @@ -3945,6 +3950,11 @@ ring:
# CLI flag: -ruler.ring.num-tokens
[num_tokens: <int> | default = 128]
# The sleep seconds when ruler is shutting down. Need to be close to or larger
# than KV Store information propagation delay
# CLI flag: -ruler.ring.final-sleep
[final_sleep: <duration> | default = 0s]
# Period with which to attempt to flush rule groups.
# CLI flag: -ruler.flush-period
[flush_period: <duration> | default = 1m]
Expand Down Expand Up @@ -4836,6 +4846,11 @@ sharding_ring:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]
# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
[final_sleep: <duration> | default = 0s]
# Name of network interface to read address from.
# CLI flag: -store-gateway.sharding-ring.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]
Expand Down
4 changes: 4 additions & 0 deletions docs/configuration/v1-guarantees.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,7 @@ Currently experimental features are:
- Store Gateway Zone Stable Shuffle Sharding
- `-store-gateway.sharding-ring.zone-stable-shuffle-sharding` CLI flag
- `zone_stable_shuffle_sharding` (boolean) field in config file
- Basic Lifecycler (Storegateway, Alertmanager, Ruler) Final Sleep on shutdown, which tells the pod wait before shutdown, allowing a delay to propagate ring changes.
- `-ruler.ring.final-sleep` (duration) CLI flag
- `store-gateway.sharding-ring.final-sleep` (duration) CLI flag
- `alertmanager-sharding-ring.final-sleep` (duration) CLI flag
4 changes: 4 additions & 0 deletions pkg/alertmanager/alertmanager_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ type RingConfig struct {
ReplicationFactor int `yaml:"replication_factor"`
ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"`

FinalSleep time.Duration `yaml:"final_sleep"`

// Instance details
InstanceID string `yaml:"instance_id" doc:"hidden"`
InstanceInterfaceNames []string `yaml:"instance_interface_names"`
Expand Down Expand Up @@ -79,6 +81,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f)
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).")
f.DurationVar(&cfg.FinalSleep, rfprefix+"final-sleep", 0*time.Second, "The sleep seconds when alertmanager is shutting down. Need to be close to or larger than KV Store information propagation delay")
f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.")
f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.")

Expand Down Expand Up @@ -110,6 +113,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
TokensObservePeriod: 0,
Zone: cfg.InstanceZone,
NumTokens: RingNumTokens,
FinalSleep: cfg.FinalSleep,
}, nil
}

Expand Down
1 change: 1 addition & 0 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func mockAlertmanagerConfig(t *testing.T) *MultitenantAlertmanagerConfig {
cfg.ShardingRing.InstanceID = "test"
cfg.ShardingRing.InstanceAddr = "127.0.0.1"
cfg.PollInterval = time.Minute
cfg.ShardingRing.FinalSleep = 0

return cfg
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/ring/basic_lifecycler.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ type BasicLifecyclerConfig struct {
// If true lifecycler doesn't unregister instance from the ring when it's stopping. Default value is false,
// which means unregistering.
KeepInstanceInTheRingOnShutdown bool

FinalSleep time.Duration
}

// BasicLifecycler is a basic ring lifecycler which allows to hook custom
Expand Down Expand Up @@ -251,6 +253,7 @@ heartbeatLoop:
level.Info(l.logger).Log("msg", "instance removed from the ring", "ring", l.ringName)
}

time.Sleep(l.cfg.FinalSleep)
return nil
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/ruler/ruler_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ type RingConfig struct {
InstanceAddr string `yaml:"instance_addr" doc:"hidden"`
NumTokens int `yaml:"num_tokens"`

FinalSleep time.Duration `yaml:"final_sleep"`

// Injected internally
ListenPort int `yaml:"-"`

Expand All @@ -60,6 +62,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f)
f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).")
f.DurationVar(&cfg.FinalSleep, "ruler.ring.final-sleep", 0*time.Second, "The sleep seconds when ruler is shutting down. Need to be close to or larger than KV Store information propagation delay")

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
Expand All @@ -86,6 +89,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
HeartbeatPeriod: cfg.HeartbeatPeriod,
TokensObservePeriod: 0,
NumTokens: cfg.NumTokens,
FinalSleep: cfg.FinalSleep,
}, nil
}

Expand Down
1 change: 1 addition & 0 deletions pkg/ruler/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ func defaultRulerConfig(t testing.TB) Config {
cfg.Ring.ListenPort = 0
cfg.Ring.InstanceAddr = "localhost"
cfg.Ring.InstanceID = "localhost"
cfg.Ring.FinalSleep = 0
cfg.EnableQueryStats = false

return cfg
Expand Down
5 changes: 5 additions & 0 deletions pkg/storegateway/gateway_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ type RingConfig struct {
WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"`
WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"`

FinalSleep time.Duration `yaml:"final_sleep"`

// Instance details
InstanceID string `yaml:"instance_id" doc:"hidden"`
InstanceInterfaceNames []string `yaml:"instance_interface_names"`
Expand Down Expand Up @@ -109,6 +111,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.WaitStabilityMinDuration, ringFlagsPrefix+"wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.")
f.DurationVar(&cfg.WaitStabilityMaxDuration, ringFlagsPrefix+"wait-stability-max-duration", 5*time.Minute, "Maximum time to wait for ring stability at startup. If the store-gateway ring keeps changing after this period of time, the store-gateway will start anyway.")

f.DurationVar(&cfg.FinalSleep, ringFlagsPrefix+"final-sleep", 0*time.Second, "The sleep seconds when store-gateway is shutting down. Need to be close to or larger than KV Store information propagation delay")

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), ringFlagsPrefix+"instance-interface-names", "Name of network interface to read address from.")
Expand Down Expand Up @@ -150,5 +154,6 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
TokensObservePeriod: 0,
NumTokens: RingNumTokens,
KeepInstanceInTheRingOnShutdown: cfg.KeepInstanceInTheRingOnShutdown,
FinalSleep: cfg.FinalSleep,
}, nil
}

0 comments on commit 7a76e51

Please sign in to comment.