Skip to content

Commit

Permalink
core: server config for plan rejection tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
lgfa29 committed Jun 18, 2022
1 parent 0881131 commit febcf30
Show file tree
Hide file tree
Showing 11 changed files with 190 additions and 2 deletions.
11 changes: 11 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,17 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
return nil, fmt.Errorf("deploy_query_rate_limit must be greater than 0")
}

// Set plan rejection tracker configuration.
if planRejectConf := agentConfig.Server.PlanRejectionTracker; planRejectConf != nil {
conf.NodePlanRejectionThreshold = planRejectConf.NodeThreshold

if planRejectConf.NodeWindow == 0 {
return nil, fmt.Errorf("plan_rejection_tracker.node_window must be greater than 0")
} else {
conf.NodePlanRejectionWindow = planRejectConf.NodeWindow
}
}

// Add Enterprise license configs
conf.LicenseEnv = agentConfig.Server.LicenseEnv
conf.LicensePath = agentConfig.Server.LicensePath
Expand Down
68 changes: 68 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,74 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
}
}

func TestAgent_ServerConfig_PlanRejectionTracker(t *testing.T) {
ci.Parallel(t)

cases := []struct {
name string
trackerConfig *PlanRejectionTracker
expectedConfig *PlanRejectionTracker
expectedErr string
}{
{
name: "default",
trackerConfig: nil,
expectedConfig: &PlanRejectionTracker{
NodeThreshold: 15,
NodeWindow: 10 * time.Minute,
},
expectedErr: "",
},
{
name: "valid config",
trackerConfig: &PlanRejectionTracker{
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedConfig: &PlanRejectionTracker{
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedErr: "",
},
{
name: "invalid node window",
trackerConfig: &PlanRejectionTracker{
NodeThreshold: 123,
},
expectedErr: "node_window must be greater than 0",
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
config := DevConfig(nil)
require.NoError(t, config.normalizeAddrs())

if tc.trackerConfig != nil {
config.Server.PlanRejectionTracker = tc.trackerConfig
}

serverConfig, err := convertServerConfig(config)

if tc.expectedErr != "" {
require.Error(t, err)
require.Contains(t, err.Error(), tc.expectedErr)
} else {
require.NoError(t, err)
require.Equal(t,
tc.expectedConfig.NodeThreshold,
serverConfig.NodePlanRejectionThreshold,
)
require.Equal(t,
tc.expectedConfig.NodeWindow,
serverConfig.NodePlanRejectionWindow,
)
}
})
}
}

func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
ci.Parallel(t)

Expand Down
52 changes: 52 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ type ServerConfig struct {
// This value is ignored.
DefaultSchedulerConfig *structs.SchedulerConfiguration `hcl:"default_scheduler_config"`

// PlanRejectionTracker configures the node plan rejection tracker that
// detects potentially bad nodes.
PlanRejectionTracker *PlanRejectionTracker `hcl:"plan_rejection_tracker"`

// EnableEventBroker configures whether this server's state store
// will generate events for its event stream.
EnableEventBroker *bool `hcl:"enable_event_broker"`
Expand Down Expand Up @@ -548,6 +552,46 @@ type RaftBoltConfig struct {
NoFreelistSync bool `hcl:"no_freelist_sync"`
}

// PlanRejectionTracker is used in servers to configure the plan rejection
// tracker.
type PlanRejectionTracker struct {
// NodeThreshold is the number of times a node can have plan rejections
// before it is marked as ineligible.
NodeThreshold int `hcl:"node_threshold"`

// NodeWindow is the time window used to track active plan rejections for
// nodes.
NodeWindow time.Duration
NodeWindowHCL string `hcl:"node_window" json:"-"`

// ExtraKeysHCL is used by hcl to surface unexpected keys
ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"`
}

func (p *PlanRejectionTracker) Merge(b *PlanRejectionTracker) *PlanRejectionTracker {
if p == nil {
return b
}

result := *p

if b == nil {
return &result
}

if b.NodeThreshold != 0 {
result.NodeThreshold = b.NodeThreshold
}

if b.NodeWindow != 0 {
result.NodeWindow = b.NodeWindow
}
if b.NodeWindowHCL != "" {
result.NodeWindowHCL = b.NodeWindowHCL
}
return &result
}

// Search is used in servers to configure search API options.
type Search struct {
// FuzzyEnabled toggles whether the FuzzySearch API is enabled. If not
Expand Down Expand Up @@ -985,6 +1029,10 @@ func DefaultConfig() *Config {
EventBufferSize: helper.IntToPtr(100),
RaftProtocol: 3,
StartJoin: []string{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 15,
NodeWindow: 10 * time.Minute,
},
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryInterval: 30 * time.Second,
Expand Down Expand Up @@ -1586,6 +1634,10 @@ func (s *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
result.EventBufferSize = b.EventBufferSize
}

if b.PlanRejectionTracker != nil {
result.PlanRejectionTracker = result.PlanRejectionTracker.Merge(b.PlanRejectionTracker)
}

if b.DefaultSchedulerConfig != nil {
c := *b.DefaultSchedulerConfig
result.DefaultSchedulerConfig = &c
Expand Down
8 changes: 6 additions & 2 deletions command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ func ParseConfigFile(path string) (*Config, error) {
VaultRetry: &client.RetryConfig{},
},
},
Server: &ServerConfig{
PlanRejectionTracker: &PlanRejectionTracker{},
ServerJoin: &ServerJoin{},
},
ACL: &ACLConfig{},
Audit: &config.AuditConfig{},
Server: &ServerConfig{ServerJoin: &ServerJoin{}},
Consul: &config.ConsulConfig{},
Autopilot: &config.AutopilotConfig{},
Telemetry: &Telemetry{},
Expand All @@ -54,7 +57,7 @@ func ParseConfigFile(path string) (*Config, error) {

err = hcl.Decode(c, buf.String())
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to decode HCL file %s: %v", path, err)
}

// convert strings to time.Durations
Expand All @@ -66,6 +69,7 @@ func ParseConfigFile(path string) (*Config, error) {
{"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL, nil},
{"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL, nil},
{"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL, nil},
{"server.plan_rejection_tracker.node_window", &c.Server.PlanRejectionTracker.NodeWindow, &c.Server.PlanRejectionTracker.NodeWindowHCL, nil},
{"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL, nil},
{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
{"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL, nil},
Expand Down
18 changes: 18 additions & 0 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ var basicConfig = &Config{
EncryptKey: "abc",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(200),
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 41 * time.Minute,
NodeWindowHCL: "41m",
},
ServerJoin: &ServerJoin{
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: time.Duration(15) * time.Second,
Expand Down Expand Up @@ -540,6 +545,9 @@ func (c *Config) addDefaults() {
if c.Server.ServerJoin == nil {
c.Server.ServerJoin = &ServerJoin{}
}
if c.Server.PlanRejectionTracker == nil {
c.Server.PlanRejectionTracker = &PlanRejectionTracker{}
}
}

// Tests for a panic parsing json with an object of exactly
Expand Down Expand Up @@ -617,6 +625,11 @@ var sample0 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -707,6 +720,11 @@ var sample1 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
8 changes: 8 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "foo",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(0),
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -343,6 +347,10 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "bar",
EnableEventBroker: helper.BoolToPtr(true),
EventBufferSize: helper.IntToPtr(100),
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
5 changes: 5 additions & 0 deletions command/agent/testdata/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ server {
enable_event_broker = false
event_buffer_size = 200

plan_rejection_tracker {
node_threshold = 100
node_window = "41m"
}

server_join {
retry_join = ["1.1.1.1", "2.2.2.2"]
retry_max = 3
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@
"node_gc_threshold": "12h",
"non_voting_server": true,
"num_schedulers": 2,
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "41m"
},
"raft_protocol": 3,
"raft_multiplier": 4,
"redundancy_zone": "foo",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample1/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
10 changes: 10 additions & 0 deletions nomad/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ type Config struct {
// additional delay is selected from this range randomly.
EvalFailedFollowupDelayRange time.Duration

// NodePlanRejectionThreshold is the number of times a node must have a
// plan rejection before it is set as ineligible.
NodePlanRejectionThreshold int

// NodePlanRejectionWindow is the time window used to track plan
// rejections for nodes.
NodePlanRejectionWindow time.Duration

// MinHeartbeatTTL is the minimum time between heartbeats.
// This is used as a floor to prevent excessive updates.
MinHeartbeatTTL time.Duration
Expand Down Expand Up @@ -395,6 +403,8 @@ func DefaultConfig() *Config {
MaxHeartbeatsPerSecond: 50.0,
HeartbeatGrace: 10 * time.Second,
FailoverHeartbeatTTL: 300 * time.Second,
NodePlanRejectionThreshold: 15,
NodePlanRejectionWindow: 10 * time.Minute,
ConsulConfig: config.DefaultConsulConfig(),
VaultConfig: config.DefaultVaultConfig(),
RPCHoldTimeout: 5 * time.Second,
Expand Down

0 comments on commit febcf30

Please sign in to comment.