Skip to content

Commit

Permalink
Backport of Track plan rejection history and automatically mark clien…
Browse files Browse the repository at this point in the history
…ts as ineligible into release/1.3.x (#13729)

This pull request was automerged via backport-assistant
  • Loading branch information
hc-github-team-nomad-core authored Jul 12, 2022
1 parent a2ae1f2 commit e9b3351
Show file tree
Hide file tree
Showing 21 changed files with 763 additions and 12 deletions.
7 changes: 7 additions & 0 deletions .changelog/13421.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:improvement
core: automatically mark clients with recurring plan rejections as ineligible
```

```release-note:improvement
metrics: emit `nomad.nomad.plan.rejection_tracker.node_score` metric for the number of times a node had a plan rejection within the past time window
```
14 changes: 14 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,20 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
return nil, fmt.Errorf("deploy_query_rate_limit must be greater than 0")
}

// Set plan rejection tracker configuration.
if planRejectConf := agentConfig.Server.PlanRejectionTracker; planRejectConf != nil {
if planRejectConf.Enabled != nil {
conf.NodePlanRejectionEnabled = *planRejectConf.Enabled
}
conf.NodePlanRejectionThreshold = planRejectConf.NodeThreshold

if planRejectConf.NodeWindow == 0 {
return nil, fmt.Errorf("plan_rejection_tracker.node_window must be greater than 0")
} else {
conf.NodePlanRejectionWindow = planRejectConf.NodeWindow
}
}

// Add Enterprise license configs
conf.LicenseEnv = agentConfig.Server.LicenseEnv
conf.LicensePath = agentConfig.Server.LicensePath
Expand Down
76 changes: 76 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,82 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
}
}

func TestAgent_ServerConfig_PlanRejectionTracker(t *testing.T) {
ci.Parallel(t)

cases := []struct {
name string
trackerConfig *PlanRejectionTracker
expectedConfig *PlanRejectionTracker
expectedErr string
}{
{
name: "default",
trackerConfig: nil,
expectedConfig: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 5 * time.Minute,
},
expectedErr: "",
},
{
name: "valid config",
trackerConfig: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(true),
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedConfig: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(true),
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedErr: "",
},
{
name: "invalid node window",
trackerConfig: &PlanRejectionTracker{
NodeThreshold: 123,
},
expectedErr: "node_window must be greater than 0",
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
config := DevConfig(nil)
require.NoError(t, config.normalizeAddrs())

if tc.trackerConfig != nil {
config.Server.PlanRejectionTracker = tc.trackerConfig
}

serverConfig, err := convertServerConfig(config)

if tc.expectedErr != "" {
require.Error(t, err)
require.Contains(t, err.Error(), tc.expectedErr)
} else {
require.NoError(t, err)
if tc.expectedConfig.Enabled != nil {
require.Equal(t,
*tc.expectedConfig.Enabled,
serverConfig.NodePlanRejectionEnabled,
)
}
require.Equal(t,
tc.expectedConfig.NodeThreshold,
serverConfig.NodePlanRejectionThreshold,
)
require.Equal(t,
tc.expectedConfig.NodeWindow,
serverConfig.NodePlanRejectionWindow,
)
}
})
}
}

func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
ci.Parallel(t)

Expand Down
60 changes: 60 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ type ServerConfig struct {
// This value is ignored.
DefaultSchedulerConfig *structs.SchedulerConfiguration `hcl:"default_scheduler_config"`

// PlanRejectionTracker configures the node plan rejection tracker that
// detects potentially bad nodes.
PlanRejectionTracker *PlanRejectionTracker `hcl:"plan_rejection_tracker"`

// EnableEventBroker configures whether this server's state store
// will generate events for its event stream.
EnableEventBroker *bool `hcl:"enable_event_broker"`
Expand Down Expand Up @@ -548,6 +552,53 @@ type RaftBoltConfig struct {
NoFreelistSync bool `hcl:"no_freelist_sync"`
}

// PlanRejectionTracker is used in servers to configure the plan rejection
// tracker.
type PlanRejectionTracker struct {
// Enabled controls if the plan rejection tracker is active or not.
Enabled *bool `hcl:"enabled"`

// NodeThreshold is the number of times a node can have plan rejections
// before it is marked as ineligible.
NodeThreshold int `hcl:"node_threshold"`

// NodeWindow is the time window used to track active plan rejections for
// nodes.
NodeWindow time.Duration
NodeWindowHCL string `hcl:"node_window" json:"-"`

// ExtraKeysHCL is used by hcl to surface unexpected keys
ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"`
}

func (p *PlanRejectionTracker) Merge(b *PlanRejectionTracker) *PlanRejectionTracker {
if p == nil {
return b
}

result := *p

if b == nil {
return &result
}

if b.Enabled != nil {
result.Enabled = b.Enabled
}

if b.NodeThreshold != 0 {
result.NodeThreshold = b.NodeThreshold
}

if b.NodeWindow != 0 {
result.NodeWindow = b.NodeWindow
}
if b.NodeWindowHCL != "" {
result.NodeWindowHCL = b.NodeWindowHCL
}
return &result
}

// Search is used in servers to configure search API options.
type Search struct {
// FuzzyEnabled toggles whether the FuzzySearch API is enabled. If not
Expand Down Expand Up @@ -985,6 +1036,11 @@ func DefaultConfig() *Config {
EventBufferSize: helper.IntToPtr(100),
RaftProtocol: 3,
StartJoin: []string{},
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(false),
NodeThreshold: 100,
NodeWindow: 5 * time.Minute,
},
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryInterval: 30 * time.Second,
Expand Down Expand Up @@ -1586,6 +1642,10 @@ func (s *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
result.EventBufferSize = b.EventBufferSize
}

if b.PlanRejectionTracker != nil {
result.PlanRejectionTracker = result.PlanRejectionTracker.Merge(b.PlanRejectionTracker)
}

if b.DefaultSchedulerConfig != nil {
c := *b.DefaultSchedulerConfig
result.DefaultSchedulerConfig = &c
Expand Down
8 changes: 6 additions & 2 deletions command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ func ParseConfigFile(path string) (*Config, error) {
VaultRetry: &client.RetryConfig{},
},
},
Server: &ServerConfig{
PlanRejectionTracker: &PlanRejectionTracker{},
ServerJoin: &ServerJoin{},
},
ACL: &ACLConfig{},
Audit: &config.AuditConfig{},
Server: &ServerConfig{ServerJoin: &ServerJoin{}},
Consul: &config.ConsulConfig{},
Autopilot: &config.AutopilotConfig{},
Telemetry: &Telemetry{},
Expand All @@ -54,7 +57,7 @@ func ParseConfigFile(path string) (*Config, error) {

err = hcl.Decode(c, buf.String())
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to decode HCL file %s: %w", path, err)
}

// convert strings to time.Durations
Expand All @@ -66,6 +69,7 @@ func ParseConfigFile(path string) (*Config, error) {
{"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL, nil},
{"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL, nil},
{"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL, nil},
{"server.plan_rejection_tracker.node_window", &c.Server.PlanRejectionTracker.NodeWindow, &c.Server.PlanRejectionTracker.NodeWindowHCL, nil},
{"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL, nil},
{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
{"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL, nil},
Expand Down
19 changes: 19 additions & 0 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ var basicConfig = &Config{
EncryptKey: "abc",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(200),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(true),
NodeThreshold: 100,
NodeWindow: 41 * time.Minute,
NodeWindowHCL: "41m",
},
ServerJoin: &ServerJoin{
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: time.Duration(15) * time.Second,
Expand Down Expand Up @@ -543,6 +549,9 @@ func (c *Config) addDefaults() {
if c.Server.ServerJoin == nil {
c.Server.ServerJoin = &ServerJoin{}
}
if c.Server.PlanRejectionTracker == nil {
c.Server.PlanRejectionTracker = &PlanRejectionTracker{}
}
}

// Tests for a panic parsing json with an object of exactly
Expand Down Expand Up @@ -620,6 +629,11 @@ var sample0 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -710,6 +724,11 @@ var sample1 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
10 changes: 10 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "foo",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(0),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(true),
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -343,6 +348,11 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "bar",
EnableEventBroker: helper.BoolToPtr(true),
EventBufferSize: helper.IntToPtr(100),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: helper.BoolToPtr(true),
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
6 changes: 6 additions & 0 deletions command/agent/testdata/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ server {
enable_event_broker = false
event_buffer_size = 200

plan_rejection_tracker {
enabled = true
node_threshold = 100
node_window = "41m"
}

server_join {
retry_join = ["1.1.1.1", "2.2.2.2"]
retry_max = 3
Expand Down
5 changes: 5 additions & 0 deletions command/agent/testdata/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,11 @@
"node_gc_threshold": "12h",
"non_voting_server": true,
"num_schedulers": 2,
"plan_rejection_tracker": {
"enabled": true,
"node_threshold": 100,
"node_window": "41m"
},
"raft_protocol": 3,
"raft_multiplier": 4,
"redundancy_zone": "foo",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample1/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
14 changes: 14 additions & 0 deletions nomad/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,17 @@ type Config struct {
// additional delay is selected from this range randomly.
EvalFailedFollowupDelayRange time.Duration

// NodePlanRejectionEnabled controls if node rejection tracker is enabled.
NodePlanRejectionEnabled bool

// NodePlanRejectionThreshold is the number of times a node must have a
// plan rejection before it is set as ineligible.
NodePlanRejectionThreshold int

// NodePlanRejectionWindow is the time window used to track plan
// rejections for nodes.
NodePlanRejectionWindow time.Duration

// MinHeartbeatTTL is the minimum time between heartbeats.
// This is used as a floor to prevent excessive updates.
MinHeartbeatTTL time.Duration
Expand Down Expand Up @@ -395,6 +406,9 @@ func DefaultConfig() *Config {
MaxHeartbeatsPerSecond: 50.0,
HeartbeatGrace: 10 * time.Second,
FailoverHeartbeatTTL: 300 * time.Second,
NodePlanRejectionEnabled: false,
NodePlanRejectionThreshold: 15,
NodePlanRejectionWindow: 10 * time.Minute,
ConsulConfig: config.DefaultConsulConfig(),
VaultConfig: config.DefaultVaultConfig(),
RPCHoldTimeout: 5 * time.Second,
Expand Down
Loading

0 comments on commit e9b3351

Please sign in to comment.