Skip to content

Commit

Permalink
Address feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
JoelSpeed committed Feb 26, 2020
1 parent 843e020 commit 5ac9cc8
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 11 deletions.
2 changes: 1 addition & 1 deletion api/v1alpha3/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ type MachineHealthCheckSpec struct {
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`

// Machines older than this duration without a node will be considered to have
// failed and will be remediated
// failed and will be remediated.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
}
Expand Down
13 changes: 10 additions & 3 deletions api/v1alpha3/machinehealthcheck_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook"
)

var (
// Default time allowed for a node to start up. Can be made longer as part of
// spec if required for particular provider.
// 10 minutes should allow the instance to start and the node to join the
// cluster on most providers.
defaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute}
)

func (m *MachineHealthCheck) SetupWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(m).
Expand All @@ -49,7 +57,6 @@ func (m *MachineHealthCheck) Default() {
}

if m.Spec.NodeStartupTimeout == nil {
defaultNodeStartupTimeout := metav1.Duration{Duration: 10 * time.Minute}
m.Spec.NodeStartupTimeout = &defaultNodeStartupTimeout
}
}
Expand Down Expand Up @@ -92,10 +99,10 @@ func (m *MachineHealthCheck) validate(old *MachineHealthCheck) error {
)
}

if m.Spec.NodeStartupTimeout != nil && m.Spec.NodeStartupTimeout.Nanoseconds() <= 0 {
if m.Spec.NodeStartupTimeout != nil && m.Spec.NodeStartupTimeout.Seconds() < 30 {
allErrs = append(
allErrs,
field.Invalid(field.NewPath("spec", "nodeStartupTimeout"), m.Spec.NodeStartupTimeout, "must be greater than 0"),
field.Invalid(field.NewPath("spec", "nodeStartupTimeout"), m.Spec.NodeStartupTimeout, "must be greater than 30s"),
)
}

Expand Down
14 changes: 13 additions & 1 deletion api/v1alpha3/machinehealthcheck_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ func TestMachineHealthCheckClusterNameImmutable(t *testing.T) {

func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) {
zero := metav1.Duration{Duration: 0}
twentyNineSeconds := metav1.Duration{Duration: 29 * time.Second}
thirtySeconds := metav1.Duration{Duration: 30 * time.Second}
oneMinute := metav1.Duration{Duration: 1 * time.Minute}
minusOneMinute := metav1.Duration{Duration: -1 * time.Minute}

Expand All @@ -133,10 +135,20 @@ func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) {
expectErr: false,
},
{
name: "when the nodeStartupTimeout is greater than 0",
name: "when the nodeStartupTimeout is greater than 30s",
timeout: &oneMinute,
expectErr: false,
},
{
name: "when the nodeStartupTimeout is 30s",
timeout: &thirtySeconds,
expectErr: false,
},
{
name: "when the nodeStartupTimeout is 29s",
timeout: &twentyNineSeconds,
expectErr: true,
},
{
name: "when the nodeStartupTimeout is less than 0",
timeout: &minusOneMinute,
Expand Down
10 changes: 5 additions & 5 deletions controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ import (
)

const (
mhcClusterNameIndex = "spec.clusterName"
machineNodeNameIndex = "status.nodeRef.name"
mhcClusterNameIndex = "spec.clusterName"
machineNodeNameIndex = "status.nodeRef.name"
defaultTimeoutForMachineToHaveNode = 10 * time.Minute
)

// MachineHealthCheckReconciler reconciles a MachineHealthCheck object
Expand Down Expand Up @@ -185,8 +186,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
return ctrl.Result{}, err
}

err = r.watchClusterNodes(ctx, r.Client, cluster)
if err != nil {
if err := r.watchClusterNodes(ctx, r.Client, cluster); err != nil {
logger.Error(err, "Error watching nodes on target cluster")
return ctrl.Result{}, err
}
Expand All @@ -202,7 +202,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
m.Status.ExpectedMachines = int32(totalTargets)

// Default to 10 minutes but override if set in MachineHealthCheck
timeoutForMachineToHaveNode := 10 * time.Minute
timeoutForMachineToHaveNode := defaultTimeoutForMachineToHaveNode
if m.Spec.NodeStartupTimeout != nil {
timeoutForMachineToHaveNode = m.Spec.NodeStartupTimeout.Duration
}
Expand Down
10 changes: 9 additions & 1 deletion controllers/machinehealthcheck_targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,15 @@ func (t *healthCheckTarget) nodeName() string {
return ""
}

// Determine whether or not a given target needs remediation
// Determine whether or not a given target needs remediation.
// The node will be need rememdiation if any of the following are true:
// - The Machine has failed for some reason
// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
// - The Node has gone away
// - Any condition on the node is matched for the given timeout
// If the target doesn't currently need rememdiation, provide a duration after
// which the target should next be checked.
// The target should be requeued after this duration.
func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode time.Duration) (bool, time.Duration) {
var nextCheckTimes []time.Duration
now := time.Now()
Expand Down

0 comments on commit 5ac9cc8

Please sign in to comment.