Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Oct 21, 2020
1 parent ce2b44d commit 364c34b
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 35 deletions.
2 changes: 1 addition & 1 deletion api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"

// RemediationFailedReason is the reason used when a machine fails a remediate an unhealthy machine.
// RemediationFailedReason is the reason used when a machine fails to be remediated.
RemediationFailedReason = "RemediationFailed"

// RemediationInProgressReason is the reason used when an unhealthy machine is being remediated.
Expand Down
4 changes: 2 additions & 2 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,8 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *

// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other MHC operations
ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
if !ret.IsZero() || err != nil {

if ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane); !ret.IsZero() || err != nil {
return ret, err
}

Expand Down
20 changes: 8 additions & 12 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@ import (
func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, c *internal.ControlPlane) (ret ctrl.Result, retErr error) {
logger := r.Log.WithValues("namespace", c.KCP.Namespace, "kubeadmControlPlane", c.KCP.Name, "cluster", c.Cluster.Name)

// Gets all machines that have a MachineHealthCheckSucceeded condition set to False,
// indicating a problem was detected on the machine, and the MachineOwnerRemediated condition set, indicating that KCP is
// responsible of performing remediation as owner of the machine.
// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that KCP is responsible of performing remediation as owner of the machine.
unhealthyMachines := c.UnhealthyMachines()

// If there are not unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -70,26 +69,23 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
}
}()

// before starting remediation, run preflight checks in order to verify it is safe to remediate.
// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If those checks are failing, we surface the reason why remediation is not happening into the MachineOwnerRemediated condition
// and then we return so KCP can proceed with other operations (ctrl.Result nil).

var desiredReplicas int = 0
if c.KCP.Spec.Replicas != nil {
desiredReplicas = int(*c.KCP.Spec.Replicas)
}
desiredReplicas := int(*c.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but there are less than 3 control plane machines. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 control plane machines")
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
return ctrl.Result{}, nil
}

// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
if c.Machines.Len() < desiredReplicas {
logger.Info("A control plane machine needs remediation, but the current number of replicas is currently lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", c.Machines.Len())
logger.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", c.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
return ctrl.Result{}, nil
}
Expand Down
9 changes: 3 additions & 6 deletions controlplane/kubeadm/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"

. "github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -48,18 +49,15 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) {
g := NewWithT(t)

m := createUnhealthyMachine(ctx, g, ns.Name, "m1-unhealthy-")
controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m),
Machines: internal.NewFilterableMachineCollection(),
}
ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
})

t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is marked for deletion", func(t *testing.T) {
Expand Down Expand Up @@ -94,7 +92,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 control plane machines")
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
})
Expand Down Expand Up @@ -548,5 +546,4 @@ func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine,
g.Expect(machineOwnerRemediatedCondition.Reason).To(Equal(reason))
g.Expect(machineOwnerRemediatedCondition.Severity).To(Equal(severity))
g.Expect(machineOwnerRemediatedCondition.Message).To(Equal(message))

}
2 changes: 1 addition & 1 deletion controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"sigs.k8s.io/cluster-api/util/conditions"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -30,6 +29,7 @@ import (
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
"sigs.k8s.io/cluster-api/util/conditions"
)

func TestControlPlane(t *testing.T) {
Expand Down
22 changes: 9 additions & 13 deletions controlplane/kubeadm/internal/workload_cluster_etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,29 +322,25 @@ func (w *Workload) EtcdStatus(ctx context.Context) ([]EtcdMemberStatus, error) {

statuses := []EtcdMemberStatus{}
for _, member := range members {
etcdStatus, err := etcdClient.EtcdClient.Status(ctx, staticPodName("etcd", member.Name))
responsive := true

// if it is not possible to get an etcd member status, it is considered it not responsive.
etcdStatus, err := etcdClient.EtcdClient.Status(ctx, staticPodName("etcd", member.Name))
if err != nil {
statuses = append(statuses, EtcdMemberStatus{
Name: member.Name,
Responsive: false,
})
continue
responsive = false
}

// if there are error or alarms on an etcd member status, it is considered it not responsive.
// NOTE: in future we might want to make a more granular decision here; in the meantime we
// prefer to lean on the safe side.
if len(etcdStatus.Errors) > 0 {
statuses = append(statuses, EtcdMemberStatus{
Name: member.Name,
Responsive: false,
})
continue
if etcdStatus != nil && len(etcdStatus.Errors) > 0 {
responsive = false
}

// Otherwise consider the member as responsive
statuses = append(statuses, EtcdMemberStatus{
Name: member.Name,
Responsive: true,
Responsive: responsive,
})
}
return statuses, nil
Expand Down

0 comments on commit 364c34b

Please sign in to comment.