Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 KCP health checks handle consistently errors and check failures #3863

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 14 additions & 24 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,34 +384,32 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
logger.Info("Reconcile KubeadmControlPlane deletion")

// Gets all machines, not just control plane machines.
allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about MachinePools?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is just shuffling things around to fix a bug, but this bit is not changing from what we have currently.
However this is a good point and I open an issue to track that we are missing machine pools here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if err != nil {
return ctrl.Result{}, err
}
ownedMachines := allMachines.Filter(machinefilters.OwnedMachines(kcp))

// Ignore the health check results here as well as the errors, health check functions are to set health related conditions on Machines.
// Errors may be due to not being able to get workload cluster nodes.
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
r.Log.V(2).Info("Cannot get remote client to workload cluster during delete reconciliation", "err", err.Error())
} else {
// Do a health check of the Control Plane components
_, err = workloadCluster.ControlPlaneIsHealthy(ctx)
if err != nil {
if err := workloadCluster.ControlPlaneIsHealthy(ctx, ownedMachines.UnsortedList()); err != nil {
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
// Do nothing
r.Log.V(2).Info("Control plane did not pass control plane health check during delete reconciliation", "err", err.Error())
}

// Do a health check of the etcd
_, err = workloadCluster.EtcdIsHealthy(ctx)
if err != nil {
if err := workloadCluster.EtcdIsHealthy(ctx, ownedMachines.UnsortedList()); err != nil {
// Do nothing
r.Log.V(2).Info("Control plane did not pass etcd health check during delete reconciliation", "err", err.Error())
}
}

// Gets all machines, not just control plane machines.
allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
}
ownedMachines := allMachines.Filter(machinefilters.OwnedMachines(kcp))

// If no control plane machines remain, remove the finalizer
if len(ownedMachines) == 0 {
controllerutil.RemoveFinalizer(kcp, controlplanev1.KubeadmControlPlaneFinalizer)
Expand Down Expand Up @@ -486,31 +484,23 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.

errList := []error{}

// Do a health check of the Control Plane components
checkResult, err := workloadCluster.ControlPlaneIsHealthy(ctx)
if err != nil {
errList = append(errList, errors.Wrap(err, "failed to pass control-plane health check"))
} else if err := checkResult.Aggregate(controlPlane); err != nil {
// Do a health check of the Control Plane components.
if err := workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane.Machines.UnsortedList()); err != nil {
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
errList = append(errList, errors.Wrap(err, "failed to pass control-plane health check"))
}

// If KCP should manage etcd, ensure etcd is healthy.
if controlPlane.IsEtcdManaged() {
checkResult, err := workloadCluster.EtcdIsHealthy(ctx)
if err != nil {
errList = append(errList, errors.Wrap(err, "failed to pass etcd health check"))
} else if err := checkResult.Aggregate(controlPlane); err != nil {
if err := workloadCluster.EtcdIsHealthy(ctx, controlPlane.Machines.UnsortedList()); err != nil {
errList = append(errList, errors.Wrap(err, "failed to pass etcd health check"))
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)

// If there are any etcd members that do not have corresponding nodes, remove them from etcd and from the kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
errList = append(errList, errors.Wrap(err, "cannot get remote client to workload cluster"))
} else if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
errList = append(errList, errors.Wrap(err, "failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation"))
}
}
Expand Down
12 changes: 6 additions & 6 deletions controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,18 @@ func (f fakeWorkloadCluster) UpdateKubeletConfigMap(ctx context.Context, version
return nil
}

func (f fakeWorkloadCluster) ControlPlaneIsHealthy(ctx context.Context) (internal.HealthCheckResult, error) {
func (f fakeWorkloadCluster) ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
if !f.ControlPlaneHealthy {
return nil, errors.New("control plane is not healthy")
return errors.New("control plane is not healthy")
}
return nil, nil
return nil
}

func (f fakeWorkloadCluster) EtcdIsHealthy(ctx context.Context) (internal.HealthCheckResult, error) {
func (f fakeWorkloadCluster) EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
if !f.EtcdHealthy {
return nil, errors.New("etcd is not healthy")
return errors.New("etcd is not healthy")
}
return nil, nil
return nil
}

type fakeMigrator struct {
Expand Down
9 changes: 6 additions & 3 deletions controlplane/kubeadm/internal/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
func TestControlPlaneIsHealthy(t *testing.T) {
g := NewWithT(t)

machines := []*clusterv1.Machine{
controlPlaneMachine("first-control-plane"),
controlPlaneMachine("second-control-plane"),
controlPlaneMachine("third-control-plane"),
}
readyStatus := corev1.PodStatus{
Conditions: []corev1.PodCondition{
{
Expand All @@ -118,10 +123,8 @@ func TestControlPlaneIsHealthy(t *testing.T) {
},
}

health, err := workloadCluster.ControlPlaneIsHealthy(context.Background())
err := workloadCluster.ControlPlaneIsHealthy(context.Background(), machines)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(health).NotTo(HaveLen(0))
g.Expect(health).To(HaveLen(len(nodeListForTestControlPlaneIsHealthy().Items)))
}

func TestGetMachinesForCluster(t *testing.T) {
Expand Down
33 changes: 16 additions & 17 deletions controlplane/kubeadm/internal/workload_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ var (
type WorkloadCluster interface {
// Basic health and status checks.
ClusterStatus(ctx context.Context) (ClusterStatus, error)
ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error)
EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error
EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error

// Upgrade related tasks.
ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error
Expand Down Expand Up @@ -107,17 +107,16 @@ func (w *Workload) getConfigMap(ctx context.Context, configMap ctrlclient.Object
return original.DeepCopy(), nil
}

// HealthCheckResult maps nodes that are checked to any errors the node has related to the check.
type HealthCheckResult map[string]error
// checkResult maps nodes that are checked to any errors the node has related to the check.
type checkResult map[string]error

// Aggregate will analyse HealthCheckResult and report any errors discovered.
// It also ensures there is a 1;1 match between nodes and machines.
func (h HealthCheckResult) Aggregate(controlPlane *ControlPlane) error {
// CompareWith compares the current health check result with the list of machines coming from the control plane.
// Additionally, it checks that for each machine a node is present.
func (c checkResult) CompareWith(machines []*clusterv1.Machine) error {
var errorList []error
kcpMachines := controlPlane.Machines.UnsortedList()
// Make sure Cluster API is aware of all the nodes.

for nodeName, err := range h {
for nodeName, err := range c {
if err != nil {
errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err))
}
Expand All @@ -128,32 +127,32 @@ func (h HealthCheckResult) Aggregate(controlPlane *ControlPlane) error {

// This check ensures there is a 1 to 1 correspondence of nodes and machines.
// If a machine was not checked this is considered an error.
for _, machine := range kcpMachines {
for _, machine := range machines {
if machine.Status.NodeRef == nil {
// The condition for this case is set by the Machine controller
return errors.Errorf("control plane machine %s/%s has no status.nodeRef", machine.Namespace, machine.Name)
}
if _, ok := h[machine.Status.NodeRef.Name]; !ok {
if _, ok := c[machine.Status.NodeRef.Name]; !ok {
return errors.Errorf("machine's (%s/%s) node (%s) was not checked", machine.Namespace, machine.Name, machine.Status.NodeRef.Name)
}
}
if len(h) != len(kcpMachines) {
if len(c) != len(machines) {
// MachinesReadyCondition covers this health failure.
return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", controlPlane.Cluster.Namespace, len(h), len(kcpMachines))
return errors.Errorf("number of nodes and machines did not match: %d nodes %d machines", len(c), len(machines))
}
return nil
}

// controlPlaneIsHealthy does a best effort check of the control plane components the kubeadm control plane cares about.
// The return map is a map of node names as keys to error that that node encountered.
// All nodes will exist in the map with nil errors if there were no errors for that node.
func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) {
func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
if err != nil {
return nil, err
return err
}

response := make(map[string]error)
response := make(checkResult)
for _, node := range controlPlaneNodes.Items {
name := node.Name
response[name] = nil
Expand Down Expand Up @@ -186,7 +185,7 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
response[name] = checkStaticPodReadyCondition(controllerManagerPod)
}

return response, nil
return response.CompareWith(machines)
}

// UpdateKubernetesVersionInKubeadmConfigMap updates the kubernetes version in the kubeadm config map.
Expand Down
10 changes: 5 additions & 5 deletions controlplane/kubeadm/internal/workload_cluster_etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ type etcdClientFor interface {
// This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee.
// It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade.
// It returns a map of nodes checked along with an error for a given node.
func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) {
func (w *Workload) EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
var knownClusterID uint64
var knownMemberIDSet etcdutil.UInt64Set

controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
if err != nil {
return nil, err
return err
}

expectedMembers := 0
response := make(map[string]error)
response := make(checkResult)
for _, node := range controlPlaneNodes.Items {
name := node.Name
response[name] = nil
Expand Down Expand Up @@ -127,10 +127,10 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
// Check that there is exactly one etcd member for every healthy pod.
// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
if expectedMembers != len(knownMemberIDSet) {
return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
return errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
}

return response, nil
return response.CompareWith(machines)
}

// ReconcileEtcdMembers iterates over all etcd members and finds members that do not have corresponding nodes.
Expand Down
12 changes: 7 additions & 5 deletions controlplane/kubeadm/internal/workload_cluster_etcd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ import (
func TestWorkload_EtcdIsHealthy(t *testing.T) {
g := NewWithT(t)

machines := []*clusterv1.Machine{
controlPlaneMachine("test-1"),
controlPlaneMachine("test-2"),
controlPlaneMachine("test-3"),
controlPlaneMachine("test-4"),
}
workload := &Workload{
Client: &fakeClient{
get: map[string]interface{}{
Expand Down Expand Up @@ -75,12 +81,8 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) {
},
}
ctx := context.Background()
health, err := workload.EtcdIsHealthy(ctx)
err := workload.EtcdIsHealthy(ctx, machines)
g.Expect(err).NotTo(HaveOccurred())

for _, err := range health {
g.Expect(err).NotTo(HaveOccurred())
}
}

func TestUpdateEtcdVersionInKubeadmConfigMap(t *testing.T) {
Expand Down
Loading