From f984dba42469febabee9c48e8752e0f3f9e1eb96 Mon Sep 17 00:00:00 2001 From: Sedef Date: Fri, 9 Oct 2020 03:23:02 -0700 Subject: [PATCH] Add health checks to reconcileDelete --- .../kubeadm/controllers/controller.go | 52 ++++++++++--- .../kubeadm/controllers/fakes_test.go | 13 ++++ controlplane/kubeadm/internal/cluster.go | 36 ++++++--- controlplane/kubeadm/internal/cluster_test.go | 78 ++++++------------- 4 files changed, 104 insertions(+), 75 deletions(-) diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go index 7c2e6dfa136f..099baa3ac952 100644 --- a/controlplane/kubeadm/controllers/controller.go +++ b/controlplane/kubeadm/controllers/controller.go @@ -290,21 +290,19 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * return ctrl.Result{}, err } - ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) - if len(ownedMachines) != len(controlPlaneMachines) { - logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") - return ctrl.Result{}, nil - } - - controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) - if err != nil { + controlPlane, err := r.createControlPlane(ctx, cluster, kcp) + if controlPlane == nil || err != nil { logger.Error(err, "failed to initialize control plane") return ctrl.Result{}, err } + if len(controlPlane.Machines) != len(controlPlaneMachines) { + logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") + return ctrl.Result{}, nil + } // Aggregate the operational state of all the machines; while aggregating we are adding the // source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. - conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef()) + conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef()) // reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy. // If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail. @@ -331,7 +329,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * } // If we've made it this far, we can assume that all ownedMachines are up to date - numMachines := len(ownedMachines) + numMachines := len(controlPlane.Machines) desiredReplicas := int(*kcp.Spec.Replicas) switch { @@ -379,6 +377,24 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * return ctrl.Result{}, nil } +func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) { + logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) + + controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name)) + if err != nil { + logger.Error(err, "failed to retrieve control plane machines for cluster") + return nil, err + } + ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) + + controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) + if err != nil { + logger.Error(err, "failed to initialize control plane") + return nil, err + } + return controlPlane, nil +} + // reconcileDelete handles KubeadmControlPlane deletion. // The implementation does not take non-control plane workloads into consideration. This may or may not change in the future. // Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064. @@ -386,6 +402,22 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) logger.Info("Reconcile KubeadmControlPlane deletion") + controlPlane, err := r.createControlPlane(ctx, cluster, kcp) + if controlPlane == nil || err != nil { + logger.Error(err, "failed to initialize control plane") + return ctrl.Result{}, err + } + + // Ignore the health check results here, they are used to set health related conditions on Machines. + _, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) + if err != nil { + return ctrl.Result{}, err + } + _, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) + if err != nil { + return ctrl.Result{}, err + } + allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go index fce1de771386..60f182dd2284 100644 --- a/controlplane/kubeadm/controllers/fakes_test.go +++ b/controlplane/kubeadm/controllers/fakes_test.go @@ -70,6 +70,19 @@ func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ } return nil } +func (f *fakeManagementCluster) TargetClusterEtcdHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) { + if !f.EtcdHealthy { + return nil, errors.New("etcd is not healthy") + } + return nil, nil +} + +func (f *fakeManagementCluster) TargetClusterControlPlaneHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) { + if !f.ControlPlaneHealthy { + return nil, errors.New("control plane is not healthy") + } + return nil, nil +} type fakeWorkloadCluster struct { *internal.Workload diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index 363e852411c5..53711331f82a 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -41,6 +41,8 @@ type ManagementCluster interface { ctrlclient.Reader GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error) + TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) + TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error) @@ -179,21 +181,14 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey return tls.X509KeyPair(crtData, keyData) } -type healthCheck func(context.Context, *ControlPlane) (HealthCheckResult, error) - // HealthCheck will run a generic health check function and report any errors discovered. // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines. // To have access to the owned control-plane machines during health checks, need to pass owningMachines here. -func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane, check healthCheck, clusterKey client.ObjectKey) error { +func (m *Management) healthCheck(controlPlane *ControlPlane, nodeChecks HealthCheckResult, clusterKey client.ObjectKey) error { var errorList []error kcpMachines := controlPlane.Machines.UnsortedList() // Make sure Cluster API is aware of all the nodes. - nodeChecks, err := check(ctx, controlPlane) - if err != nil { - errorList = append(errorList, err) - } - // TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)... for nodeName, err := range nodeChecks { if err != nil { @@ -222,22 +217,39 @@ func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane return nil } +func (m *Management) TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) { + workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey) + if err != nil { + return nil, err + } + return workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane) +} + +func (m *Management) TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) { + workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey) + if err != nil { + return nil, err + } + return workloadCluster.EtcdIsHealthy(ctx, controlPlane) +} + // TargetClusterControlPlaneIsHealthy checks every node for control plane health. func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error { // TODO: add checks for expected taints/labels - cluster, err := m.GetWorkloadCluster(ctx, clusterKey) + + checkResult, err := m.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, clusterKey) if err != nil { return err } - return m.healthCheck(ctx, controlPlane, cluster.ControlPlaneIsHealthy, clusterKey) + return m.healthCheck(controlPlane, checkResult, clusterKey) } // TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster. // In addition, it verifies that there are the same number of etcd members as control plane Machines. func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error { - cluster, err := m.GetWorkloadCluster(ctx, clusterKey) + checkResult, err := m.TargetClusterEtcdHealthCheck(ctx, controlPlane, clusterKey) if err != nil { return err } - return m.healthCheck(ctx, controlPlane, cluster.EtcdIsHealthy, clusterKey) + return m.healthCheck(controlPlane, checkResult, clusterKey) } diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go index f4193244ac5f..c9cc1f436ce7 100644 --- a/controlplane/kubeadm/internal/cluster_test.go +++ b/controlplane/kubeadm/internal/cluster_test.go @@ -25,9 +25,6 @@ import ( "errors" "fmt" "math/big" - "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" - "sigs.k8s.io/cluster-api/util/kubeconfig" - "sigs.k8s.io/cluster-api/util/secret" "testing" "time" @@ -41,8 +38,11 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters" "sigs.k8s.io/cluster-api/util/certs" + "sigs.k8s.io/cluster-api/util/kubeconfig" + "sigs.k8s.io/cluster-api/util/secret" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -476,19 +476,17 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) { controlPlane := createControlPlane(threeMachines) tests := []struct { name string - check healthCheck + checkResult HealthCheckResult clusterKey client.ObjectKey controlPlaneName string controlPlane *ControlPlane }{ { name: "simple", - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"}, controlPlane: controlPlane, @@ -499,11 +497,10 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - ctx := context.Background() m := &Management{ Client: &fakeClient{}, } - g.Expect(m.healthCheck(ctx, controlPlane, tt.check, tt.clusterKey)).To(Succeed()) + g.Expect(m.healthCheck(controlPlane, tt.checkResult, tt.clusterKey)).To(Succeed()) }) } } @@ -512,7 +509,7 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { tests := []struct { name string - check healthCheck + checkResult HealthCheckResult clusterKey client.ObjectKey controlPlaneName string controlPlane *ControlPlane @@ -527,27 +524,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { controlPlaneMachine("two"), controlPlaneMachine("three"), }), - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - }, nil - }, - }, - { - name: "health check returns an error not related to the nodes health", - controlPlane: createControlPlane([]*clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - controlPlaneMachine("three"), - }), - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": errors.New("two"), - "three": errors.New("three"), - }, errors.New("meta") + checkResult: HealthCheckResult{ + "one": nil, }, - expectedErrors: []string{"two", "three", "meta"}, }, { name: "two nodes error on the check but no overall error occurred", @@ -555,12 +534,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { controlPlaneMachine("one"), controlPlaneMachine("two"), controlPlaneMachine("three")}), - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": errors.New("two"), - "three": errors.New("three"), - }, nil + checkResult: HealthCheckResult{ + "one": nil, + "two": errors.New("two"), + "three": errors.New("three"), }, expectedErrors: []string{"two", "three"}, }, @@ -568,12 +545,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { name: "more nodes than machines were checked (out of band control plane nodes)", controlPlane: createControlPlane([]*clusterv1.Machine{ controlPlaneMachine("one")}), - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, }, { @@ -582,12 +557,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { controlPlaneMachine("one"), controlPlaneMachine("two"), nilNodeRef(controlPlaneMachine("three"))}), - check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, }, } @@ -595,11 +568,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - ctx := context.Background() clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"} m := &Management{Client: &fakeClient{}} - err := m.healthCheck(ctx, tt.controlPlane, tt.check, clusterKey) + err := m.healthCheck(tt.controlPlane, tt.checkResult, clusterKey) g.Expect(err).To(HaveOccurred()) for _, expectedError := range tt.expectedErrors {