From 0d07944fd3d23ebe476d248b1a594c445273b492 Mon Sep 17 00:00:00 2001 From: Sedef Date: Thu, 15 Oct 2020 15:04:45 -0700 Subject: [PATCH] Refactor controlplane health check in KCP --- .../kubeadm/controllers/controller.go | 105 ++++++++--- .../kubeadm/controllers/controller_test.go | 4 +- .../kubeadm/controllers/fakes_test.go | 17 +- controlplane/kubeadm/controllers/scale.go | 12 +- .../kubeadm/controllers/scale_test.go | 15 +- .../kubeadm/controllers/upgrade_test.go | 8 +- controlplane/kubeadm/internal/cluster.go | 67 ++++--- controlplane/kubeadm/internal/cluster_test.go | 163 +++++++----------- .../kubeadm/internal/machine_collection.go | 8 +- .../kubeadm/internal/workload_cluster.go | 14 +- .../kubeadm/internal/workload_cluster_etcd.go | 2 +- .../internal/workload_cluster_etcd_test.go | 8 +- 12 files changed, 241 insertions(+), 182 deletions(-) diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go index d6a10904cd75..5f8268c96c54 100644 --- a/controlplane/kubeadm/controllers/controller.go +++ b/controlplane/kubeadm/controllers/controller.go @@ -31,13 +31,6 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" "k8s.io/utils/pointer" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/handler" - "sigs.k8s.io/controller-runtime/pkg/source" - clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3" kubeadmv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/types/v1beta1" @@ -51,6 +44,12 @@ import ( "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" "sigs.k8s.io/cluster-api/util/secret" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/source" ) // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch @@ -289,21 +288,25 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * return ctrl.Result{}, err } - ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) - if len(ownedMachines) != len(controlPlaneMachines) { - logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") - return ctrl.Result{}, nil - } - - controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) + controlPlane, err := r.createControlPlane(ctx, cluster, kcp) if err != nil { logger.Error(err, "failed to initialize control plane") return ctrl.Result{}, err } + if len(controlPlane.Machines) != len(controlPlaneMachines) { + logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") + return ctrl.Result{}, nil + } // Aggregate the operational state of all the machines; while aggregating we are adding the // source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. - conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef()) + conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef()) + + // reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy. + // If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail. + if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() { + return result, err + } // Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations. needRollout := controlPlane.MachinesNeedingRollout() @@ -324,7 +327,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * } // If we've made it this far, we can assume that all ownedMachines are up to date - numMachines := len(ownedMachines) + numMachines := len(controlPlane.Machines) desiredReplicas := int(*kcp.Spec.Replicas) switch { @@ -372,6 +375,24 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * return ctrl.Result{}, nil } +func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) { + logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) + + controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name)) + if err != nil { + logger.Error(err, "failed to retrieve control plane machines for cluster") + return nil, err + } + ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) + + controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) + if err != nil { + logger.Error(err, "failed to initialize control plane") + return nil, err + } + return controlPlane, nil +} + // reconcileDelete handles KubeadmControlPlane deletion. // The implementation does not take non-control plane workloads into consideration. This may or may not change in the future. // Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064. @@ -379,6 +400,26 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) logger.Info("Reconcile KubeadmControlPlane deletion") + controlPlane, err := r.createControlPlane(ctx, cluster, kcp) + if err != nil { + logger.Error(err, "failed to initialize control plane") + return ctrl.Result{}, err + } + + // Ignore the health check results here as well as the errors, health check functions are to set health related conditions on Machines. + // Errors may be due to not being able to get workload cluster nodes. + _, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) + if err != nil { + // Do nothing + r.Log.Info("Control plane did not pass control plane health check during delete reconciliation", "err", err.Error()) + } + _, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) + if err != nil { + // Do nothing + r.Log.Info("Control plane did not pass etcd health check during delete reconciliation", "err", err.Error()) + } + + // Gets all machines, not just control plane machines. allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err @@ -442,21 +483,43 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M return nil } -// reconcileHealth performs health checks for control plane components and etcd +// reconcileControlPlaneHealth performs health checks for control plane components and etcd // It removes any etcd members that do not have a corresponding node. // Also, as a final step, checks if there is any machines that is being deleted. -func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { +func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name) + + // If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet. + if controlPlane.Machines.Len() == 0 { + return ctrl.Result{}, nil + } + + for i := range controlPlane.Machines { + m := controlPlane.Machines[i] + // Initialize the patch helper. + patchHelper, err := patch.NewHelper(m, r.Client) + if err != nil { + return ctrl.Result{}, err + } + + defer func() { + // Always attempt to Patch the Machine conditions after each health reconciliation. + if err := patchHelper.Patch(ctx, m); err != nil { + logger.Error(err, "Failed to patch KubeadmControlPlane Machine", "machine", m.Name) + } + }() + } // Do a health check of the Control Plane components - if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil { + if err := r.managementCluster.TargetClusterAnalyseControlPlaneHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil { r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check to continue reconciliation: %v", err) - return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil + return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check") } // If KCP should manage etcd, ensure etcd is healthy. if controlPlane.IsEtcdManaged() { - if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil { + if err := r.managementCluster.TargetClusterAnalyseEtcdHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil { errList := []error{errors.Wrap(err, "failed to pass etcd health check")} r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check to continue reconciliation: %v", err) diff --git a/controlplane/kubeadm/controllers/controller_test.go b/controlplane/kubeadm/controllers/controller_test.go index 01dd7b2bf6eb..34c21a0a0a9c 100644 --- a/controlplane/kubeadm/controllers/controller_test.go +++ b/controlplane/kubeadm/controllers/controller_test.go @@ -572,7 +572,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { g := NewWithT(t) cluster, kcp, tmpl := createClusterWithControlPlane() - cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com" + cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1" cluster.Spec.ControlPlaneEndpoint.Port = 6443 kcp.Spec.Version = version @@ -642,7 +642,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { g := NewWithT(t) cluster, kcp, tmpl := createClusterWithControlPlane() - cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com" + cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2" cluster.Spec.ControlPlaneEndpoint.Port = 6443 kcp.Spec.Version = "v1.17.0" diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go index 46af76804696..7866b8a2c8c8 100644 --- a/controlplane/kubeadm/controllers/fakes_test.go +++ b/controlplane/kubeadm/controllers/fakes_test.go @@ -57,19 +57,32 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien return f.Machines, nil } -func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error { +func (f *fakeManagementCluster) TargetClusterAnalyseControlPlaneHealth(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error { if !f.ControlPlaneHealthy { return errors.New("control plane is not healthy") } return nil } -func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error { +func (f *fakeManagementCluster) TargetClusterAnalyseEtcdHealth(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error { if !f.EtcdHealthy { return errors.New("etcd is not healthy") } return nil } +func (f *fakeManagementCluster) TargetClusterEtcdHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) { + if !f.EtcdHealthy { + return nil, errors.New("etcd is not healthy") + } + return nil, nil +} + +func (f *fakeManagementCluster) TargetClusterControlPlaneHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) { + if !f.ControlPlaneHealthy { + return nil, errors.New("control plane is not healthy") + } + return nil, nil +} type fakeWorkloadCluster struct { *internal.Workload diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go index a89d5b8e5322..2591ec28c47e 100644 --- a/controlplane/kubeadm/controllers/scale.go +++ b/controlplane/kubeadm/controllers/scale.go @@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { logger := controlPlane.Logger() - // reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up - if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() { - return result, err - } - // Create the bootstrap configuration bootstrapSpec := controlPlane.JoinControlPlaneConfig() fd := controlPlane.NextFailureDomainForScaleUp() @@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( ) (ctrl.Result, error) { logger := controlPlane.Logger() - if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() { - return result, err - } - workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster)) if err != nil { logger.Error(err, "Failed to create client to workload cluster") @@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( } } - if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil { + // TODO: check if this is needed after moving the health check to the main reconcile + if err := r.managementCluster.TargetClusterAnalyseControlPlaneHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil { logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err) r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err) diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go index ceed678aec27..c971906b90da 100644 --- a/controlplane/kubeadm/controllers/scale_test.go +++ b/controlplane/kubeadm/controllers/scale_test.go @@ -124,6 +124,8 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { }) t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) { cluster, kcp, genericMachineTemplate := createClusterWithControlPlane() + cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com" + cluster.Spec.ControlPlaneEndpoint.Port = 6443 initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()} beforeMachines := internal.NewFilterableMachineCollection() @@ -170,18 +172,11 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { Log: log.Log, recorder: record.NewFakeRecorder(32), } - controlPlane := &internal.ControlPlane{ - KCP: kcp, - Cluster: cluster, - Machines: beforeMachines, - } - result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane) - if tc.expectErr { - g.Expect(err).To(HaveOccurred()) - } - g.Expect(result).To(Equal(tc.expectResult)) + _, err := r.reconcile(context.Background(), cluster, kcp) + g.Expect(err).To(HaveOccurred()) + // scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up. controlPlaneMachines := &clusterv1.MachineList{} g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed()) g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines))) diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go index 53596c317188..f0702e2a3da2 100644 --- a/controlplane/kubeadm/controllers/upgrade_test.go +++ b/controlplane/kubeadm/controllers/upgrade_test.go @@ -36,6 +36,8 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) { g := NewWithT(t) cluster, kcp, genericMachineTemplate := createClusterWithControlPlane() + cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com" + cluster.Spec.ControlPlaneEndpoint.Port = 6443 kcp.Spec.Version = "v1.17.3" kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil kcp.Spec.Replicas = pointer.Int32Ptr(1) @@ -89,9 +91,9 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) { // run upgrade a second time, simulate that the node has not appeared yet but the machine exists r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false - result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade) - g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter})) - g.Expect(err).To(BeNil()) + // Unhealthy control plane will be detected during reconcile loop and upgrade will never be called. + _, err = r.reconcile(context.Background(), cluster, kcp) + g.Expect(err).To(HaveOccurred()) g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed()) g.Expect(bothMachines.Items).To(HaveLen(2)) diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index 0789c27e88f1..9ec096f8685d 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -28,13 +28,12 @@ import ( "k8s.io/apimachinery/pkg/runtime" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" - clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters" "sigs.k8s.io/cluster-api/util/secret" + "sigs.k8s.io/controller-runtime/pkg/client" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" ) // ManagementCluster defines all behaviors necessary for something to function as a management cluster. @@ -42,8 +41,10 @@ type ManagementCluster interface { ctrlclient.Reader GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error) - TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error - TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error + TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) + TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) + TargetClusterAnalyseEtcdHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error + TargetClusterAnalyseControlPlaneHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error) } @@ -180,16 +181,15 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey return tls.X509KeyPair(crtData, keyData) } -type healthCheck func(context.Context) (HealthCheckResult, error) - // HealthCheck will run a generic health check function and report any errors discovered. // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines. -func (m *Management) healthCheck(ctx context.Context, check healthCheck, clusterKey client.ObjectKey) error { +// To have access to the owned control-plane machines during health checks, need to pass owningMachines here. +func (m *Management) healthCheck(controlPlane *ControlPlane, nodeChecks HealthCheckResult, clusterKey client.ObjectKey) error { var errorList []error - nodeChecks, err := check(ctx) - if err != nil { - errorList = append(errorList, err) - } + kcpMachines := controlPlane.Machines.UnsortedList() + // Make sure Cluster API is aware of all the nodes. + + // TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)... for nodeName, err := range nodeChecks { if err != nil { errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err)) @@ -199,44 +199,57 @@ func (m *Management) healthCheck(ctx context.Context, check healthCheck, cluster return kerrors.NewAggregate(errorList) } - // Make sure Cluster API is aware of all the nodes. - machines, err := m.GetMachinesForCluster(ctx, clusterKey, machinefilters.ControlPlaneMachines(clusterKey.Name)) - if err != nil { - return err - } - // This check ensures there is a 1 to 1 correspondence of nodes and machines. // If a machine was not checked this is considered an error. - for _, machine := range machines { + for _, machine := range kcpMachines { if machine.Status.NodeRef == nil { + // The condition for this case is set by the Machine controller return errors.Errorf("control plane machine %s/%s has no status.nodeRef", machine.Namespace, machine.Name) } if _, ok := nodeChecks[machine.Status.NodeRef.Name]; !ok { return errors.Errorf("machine's (%s/%s) node (%s) was not checked", machine.Namespace, machine.Name, machine.Status.NodeRef.Name) } } - if len(nodeChecks) != len(machines) { - return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(machines)) + if len(nodeChecks) != len(kcpMachines) { + // MachinesReadyCondition covers this health failure. + return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(kcpMachines)) } return nil } +func (m *Management) TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) { + workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey) + if err != nil { + return nil, err + } + return workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane) +} + // TargetClusterControlPlaneIsHealthy checks every node for control plane health. -func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error { +func (m *Management) TargetClusterAnalyseControlPlaneHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error { // TODO: add checks for expected taints/labels - cluster, err := m.GetWorkloadCluster(ctx, clusterKey) + + checkResult, err := m.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, clusterKey) if err != nil { return err } - return m.healthCheck(ctx, cluster.ControlPlaneIsHealthy, clusterKey) + return m.healthCheck(controlPlane, checkResult, clusterKey) +} + +func (m *Management) TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) { + workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey) + if err != nil { + return nil, err + } + return workloadCluster.EtcdIsHealthy(ctx, controlPlane) } // TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster. // In addition, it verifies that there are the same number of etcd members as control plane Machines. -func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error { - cluster, err := m.GetWorkloadCluster(ctx, clusterKey) +func (m *Management) TargetClusterAnalyseEtcdHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error { + checkResult, err := m.TargetClusterEtcdHealthCheck(ctx, controlPlane, clusterKey) if err != nil { return err } - return m.healthCheck(ctx, cluster.EtcdIsHealthy, clusterKey) + return m.healthCheck(controlPlane, checkResult, clusterKey) } diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go index 4cc990e90e85..3955df649e59 100644 --- a/controlplane/kubeadm/internal/cluster_test.go +++ b/controlplane/kubeadm/internal/cluster_test.go @@ -38,6 +38,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters" "sigs.k8s.io/cluster-api/util/certs" "sigs.k8s.io/cluster-api/util/kubeconfig" @@ -89,7 +90,7 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) { Spec: corev1.PodSpec{}, Status: corev1.PodStatus{Conditions: test.conditions}, } - g.Expect(checkStaticPodReadyCondition(pod)).NotTo(Succeed()) + g.Expect(checkStaticPodReadyCondition(pod)).To(Succeed()) }) } } @@ -118,8 +119,10 @@ func TestControlPlaneIsHealthy(t *testing.T) { }, }, } + controlPlane, err := NewControlPlane(ctx, workloadCluster.Client, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, nil) + g.Expect(err).NotTo(HaveOccurred()) - health, err := workloadCluster.ControlPlaneIsHealthy(context.Background()) + health, err := workloadCluster.ControlPlaneIsHealthy(context.Background(), controlPlane) g.Expect(err).NotTo(HaveOccurred()) g.Expect(health).NotTo(HaveLen(0)) g.Expect(health).To(HaveLen(len(nodeListForTestControlPlaneIsHealthy().Items))) @@ -156,7 +159,7 @@ func TestGetMachinesForCluster(t *testing.T) { func TestGetWorkloadCluster(t *testing.T) { g := NewWithT(t) - ns, err := testEnv.CreateNamespace(ctx, "workload-cluster") + ns, err := testEnv.CreateNamespace(ctx, "workload-cluster2") g.Expect(err).ToNot(HaveOccurred()) defer func() { g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed()) @@ -459,138 +462,105 @@ func (f *fakeClient) Update(_ context.Context, _ runtime.Object, _ ...client.Upd return nil } +func createControlPlane(machines []*clusterv1.Machine) *ControlPlane { + controlPlane, _ := NewControlPlane(ctx, &fakeClient{}, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, NewFilterableMachineCollection(machines...)) + return controlPlane +} + func TestManagementCluster_healthCheck_NoError(t *testing.T) { + threeMachines := []*clusterv1.Machine{ + controlPlaneMachine("one"), + controlPlaneMachine("two"), + controlPlaneMachine("three"), + } + controlPlane := createControlPlane(threeMachines) tests := []struct { name string - machineList *clusterv1.MachineList - check healthCheck + checkResult HealthCheckResult clusterKey client.ObjectKey controlPlaneName string + controlPlane *ControlPlane }{ { name: "simple", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - controlPlaneMachine("three"), - }, + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil - }, - clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"}, + clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"}, + controlPlane: controlPlane, }, } + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - ctx := context.Background() m := &Management{ - Client: &fakeClient{list: tt.machineList}, + Client: &fakeClient{}, } - g.Expect(m.healthCheck(ctx, tt.check, tt.clusterKey)).To(Succeed()) + g.Expect(m.healthCheck(controlPlane, tt.checkResult, tt.clusterKey)).To(Succeed()) }) } } func TestManagementCluster_healthCheck_Errors(t *testing.T) { + tests := []struct { name string - machineList *clusterv1.MachineList - check healthCheck + checkResult HealthCheckResult clusterKey client.ObjectKey controlPlaneName string + controlPlane *ControlPlane // expected errors will ensure the error contains this list of strings. // If not supplied, no check on the error's value will occur. expectedErrors []string }{ { name: "machine's node was not checked for health", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - controlPlaneMachine("three"), - }, - }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - }, nil - }, - }, - { - name: "health check returns an error not related to the nodes health", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - controlPlaneMachine("three"), - }, + controlPlane: createControlPlane([]*clusterv1.Machine{ + controlPlaneMachine("one"), + controlPlaneMachine("two"), + controlPlaneMachine("three"), + }), + checkResult: HealthCheckResult{ + "one": nil, }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": errors.New("two"), - "three": errors.New("three"), - }, errors.New("meta") - }, - expectedErrors: []string{"two", "three", "meta"}, }, { name: "two nodes error on the check but no overall error occurred", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - controlPlaneMachine("three"), - }, - }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": errors.New("two"), - "three": errors.New("three"), - }, nil + controlPlane: createControlPlane([]*clusterv1.Machine{ + controlPlaneMachine("one"), + controlPlaneMachine("two"), + controlPlaneMachine("three")}), + checkResult: HealthCheckResult{ + "one": nil, + "two": errors.New("two"), + "three": errors.New("three"), }, expectedErrors: []string{"two", "three"}, }, { name: "more nodes than machines were checked (out of band control plane nodes)", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - }, - }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil + controlPlane: createControlPlane([]*clusterv1.Machine{ + controlPlaneMachine("one")}), + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, }, { name: "a machine that has a nil node reference", - machineList: &clusterv1.MachineList{ - Items: []clusterv1.Machine{ - controlPlaneMachine("one"), - controlPlaneMachine("two"), - nilNodeRef(controlPlaneMachine("three")), - }, - }, - check: func(ctx context.Context) (HealthCheckResult, error) { - return HealthCheckResult{ - "one": nil, - "two": nil, - "three": nil, - }, nil + controlPlane: createControlPlane([]*clusterv1.Machine{ + controlPlaneMachine("one"), + controlPlaneMachine("two"), + nilNodeRef(controlPlaneMachine("three"))}), + checkResult: HealthCheckResult{ + "one": nil, + "two": nil, + "three": nil, }, }, } @@ -598,13 +568,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - ctx := context.Background() clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"} - m := &Management{ - Client: &fakeClient{list: tt.machineList}, - } - err := m.healthCheck(ctx, tt.check, clusterKey) + m := &Management{Client: &fakeClient{}} + err := m.healthCheck(tt.controlPlane, tt.checkResult, clusterKey) g.Expect(err).To(HaveOccurred()) for _, expectedError := range tt.expectedErrors { @@ -614,9 +581,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) { } } -func controlPlaneMachine(name string) clusterv1.Machine { +func controlPlaneMachine(name string) *clusterv1.Machine { t := true - return clusterv1.Machine{ + return &clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ Namespace: "default", Name: name, @@ -637,7 +604,7 @@ func controlPlaneMachine(name string) clusterv1.Machine { } } -func nilNodeRef(machine clusterv1.Machine) clusterv1.Machine { +func nilNodeRef(machine *clusterv1.Machine) *clusterv1.Machine { machine.Status.NodeRef = nil return machine } diff --git a/controlplane/kubeadm/internal/machine_collection.go b/controlplane/kubeadm/internal/machine_collection.go index 425cea20344a..975aea467218 100644 --- a/controlplane/kubeadm/internal/machine_collection.go +++ b/controlplane/kubeadm/internal/machine_collection.go @@ -83,8 +83,8 @@ func (s FilterableMachineCollection) SortedByCreationTimestamp() []*clusterv1.Ma return res } -// unsortedList returns the slice with contents in random order. -func (s FilterableMachineCollection) unsortedList() []*clusterv1.Machine { +// UnsortedList returns the slice with contents in random order. +func (s FilterableMachineCollection) UnsortedList() []*clusterv1.Machine { res := make([]*clusterv1.Machine, 0, len(s)) for _, value := range s { res = append(res, value) @@ -110,12 +110,12 @@ func newFilteredMachineCollection(filter machinefilters.Func, machines ...*clust // Filter returns a FilterableMachineCollection containing only the Machines that match all of the given MachineFilters func (s FilterableMachineCollection) Filter(filters ...machinefilters.Func) FilterableMachineCollection { - return newFilteredMachineCollection(machinefilters.And(filters...), s.unsortedList()...) + return newFilteredMachineCollection(machinefilters.And(filters...), s.UnsortedList()...) } // AnyFilter returns a FilterableMachineCollection containing only the Machines that match any of the given MachineFilters func (s FilterableMachineCollection) AnyFilter(filters ...machinefilters.Func) FilterableMachineCollection { - return newFilteredMachineCollection(machinefilters.Or(filters...), s.unsortedList()...) + return newFilteredMachineCollection(machinefilters.Or(filters...), s.UnsortedList()...) } // Oldest returns the Machine with the oldest CreationTimestamp diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go index d0db1fe684a9..c3807935d83c 100644 --- a/controlplane/kubeadm/internal/workload_cluster.go +++ b/controlplane/kubeadm/internal/workload_cluster.go @@ -53,12 +53,20 @@ var ( ErrControlPlaneMinNodes = errors.New("cluster has fewer than 2 control plane nodes; removing an etcd member is not supported") ) +// Common control-plane pod name prefixes +const ( + KubeAPIServerPodNamePrefix = "kube-apiserver" + KubeControllerManagerPodNamePrefix = "kube-controller-manager" + KubeSchedulerHealthyPodNamePrefix = "kube-scheduler" + EtcdPodNamePrefix = "etcd" +) + // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster type WorkloadCluster interface { // Basic health and status checks. ClusterStatus(ctx context.Context) (ClusterStatus, error) - ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) - EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) + ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) + EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) // Upgrade related tasks. ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error @@ -112,7 +120,7 @@ type HealthCheckResult map[string]error // controlPlaneIsHealthy does a best effort check of the control plane components the kubeadm control plane cares about. // The return map is a map of node names as keys to error that that node encountered. // All nodes will exist in the map with nil errors if there were no errors for that node. -func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) { +func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { controlPlaneNodes, err := w.getControlPlaneNodes(ctx) if err != nil { return nil, err diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index 78261db27911..3951ca371497 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -38,7 +38,7 @@ type etcdClientFor interface { // This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee. // It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade. // It returns a map of nodes checked along with an error for a given node. -func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) { +func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) { var knownClusterID uint64 var knownMemberIDSet etcdutil.UInt64Set diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index d4384d4bbec7..e997135494bf 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -39,6 +39,12 @@ import ( func TestWorkload_EtcdIsHealthy(t *testing.T) { g := NewWithT(t) + controlPlane := createControlPlane([]*clusterv1.Machine{ + controlPlaneMachine("one"), + controlPlaneMachine("two"), + controlPlaneMachine("three"), + }) + workload := &Workload{ Client: &fakeClient{ get: map[string]interface{}{ @@ -75,7 +81,7 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) { }, } ctx := context.Background() - health, err := workload.EtcdIsHealthy(ctx) + health, err := workload.EtcdIsHealthy(ctx, controlPlane) g.Expect(err).NotTo(HaveOccurred()) for _, err := range health {