Skip to content

Commit

Permalink
[kcp] Combined health checks to a function
Browse files Browse the repository at this point in the history
  • Loading branch information
Sedef committed Apr 2, 2020
1 parent 603b6f4 commit b0e67d6
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 35 deletions.
27 changes: 27 additions & 0 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,30 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M

return nil
}

func (r *KubeadmControlPlaneReconciler) generalHealthCheck(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) error {
logger := controlPlane.Logger()

// Do a health check of the Control Plane components
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
return &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}

// Ensure etcd is healthy
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
// If there are any nodes in ETCD members that do not exist, remove them from ETCD and from kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
if err := r.managementCluster.TargetClusterRemoveMissingNodes(ctx, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation", "cause", err)
}
logger.V(2).Info("Waiting for control plane to pass etcd health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
return &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}

return nil
}
41 changes: 6 additions & 35 deletions controlplane/kubeadm/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,7 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, _ internal.FilterableMachineCollection, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
logger := controlPlane.Logger()

if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check before adding an additional control plane machine", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before adding additional control plane machine: %v", err)
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}

if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
// If there are any nodes in ETCD members that do not exist, remove them from ETCD and from kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
if err := r.managementCluster.TargetClusterRemoveMissingNodes(ctx, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Failed attempt to remove potential hanging etcd members to pass etcd health check before adding an additional control plane machine", "cause", err)
}
logger.V(2).Info("Waiting for control plane to pass etcd health check before adding an additional control plane machine", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check before adding additional control plane machine: %v", err)
if err := r.generalHealthCheck(ctx, cluster, kcp, controlPlane); err != nil {
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}

Expand Down Expand Up @@ -95,13 +82,16 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}

// We don't want to health check at the beginning of this method to avoid blocking re-entrancy

// Wait for any delete in progress to complete before deleting another Machine
if controlPlane.HasDeletingMachine() {
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter}
}

// We don't want to health check at the beginning of this method to avoid blocking re-entrancy
if err := r.generalHealthCheck(ctx, cluster, kcp, controlPlane); err != nil {
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}

markedForDeletion := selectedMachines.Filter(machinefilters.HasAnnotationKey(controlplanev1.DeleteForScaleDownAnnotation))
if len(markedForDeletion) == 0 {
fd := controlPlane.FailureDomainWithMostMachines(selectedMachines)
Expand All @@ -122,18 +112,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete")
}

// Ensure etcd is healthy prior to attempting to remove the member
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
// If there are any nodes in ETCD members that do not exist, remove them from ETCD and from kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
if err := r.managementCluster.TargetClusterRemoveMissingNodes(ctx, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Failed attempt to remove potential hanging etcd members to pass etcd health check before adding an additional control plane machine", "cause", err)
}
logger.V(2).Info("Waiting for control plane to pass etcd health check before removing a control plane machine", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check before removing a control plane machine: %v", err)
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}
// If etcd leadership is on machine that is about to be deleted, move it to the newest member available.
etcdLeaderCandidate := ownedMachines.Newest()
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil {
Expand Down Expand Up @@ -162,13 +140,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
}
}

// Do a final health check of the Control Plane components prior to actually deleting the machine
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
}
logger = logger.WithValues("machine", machineToDelete)
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete control plane machine")
Expand Down

0 comments on commit b0e67d6

Please sign in to comment.