Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][DNR] 🌱 Improve & fix KCP remediation e2e test #8075

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions controlplane/kubeadm/api/v1alpha4/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down Expand Up @@ -173,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.Template.Spec.RemediationStrategy != nil {
dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy
}

return nil
}

Expand Down Expand Up @@ -262,5 +273,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr

func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
// .RolloutBefore was added in v1beta1.
// .RemediationStrategy was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
}

func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
// .LastRemediation was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
}
17 changes: 7 additions & 10 deletions controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 86 additions & 0 deletions controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package v1beta1

import (
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -49,6 +51,23 @@ const (
// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"

// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
// failures in updating remediation retry or remediation count (both counters restart from zero).
RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"

// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
// please note that in case of retry, when also the remediating machine fails, the system keeps track of
// the first machine of the sequence only.
// NOTE: if something external to CAPI removes this annotation the system this can lead to
// failures in updating remediation retry (the counter restarts from zero).
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"

// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
// machine unrelated from the previous remediation.
DefaultMinHealthyPeriod = 1 * time.Hour
)

// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
Expand Down Expand Up @@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneMachineTemplate defines the template for Machines
Expand Down Expand Up @@ -158,6 +181,50 @@ type RollingUpdate struct {
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}

// RemediationStrategy allows to define how control plane machine remediation happens.
type RemediationStrategy struct {
// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
// For example, given a control plane with three machines M1, M2, M3:
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
// remediated; such operation is considered a retry, remediation-retry #1.
// If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
//
// A retry could happen only after RetryPeriod from the previous retry.
// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
//
// If not set, the remedation will be retried infinitely.
// +optional
MaxRetry *int32 `json:"maxRetry,omitempty"`

// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
// for an unhealthy machine (a retry).
//
// If not set, a retry will happen immediately.
// +optional
RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`

// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
// this machine will be remediated and this operation is considered a retry - a problem related
// to the original issue happened to M1 -.
//
// If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
// m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
// the original issue happened to M1.
//
// If not set, this value is defaulted to 1h.
// +optional
MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
}

// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
type KubeadmControlPlaneStatus struct {
// Selector is the label selector in string format to avoid introspection
Expand Down Expand Up @@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
// Conditions defines current service state of the KubeadmControlPlane.
// +optional
Conditions clusterv1.Conditions `json:"conditions,omitempty"`

// LastRemediation stores info about last remediation performed.
// +optional
LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
}

// LastRemediationStatus stores info about last remediation performed.
// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
// more remediations than expected might happen.
type LastRemediationStatus struct {
// Machine is the machine name of the latest machine being remediated.
Machine string `json:"machine"`

// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
Timestamp metav1.Time `json:"timestamp"`

// RetryCount used to keep track of remediation retry for the last remediated machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
RetryCount int32 `json:"retryCount"`
}

// +kubebuilder:object:root=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
{spec, "machineTemplate", "nodeDeletionTimeout"},
{spec, "replicas"},
{spec, "version"},
{spec, "remediationStrategy"},
{spec, "remediationStrategy", "*"},
{spec, "rolloutAfter"},
{spec, "rolloutBefore", "*"},
{spec, "rolloutStrategy", "*"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
validUpdate.Spec.RolloutBefore = &RolloutBefore{
CertificatesExpiryDays: pointer.Int32(14),
}
validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
MaxRetry: pointer.Int32(50),
MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
RetryPeriod: metav1.Duration{Duration: 10 * time.Minute},
}
validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig

scaleToZero := before.DeepCopy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines
Expand Down
57 changes: 57 additions & 0 deletions controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading