From fba0337063462c37487ff92a780fe31df89dc62c Mon Sep 17 00:00:00 2001 From: Vince Prignano Date: Fri, 31 May 2024 08:34:55 -0700 Subject: [PATCH] :sparkles: Add remediation strategy support for MachineSet Signed-off-by: Vince Prignano tests Signed-off-by: Vince Prignano --- api/v1beta1/machinedeployment_types.go | 30 ++ api/v1beta1/zz_generated.deepcopy.go | 25 ++ api/v1beta1/zz_generated.openapi.go | 30 +- .../cluster.x-k8s.io_clusterclasses.yaml | 30 ++ .../crd/bases/cluster.x-k8s.io_clusters.yaml | 30 ++ .../cluster.x-k8s.io_machinedeployments.yaml | 30 ++ .../desiredstate/desired_state_test.go | 6 + internal/apis/core/v1alpha3/conversion.go | 19 +- .../core/v1alpha3/zz_generated.conversion.go | 31 +- internal/apis/core/v1alpha4/conversion.go | 15 + .../core/v1alpha4/zz_generated.conversion.go | 51 +-- .../machineset/machineset_controller.go | 87 ++++- .../machineset/machineset_controller_test.go | 359 ++++++++++++++++++ internal/webhooks/machinedeployment.go | 17 + internal/webhooks/machinedeployment_test.go | 36 ++ test/e2e/clusterclass_rollout.go | 3 + 16 files changed, 746 insertions(+), 53 deletions(-) diff --git a/api/v1beta1/machinedeployment_types.go b/api/v1beta1/machinedeployment_types.go index 13a023d07a63..41893ef5b4c0 100644 --- a/api/v1beta1/machinedeployment_types.go +++ b/api/v1beta1/machinedeployment_types.go @@ -162,6 +162,11 @@ type MachineDeploymentStrategy struct { // MachineDeploymentStrategyType = RollingUpdate. // +optional RollingUpdate *MachineRollingUpdateDeployment `json:"rollingUpdate,omitempty"` + + // Remediation controls the strategy of remediating unhealthy machines + // and how remediating operations should occur during the lifecycle of the dependant MachineSets. + // +optional + Remediation *RemediationStrategy `json:"remediation,omitempty"` } // ANCHOR_END: MachineDeploymentStrategy @@ -211,6 +216,31 @@ type MachineRollingUpdateDeployment struct { // ANCHOR_END: MachineRollingUpdateDeployment +// ANCHOR: RemediationStrategy + +// RemediationStrategy allows to define how the MachineSet can control scaling operations. +type RemediationStrategy struct { + // MaxInFlight determines how many in flight remediations should happen at the same time. + // + // Remediation only happens on the MachineSet with the most current revision, while + // older MachineSets (usually present during rollout operations) aren't allowed to remediate. + // + // Note: In general (independent of remediations), unhealthy machines are always + // prioritized during scale down operations over healthy ones. + // + // MaxInFlight can be set to a fixed number or a percentage. + // Example: when this is set to 20%, the MachineSet controller deletes at most 20% of + // the desired replicas. + // + // If not set, remediation is limited to all machines (bounded by replicas) + // under the active MachineSet's management. + // + // +optional + MaxInFlight *intstr.IntOrString `json:"maxInFlight,omitempty"` +} + +// ANCHOR_END: RemediationStrategy + // ANCHOR: MachineDeploymentStatus // MachineDeploymentStatus defines the observed state of MachineDeployment. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 695ac1cea34a..aa62c8fb9f4f 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -1182,6 +1182,11 @@ func (in *MachineDeploymentStrategy) DeepCopyInto(out *MachineDeploymentStrategy *out = new(MachineRollingUpdateDeployment) (*in).DeepCopyInto(*out) } + if in.Remediation != nil { + in, out := &in.Remediation, &out.Remediation + *out = new(RemediationStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineDeploymentStrategy. @@ -2069,6 +2074,26 @@ func (in *PatchSelectorMatchMachinePoolClass) DeepCopy() *PatchSelectorMatchMach return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemediationStrategy) DeepCopyInto(out *RemediationStrategy) { + *out = *in + if in.MaxInFlight != nil { + in, out := &in.MaxInFlight, &out.MaxInFlight + *out = new(intstr.IntOrString) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationStrategy. +func (in *RemediationStrategy) DeepCopy() *RemediationStrategy { + if in == nil { + return nil + } + out := new(RemediationStrategy) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Topology) DeepCopyInto(out *Topology) { *out = *in diff --git a/api/v1beta1/zz_generated.openapi.go b/api/v1beta1/zz_generated.openapi.go index 849b9a01c665..4953306c2985 100644 --- a/api/v1beta1/zz_generated.openapi.go +++ b/api/v1beta1/zz_generated.openapi.go @@ -95,6 +95,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "sigs.k8s.io/cluster-api/api/v1beta1.PatchSelectorMatch": schema_sigsk8sio_cluster_api_api_v1beta1_PatchSelectorMatch(ref), "sigs.k8s.io/cluster-api/api/v1beta1.PatchSelectorMatchMachineDeploymentClass": schema_sigsk8sio_cluster_api_api_v1beta1_PatchSelectorMatchMachineDeploymentClass(ref), "sigs.k8s.io/cluster-api/api/v1beta1.PatchSelectorMatchMachinePoolClass": schema_sigsk8sio_cluster_api_api_v1beta1_PatchSelectorMatchMachinePoolClass(ref), + "sigs.k8s.io/cluster-api/api/v1beta1.RemediationStrategy": schema_sigsk8sio_cluster_api_api_v1beta1_RemediationStrategy(ref), "sigs.k8s.io/cluster-api/api/v1beta1.Topology": schema_sigsk8sio_cluster_api_api_v1beta1_Topology(ref), "sigs.k8s.io/cluster-api/api/v1beta1.UnhealthyCondition": schema_sigsk8sio_cluster_api_api_v1beta1_UnhealthyCondition(ref), "sigs.k8s.io/cluster-api/api/v1beta1.VariableSchema": schema_sigsk8sio_cluster_api_api_v1beta1_VariableSchema(ref), @@ -2026,11 +2027,17 @@ func schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeploymentStrategy(ref comm Ref: ref("sigs.k8s.io/cluster-api/api/v1beta1.MachineRollingUpdateDeployment"), }, }, + "remediation": { + SchemaProps: spec.SchemaProps{ + Description: "Remediation controls the strategy of remediating unhealthy machines and how remediating operations should occur during the lifecycle of the dependant MachineSets.", + Ref: ref("sigs.k8s.io/cluster-api/api/v1beta1.RemediationStrategy"), + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/v1beta1.MachineRollingUpdateDeployment"}, + "sigs.k8s.io/cluster-api/api/v1beta1.MachineRollingUpdateDeployment", "sigs.k8s.io/cluster-api/api/v1beta1.RemediationStrategy"}, } } @@ -3561,6 +3568,27 @@ func schema_sigsk8sio_cluster_api_api_v1beta1_PatchSelectorMatchMachinePoolClass } } +func schema_sigsk8sio_cluster_api_api_v1beta1_RemediationStrategy(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "RemediationStrategy allows to define how the MachineSet can control scaling operations.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "maxInFlight": { + SchemaProps: spec.SchemaProps{ + Description: "MaxInFlight determines how many in flight remediations should happen at the same time.\n\nRemediation only happens on the MachineSet with the most current revision, while older MachineSets (usually present during rollout operations) aren't allowed to remediate.\n\nNote: In general (independent of remediations), unhealthy machines are always prioritized during scale down operations over healthy ones.\n\nMaxInFlight can be set to a fixed number or a percentage. Example: when this is set to 20%, the MachineSet controller deletes at most 20% of the desired replicas.\n\nIf not set, remediation is limited to all machines (bounded by replicas) under the active MachineSet's management.", + Ref: ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"), + }, + }, + }, + }, + }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/util/intstr.IntOrString"}, + } +} + func schema_sigsk8sio_cluster_api_api_v1beta1_Topology(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ diff --git a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml index 698766e5a909..45c8cb9e3bd7 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml @@ -1363,6 +1363,36 @@ spec: new ones. NOTE: This value can be overridden while defining a Cluster.Topology using this MachineDeploymentClass. properties: + remediation: + description: |- + Remediation controls the strategy of remediating unhealthy machines + and how remediating operations should occur during the lifecycle of the dependant MachineSets. + properties: + maxInFlight: + anyOf: + - type: integer + - type: string + description: |- + MaxInFlight determines how many in flight remediations should happen at the same time. + + + Remediation only happens on the MachineSet with the most current revision, while + older MachineSets (usually present during rollout operations) aren't allowed to remediate. + + + Note: In general (independent of remediations), unhealthy machines are always + prioritized during scale down operations over healthy ones. + + + MaxInFlight can be set to a fixed number or a percentage. + Example: when this is set to 20%, the MachineSet controller deletes at most 20% of + the desired replicas. + + + If not set, remediation is limited to all machines (bounded by replicas) + under the active MachineSet's management. + x-kubernetes-int-or-string: true + type: object rollingUpdate: description: |- Rolling update config params. Present only if diff --git a/config/crd/bases/cluster.x-k8s.io_clusters.yaml b/config/crd/bases/cluster.x-k8s.io_clusters.yaml index 02d46238cb30..0935c3f5ec15 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusters.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusters.yaml @@ -1405,6 +1405,36 @@ spec: The deployment strategy to use to replace existing machines with new ones. properties: + remediation: + description: |- + Remediation controls the strategy of remediating unhealthy machines + and how remediating operations should occur during the lifecycle of the dependant MachineSets. + properties: + maxInFlight: + anyOf: + - type: integer + - type: string + description: |- + MaxInFlight determines how many in flight remediations should happen at the same time. + + + Remediation only happens on the MachineSet with the most current revision, while + older MachineSets (usually present during rollout operations) aren't allowed to remediate. + + + Note: In general (independent of remediations), unhealthy machines are always + prioritized during scale down operations over healthy ones. + + + MaxInFlight can be set to a fixed number or a percentage. + Example: when this is set to 20%, the MachineSet controller deletes at most 20% of + the desired replicas. + + + If not set, remediation is limited to all machines (bounded by replicas) + under the active MachineSet's management. + x-kubernetes-int-or-string: true + type: object rollingUpdate: description: |- Rolling update config params. Present only if diff --git a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml index 0dd0672d2cf4..f67b374df74b 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml @@ -1245,6 +1245,36 @@ spec: The deployment strategy to use to replace existing machines with new ones. properties: + remediation: + description: |- + Remediation controls the strategy of remediating unhealthy machines + and how remediating operations should occur during the lifecycle of the dependant MachineSets. + properties: + maxInFlight: + anyOf: + - type: integer + - type: string + description: |- + MaxInFlight determines how many in flight remediations should happen at the same time. + + + Remediation only happens on the MachineSet with the most current revision, while + older MachineSets (usually present during rollout operations) aren't allowed to remediate. + + + Note: In general (independent of remediations), unhealthy machines are always + prioritized during scale down operations over healthy ones. + + + MaxInFlight can be set to a fixed number or a percentage. + Example: when this is set to 20%, the MachineSet controller deletes at most 20% of + the desired replicas. + + + If not set, remediation is limited to all machines (bounded by replicas) + under the active MachineSet's management. + x-kubernetes-int-or-string: true + type: object rollingUpdate: description: |- Rolling update config params. Present only if diff --git a/exp/topology/desiredstate/desired_state_test.go b/exp/topology/desiredstate/desired_state_test.go index 7ef777cdf1aa..dc0aecb69456 100644 --- a/exp/topology/desiredstate/desired_state_test.go +++ b/exp/topology/desiredstate/desired_state_test.go @@ -1329,6 +1329,9 @@ func TestComputeMachineDeployment(t *testing.T) { var clusterClassMinReadySeconds int32 = 20 clusterClassStrategy := clusterv1.MachineDeploymentStrategy{ Type: clusterv1.OnDeleteMachineDeploymentStrategyType, + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: ptr.To(intstr.FromInt32(5)), + }, } md1 := builder.MachineDeploymentClass("linux-worker"). WithLabels(labels). @@ -1390,6 +1393,9 @@ func TestComputeMachineDeployment(t *testing.T) { var topologyMinReadySeconds int32 = 10 topologyStrategy := clusterv1.MachineDeploymentStrategy{ Type: clusterv1.RollingUpdateMachineDeploymentStrategyType, + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: ptr.To(intstr.FromInt32(5)), + }, } mdTopology := clusterv1.MachineDeploymentTopology{ Metadata: clusterv1.ObjectMeta{ diff --git a/internal/apis/core/v1alpha3/conversion.go b/internal/apis/core/v1alpha3/conversion.go index 9352eb4de3af..405cf123f813 100644 --- a/internal/apis/core/v1alpha3/conversion.go +++ b/internal/apis/core/v1alpha3/conversion.go @@ -187,14 +187,17 @@ func (src *MachineDeployment) ConvertTo(dstRaw conversion.Hub) error { return err } - if restored.Spec.Strategy != nil && restored.Spec.Strategy.RollingUpdate != nil { + if restored.Spec.Strategy != nil { if dst.Spec.Strategy == nil { dst.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{} } - if dst.Spec.Strategy.RollingUpdate == nil { - dst.Spec.Strategy.RollingUpdate = &clusterv1.MachineRollingUpdateDeployment{} + if restored.Spec.Strategy.RollingUpdate != nil { + if dst.Spec.Strategy.RollingUpdate == nil { + dst.Spec.Strategy.RollingUpdate = &clusterv1.MachineRollingUpdateDeployment{} + } + dst.Spec.Strategy.RollingUpdate.DeletePolicy = restored.Spec.Strategy.RollingUpdate.DeletePolicy } - dst.Spec.Strategy.RollingUpdate.DeletePolicy = restored.Spec.Strategy.RollingUpdate.DeletePolicy + dst.Spec.Strategy.Remediation = restored.Spec.Strategy.Remediation } dst.Spec.Template.Spec.NodeDeletionTimeout = restored.Spec.Template.Spec.NodeDeletionTimeout @@ -330,3 +333,11 @@ func Convert_v1alpha3_MachineStatus_To_v1beta1_MachineStatus(in *MachineStatus, // Status.version has been removed in v1beta1, thus requiring custom conversion function. the information will be dropped. return autoConvert_v1alpha3_MachineStatus_To_v1beta1_MachineStatus(in, out, s) } + +func Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(in *clusterv1.MachineDeploymentStrategy, out *MachineDeploymentStrategy, s apiconversion.Scope) error { + return autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(in, out, s) +} + +func Convert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(in *clusterv1.MachineSetSpec, out *MachineSetSpec, s apiconversion.Scope) error { + return autoConvert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(in, out, s) +} diff --git a/internal/apis/core/v1alpha3/zz_generated.conversion.go b/internal/apis/core/v1alpha3/zz_generated.conversion.go index 9aa52bb74333..1d8c7451e31c 100644 --- a/internal/apis/core/v1alpha3/zz_generated.conversion.go +++ b/internal/apis/core/v1alpha3/zz_generated.conversion.go @@ -170,11 +170,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta1.MachineDeploymentStrategy)(nil), (*MachineDeploymentStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(a.(*v1beta1.MachineDeploymentStrategy), b.(*MachineDeploymentStrategy), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachineHealthCheck)(nil), (*v1beta1.MachineHealthCheck)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha3_MachineHealthCheck_To_v1beta1_MachineHealthCheck(a.(*MachineHealthCheck), b.(*v1beta1.MachineHealthCheck), scope) }); err != nil { @@ -250,11 +245,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta1.MachineSetSpec)(nil), (*MachineSetSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(a.(*v1beta1.MachineSetSpec), b.(*MachineSetSpec), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachineSetStatus)(nil), (*v1beta1.MachineSetStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha3_MachineSetStatus_To_v1beta1_MachineSetStatus(a.(*MachineSetStatus), b.(*v1beta1.MachineSetStatus), scope) }); err != nil { @@ -335,6 +325,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachineDeploymentStrategy)(nil), (*MachineDeploymentStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(a.(*v1beta1.MachineDeploymentStrategy), b.(*MachineDeploymentStrategy), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta1.MachineHealthCheckSpec)(nil), (*MachineHealthCheckSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachineHealthCheckSpec_To_v1alpha3_MachineHealthCheckSpec(a.(*v1beta1.MachineHealthCheckSpec), b.(*MachineHealthCheckSpec), scope) }); err != nil { @@ -345,6 +340,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachineSetSpec)(nil), (*MachineSetSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(a.(*v1beta1.MachineSetSpec), b.(*MachineSetSpec), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta1.MachineSetStatus)(nil), (*MachineSetStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachineSetStatus_To_v1alpha3_MachineSetStatus(a.(*v1beta1.MachineSetStatus), b.(*MachineSetStatus), scope) }); err != nil { @@ -853,14 +853,10 @@ func autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeployment } else { out.RollingUpdate = nil } + // WARNING: in.Remediation requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy is an autogenerated conversion function. -func Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(in *v1beta1.MachineDeploymentStrategy, out *MachineDeploymentStrategy, s conversion.Scope) error { - return autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha3_MachineDeploymentStrategy(in, out, s) -} - func autoConvert_v1alpha3_MachineHealthCheck_To_v1beta1_MachineHealthCheck(in *MachineHealthCheck, out *v1beta1.MachineHealthCheck, s conversion.Scope) error { out.ObjectMeta = in.ObjectMeta if err := Convert_v1alpha3_MachineHealthCheckSpec_To_v1beta1_MachineHealthCheckSpec(&in.Spec, &out.Spec, s); err != nil { @@ -1154,11 +1150,6 @@ func autoConvert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(in *v1beta1.M return nil } -// Convert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec is an autogenerated conversion function. -func Convert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(in *v1beta1.MachineSetSpec, out *MachineSetSpec, s conversion.Scope) error { - return autoConvert_v1beta1_MachineSetSpec_To_v1alpha3_MachineSetSpec(in, out, s) -} - func autoConvert_v1alpha3_MachineSetStatus_To_v1beta1_MachineSetStatus(in *MachineSetStatus, out *v1beta1.MachineSetStatus, s conversion.Scope) error { out.Selector = in.Selector out.Replicas = in.Replicas diff --git a/internal/apis/core/v1alpha4/conversion.go b/internal/apis/core/v1alpha4/conversion.go index 3b58a00e7ad5..ba4469e52445 100644 --- a/internal/apis/core/v1alpha4/conversion.go +++ b/internal/apis/core/v1alpha4/conversion.go @@ -277,6 +277,13 @@ func (src *MachineDeployment) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.Template.Spec.NodeDeletionTimeout = restored.Spec.Template.Spec.NodeDeletionTimeout dst.Spec.Template.Spec.NodeVolumeDetachTimeout = restored.Spec.Template.Spec.NodeVolumeDetachTimeout dst.Spec.RolloutAfter = restored.Spec.RolloutAfter + + if restored.Spec.Strategy != nil { + if dst.Spec.Strategy == nil { + dst.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{} + } + dst.Spec.Strategy.Remediation = restored.Spec.Strategy.Remediation + } return nil } @@ -391,3 +398,11 @@ func Convert_v1beta1_WorkersTopology_To_v1alpha4_WorkersTopology(in *clusterv1.W // WorkersTopology.MachinePools has been added in v1beta1. return autoConvert_v1beta1_WorkersTopology_To_v1alpha4_WorkersTopology(in, out, s) } + +func Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(in *clusterv1.MachineDeploymentStrategy, out *MachineDeploymentStrategy, s apiconversion.Scope) error { + return autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(in, out, s) +} + +func Convert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(in *clusterv1.MachineSetSpec, out *MachineSetSpec, s apiconversion.Scope) error { + return autoConvert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(in, out, s) +} diff --git a/internal/apis/core/v1alpha4/zz_generated.conversion.go b/internal/apis/core/v1alpha4/zz_generated.conversion.go index e5d5473e2d2c..e8f640274d79 100644 --- a/internal/apis/core/v1alpha4/zz_generated.conversion.go +++ b/internal/apis/core/v1alpha4/zz_generated.conversion.go @@ -245,11 +245,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta1.MachineDeploymentStrategy)(nil), (*MachineDeploymentStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(a.(*v1beta1.MachineDeploymentStrategy), b.(*MachineDeploymentStrategy), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachineDeploymentTopology)(nil), (*v1beta1.MachineDeploymentTopology)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha4_MachineDeploymentTopology_To_v1beta1_MachineDeploymentTopology(a.(*MachineDeploymentTopology), b.(*v1beta1.MachineDeploymentTopology), scope) }); err != nil { @@ -340,11 +335,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta1.MachineSetSpec)(nil), (*MachineSetSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(a.(*v1beta1.MachineSetSpec), b.(*MachineSetSpec), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachineSetStatus)(nil), (*v1beta1.MachineSetStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha4_MachineSetStatus_To_v1beta1_MachineSetStatus(a.(*MachineSetStatus), b.(*v1beta1.MachineSetStatus), scope) }); err != nil { @@ -450,11 +440,21 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachineDeploymentStrategy)(nil), (*MachineDeploymentStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(a.(*v1beta1.MachineDeploymentStrategy), b.(*MachineDeploymentStrategy), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta1.MachineDeploymentTopology)(nil), (*MachineDeploymentTopology)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachineDeploymentTopology_To_v1alpha4_MachineDeploymentTopology(a.(*v1beta1.MachineDeploymentTopology), b.(*MachineDeploymentTopology), scope) }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachineSetSpec)(nil), (*MachineSetSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(a.(*v1beta1.MachineSetSpec), b.(*MachineSetSpec), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta1.MachineSpec)(nil), (*MachineSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachineSpec_To_v1alpha4_MachineSpec(a.(*v1beta1.MachineSpec), b.(*MachineSpec), scope) }); err != nil { @@ -1138,7 +1138,15 @@ func autoConvert_v1alpha4_MachineDeploymentSpec_To_v1beta1_MachineDeploymentSpec if err := Convert_v1alpha4_MachineTemplateSpec_To_v1beta1_MachineTemplateSpec(&in.Template, &out.Template, s); err != nil { return err } - out.Strategy = (*v1beta1.MachineDeploymentStrategy)(unsafe.Pointer(in.Strategy)) + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(v1beta1.MachineDeploymentStrategy) + if err := Convert_v1alpha4_MachineDeploymentStrategy_To_v1beta1_MachineDeploymentStrategy(*in, *out, s); err != nil { + return err + } + } else { + out.Strategy = nil + } out.MinReadySeconds = (*int32)(unsafe.Pointer(in.MinReadySeconds)) out.RevisionHistoryLimit = (*int32)(unsafe.Pointer(in.RevisionHistoryLimit)) out.Paused = in.Paused @@ -1159,7 +1167,15 @@ func autoConvert_v1beta1_MachineDeploymentSpec_To_v1alpha4_MachineDeploymentSpec if err := Convert_v1beta1_MachineTemplateSpec_To_v1alpha4_MachineTemplateSpec(&in.Template, &out.Template, s); err != nil { return err } - out.Strategy = (*MachineDeploymentStrategy)(unsafe.Pointer(in.Strategy)) + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(MachineDeploymentStrategy) + if err := Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(*in, *out, s); err != nil { + return err + } + } else { + out.Strategy = nil + } out.MinReadySeconds = (*int32)(unsafe.Pointer(in.MinReadySeconds)) out.RevisionHistoryLimit = (*int32)(unsafe.Pointer(in.RevisionHistoryLimit)) out.Paused = in.Paused @@ -1217,14 +1233,10 @@ func Convert_v1alpha4_MachineDeploymentStrategy_To_v1beta1_MachineDeploymentStra func autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(in *v1beta1.MachineDeploymentStrategy, out *MachineDeploymentStrategy, s conversion.Scope) error { out.Type = MachineDeploymentStrategyType(in.Type) out.RollingUpdate = (*MachineRollingUpdateDeployment)(unsafe.Pointer(in.RollingUpdate)) + // WARNING: in.Remediation requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy is an autogenerated conversion function. -func Convert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(in *v1beta1.MachineDeploymentStrategy, out *MachineDeploymentStrategy, s conversion.Scope) error { - return autoConvert_v1beta1_MachineDeploymentStrategy_To_v1alpha4_MachineDeploymentStrategy(in, out, s) -} - func autoConvert_v1alpha4_MachineDeploymentTopology_To_v1beta1_MachineDeploymentTopology(in *MachineDeploymentTopology, out *v1beta1.MachineDeploymentTopology, s conversion.Scope) error { if err := Convert_v1alpha4_ObjectMeta_To_v1beta1_ObjectMeta(&in.Metadata, &out.Metadata, s); err != nil { return err @@ -1543,11 +1555,6 @@ func autoConvert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(in *v1beta1.M return nil } -// Convert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec is an autogenerated conversion function. -func Convert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(in *v1beta1.MachineSetSpec, out *MachineSetSpec, s conversion.Scope) error { - return autoConvert_v1beta1_MachineSetSpec_To_v1alpha4_MachineSetSpec(in, out, s) -} - func autoConvert_v1alpha4_MachineSetStatus_To_v1beta1_MachineSetStatus(in *MachineSetStatus, out *v1beta1.MachineSetStatus, s conversion.Scope) error { out.Selector = in.Selector out.Replicas = in.Replicas diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 85254155bb8a..f976a46bbdda 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -19,6 +19,7 @@ package machineset import ( "context" "fmt" + "sort" "strings" "time" @@ -29,11 +30,13 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/storage/names" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -687,6 +690,19 @@ func (r *Reconciler) updateExternalObject(ctx context.Context, obj client.Object return nil } +func (r *Reconciler) getOwnerMachineDeployment(ctx context.Context, machineSet *clusterv1.MachineSet) (*clusterv1.MachineDeployment, error) { + mdName := machineSet.Labels[clusterv1.MachineDeploymentNameLabel] + if mdName == "" { + return nil, fmt.Errorf("no owner MachineDeployment found for MachineSet %s", klog.KObj(machineSet)) + } + + md := &clusterv1.MachineDeployment{} + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: machineSet.Namespace, Name: mdName}, md); err != nil { + return nil, fmt.Errorf("failed to retrieve owner MachineDeployment for MachineSet %s: %w", klog.KObj(machineSet), err) + } + return md, nil +} + // machineLabelsFromMachineSet computes the labels the Machine created from this MachineSet should have. func machineLabelsFromMachineSet(machineSet *clusterv1.MachineSet) map[string]string { machineLabels := map[string]string{} @@ -842,6 +858,12 @@ func (r *Reconciler) getMachineSetsForMachine(ctx context.Context, m *clusterv1. return mss, nil } +// isDeploymentChild returns true if the MachineSet originated from a MachineDeployment by checking its labels. +func (r *Reconciler) isDeploymentChild(ms *clusterv1.MachineSet) bool { + _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel] + return ok +} + // shouldAdopt returns true if the MachineSet should be adopted as a stand-alone MachineSet directly owned by the Cluster. func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // if the MachineSet is controlled by a MachineDeployment, or if it is a stand-alone MachinesSet directly owned by the Cluster, then no-op. @@ -852,10 +874,7 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // If the MachineSet is originated by a MachineDeployment object, it should not be adopted directly by the Cluster as a stand-alone MachineSet. // Note: this is required because after restore from a backup both the MachineSet controller and the // MachineDeployment controller are racing to adopt MachineSets, see https://github.com/kubernetes-sigs/cluster-api/issues/7529 - if _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel]; ok { - return false - } - return true + return !r.isDeploymentChild(ms) } // updateStatus updates the Status field for the MachineSet @@ -978,12 +997,47 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) + + // Calculate how many in flight machines we should remediate. + // By default, we allow all machines to be remediated at the same time. + maxInFlight := len(filteredMachines) + + // If the MachineSet is part of a MachineDeployment, only allow remediations if + // it's the desired revision. + if r.isDeploymentChild(ms) { + owner, err := r.getOwnerMachineDeployment(ctx, ms) + if err != nil { + return ctrl.Result{}, err + } + + if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { + // MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed. + return ctrl.Result{}, nil + } + + if owner.Spec.Strategy != nil && owner.Spec.Strategy.Remediation != nil { + if owner.Spec.Strategy.Remediation.MaxInFlight != nil { + var err error + replicas := int(ptr.Deref(owner.Spec.Replicas, 1)) + maxInFlight, err = intstr.GetScaledValueFromIntOrPercent(owner.Spec.Strategy.Remediation.MaxInFlight, replicas, true) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to calculate maxInFlight to remediate machines: %v", err) + } + log = log.WithValues("maxInFlight", maxInFlight, "replicas", replicas) + } + } + } + // List all unhealthy machines. machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) for _, m := range filteredMachines { // filteredMachines contains machines in deleting status to calculate correct status. // skip remediation for those in deleting status. if !m.DeletionTimestamp.IsZero() { + if conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) { + // Machine has been remediated by this controller and still in flight. + maxInFlight-- + } continue } if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) { @@ -995,7 +1049,28 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl if len(machinesToRemediate) == 0 { return ctrl.Result{}, nil } + // Check if we can remediate any machines. + if maxInFlight <= 0 { + // No tokens available to remediate machines. + log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate)) + return ctrl.Result{}, nil + } + + // Sort the machines from newest to oldest. + // We are trying to remediate machines failing to come up first because + // there is a chance that they are not hosting any workloads (minimize disruption). + sort.SliceStable(machinesToRemediate, func(i, j int) bool { + return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time) + }) + + // Check if we should limit the in flight operations. + if len(machinesToRemediate) > maxInFlight { + log.V(5).Info("Remediation strategy is set, limiting in flight operations", "machinesToBeRemediated", len(machinesToRemediate)) + // We have more machines to remediate than tokens available. + machinesToRemediate = machinesToRemediate[:maxInFlight] + } + // Run preflight checks. preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation") if err != nil { // If err is not nil use that as the preflightCheckErrMessage @@ -1029,9 +1104,9 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl // Remediate unhealthy machines by deleting them. var errs []error for _, m := range machinesToRemediate { - log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m))) + log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m)) patch := client.MergeFrom(m.DeepCopy()) - if err := r.Client.Delete(ctx, m); err != nil { + if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) continue } diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 9e11d8ec3e2d..0f0e80cead69 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -17,6 +17,7 @@ limitations under the License. package machineset import ( + "fmt" "testing" "time" @@ -25,11 +26,13 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" utilfeature "k8s.io/component-base/featuregate/testing" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -411,6 +414,19 @@ func TestMachineSetOwnerReference(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: testClusterName}, } + validMD := &clusterv1.MachineDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-machinedeployment", + Namespace: metav1.NamespaceDefault, + Labels: map[string]string{ + clusterv1.ClusterNameLabel: testCluster.Name, + }, + }, + Spec: clusterv1.MachineDeploymentSpec{ + ClusterName: testCluster.Name, + }, + } + ms1 := newMachineSet("machineset1", "valid-cluster", int32(0)) ms2 := newMachineSet("machineset2", "invalid-cluster", int32(0)) ms3 := newMachineSet("machineset3", "valid-cluster", int32(0)) @@ -469,6 +485,7 @@ func TestMachineSetOwnerReference(t *testing.T) { ms1, ms2, ms3, + validMD, ).WithStatusSubresource(&clusterv1.MachineSet{}).Build() msr := &Reconciler{ Client: c, @@ -1471,6 +1488,348 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { g.Expect(conditions.Has(m, condition)). To(BeFalse(), "Machine should not have the %s condition set", condition) }) + + t.Run("should only try to remediate MachineOwnerRemediated if MachineSet is current", func(t *testing.T) { + g := NewWithT(t) + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{}, + } + + machineDeployment := &clusterv1.MachineDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machinedeployment", + Namespace: "default", + Annotations: map[string]string{ + clusterv1.RevisionAnnotation: "10", + }, + }, + Spec: clusterv1.MachineDeploymentSpec{ + ClusterName: "test-cluster", + }, + } + + machineSetOld := &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machinedeployment-old", + Namespace: "default", + Labels: map[string]string{ + clusterv1.MachineDeploymentNameLabel: "test-machinedeployment", + }, + Annotations: map[string]string{ + clusterv1.RevisionAnnotation: "7", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "MachineDeployment", + Name: "test-machinedeployment", + }, + }, + }, + Spec: clusterv1.MachineSetSpec{ + ClusterName: "test-cluster", + }, + } + + machineSetCurrent := machineSetOld.DeepCopy() + machineSetCurrent.Name = "test-machinedeployment-current" + machineSetCurrent.Annotations[clusterv1.RevisionAnnotation] = "10" + + unhealthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unhealthy-machine", + Namespace: "default", + }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + } + healthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "healthy-machine", + Namespace: "default", + }, + } + + machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine} + fakeClient := fake.NewClientBuilder().WithObjects( + machineDeployment, + machineSetOld, + machineSetCurrent, + unhealthyMachine, + healthyMachine, + ).WithStatusSubresource(&clusterv1.Machine{}, &clusterv1.MachineSet{}, &clusterv1.MachineDeployment{}).Build() + r := &Reconciler{ + Client: fakeClient, + } + + // Test first with the old MachineSet. + _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSetOld, machines) + g.Expect(err).ToNot(HaveOccurred()) + + condition := clusterv1.MachineOwnerRemediatedCondition + m := &clusterv1.Machine{} + + // Verify that no action was taken on the Machine: MachineOwnerRemediated should be false + // and the Machine wasn't deleted. + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeTrue(), "Machine should have the %s condition set", condition) + machineOwnerRemediatedCondition := conditions.Get(m, condition) + g.Expect(machineOwnerRemediatedCondition.Status). + To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + g.Expect(unhealthyMachine.DeletionTimestamp).Should(BeZero()) + + // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + m = &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + + // Test with the current MachineSet. + _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSetCurrent, machines) + g.Expect(err).ToNot(HaveOccurred()) + + // Verify the unhealthy machine has been deleted. + err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + + // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + m = &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + }) + + t.Run("should only try to remediate up to MaxInFlight unhealthy", func(t *testing.T) { + g := NewWithT(t) + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{}, + } + + maxInFlight := 3 + machineDeployment := &clusterv1.MachineDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machinedeployment", + Namespace: "default", + Annotations: map[string]string{ + clusterv1.RevisionAnnotation: "10", + }, + }, + Spec: clusterv1.MachineDeploymentSpec{ + ClusterName: "test-cluster", + Strategy: &clusterv1.MachineDeploymentStrategy{ + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: ptr.To(intstr.FromInt32(int32(maxInFlight))), + }, + }, + }, + } + + machineSet := &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machinedeployment-old", + Namespace: "default", + Labels: map[string]string{ + clusterv1.MachineDeploymentNameLabel: "test-machinedeployment", + }, + Annotations: map[string]string{ + clusterv1.RevisionAnnotation: "10", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "MachineDeployment", + Name: "test-machinedeployment", + }, + }, + }, + Spec: clusterv1.MachineSetSpec{ + ClusterName: "test-cluster", + }, + } + + unhealthyMachines := []*clusterv1.Machine{} + total := 8 + for i := range total { + unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("unhealthy-machine-%d", i), + Namespace: "default", + CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)}, + }, + Status: clusterv1.MachineStatus{ + Conditions: []clusterv1.Condition{ + { + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + }) + } + + healthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "healthy-machine", + Namespace: "default", + }, + } + + fakeClient := fake.NewClientBuilder().WithObjects(cluster, machineDeployment, healthyMachine). + WithStatusSubresource(&clusterv1.Machine{}, &clusterv1.MachineSet{}, &clusterv1.MachineDeployment{}) + // Create the unhealthy machines. + for _, machine := range unhealthyMachines { + fakeClient.WithObjects(machine) + } + r := &Reconciler{ + Client: fakeClient.Build(), + } + + // + // First pass. + // + _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, append(unhealthyMachines, healthyMachine)) + g.Expect(err).ToNot(HaveOccurred()) + + condition := clusterv1.MachineOwnerRemediatedCondition + + // Iterate over the unhealthy machines and verify that the last maxInFlight were deleted. + for i := range unhealthyMachines { + m := unhealthyMachines[i] + + err = r.Client.Get(ctx, client.ObjectKeyFromObject(m), m) + if i < total-maxInFlight { + // Machines before the maxInFlight should not be deleted. + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(conditions.Has(m, condition)). + To(BeTrue(), "Machine should have the %s condition set", condition) + machineOwnerRemediatedCondition := conditions.Get(m, condition) + g.Expect(machineOwnerRemediatedCondition.Status). + To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + } else { + // Machines after maxInFlight, should be deleted. + g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted", i) + } + } + + // Verify the healthy machine continues to not have the MachineOwnerRemediated condition. + m := &clusterv1.Machine{} + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + + // + // Second pass. + // + // Set a finalizer on the next set of machines that should be remediated. + for i := maxInFlight - 1; i < total-maxInFlight; i++ { + m := unhealthyMachines[i] + m.Finalizers = append(m.Finalizers, "test") + g.Expect(r.Client.Update(ctx, m)).To(Succeed()) + } + + // Perform the second pass. + var allMachines = func() (res []*clusterv1.Machine) { + var machineList clusterv1.MachineList + g.Expect(r.Client.List(ctx, &machineList)).To(Succeed()) + for i := range machineList.Items { + m := &machineList.Items[i] + res = append(res, m) + } + return + } + + _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + g.Expect(err).ToNot(HaveOccurred()) + + var validateSecondPass = func(cleanFinalizer bool) { + t.Helper() + for i := range unhealthyMachines { + m := unhealthyMachines[i] + + err = r.Client.Get(ctx, client.ObjectKeyFromObject(m), m) + if i < total-(maxInFlight*2) { + // Machines before the maxInFlight*2 should not be deleted, and should have the remediated condition to false. + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(conditions.Has(m, condition)). + To(BeTrue(), "Machine should have the %s condition set", condition) + machineOwnerRemediatedCondition := conditions.Get(m, condition) + g.Expect(machineOwnerRemediatedCondition.Status). + To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition) + } else if i < total-maxInFlight { + // Machines before the maxInFlight should have a deletion timestamp + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(conditions.Has(m, condition)). + To(BeTrue(), "Machine should have the %s condition set", condition) + machineOwnerRemediatedCondition := conditions.Get(m, condition) + g.Expect(machineOwnerRemediatedCondition.Status). + To(Equal(corev1.ConditionTrue), "%s condition status should be true", condition) + g.Expect(m.DeletionTimestamp).ToNot(BeZero()) + + if cleanFinalizer { + g.Expect(controllerutil.RemoveFinalizer(m, "test")).To(BeTrue()) + g.Expect(r.Client.Update(ctx, m)).To(Succeed()) + } + } else { + // Machines after maxInFlight, should be deleted. + g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted", i) + } + } + } + validateSecondPass(false) + + // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + + // Perform another pass with the same exact configuration. + // This is testing that, given that we have Machines that are being deleted and are in flight, + // we have reached the maximum amount of tokens we have and we should wait to remediate the rest. + _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + g.Expect(err).ToNot(HaveOccurred()) + + // Validate and remove finalizers for in flight machines. + validateSecondPass(true) + + // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + + // Call again to verify that the remaining unhealthy machines are deleted, + // at this point all unhealthy machines should be deleted given the max in flight + // is greater than the number of unhealthy machines. + _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + g.Expect(err).ToNot(HaveOccurred()) + + // Iterate over the unhealthy machines and verify that all were deleted. + for i, m := range unhealthyMachines { + err = r.Client.Get(ctx, client.ObjectKeyFromObject(m), m) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected machine %d to be deleted: %v", i) + } + + // Verify (again) the healthy machine continues to not have the MachineOwnerRemediated condition. + g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed()) + g.Expect(conditions.Has(m, condition)). + To(BeFalse(), "Machine should not have the %s condition set", condition) + }) } func TestMachineSetReconciler_syncReplicas(t *testing.T) { diff --git a/internal/webhooks/machinedeployment.go b/internal/webhooks/machinedeployment.go index 843f6d0f6d12..7f1441bc74a4 100644 --- a/internal/webhooks/machinedeployment.go +++ b/internal/webhooks/machinedeployment.go @@ -268,6 +268,23 @@ func (webhook *MachineDeployment) validate(oldMD, newMD *clusterv1.MachineDeploy } } + if newMD.Spec.Strategy != nil && newMD.Spec.Strategy.Remediation != nil { + total := 1 + if newMD.Spec.Replicas != nil { + total = int(*newMD.Spec.Replicas) + } + + if newMD.Spec.Strategy.Remediation.MaxInFlight != nil { + if _, err := intstr.GetScaledValueFromIntOrPercent(newMD.Spec.Strategy.Remediation.MaxInFlight, total, true); err != nil { + allErrs = append( + allErrs, + field.Invalid(specPath.Child("strategy", "remediation", "maxInFlight"), + newMD.Spec.Strategy.Remediation.MaxInFlight.String(), fmt.Sprintf("must be either an int or a percentage: %v", err.Error())), + ) + } + } + } + if newMD.Spec.Template.Spec.Version != nil { if !version.KubeSemver.MatchString(*newMD.Spec.Template.Spec.Version) { allErrs = append(allErrs, field.Invalid(specPath.Child("template", "spec", "version"), *newMD.Spec.Template.Spec.Version, "must be a valid semantic version")) diff --git a/internal/webhooks/machinedeployment_test.go b/internal/webhooks/machinedeployment_test.go index e563717d8676..d3def492651f 100644 --- a/internal/webhooks/machinedeployment_test.go +++ b/internal/webhooks/machinedeployment_test.go @@ -251,12 +251,15 @@ func TestCalculateMachineDeploymentReplicas(t *testing.T) { func TestMachineDeploymentValidation(t *testing.T) { badMaxSurge := intstr.FromString("1") badMaxUnavailable := intstr.FromString("0") + badMaxInFlight := intstr.FromString("1") goodMaxSurgePercentage := intstr.FromString("1%") goodMaxUnavailablePercentage := intstr.FromString("0%") + goodMaxInFlightPercentage := intstr.FromString("20%") goodMaxSurgeInt := intstr.FromInt(1) goodMaxUnavailableInt := intstr.FromInt(0) + goodMaxInFlightInt := intstr.FromInt(5) tests := []struct { name string md *clusterv1.MachineDeployment @@ -352,6 +355,39 @@ func TestMachineDeploymentValidation(t *testing.T) { }, expectErr: true, }, + { + name: "should return error for invalid remediation maxInFlight", + selectors: map[string]string{"foo": "bar"}, + labels: map[string]string{"foo": "bar"}, + strategy: clusterv1.MachineDeploymentStrategy{ + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: &badMaxInFlight, + }, + }, + expectErr: true, + }, + { + name: "should not return error for valid percentage remediation maxInFlight", + selectors: map[string]string{"foo": "bar"}, + labels: map[string]string{"foo": "bar"}, + strategy: clusterv1.MachineDeploymentStrategy{ + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: &goodMaxInFlightPercentage, + }, + }, + expectErr: false, + }, + { + name: "should not return error for valid int remediation maxInFlight", + selectors: map[string]string{"foo": "bar"}, + labels: map[string]string{"foo": "bar"}, + strategy: clusterv1.MachineDeploymentStrategy{ + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: &goodMaxInFlightInt, + }, + }, + expectErr: false, + }, { name: "should not return error for valid int maxSurge and maxUnavailable", selectors: map[string]string{"foo": "bar"}, diff --git a/test/e2e/clusterclass_rollout.go b/test/e2e/clusterclass_rollout.go index d0bd47a2c773..e9ca19ea4581 100644 --- a/test/e2e/clusterclass_rollout.go +++ b/test/e2e/clusterclass_rollout.go @@ -187,6 +187,9 @@ func ClusterClassRolloutSpec(ctx context.Context, inputGetter func() ClusterClas MaxSurge: &intstr.IntOrString{Type: intstr.Int, IntVal: 5 + rand.Int31n(20)}, //nolint:gosec DeletePolicy: ptr.To(string(clusterv1.NewestMachineSetDeletePolicy)), }, + Remediation: &clusterv1.RemediationStrategy{ + MaxInFlight: &intstr.IntOrString{Type: intstr.Int, IntVal: 2 + rand.Int31n(20)}, //nolint:gosec + }, } }, WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),