Skip to content

Commit

Permalink
Merge pull request #8464 from ykakarap/pr-mhc-upgrade-deadlock-fix
Browse files Browse the repository at this point in the history
🐛 avoid errors when MHC and upgrade occur together in classy clusters
  • Loading branch information
k8s-ci-robot authored Apr 25, 2023
2 parents a2c1f5d + 20032af commit 8ca8170
Show file tree
Hide file tree
Showing 9 changed files with 357 additions and 194 deletions.
5 changes: 5 additions & 0 deletions internal/controllers/topology/cluster/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
case s.UpgradeTracker.ControlPlane.IsScaling:
msgBuilder.WriteString(" Control plane is reconciling desired replicas")

case len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0:
fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are upgrading",
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()),
)

case s.Current.MachineDeployments.IsAnyRollingOut():
fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are rolling out",
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.RolloutNames()),
Expand Down
106 changes: 42 additions & 64 deletions internal/controllers/topology/cluster/conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ import (
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
Expand All @@ -32,11 +35,16 @@ import (
)

func TestReconcileTopologyReconciledCondition(t *testing.T) {
g := NewWithT(t)
scheme := runtime.NewScheme()
g.Expect(clusterv1.AddToScheme(scheme)).To(Succeed())

tests := []struct {
name string
reconcileErr error
s *scope.Scope
cluster *clusterv1.Cluster
machines []*clusterv1.Machine
wantConditionStatus corev1.ConditionStatus
wantConditionReason string
wantConditionMessage string
Expand Down Expand Up @@ -382,7 +390,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
wantConditionStatus: corev1.ConditionTrue,
},
{
name: "should set the condition to false is some machine deployments have not picked the new version because other machine deployments are rolling out (not all replicas ready)",
name: "should set the condition to false is some machine deployments have not picked the new version because other machine deployments are upgrading",
reconcileErr: nil,
cluster: &clusterv1.Cluster{},
s: &scope.Scope{
Expand All @@ -404,6 +412,11 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
Object: builder.MachineDeployment("ns1", "md0-abc123").
WithReplicas(2).
WithVersion("v1.22.0").
WithSelector(metav1.LabelSelector{
MatchLabels: map[string]string{
clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md0",
},
}).
WithStatus(clusterv1.MachineDeploymentStatus{
// MD is not ready because we don't have 2 updated, ready and available replicas.
Replicas: int32(2),
Expand All @@ -418,67 +431,11 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
Object: builder.MachineDeployment("ns1", "md1-abc123").
WithReplicas(2).
WithVersion("v1.21.2").
WithStatus(clusterv1.MachineDeploymentStatus{
Replicas: int32(2),
UpdatedReplicas: int32(2),
ReadyReplicas: int32(2),
AvailableReplicas: int32(2),
UnavailableReplicas: int32(0),
WithSelector(metav1.LabelSelector{
MatchLabels: map[string]string{
clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md1",
},
}).
Build(),
},
},
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.MachineDeployments.MarkRollingOut("md0-abc123")
ut.MachineDeployments.MarkPendingUpgrade("md1-abc123")
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are rolling out",
},
{
name: "should set the condition to false if some machine deployments have not picked the new version because other machine deployments are rolling out (unavailable replica)",
reconcileErr: nil,
cluster: &clusterv1.Cluster{},
s: &scope.Scope{
Blueprint: &scope.ClusterBlueprint{
Topology: &clusterv1.Topology{
Version: "v1.22.0",
},
},
Current: &scope.ClusterState{
Cluster: &clusterv1.Cluster{},
ControlPlane: &scope.ControlPlaneState{
Object: builder.ControlPlane("ns1", "controlplane1").
WithVersion("v1.22.0").
WithReplicas(3).
Build(),
},
MachineDeployments: scope.MachineDeploymentsStateMap{
"md0": &scope.MachineDeploymentState{
Object: builder.MachineDeployment("ns1", "md0-abc123").
WithReplicas(2).
WithVersion("v1.22.0").
WithStatus(clusterv1.MachineDeploymentStatus{
// MD is not ready because we still have an unavailable replica.
Replicas: int32(2),
UpdatedReplicas: int32(2),
ReadyReplicas: int32(2),
AvailableReplicas: int32(2),
UnavailableReplicas: int32(1),
}).
Build(),
},
"md1": &scope.MachineDeploymentState{
Object: builder.MachineDeployment("ns1", "md1-abc123").
WithReplicas(2).
WithVersion("v1.21.2").
WithStatus(clusterv1.MachineDeploymentStatus{
Replicas: int32(2),
UpdatedReplicas: int32(2),
Expand All @@ -493,15 +450,25 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.MachineDeployments.MarkRollingOut("md0-abc123")
ut.MachineDeployments.MarkUpgradingAndRollingOut("md0-abc123")
ut.MachineDeployments.MarkPendingUpgrade("md1-abc123")
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
machines: []*clusterv1.Machine{
builder.Machine("ns1", "md0-machine0").
WithLabels(map[string]string{clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md0"}).
WithVersion("v1.21.2"). // Machine's version does not match MachineDeployment's version
Build(),
builder.Machine("ns1", "md1-machine0").
WithLabels(map[string]string{clusterv1.ClusterTopologyMachineDeploymentNameLabel: "md1"}).
WithVersion("v1.21.2").
Build(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are rolling out",
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
},
{
name: "should set the condition to false if some machine deployments have not picked the new version because their upgrade has been deferred",
Expand Down Expand Up @@ -624,7 +591,18 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

r := &Reconciler{}
objs := []client.Object{}
if tt.s != nil && tt.s.Current != nil {
for _, mds := range tt.s.Current.MachineDeployments {
objs = append(objs, mds.Object)
}
}
for _, m := range tt.machines {
objs = append(objs, m)
}
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()

r := &Reconciler{Client: fakeClient}
err := r.reconcileTopologyReconciledCondition(tt.s, tt.cluster, tt.reconcileErr)
if tt.wantErr {
g.Expect(err).To(HaveOccurred())
Expand Down
24 changes: 17 additions & 7 deletions internal/controllers/topology/cluster/desired_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func (r *Reconciler) computeDesiredState(ctx context.Context, s *scope.Scope) (*
// If required, compute the desired state of the MachineDeployments from the list of MachineDeploymentTopologies
// defined in the cluster.
if s.Blueprint.HasMachineDeployments() {
desiredState.MachineDeployments, err = computeMachineDeployments(ctx, s, desiredState.ControlPlane)
desiredState.MachineDeployments, err = r.computeMachineDeployments(ctx, s, desiredState.ControlPlane)
if err != nil {
return nil, errors.Wrapf(err, "failed to compute MachineDeployments")
}
Expand Down Expand Up @@ -523,12 +523,22 @@ func calculateRefDesiredAPIVersion(currentRef *corev1.ObjectReference, desiredRe
}

// computeMachineDeployments computes the desired state of the list of MachineDeployments.
func computeMachineDeployments(ctx context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState) (scope.MachineDeploymentsStateMap, error) {
// Mark all the machine deployments that are currently rolling out.
// This captured information will be used for
// - Building the TopologyReconciled condition.
// - Making upgrade decisions on machine deployments.
func (r *Reconciler) computeMachineDeployments(ctx context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState) (scope.MachineDeploymentsStateMap, error) {
// Mark all the MachineDeployments that are currently upgrading.
// This captured information is used for:
// - Building the TopologyReconciled condition.
// - Making upgrade decisions on machine deployments.
upgradingMDs, err := s.Current.MachineDeployments.Upgrading(ctx, r.Client)
if err != nil {
return nil, err
}
s.UpgradeTracker.MachineDeployments.MarkUpgradingAndRollingOut(upgradingMDs...)

// Mark all MachineDeployments that are currently rolling out.
// This captured information is used for:
// - Building the TopologyReconciled condition (when control plane upgrade is pending)
s.UpgradeTracker.MachineDeployments.MarkRollingOut(s.Current.MachineDeployments.RollingOut()...)

machineDeploymentsStateMap := make(scope.MachineDeploymentsStateMap)
for _, mdTopology := range s.Blueprint.Topology.Workers.MachineDeployments {
desiredMachineDeployment, err := computeMachineDeployment(ctx, s, desiredControlPlaneState, mdTopology)
Expand Down Expand Up @@ -844,7 +854,7 @@ func computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology c

// Control plane and machine deployments are stable.
// Ready to pick up the topology version.
s.UpgradeTracker.MachineDeployments.MarkRollingOut(currentMDState.Object.Name)
s.UpgradeTracker.MachineDeployments.MarkUpgradingAndRollingOut(currentMDState.Object.Name)
return desiredVersion, nil
}

Expand Down
Loading

0 comments on commit 8ca8170

Please sign in to comment.