From 48ca777fa0f3083ae4f45bdb9ea1a78cf5bfaa39 Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Tue, 31 Mar 2020 21:06:49 +0000 Subject: [PATCH 1/6] Merge removeMemberForNode into RemoveEtcdMemberForMachine Just makes things more confusing to have all the logic in a separate private method --- controlplane/kubeadm/internal/cluster_test.go | 21 ----- .../kubeadm/internal/workload_cluster_etcd.go | 77 +++++++++---------- 2 files changed, 35 insertions(+), 63 deletions(-) diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go index f6512affb418..ef903b3b116e 100644 --- a/controlplane/kubeadm/internal/cluster_test.go +++ b/controlplane/kubeadm/internal/cluster_test.go @@ -471,27 +471,6 @@ func nilNodeRef(machine clusterv1.Machine) clusterv1.Machine { return machine } -func TestRemoveMemberForNode_ErrControlPlaneMinNodes(t *testing.T) { - t.Run("do not remove the etcd member if the cluster has fewer than 2 control plane nodes", func(t *testing.T) { - g := NewWithT(t) - - expectedErr := ErrControlPlaneMinNodes - - workloadCluster := &Workload{ - Client: &fakeClient{ - list: &corev1.NodeList{ - Items: []corev1.Node{ - nodeNamed("first-control-plane"), - }, - }, - }, - } - - err := workloadCluster.removeMemberForNode(context.Background(), "first-control-plane") - g.Expect(err).To(MatchError(expectedErr)) - }) -} - func TestPickFirstNodeNotMatching(t *testing.T) { g := NewWithT(t) diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index a345e654b4bd..bd69bdd07932 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -148,13 +148,47 @@ func (w *Workload) UpdateEtcdVersionInKubeadmConfigMap(ctx context.Context, imag } // RemoveEtcdMemberForMachine removes the etcd member from the target cluster's etcd cluster. +// Removing the last remaining member of the cluster is not supported. func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error { if machine == nil || machine.Status.NodeRef == nil { // Nothing to do, no node for Machine return nil } - return w.removeMemberForNode(ctx, machine.Status.NodeRef.Name) + // Pick a different node to talk to etcd + controlPlaneNodes, err := w.getControlPlaneNodes(ctx) + if err != nil { + return err + } + if len(controlPlaneNodes.Items) < 2 { + return ErrControlPlaneMinNodes + } + anotherNode := firstNodeNotMatchingName(machine.Status.NodeRef.Name, controlPlaneNodes.Items) + if anotherNode == nil { + return errors.Errorf("failed to find a control plane node whose name is not %s", machine.Status.NodeRef.Name) + } + etcdClient, err := w.etcdClientGenerator.forNode(ctx, anotherNode.Name) + if err != nil { + return errors.Wrap(err, "failed to create etcd client") + } + + // List etcd members. This checks that the member is healthy, because the request goes through consensus. + members, err := etcdClient.Members(ctx) + if err != nil { + return errors.Wrap(err, "failed to list etcd members using etcd client") + } + member := etcdutil.MemberForName(members, machine.Status.NodeRef.Name) + + // The member has already been removed, return immediately + if member == nil { + return nil + } + + if err := etcdClient.RemoveMember(ctx, member.ID); err != nil { + return errors.Wrap(err, "failed to remove member from etcd") + } + + return nil } // ForwardEtcdLeadership forwards etcd leadership to the first follower @@ -216,44 +250,3 @@ func (w *Workload) ForwardEtcdLeadership(ctx context.Context, machine *clusterv1 } return nil } - -// removeMemberForNode removes the etcd member for the node. Removing the etcd -// member when the cluster has one control plane node is not supported. To allow -// the removal of a failed etcd member, the etcd API requests are sent to a -// different node. -func (w *Workload) removeMemberForNode(ctx context.Context, name string) error { - // Pick a different node to talk to etcd - controlPlaneNodes, err := w.getControlPlaneNodes(ctx) - if err != nil { - return err - } - if len(controlPlaneNodes.Items) < 2 { - return ErrControlPlaneMinNodes - } - anotherNode := firstNodeNotMatchingName(name, controlPlaneNodes.Items) - if anotherNode == nil { - return errors.Errorf("failed to find a control plane node whose name is not %s", name) - } - etcdClient, err := w.etcdClientGenerator.forNode(ctx, anotherNode.Name) - if err != nil { - return errors.Wrap(err, "failed to create etcd client") - } - - // List etcd members. This checks that the member is healthy, because the request goes through consensus. - members, err := etcdClient.Members(ctx) - if err != nil { - return errors.Wrap(err, "failed to list etcd members using etcd client") - } - member := etcdutil.MemberForName(members, name) - - // The member has already been removed, return immediately - if member == nil { - return nil - } - - if err := etcdClient.RemoveMember(ctx, member.ID); err != nil { - return errors.Wrap(err, "failed to remove member from etcd") - } - - return nil -} From 3ec58ba0b216506b153e073fc7604da7c37ec55d Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Tue, 31 Mar 2020 21:13:19 +0000 Subject: [PATCH 2/6] Reword some test descriptions --- .../internal/workload_cluster_etcd_test.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index a1bc58711d39..9d447cbc265b 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -204,11 +204,12 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr bool }{ { - name: "does not panic if machine is nil", + name: "does nothing if the machine is nil", + machine: nil, expectErr: false, }, { - name: "does not panic if machine noderef is nil", + name: "does nothing if the machine has no node", machine: &clusterv1.Machine{ Status: clusterv1.MachineStatus{ NodeRef: nil, @@ -217,13 +218,13 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr: false, }, { - name: "returns error if there are less than 2 control plane nodes", + name: "returns an error if there are less than 2 control plane nodes", machine: machine, objs: []runtime.Object{cp1}, expectErr: true, }, { - name: "returns error if nodes match node ref name", + name: "returns an error if it can't find a node with a different name", machine: &clusterv1.Machine{ Status: clusterv1.MachineStatus{ NodeRef: &corev1.ObjectReference{ @@ -235,14 +236,14 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr: true, }, { - name: "returns error if it failed to create etcdClient", + name: "returns an error if it failed to create the etcd client", machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{err: errors.New("no client")}, expectErr: true, }, { - name: "returns error if it failed to get etcd members", + name: "returns an error if the client errors getting etcd members", machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ @@ -255,7 +256,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr: true, }, { - name: "returns error if it failed to remove etcd member", + name: "returns an error if the client errors removing the etcd member", machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ @@ -278,7 +279,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr: true, }, { - name: "removes member from etcd", + name: "removes the member from etcd", machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ From 8553a3755a1e70fc72f7804dee5a65b31ecb8ae0 Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Tue, 31 Mar 2020 21:59:28 +0000 Subject: [PATCH 3/6] Add forLeader to etcdClientGenerator --- .../kubeadm/internal/etcd_client_generator.go | 29 ++++++++++++++++++ .../kubeadm/internal/workload_cluster_etcd.go | 1 + .../internal/workload_cluster_etcd_test.go | 30 +++++++++++-------- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/controlplane/kubeadm/internal/etcd_client_generator.go b/controlplane/kubeadm/internal/etcd_client_generator.go index 18525b8b1f67..5883d385dce1 100644 --- a/controlplane/kubeadm/internal/etcd_client_generator.go +++ b/controlplane/kubeadm/internal/etcd_client_generator.go @@ -20,7 +20,10 @@ import ( "context" "crypto/tls" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/rest" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy" @@ -56,3 +59,29 @@ func (c *etcdClientGenerator) forNode(ctx context.Context, name string) (*etcd.C } return customClient, nil } + +// forLeader takes a list of nodes and returns a client to the leader node +func (c *etcdClientGenerator) forLeader(ctx context.Context, nodes *corev1.NodeList) (*etcd.Client, error) { + var errs []error + + for _, node := range nodes.Items { + client, err := c.forNode(ctx, node.Name) + if err != nil { + errs = append(errs, err) + continue + } + defer client.Close() + members, err := client.Members(ctx) + if err != nil { + errs = append(errs, err) + continue + } + for _, member := range members { + if member.ID == client.LeaderID { + return c.forNode(ctx, member.Name) + } + } + } + + return nil, errors.Wrap(kerrors.NewAggregate(errs), "could not establish a connection to the etcd leader") +} diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index bd69bdd07932..09f9ae967eda 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -30,6 +30,7 @@ import ( type etcdClientFor interface { forNode(ctx context.Context, name string) (*etcd.Client, error) + forLeader(ctx context.Context, nodes *corev1.NodeList) (*etcd.Client, error) } // EtcdIsHealthy runs checks for every etcd member in the cluster to satisfy our definition of healthy. diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index 9d447cbc265b..0e608af6a3d9 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -56,7 +56,7 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) { }, }, etcdClientGenerator: &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ EtcdEndpoints: []string{}, MemberListResponse: &clientv3.MemberListResponse{ @@ -239,7 +239,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { name: "returns an error if it failed to create the etcd client", machine: machine, objs: []runtime.Object{cp1, cp2}, - etcdClientGenerator: &fakeEtcdClientGenerator{err: errors.New("no client")}, + etcdClientGenerator: &fakeEtcdClientGenerator{forNodeErr: errors.New("no client")}, expectErr: true, }, { @@ -247,7 +247,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ ErrorResponse: errors.New("cannot get etcd members"), }, @@ -260,7 +260,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ ErrorResponse: errors.New("cannot remove etcd member"), MemberListResponse: &clientv3.MemberListResponse{ @@ -283,7 +283,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ MemberListResponse: &clientv3.MemberListResponse{ Members: []*pb.Member{ @@ -355,14 +355,14 @@ func TestForwardEtcdLeadership(t *testing.T) { { name: "returns error if cannot find etcdClient for node", machine: machineNoNode, - etcdClientGenerator: &fakeEtcdClientGenerator{err: errors.New("no etcdClient")}, + etcdClientGenerator: &fakeEtcdClientGenerator{forNodeErr: errors.New("no etcdClient")}, expectErr: true, }, { name: "returns error if it failed to get etcd members", machine: machine, etcdClientGenerator: &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ ErrorResponse: errors.New("cannot get etcd members"), }, @@ -409,7 +409,7 @@ func TestForwardEtcdLeadership(t *testing.T) { }, } etcdClientGenerator := &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: fakeEtcdClient, // this etcd client does not belong to the current // machine. Ideally, this would match 101 from members @@ -497,7 +497,7 @@ func TestForwardEtcdLeadership(t *testing.T) { } etcdClientGenerator := &fakeEtcdClientGenerator{ - client: &etcd.Client{ + forNodeClient: &etcd.Client{ EtcdClient: fakeEtcdClient, // this etcdClient belongs to the machine-node LeaderID: 101, @@ -521,12 +521,18 @@ func TestForwardEtcdLeadership(t *testing.T) { } type fakeEtcdClientGenerator struct { - client *etcd.Client - err error + forNodeClient *etcd.Client + forLeaderClient *etcd.Client + forNodeErr error + forLeaderErr error } func (c *fakeEtcdClientGenerator) forNode(_ context.Context, _ string) (*etcd.Client, error) { - return c.client, c.err + return c.forNodeClient, c.forNodeErr +} + +func (c *fakeEtcdClientGenerator) forLeader(_ context.Context, _ *corev1.NodeList) (*etcd.Client, error) { + return c.forLeaderClient, c.forLeaderErr } type podOption func(*corev1.Pod) From 05a8f91b0c6001ced15a69d18afa7963153a124c Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Tue, 31 Mar 2020 22:17:33 +0000 Subject: [PATCH 4/6] RemoveEtcdMemberForMachine uses leader node --- .../kubeadm/internal/workload_cluster_etcd.go | 6 +---- .../internal/workload_cluster_etcd_test.go | 22 +++++-------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index 09f9ae967eda..a4488491b544 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -164,11 +164,7 @@ func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clus if len(controlPlaneNodes.Items) < 2 { return ErrControlPlaneMinNodes } - anotherNode := firstNodeNotMatchingName(machine.Status.NodeRef.Name, controlPlaneNodes.Items) - if anotherNode == nil { - return errors.Errorf("failed to find a control plane node whose name is not %s", machine.Status.NodeRef.Name) - } - etcdClient, err := w.etcdClientGenerator.forNode(ctx, anotherNode.Name) + etcdClient, err := w.etcdClientGenerator.forLeader(ctx, controlPlaneNodes) if err != nil { return errors.Wrap(err, "failed to create etcd client") } diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index 0e608af6a3d9..e1694128caaa 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -224,22 +224,10 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { expectErr: true, }, { - name: "returns an error if it can't find a node with a different name", - machine: &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: &corev1.ObjectReference{ - Name: "cp1", - }, - }, - }, - objs: []runtime.Object{cp1, cp1DiffNS}, - expectErr: true, - }, - { - name: "returns an error if it failed to create the etcd client", + name: "returns an error if it fails to create the etcd client", machine: machine, objs: []runtime.Object{cp1, cp2}, - etcdClientGenerator: &fakeEtcdClientGenerator{forNodeErr: errors.New("no client")}, + etcdClientGenerator: &fakeEtcdClientGenerator{forLeaderErr: errors.New("no client")}, expectErr: true, }, { @@ -247,7 +235,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - forNodeClient: &etcd.Client{ + forLeaderClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ ErrorResponse: errors.New("cannot get etcd members"), }, @@ -260,7 +248,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - forNodeClient: &etcd.Client{ + forLeaderClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ ErrorResponse: errors.New("cannot remove etcd member"), MemberListResponse: &clientv3.MemberListResponse{ @@ -283,7 +271,7 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { machine: machine, objs: []runtime.Object{cp1, cp2}, etcdClientGenerator: &fakeEtcdClientGenerator{ - forNodeClient: &etcd.Client{ + forLeaderClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ MemberListResponse: &clientv3.MemberListResponse{ Members: []*pb.Member{ From a5b31e9b6682d6d7808ca108c5c7f94f423fe272 Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Wed, 1 Apr 2020 19:23:41 +0000 Subject: [PATCH 5/6] ForwardEtcdLeadership uses leader node --- controlplane/kubeadm/internal/cluster_test.go | 13 +- .../kubeadm/internal/workload_cluster.go | 9 -- .../kubeadm/internal/workload_cluster_etcd.go | 37 +---- .../internal/workload_cluster_etcd_test.go | 142 +++++++++--------- 4 files changed, 84 insertions(+), 117 deletions(-) diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go index ef903b3b116e..eab2bd8143bb 100644 --- a/controlplane/kubeadm/internal/cluster_test.go +++ b/controlplane/kubeadm/internal/cluster_test.go @@ -223,6 +223,7 @@ type fakeClient struct { getErr error patchErr error updateErr error + listErr error } func (f *fakeClient) Get(_ context.Context, key client.ObjectKey, obj runtime.Object) error { @@ -251,6 +252,9 @@ func (f *fakeClient) Get(_ context.Context, key client.ObjectKey, obj runtime.Ob } func (f *fakeClient) List(_ context.Context, list runtime.Object, _ ...client.ListOption) error { + if f.listErr != nil { + return f.listErr + } switch l := f.list.(type) { case *clusterv1.MachineList: l.DeepCopyInto(list.(*clusterv1.MachineList)) @@ -470,12 +474,3 @@ func nilNodeRef(machine clusterv1.Machine) clusterv1.Machine { machine.Status.NodeRef = nil return machine } - -func TestPickFirstNodeNotMatching(t *testing.T) { - g := NewWithT(t) - - name := "first-control-plane" - anotherNode := firstNodeNotMatchingName(name, nodeListForTestControlPlaneIsHealthy().Items) - g.Expect(anotherNode).NotTo(BeNil()) - g.Expect(anotherNode.Name).NotTo(Equal(name)) -} diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go index 25079b4bc47f..76f1ba516737 100644 --- a/controlplane/kubeadm/internal/workload_cluster.go +++ b/controlplane/kubeadm/internal/workload_cluster.go @@ -435,15 +435,6 @@ func checkNodeNoExecuteCondition(node corev1.Node) error { return nil } -func firstNodeNotMatchingName(name string, nodes []corev1.Node) *corev1.Node { - for _, n := range nodes { - if n.Name != name { - return &n - } - } - return nil -} - // UpdateKubeProxyImageInfo updates kube-proxy image in the kube-proxy DaemonSet. func (w *Workload) UpdateKubeProxyImageInfo(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane) error { ds := &appsv1.DaemonSet{} diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index a4488491b544..ab1c40e43bf5 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -190,24 +190,19 @@ func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clus // ForwardEtcdLeadership forwards etcd leadership to the first follower func (w *Workload) ForwardEtcdLeadership(ctx context.Context, machine *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { - if machine == nil || machine.Status.NodeRef == nil { - // Nothing to do, no node for Machine + if machine == nil || machine.Status.NodeRef == nil || leaderCandidate == nil { return nil } - - // TODO we'd probably prefer to pass in all the known nodes and let grpc handle retrying connections across them - clientMachineName := machine.Status.NodeRef.Name - if leaderCandidate != nil && leaderCandidate.Status.NodeRef != nil { - // connect to the new leader candidate, in case machine's etcd membership has already been removed - clientMachineName = leaderCandidate.Status.NodeRef.Name + nodes, err := w.getControlPlaneNodes(ctx) + if err != nil { + return errors.Wrap(err, "failed to list control plane nodes") } - etcdClient, err := w.etcdClientGenerator.forNode(ctx, clientMachineName) + etcdClient, err := w.etcdClientGenerator.forLeader(ctx, nodes) if err != nil { - return errors.Wrap(err, "failed to create etcd Client") + return errors.Wrap(err, "failed to create etcd client") } - // List etcd members. This checks that the member is healthy, because the request goes through consensus. members, err := etcdClient.Members(ctx) if err != nil { return errors.Wrap(err, "failed to list etcd members using etcd client") @@ -215,25 +210,7 @@ func (w *Workload) ForwardEtcdLeadership(ctx context.Context, machine *clusterv1 currentMember := etcdutil.MemberForName(members, machine.Status.NodeRef.Name) if currentMember == nil || currentMember.ID != etcdClient.LeaderID { - return nil - } - - // Move the etcd client to the current leader, which in this case is the machine we're about to delete. - etcdClient, err = w.etcdClientGenerator.forNode(ctx, machine.Status.NodeRef.Name) - if err != nil { - return errors.Wrap(err, "failed to create etcd Client") - } - - // If we don't have a leader candidate, move the leader to the next available machine. - if leaderCandidate == nil || leaderCandidate.Status.NodeRef == nil { - for _, member := range members { - if member.ID != currentMember.ID { - if err := etcdClient.MoveLeader(ctx, member.ID); err != nil { - return errors.Wrapf(err, "failed to move leader") - } - break - } - } + // nothing to do, this is not the etcd leader return nil } diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index e1694128caaa..f7be462cdcae 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -31,6 +31,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" fake2 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd/fake" + "sigs.k8s.io/controller-runtime/pkg/client" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -169,7 +170,7 @@ kind: ClusterConfiguration } } -func TestRemoveEtcdMemberFromMachine(t *testing.T) { +func TestRemoveEtcdMemberForMachine(t *testing.T) { machine := &clusterv1.Machine{ Status: clusterv1.MachineStatus{ NodeRef: &corev1.ObjectReference{ @@ -311,44 +312,53 @@ func TestRemoveEtcdMemberFromMachine(t *testing.T) { func TestForwardEtcdLeadership(t *testing.T) { t.Run("handles errors correctly", func(t *testing.T) { - machine := &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: &corev1.ObjectReference{ - Name: "machine-node", - }, - }, - } - machineNoNode := machine.DeepCopy() - machineNoNode.Status.NodeRef.Name = "does-not-exist" + tests := []struct { name string machine *clusterv1.Machine leaderCandidate *clusterv1.Machine etcdClientGenerator etcdClientFor + k8sClient client.Client expectErr bool }{ { - name: "does not panic if machine is nil", + name: "does nothing if the machine is nil", + machine: nil, expectErr: false, }, { - name: "does not panic if machine noderef is nil", - machine: &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: nil, - }, - }, + name: "does nothing if machine's NodeRef is nil", + machine: defaultMachine(func(m *clusterv1.Machine) { + m.Status.NodeRef = nil + }), expectErr: false, }, { - name: "returns error if cannot find etcdClient for node", - machine: machineNoNode, - etcdClientGenerator: &fakeEtcdClientGenerator{forNodeErr: errors.New("no etcdClient")}, + name: "does nothing if the leader candidate is nil", + machine: defaultMachine(), + leaderCandidate: nil, + expectErr: false, + }, + { + name: "returns an error if it can't retrieve the list of control plane nodes", + machine: defaultMachine(), + leaderCandidate: defaultMachine(), + k8sClient: &fakeClient{listErr: errors.New("failed to list nodes")}, + expectErr: true, + }, + { + name: "returns an error if it can't create an etcd client", + machine: defaultMachine(), + leaderCandidate: defaultMachine(), + k8sClient: &fakeClient{}, + etcdClientGenerator: &fakeEtcdClientGenerator{forLeaderErr: errors.New("no etcdClient")}, expectErr: true, }, { - name: "returns error if it failed to get etcd members", - machine: machine, + name: "returns error if it fails to get etcd members", + machine: defaultMachine(), + leaderCandidate: defaultMachine(), + k8sClient: &fakeClient{}, etcdClientGenerator: &fakeEtcdClientGenerator{ forNodeClient: &etcd.Client{ EtcdClient: &fake2.FakeEtcdClient{ @@ -363,6 +373,7 @@ func TestForwardEtcdLeadership(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) w := &Workload{ + Client: tt.k8sClient, etcdClientGenerator: tt.etcdClientGenerator, } ctx := context.TODO() @@ -376,20 +387,12 @@ func TestForwardEtcdLeadership(t *testing.T) { } }) - t.Run("does noop if machine etcd member ID does not match etcdClient leader ID", func(t *testing.T) { + t.Run("does nothing if the machine is not the leader", func(t *testing.T) { g := NewWithT(t) - machine := &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: &corev1.ObjectReference{ - Name: "machine-node", - }, - }, - } fakeEtcdClient := &fake2.FakeEtcdClient{ MemberListResponse: &clientv3.MemberListResponse{ Members: []*pb.Member{ {Name: "machine-node", ID: uint64(101)}, - {Name: "other-node", ID: uint64(1034)}, }, }, AlarmResponse: &clientv3.AlarmResponse{ @@ -410,29 +413,13 @@ func TestForwardEtcdLeadership(t *testing.T) { etcdClientGenerator: etcdClientGenerator, } ctx := context.TODO() - err := w.ForwardEtcdLeadership(ctx, machine, nil) + err := w.ForwardEtcdLeadership(ctx, defaultMachine(), nil) g.Expect(err).ToNot(HaveOccurred()) g.Expect(fakeEtcdClient.MovedLeader).To(BeEquivalentTo(0)) }) t.Run("move etcd leader", func(t *testing.T) { - machine := &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: &corev1.ObjectReference{ - Name: "machine-node", - }, - }, - } - leaderCandidate := &clusterv1.Machine{ - Status: clusterv1.MachineStatus{ - NodeRef: &corev1.ObjectReference{ - Name: "leader-node", - }, - }, - } - leaderCandidateBadNodeRef := leaderCandidate.DeepCopy() - leaderCandidateBadNodeRef.Status.NodeRef.Name = "does-not-exist" tests := []struct { name string leaderCandidate *clusterv1.Machine @@ -441,32 +428,32 @@ func TestForwardEtcdLeadership(t *testing.T) { expectErr bool }{ { - name: "to the next available member", - expectedMoveLeader: 1034, + name: "it moves the etcd leadership to the leader candidate", + leaderCandidate: defaultMachine(func(m *clusterv1.Machine) { + m.Status.NodeRef.Name = "candidate-node" + }), + expectedMoveLeader: 12345, }, { - name: "returns error if failed to move to the next available member", + name: "returns error if failed to move to the leader candidate", + leaderCandidate: defaultMachine(func(m *clusterv1.Machine) { + m.Status.NodeRef.Name = "candidate-node" + }), etcdMoveErr: errors.New("move err"), expectErr: true, }, { - name: "to the leader candidate", - leaderCandidate: leaderCandidate, - expectedMoveLeader: 12345, - }, - { - name: "returns error if failed to move to the leader candidate", - leaderCandidate: leaderCandidate, - etcdMoveErr: errors.New("move err"), - expectErr: true, - }, - { - name: "returns error if it cannot find the leader etcd member", - leaderCandidate: leaderCandidateBadNodeRef, - expectErr: true, + name: "returns error if the leader candidate doesn't exist in etcd", + leaderCandidate: defaultMachine(func(m *clusterv1.Machine) { + m.Status.NodeRef.Name = "some other node" + }), + expectErr: true, }, } + currentLeader := defaultMachine(func(m *clusterv1.Machine) { + m.Status.NodeRef.Name = "current-leader" + }) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) @@ -474,9 +461,9 @@ func TestForwardEtcdLeadership(t *testing.T) { ErrorResponse: tt.etcdMoveErr, MemberListResponse: &clientv3.MemberListResponse{ Members: []*pb.Member{ - {Name: "machine-node", ID: uint64(101)}, + {Name: currentLeader.Status.NodeRef.Name, ID: uint64(101)}, {Name: "other-node", ID: uint64(1034)}, - {Name: "leader-node", ID: uint64(12345)}, + {Name: "candidate-node", ID: uint64(12345)}, }, }, AlarmResponse: &clientv3.AlarmResponse{ @@ -485,7 +472,7 @@ func TestForwardEtcdLeadership(t *testing.T) { } etcdClientGenerator := &fakeEtcdClientGenerator{ - forNodeClient: &etcd.Client{ + forLeaderClient: &etcd.Client{ EtcdClient: fakeEtcdClient, // this etcdClient belongs to the machine-node LeaderID: 101, @@ -494,9 +481,12 @@ func TestForwardEtcdLeadership(t *testing.T) { w := &Workload{ etcdClientGenerator: etcdClientGenerator, + Client: &fakeClient{list: &corev1.NodeList{ + Items: []corev1.Node{nodeNamed("leader-node"), nodeNamed("other-node"), nodeNamed("candidate-node")}, + }}, } ctx := context.TODO() - err := w.ForwardEtcdLeadership(ctx, machine, tt.leaderCandidate) + err := w.ForwardEtcdLeadership(ctx, currentLeader, tt.leaderCandidate) if tt.expectErr { g.Expect(err).To(HaveOccurred()) return @@ -551,3 +541,17 @@ func withProviderID(pi string) func(corev1.Node) corev1.Node { return node } } + +func defaultMachine(transforms ...func(m *clusterv1.Machine)) *clusterv1.Machine { + m := &clusterv1.Machine{ + Status: clusterv1.MachineStatus{ + NodeRef: &corev1.ObjectReference{ + Name: "machine-node", + }, + }, + } + for _, t := range transforms { + t(m) + } + return m +} From d823fc7ba56ae98980f148932838be6ee1a8432e Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Wed, 1 Apr 2020 21:45:05 +0000 Subject: [PATCH 6/6] Return an error if the leader candidate is nil --- .../kubeadm/internal/workload_cluster_etcd.go | 6 +++++- .../internal/workload_cluster_etcd_test.go | 16 ++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index ab1c40e43bf5..4edec84adcf3 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -190,9 +190,13 @@ func (w *Workload) RemoveEtcdMemberForMachine(ctx context.Context, machine *clus // ForwardEtcdLeadership forwards etcd leadership to the first follower func (w *Workload) ForwardEtcdLeadership(ctx context.Context, machine *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { - if machine == nil || machine.Status.NodeRef == nil || leaderCandidate == nil { + if machine == nil || machine.Status.NodeRef == nil { return nil } + if leaderCandidate == nil { + return errors.New("leader candidate cannot be nil") + } + nodes, err := w.getControlPlaneNodes(ctx) if err != nil { return errors.Wrap(err, "failed to list control plane nodes") diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index f7be462cdcae..d0cbcac83761 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -334,10 +334,10 @@ func TestForwardEtcdLeadership(t *testing.T) { expectErr: false, }, { - name: "does nothing if the leader candidate is nil", + name: "returns an error if the leader candidate is nil", machine: defaultMachine(), leaderCandidate: nil, - expectErr: false, + expectErr: true, }, { name: "returns an error if it can't retrieve the list of control plane nodes", @@ -400,20 +400,20 @@ func TestForwardEtcdLeadership(t *testing.T) { }, } etcdClientGenerator := &fakeEtcdClientGenerator{ - forNodeClient: &etcd.Client{ + forLeaderClient: &etcd.Client{ EtcdClient: fakeEtcdClient, - // this etcd client does not belong to the current - // machine. Ideally, this would match 101 from members - // list - LeaderID: 555, + LeaderID: 555, }, } w := &Workload{ + Client: &fakeClient{list: &corev1.NodeList{ + Items: []corev1.Node{nodeNamed("leader-node")}, + }}, etcdClientGenerator: etcdClientGenerator, } ctx := context.TODO() - err := w.ForwardEtcdLeadership(ctx, defaultMachine(), nil) + err := w.ForwardEtcdLeadership(ctx, defaultMachine(), defaultMachine()) g.Expect(err).ToNot(HaveOccurred()) g.Expect(fakeEtcdClient.MovedLeader).To(BeEquivalentTo(0))