Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Change the number of expected etcd members #2696

Merged
merged 1 commit into from
Mar 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions controlplane/kubeadm/internal/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func TestCheckStaticPodReadyCondition(t *testing.T) {
}
for _, test := range table {
t.Run(test.name, func(t *testing.T) {
pod := &corev1.Pod{
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
},
Expand All @@ -82,7 +82,7 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
}
for _, test := range table {
t.Run(test.name, func(t *testing.T) {
pod := &corev1.Pod{
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
},
Expand Down Expand Up @@ -131,12 +131,16 @@ func TestControlPlaneIsHealthy(t *testing.T) {
}
}

func nodeNamed(name string) corev1.Node {
return corev1.Node{
func nodeNamed(name string, options ...func(n corev1.Node) corev1.Node) corev1.Node {
node := corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
}
for _, opt := range options {
node = opt(node)
}
return node
}

func nodeListForTestControlPlaneIsHealthy() *corev1.NodeList {
Expand Down Expand Up @@ -265,6 +269,8 @@ func (f *fakeClient) List(_ context.Context, list runtime.Object, _ ...client.Li
l.DeepCopyInto(list.(*clusterv1.MachineList))
case *corev1.NodeList:
l.DeepCopyInto(list.(*corev1.NodeList))
case *corev1.PodList:
l.DeepCopyInto(list.(*corev1.PodList))
default:
return fmt.Errorf("unknown type: %s", l)
}
Expand Down
172 changes: 22 additions & 150 deletions controlplane/kubeadm/internal/etcd/fake/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,173 +18,45 @@ package fake

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file was out of date since it was unused entirely. I updated it so I could fake etcd responses to simulate the situations necessary for the test.

import (
"context"
"errors"
"fmt"

"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"go.etcd.io/etcd/clientv3"
)

type FakeEtcdClient struct {
leaderID uint64
members map[uint64]*etcd.Member
healthy map[uint64]bool
alarms []etcd.MemberAlarm
AlarmResponse *clientv3.AlarmResponse
EtcdEndpoints []string
MemberListResponse *clientv3.MemberListResponse
MemberRemoveResponse *clientv3.MemberRemoveResponse
MemberUpdateResponse *clientv3.MemberUpdateResponse
MoveLeaderResponse *clientv3.MoveLeaderResponse
StatusResponse *clientv3.StatusResponse
}

func NewClient() *FakeEtcdClient {
c := FakeEtcdClient{
members: make(map[uint64]*etcd.Member),
healthy: make(map[uint64]bool),
}
return &c
func (c *FakeEtcdClient) Endpoints() []string {
return c.EtcdEndpoints
}

// Receivers that manipulate the state of the fake etcd cluster.

func (c *FakeEtcdClient) AddMember(memberID uint64, peerURLs []string) error {
_, ok := c.members[memberID]
if ok {
return fmt.Errorf("member with ID %d already exists", memberID)
}
c.members[memberID] = &etcd.Member{
ID: memberID,
PeerURLs: peerURLs,
}
c.healthy[memberID] = false
return nil
}

func (c *FakeEtcdClient) StartMember(memberID uint64, name string, clientURLs []string) error {
m, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
m.Name = name
m.ClientURLs = clientURLs
c.healthy[memberID] = true
return nil
}

func (c *FakeEtcdClient) SetHealthy(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.healthy[memberID] = true
return nil
}

func (c *FakeEtcdClient) SetUnhealthy(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.healthy[memberID] = false
return nil
}

func (c *FakeEtcdClient) SetLeader(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.leaderID = memberID
return nil
func (c *FakeEtcdClient) MoveLeader(_ context.Context, _ uint64) (*clientv3.MoveLeaderResponse, error) {
return c.MoveLeaderResponse, nil
}

func (c *FakeEtcdClient) Leader() (*etcd.Member, error) {
leader, ok := c.members[c.leaderID]
if !ok {
return nil, errors.New("cluster has no leader")
}
return leader, nil
}

func (c *FakeEtcdClient) SetAlarm(alarmType etcd.AlarmType, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
for _, a := range c.alarms {
if a.Type == alarmType && a.MemberID == memberID {
// Alarm is already set
return nil
}
}
mAlarm := etcd.MemberAlarm{
MemberID: memberID,
Type: alarmType,
}
c.alarms = append(c.alarms, mAlarm)
return nil
}

func (c *FakeEtcdClient) ClearAlarm(alarmType etcd.AlarmType, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
indexToDelete := -1
for i, a := range c.alarms {
if a.Type == alarmType && a.MemberID == memberID {
indexToDelete = i
break
}
}
if indexToDelete >= 0 {
c.alarms = append(c.alarms[:indexToDelete], c.alarms[indexToDelete:]...)
}
return nil
}

// Receivers that implement the controllers.EtcdClient interface.

func (c *FakeEtcdClient) Close() error {
return nil
}

func (c *FakeEtcdClient) Members(ctx context.Context) ([]*etcd.Member, error) {
members := []*etcd.Member{}
for i := range c.members {
members = append(members, c.members[i])
}
return members, nil
func (c *FakeEtcdClient) AlarmList(_ context.Context) (*clientv3.AlarmResponse, error) {
return c.AlarmResponse, nil
}

func (c *FakeEtcdClient) MoveLeader(ctx context.Context, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.leaderID = memberID
return nil
func (c *FakeEtcdClient) MemberList(_ context.Context) (*clientv3.MemberListResponse, error) {
return c.MemberListResponse, nil
}

func (c *FakeEtcdClient) RemoveMember(ctx context.Context, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
if c.leaderID == memberID {
c.leaderID = 0
}
delete(c.members, memberID)
return nil
func (c *FakeEtcdClient) MemberRemove(_ context.Context, _ uint64) (*clientv3.MemberRemoveResponse, error) {
return c.MemberRemoveResponse, nil
}

func (c *FakeEtcdClient) UpdateMemberPeerURLs(ctx context.Context, memberID uint64, peerURLs []string) ([]*etcd.Member, error) {
m, ok := c.members[memberID]
if !ok {
return nil, fmt.Errorf("no member with ID %d", memberID)
}
m.PeerURLs = peerURLs
return c.Members(ctx)
func (c *FakeEtcdClient) MemberUpdate(_ context.Context, _ uint64, _ []string) (*clientv3.MemberUpdateResponse, error) {
return c.MemberUpdateResponse, nil
}

func (c *FakeEtcdClient) Alarms(ctx context.Context) ([]etcd.MemberAlarm, error) {
alarms := []etcd.MemberAlarm{}
for i := range c.alarms {
alarms = append(alarms, c.alarms[i])
}
return alarms, nil
func (c *FakeEtcdClient) Status(_ context.Context, _ string) (*clientv3.StatusResponse, error) {
return c.StatusResponse, nil
}
42 changes: 32 additions & 10 deletions controlplane/kubeadm/internal/workload_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
Namespace: metav1.NamespaceSystem,
Name: staticPodName("kube-apiserver", name),
}

apiServerPod := &corev1.Pod{}
if err := w.Client.Get(ctx, apiServerPodKey, apiServerPod); err != nil {
apiServerPod := corev1.Pod{}
if err := w.Client.Get(ctx, apiServerPodKey, &apiServerPod); err != nil {
response[name] = err
continue
}
Expand All @@ -142,8 +141,8 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
Namespace: metav1.NamespaceSystem,
Name: staticPodName("kube-controller-manager", name),
}
controllerManagerPod := &corev1.Pod{}
if err := w.Client.Get(ctx, controllerManagerPodKey, controllerManagerPod); err != nil {
controllerManagerPod := corev1.Pod{}
if err := w.Client.Get(ctx, controllerManagerPodKey, &controllerManagerPod); err != nil {
response[name] = err
continue
}
Expand Down Expand Up @@ -207,6 +206,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
return nil, err
}

expectedMembers := 0
response := make(map[string]error)
for _, node := range controlPlaneNodes.Items {
name := node.Name
Expand All @@ -216,6 +216,26 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
continue
}

// Check to see if the pod is ready
etcdPodKey := ctrlclient.ObjectKey{
Namespace: metav1.NamespaceSystem,
Name: staticPodName("etcd", name),
}
pod := corev1.Pod{}
if err := w.Client.Get(ctx, etcdPodKey, &pod); err != nil {
response[name] = errors.Wrap(err, "failed to get etcd pod")
continue
}
if err := checkStaticPodReadyCondition(pod); err != nil {
// Nothing wrong here, etcd on this node is just not running.
// If it's a true failure the healthcheck will fail since it won't have checked enough members.
continue
}
// Only expect a member reports healthy if its pod is ready.
// This fixes the known state where the control plane has a crash-looping etcd pod that is not part of the
// etcd cluster.
expectedMembers++

// Create the etcd Client for the etcd Pod scheduled on the Node
etcdClient, err := w.etcdClientGenerator.forNode(ctx, name)
if err != nil {
Expand Down Expand Up @@ -259,10 +279,12 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
}
}

// Check that there is exactly one etcd member for every control plane machine.
// There should be no etcd members added "out of band.""
if len(controlPlaneNodes.Items) != len(knownMemberIDSet) {
return response, errors.Errorf("there are %d control plane nodes, but %d etcd members", len(controlPlaneNodes.Items), len(knownMemberIDSet))
// TODO: ensure that each pod is owned by a node that we're managing. That would ensure there are no out-of-band etcd members

// Check that there is exactly one etcd member for every healthy pod.
// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
if expectedMembers != len(knownMemberIDSet) {
return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
}

return response, nil
Expand Down Expand Up @@ -591,7 +613,7 @@ func staticPodName(component, nodeName string) string {
return fmt.Sprintf("%s-%s", component, nodeName)
}

func checkStaticPodReadyCondition(pod *corev1.Pod) error {
func checkStaticPodReadyCondition(pod corev1.Pod) error {
found := false
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodReady {
Expand Down
Loading