Skip to content

Commit

Permalink
Change the number of expected etcd members
Browse files Browse the repository at this point in the history
Signed-off-by: Chuck Ha <[email protected]>
  • Loading branch information
chuckha committed Mar 17, 2020
1 parent d06be1c commit 2772355
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 165 deletions.
14 changes: 10 additions & 4 deletions controlplane/kubeadm/internal/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func TestCheckStaticPodReadyCondition(t *testing.T) {
}
for _, test := range table {
t.Run(test.name, func(t *testing.T) {
pod := &corev1.Pod{
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
},
Expand All @@ -82,7 +82,7 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
}
for _, test := range table {
t.Run(test.name, func(t *testing.T) {
pod := &corev1.Pod{
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
},
Expand Down Expand Up @@ -131,12 +131,16 @@ func TestControlPlaneIsHealthy(t *testing.T) {
}
}

func nodeNamed(name string) corev1.Node {
return corev1.Node{
func nodeNamed(name string, options ...func(n corev1.Node) corev1.Node) corev1.Node {
node := corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
}
for _, opt := range options {
node = opt(node)
}
return node
}

func nodeListForTestControlPlaneIsHealthy() *corev1.NodeList {
Expand Down Expand Up @@ -265,6 +269,8 @@ func (f *fakeClient) List(_ context.Context, list runtime.Object, _ ...client.Li
l.DeepCopyInto(list.(*clusterv1.MachineList))
case *corev1.NodeList:
l.DeepCopyInto(list.(*corev1.NodeList))
case *corev1.PodList:
l.DeepCopyInto(list.(*corev1.PodList))
default:
return fmt.Errorf("unknown type: %s", l)
}
Expand Down
170 changes: 19 additions & 151 deletions controlplane/kubeadm/internal/etcd/fake/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,173 +18,41 @@ package fake

import (
"context"
"errors"
"fmt"

"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"go.etcd.io/etcd/clientv3"
)

type FakeEtcdClient struct {
leaderID uint64
members map[uint64]*etcd.Member
healthy map[uint64]bool
alarms []etcd.MemberAlarm
AlarmResponse *clientv3.AlarmResponse
EtcdEndpoints []string
MemberListResponse *clientv3.MemberListResponse
MemberRemoveResponse *clientv3.MemberRemoveResponse
MemberUpdateResponse *clientv3.MemberUpdateResponse
MoveLeaderResponse *clientv3.MoveLeaderResponse
}

func NewClient() *FakeEtcdClient {
c := FakeEtcdClient{
members: make(map[uint64]*etcd.Member),
healthy: make(map[uint64]bool),
}
return &c
func (c *FakeEtcdClient) Endpoints() []string {
return c.EtcdEndpoints
}

// Receivers that manipulate the state of the fake etcd cluster.

func (c *FakeEtcdClient) AddMember(memberID uint64, peerURLs []string) error {
_, ok := c.members[memberID]
if ok {
return fmt.Errorf("member with ID %d already exists", memberID)
}
c.members[memberID] = &etcd.Member{
ID: memberID,
PeerURLs: peerURLs,
}
c.healthy[memberID] = false
return nil
}

func (c *FakeEtcdClient) StartMember(memberID uint64, name string, clientURLs []string) error {
m, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
m.Name = name
m.ClientURLs = clientURLs
c.healthy[memberID] = true
return nil
}

func (c *FakeEtcdClient) SetHealthy(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.healthy[memberID] = true
return nil
func (c *FakeEtcdClient) MoveLeader(_ context.Context, _ uint64) (*clientv3.MoveLeaderResponse, error) {
return c.MoveLeaderResponse, nil
}

func (c *FakeEtcdClient) SetUnhealthy(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.healthy[memberID] = false
return nil
}

func (c *FakeEtcdClient) SetLeader(memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.leaderID = memberID
return nil
}

func (c *FakeEtcdClient) Leader() (*etcd.Member, error) {
leader, ok := c.members[c.leaderID]
if !ok {
return nil, errors.New("cluster has no leader")
}
return leader, nil
}

func (c *FakeEtcdClient) SetAlarm(alarmType etcd.AlarmType, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
for _, a := range c.alarms {
if a.Type == alarmType && a.MemberID == memberID {
// Alarm is already set
return nil
}
}
mAlarm := etcd.MemberAlarm{
MemberID: memberID,
Type: alarmType,
}
c.alarms = append(c.alarms, mAlarm)
return nil
}

func (c *FakeEtcdClient) ClearAlarm(alarmType etcd.AlarmType, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
indexToDelete := -1
for i, a := range c.alarms {
if a.Type == alarmType && a.MemberID == memberID {
indexToDelete = i
break
}
}
if indexToDelete >= 0 {
c.alarms = append(c.alarms[:indexToDelete], c.alarms[indexToDelete:]...)
}
return nil
}

// Receivers that implement the controllers.EtcdClient interface.

func (c *FakeEtcdClient) Close() error {
return nil
}

func (c *FakeEtcdClient) Members(ctx context.Context) ([]*etcd.Member, error) {
members := []*etcd.Member{}
for i := range c.members {
members = append(members, c.members[i])
}
return members, nil
func (c *FakeEtcdClient) AlarmList(_ context.Context) (*clientv3.AlarmResponse, error) {
return c.AlarmResponse, nil
}

func (c *FakeEtcdClient) MoveLeader(ctx context.Context, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
c.leaderID = memberID
return nil
func (c *FakeEtcdClient) MemberList(_ context.Context) (*clientv3.MemberListResponse, error) {
return c.MemberListResponse, nil
}

func (c *FakeEtcdClient) RemoveMember(ctx context.Context, memberID uint64) error {
_, ok := c.members[memberID]
if !ok {
return fmt.Errorf("no member with ID %d", memberID)
}
if c.leaderID == memberID {
c.leaderID = 0
}
delete(c.members, memberID)
return nil
func (c *FakeEtcdClient) MemberRemove(_ context.Context, _ uint64) (*clientv3.MemberRemoveResponse, error) {
return c.MemberRemoveResponse, nil
}

func (c *FakeEtcdClient) UpdateMemberPeerURLs(ctx context.Context, memberID uint64, peerURLs []string) ([]*etcd.Member, error) {
m, ok := c.members[memberID]
if !ok {
return nil, fmt.Errorf("no member with ID %d", memberID)
}
m.PeerURLs = peerURLs
return c.Members(ctx)
}

func (c *FakeEtcdClient) Alarms(ctx context.Context) ([]etcd.MemberAlarm, error) {
alarms := []etcd.MemberAlarm{}
for i := range c.alarms {
alarms = append(alarms, c.alarms[i])
}
return alarms, nil
func (c *FakeEtcdClient) MemberUpdate(_ context.Context, _ uint64, _ []string) (*clientv3.MemberUpdateResponse, error) {
return c.MemberUpdateResponse, nil
}
42 changes: 32 additions & 10 deletions controlplane/kubeadm/internal/workload_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,8 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
Namespace: metav1.NamespaceSystem,
Name: staticPodName("kube-apiserver", name),
}

apiServerPod := &corev1.Pod{}
if err := w.Client.Get(ctx, apiServerPodKey, apiServerPod); err != nil {
apiServerPod := corev1.Pod{}
if err := w.Client.Get(ctx, apiServerPodKey, &apiServerPod); err != nil {
response[name] = err
continue
}
Expand All @@ -141,8 +140,8 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
Namespace: metav1.NamespaceSystem,
Name: staticPodName("kube-controller-manager", name),
}
controllerManagerPod := &corev1.Pod{}
if err := w.Client.Get(ctx, controllerManagerPodKey, controllerManagerPod); err != nil {
controllerManagerPod := corev1.Pod{}
if err := w.Client.Get(ctx, controllerManagerPodKey, &controllerManagerPod); err != nil {
response[name] = err
continue
}
Expand Down Expand Up @@ -206,6 +205,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
return nil, err
}

expectedMembers := 0
response := make(map[string]error)
for _, node := range controlPlaneNodes.Items {
name := node.Name
Expand All @@ -215,6 +215,26 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
continue
}

// Check to see if the pod is ready
etcdPodKey := ctrlclient.ObjectKey{
Namespace: metav1.NamespaceSystem,
Name: staticPodName("etcd", name),
}
pod := corev1.Pod{}
if err := w.Client.Get(ctx, etcdPodKey, &pod); err != nil {
response[name] = errors.Wrap(err, "failed to get etcd pod")
continue
}
if err := checkStaticPodReadyCondition(pod); err != nil {
// Nothing wrong here, etcd on this node is just not running.
// If it's a true failure the healthcheck will fail since it won't have checked enough members.
continue
}
// Only expect a member reports healthy if its pod is ready.
// This fixes the known state where the control plane has a crash-looping etcd pod that is not part of the
// etcd cluster.
expectedMembers++

// Create the etcd Client for the etcd Pod scheduled on the Node
etcdClient, err := w.etcdClientGenerator.forNode(name)
if err != nil {
Expand Down Expand Up @@ -258,10 +278,12 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
}
}

// Check that there is exactly one etcd member for every control plane machine.
// There should be no etcd members added "out of band.""
if len(controlPlaneNodes.Items) != len(knownMemberIDSet) {
return response, errors.Errorf("there are %d control plane nodes, but %d etcd members", len(controlPlaneNodes.Items), len(knownMemberIDSet))
// TODO: ensure that each pod is owned by a node that we're managing. That would ensure there are no out-of-band etcd members

// Check that there is exactly one etcd member for every healthy pod.
// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
if expectedMembers != len(knownMemberIDSet) {
return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
}

return response, nil
Expand Down Expand Up @@ -540,7 +562,7 @@ func staticPodName(component, nodeName string) string {
return fmt.Sprintf("%s-%s", component, nodeName)
}

func checkStaticPodReadyCondition(pod *corev1.Pod) error {
func checkStaticPodReadyCondition(pod corev1.Pod) error {
found := false
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodReady {
Expand Down
Loading

0 comments on commit 2772355

Please sign in to comment.