Skip to content

Commit

Permalink
fix: clean up couple small issues in the etcd member audit code
Browse files Browse the repository at this point in the history
Removing edge cases, simplifying code a bit.

Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
smira authored and talos-bot committed Jul 16, 2021
1 parent 9be7b88 commit 119b969
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 52 deletions.
15 changes: 7 additions & 8 deletions controllers/configs.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,17 @@ func (r *TalosControlPlaneReconciler) talosconfigForMachine(ctx context.Context,
return nil, fmt.Errorf("%q machine does not have a nodeRef", machine.Name)
}

// grab all addresses as endpoints
node, err := clientset.CoreV1().Nodes().Get(machine.Status.NodeRef.Name, metav1.GetOptions{})
if err != nil {
return nil, err
}

addrList := []string{}
for _, addr := range node.Status.Addresses {
addrList = append(addrList, addr.Address)
if addr.Type == corev1.NodeExternalIP || addr.Type == corev1.NodeInternalIP {
addrList = append(addrList, addr.Address)
}
}

if len(addrList) == 0 {
Expand All @@ -74,7 +77,8 @@ func (r *TalosControlPlaneReconciler) talosconfigForMachine(ctx context.Context,
found *cabptv1.TalosConfig
)

err = r.Client.List(ctx, &cfgs)
// find talosconfig in the machine's namespace
err = r.Client.List(ctx, &cfgs, client.InNamespace(machine.Namespace))
if err != nil {
return nil, err
}
Expand All @@ -97,10 +101,5 @@ func (r *TalosControlPlaneReconciler) talosconfigForMachine(ctx context.Context,
return nil, err
}

c, err := talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
if err != nil {
return nil, err
}

return c, nil
return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
}
43 changes: 16 additions & 27 deletions controllers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
return err
}
}

break
}

return nil
Expand All @@ -53,17 +51,12 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
r.Log.Info("Removing etcd member", "memberName", memberName)

err := c.EtcdRemoveMember(
return c.EtcdRemoveMember(
ctx,
&machine.EtcdRemoveMemberRequest{
Member: memberName,
},
)
if err != nil {
return err
}

return nil
}

// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
Expand All @@ -89,10 +82,17 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
var designatedCPMachine capiv1.Machine

for _, machine := range machines {
if machine.ObjectMeta.DeletionTimestamp.IsZero() && machine.Status.NodeRef != nil {
designatedCPMachine = machine
break
if !machine.ObjectMeta.DeletionTimestamp.IsZero() || machine.Status.NodeRef == nil {
continue
}

designatedCPMachine = machine

break
}

if designatedCPMachine.Name == "" {
return fmt.Errorf("no CP machine which is not being deleted and has node ref")
}

clientset, err := r.kubeconfigForCluster(ctx, cluster)
Expand All @@ -105,22 +105,9 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
return err
}

// Save the first internal IP of the designated machine to use as our node target
// and setup the ctx to target it
var firstIntAddr string

for _, addr := range designatedCPMachine.Status.Addresses {
if addr.Type == capiv1.MachineInternalIP {
firstIntAddr = addr.Address
break
}
}

nodeCtx := talosclient.WithNodes(ctx, firstIntAddr)

response, err := c.EtcdMemberList(nodeCtx, &machine.EtcdMemberListRequest{})
response, err := c.EtcdMemberList(ctx, &machine.EtcdMemberListRequest{})
if err != nil {
return err
return fmt.Errorf("error getting etcd members via %q (endpoints %v): %w", designatedCPMachine.Name, c.GetConfigContext().Endpoints, err)
}

// Only querying one CP node, so only 1 message should return.
Expand All @@ -146,7 +133,9 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
if !present {
r.Log.Info("found etcd member that doesn't exist as controlplane machine", "member", member)

r.forceEtcdLeave(nodeCtx, c, cluster, member)
if err = r.forceEtcdLeave(ctx, c, cluster, member); err != nil {
return fmt.Errorf("error leaving etcd for member %q via machine %q", member, designatedCPMachine.Name)
}
}
}

Expand Down
38 changes: 21 additions & 17 deletions controllers/taloscontrolplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,16 @@ func (r *TalosControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Resu
numMachines := len(ownedMachines)
desiredReplicas := int(*tcp.Spec.Replicas)

requeue := false

// Audit the etcd member list to remove any nodes that no longer exist
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name); err != nil {
logger.Info("failed to check etcd membership list", "error", err)

// if audit failed, requeue the reconcile in any case
requeue = true
}

switch {
// We are creating the first replica
case numMachines < desiredReplicas && numMachines == 0:
Expand All @@ -232,11 +242,16 @@ func (r *TalosControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Resu
case numMachines > desiredReplicas:
logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)

res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name)
if err != nil && (res.Requeue || res.RequeueAfter > 0) {
logger.Info("Failed to scale down control plane", "error", err)
return res, nil
res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name, ownedMachines)
if err != nil {
if res.Requeue || res.RequeueAfter > 0 {
logger.Info("Failed to scale down control plane", "error", err)

return res, nil
}
}

return res, err
}

// Generate Cluster Kubeconfig if needed
Expand All @@ -245,13 +260,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Resu
return ctrl.Result{}, err
}

// Audit the etcd member list to remove any nodes that no longer exist
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name); err != nil {
logger.Info("failed to check etcd membership list", "error", err)
return ctrl.Result{Requeue: true}, nil
}

return ctrl.Result{}, nil
return ctrl.Result{Requeue: requeue}, nil
}

// ClusterToTalosControlPlane is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
Expand Down Expand Up @@ -311,12 +320,7 @@ func newControlPlane(cluster *capiv1.Cluster, tcp *controlplanev1.TalosControlPl
}
}

func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster client.ObjectKey, cpName string) (ctrl.Result, error) {
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
if err != nil {
return ctrl.Result{}, err
}

func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster client.ObjectKey, cpName string, machines []capiv1.Machine) (ctrl.Result, error) {
if len(machines) == 0 {
return ctrl.Result{}, fmt.Errorf("no machines found")
}
Expand Down

0 comments on commit 119b969

Please sign in to comment.