Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(hetzner): missing error return in scale up/down #6750

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 58 additions & 21 deletions cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package hetzner

import (
"context"
"errors"
"fmt"
"maps"
"math/rand"
Expand Down Expand Up @@ -91,12 +92,14 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
return fmt.Errorf("delta must be positive, have: %d", delta)
}

targetSize := n.targetSize + delta
if targetSize > n.MaxSize() {
return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, targetSize, n.MaxSize())
desiredTargetSize := n.targetSize + delta
if desiredTargetSize > n.MaxSize() {
return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, desiredTargetSize, n.MaxSize())
}

klog.V(4).Infof("Scaling Instance Pool %s to %d", n.id, targetSize)
actualDelta := delta

klog.V(4).Infof("Scaling Instance Pool %s to %d", n.id, desiredTargetSize)

n.clusterUpdateMutex.Lock()
defer n.clusterUpdateMutex.Unlock()
Expand All @@ -109,25 +112,43 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
return fmt.Errorf("server type %s not available in region %s", n.instanceType, n.region)
}

defer func() {
// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to update servers cache: %v", err)
}

// Update target size
n.resetTargetSize(actualDelta)
}()

// There is no "Server Group" in Hetzner Cloud, we need to create every
// server manually. This operation might fail for some of the servers
// because of quotas, rate limiting or server type availability. We need to
// collect the errors and inform cluster-autoscaler about this, so it can
// try other node groups if configured.
waitGroup := sync.WaitGroup{}
errsCh := make(chan error, delta)
for i := 0; i < delta; i++ {
waitGroup.Add(1)
go func() {
defer waitGroup.Done()
err := createServer(n)
if err != nil {
targetSize--
klog.Errorf("failed to create error: %v", err)
actualDelta--
errsCh <- err
}
}()
}
waitGroup.Wait()
close(errsCh)

n.targetSize = targetSize

// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to get servers: %v", err)
errs := make([]error, 0, delta)
for err = range errsCh {
errs = append(errs, err)
}
if len(errs) > 0 {
return fmt.Errorf("failed to create all servers: %w", errors.Join(errs...))
}

return nil
Expand All @@ -146,34 +167,50 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
n.clusterUpdateMutex.Lock()
defer n.clusterUpdateMutex.Unlock()

targetSize := n.targetSize - len(nodes)
delta := len(nodes)

targetSize := n.targetSize - delta
if targetSize < n.MinSize() {
return fmt.Errorf("size decrease is too large. current: %d desired: %d min: %d", n.targetSize, targetSize, n.MinSize())
}

waitGroup := sync.WaitGroup{}
actualDelta := delta

defer func() {
// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to update servers cache: %v", err)
}

n.resetTargetSize(-actualDelta)
}()

waitGroup := sync.WaitGroup{}
errsCh := make(chan error, len(nodes))
for _, node := range nodes {
waitGroup.Add(1)
go func(node *apiv1.Node) {
klog.Infof("Evicting server %s", node.Name)

err := n.manager.deleteByNode(node)
if err != nil {
klog.Errorf("failed to delete server ID %s error: %v", node.Name, err)
actualDelta--
errsCh <- fmt.Errorf("failed to delete server for node %q: %w", node.Name, err)
}

waitGroup.Done()
}(node)
}
waitGroup.Wait()
close(errsCh)

// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to get servers: %v", err)
errs := make([]error, 0, len(nodes))
for err := range errsCh {
errs = append(errs, err)
}
if len(errs) > 0 {
return fmt.Errorf("failed to delete all nodes: %w", errors.Join(errs...))
}

n.resetTargetSize(-len(nodes))

return nil
}
Expand Down Expand Up @@ -561,8 +598,8 @@ func waitForServerAction(m *hetznerManager, serverName string, action *hcloud.Ac
func (n *hetznerNodeGroup) resetTargetSize(expectedDelta int) {
servers, err := n.manager.allServers(n.id)
if err != nil {
klog.Errorf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
n.targetSize = n.targetSize - expectedDelta
klog.Warningf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
n.targetSize = n.targetSize + expectedDelta
} else {
klog.Infof("Set node group %s size from %d to %d, expected delta %d", n.id, n.targetSize, len(servers), expectedDelta)
n.targetSize = len(servers)
Expand Down
Loading