From 5a781e67c9007eab31a10dec19abb63584c79f9c Mon Sep 17 00:00:00 2001 From: Dominic Date: Wed, 3 Jul 2024 06:15:26 +0000 Subject: [PATCH 1/5] WIP add support for node pool placement group config See #5919 --- .../hetzner/hetzner_cloud_provider.go | 70 +++++++++++++++++++ .../cloudprovider/hetzner/hetzner_manager.go | 7 +- .../hetzner/hetzner_node_group.go | 1 + 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index ad9e35d2e85e..8bad29d5ec70 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -17,6 +17,7 @@ limitations under the License. package hetzner import ( + "context" "fmt" "regexp" "strconv" @@ -27,6 +28,7 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/hetzner/hcloud-go/hcloud" "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" @@ -179,6 +181,29 @@ func (d *HetznerCloudProvider) Refresh() error { return nil } +// Check if any defined placement groups could potentially have more than the maximum allowed number of nodes +func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold int) []hcloud.PlacementGroup { + placementGroupTotals := make(map[hcloud.PlacementGroup]int) + + // Calculate totals for each placement group + for _, nodeGroup := range nodeGroups { + if nodeGroup.placementGroup.Name != "" { // Check if placementGroup is defined + placementGroup := nodeGroup.placementGroup + placementGroupTotals[placementGroup] += nodeGroup.maxSize + } + } + + // Collect placement groups with total maxSize > threshold + var largePlacementGroups []hcloud.PlacementGroup + for placementGroup, totalMaxSize := range placementGroupTotals { + if totalMaxSize > threshold { + largePlacementGroups = append(largePlacementGroups, placementGroup) + } + } + + return largePlacementGroups +} + // BuildHetzner builds the Hetzner cloud provider. func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { manager, err := newManager() @@ -226,6 +251,51 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove targetSize: len(servers), clusterUpdateMutex: &clusterUpdateLock, } + + // If a placement group was specified, check with the API to see if it exists + if manager.clusterConfig.IsUsingNewFormat && placementGroupRef := manager.clusterConfig.NodeConfigs[spec.name].PlacementGroup; placementGroupRef != nil { + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + placementGroup, _, err := manager.client.PlacementGroup.Get(ctx, placementGroupRef) + + // Check if an error occurred + if err != nil { + if err == context.DeadlineExceeded { + klog.Fatalf("Timed out checking if placement group `%s` exists.", placementGroupRef) + } else { + klog.Fatalf("Failed to verify if placement group `%s` exists error: %v", placementGroupRef, err) + } + } + + // If the placement group exists, add it to the node group config + if placementGroup != nil { + manager.nodeGroups[spec.name].placementGroup = placementGroup + } else { + klog.Fatalf("The requested placement group `%s` does not appear to exist.", placementGroupRef) + } + } + } + + // Get placement groups with total maxSize over the maximum allowed + maxPlacementGroupSize := 10 + + largePlacementGroups := getLargePlacementGroups(manager.nodeGroups, maxPlacementGroupSize) + + // Fail if we have placement groups over the max size + if (len(largePlacementGroups) > 0) { + + // Gather placement group names + var placementGroupNames string + for i, pg := range largePlacementGroups { + if i > 0 { + placementGroupNames += ", " + } + placementGroupNames += pg.Name + } + + klog.Fatalf("The following placement groups have a potential size over the allowed maximum of %d: %s.", maxPlacementGroupSize, placementGroupNames) } return provider diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go index a4071b213792..4f5f5a127920 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go @@ -73,9 +73,10 @@ type ImageList struct { // NodeConfig holds the configuration for a single nodepool type NodeConfig struct { - CloudInit string - Taints []apiv1.Taint - Labels map[string]string + CloudInit string + PlacementGroup string + Taints []apiv1.Taint + Labels map[string]string } // LegacyConfig holds the configuration in the legacy format diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index 6fef8f37c5fd..99af3900bc3f 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -47,6 +47,7 @@ type hetznerNodeGroup struct { instanceType string clusterUpdateMutex *sync.Mutex + placementGroup *hcloud.PlacementGroup } type hetznerNodeGroupSpec struct { From 80b9753194725a51fb7f694155ee93a04bd7a008 Mon Sep 17 00:00:00 2001 From: Dominic Date: Mon, 4 Nov 2024 21:26:30 -0800 Subject: [PATCH 2/5] Clean up go syntax and fix conditional logic --- .../hetzner/hetzner_cloud_provider.go | 24 ++++++++++++------- .../hetzner/hetzner_node_group.go | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index 8bad29d5ec70..a139fedaba02 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -182,19 +182,21 @@ func (d *HetznerCloudProvider) Refresh() error { } // Check if any defined placement groups could potentially have more than the maximum allowed number of nodes -func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold int) []hcloud.PlacementGroup { +func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold int) []*hcloud.PlacementGroup { placementGroupTotals := make(map[hcloud.PlacementGroup]int) // Calculate totals for each placement group for _, nodeGroup := range nodeGroups { - if nodeGroup.placementGroup.Name != "" { // Check if placementGroup is defined - placementGroup := nodeGroup.placementGroup - placementGroupTotals[placementGroup] += nodeGroup.maxSize - } + if nodeGroup.placementGroup == nil || nodeGroup.placementGroup.Name == "" { + continue + } + + placementGroup := nodeGroup.placementGroup + placementGroupTotals[placementGroup] += nodeGroup.maxSize } // Collect placement groups with total maxSize > threshold - var largePlacementGroups []hcloud.PlacementGroup + var largePlacementGroups []*hcloud.PlacementGroup for placementGroup, totalMaxSize := range placementGroupTotals { if totalMaxSize > threshold { largePlacementGroups = append(largePlacementGroups, placementGroup) @@ -253,7 +255,13 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove } // If a placement group was specified, check with the API to see if it exists - if manager.clusterConfig.IsUsingNewFormat && placementGroupRef := manager.clusterConfig.NodeConfigs[spec.name].PlacementGroup; placementGroupRef != nil { + if manager.clusterConfig.IsUsingNewFormat { + + placementGroupRef := manager.clusterConfig.NodeConfigs[spec.name].PlacementGroup + + if placementGroupRef == "" { + continue + } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -284,7 +292,7 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove largePlacementGroups := getLargePlacementGroups(manager.nodeGroups, maxPlacementGroupSize) // Fail if we have placement groups over the max size - if (len(largePlacementGroups) > 0) { + if len(largePlacementGroups) > 0 { // Gather placement group names var placementGroupNames string diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index 99af3900bc3f..96378b27265f 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -445,6 +445,7 @@ func createServer(n *hetznerNodeGroup) error { EnableIPv4: n.manager.publicIPv4, EnableIPv6: n.manager.publicIPv6, }, + PlacementGroup: n.placementGroup, } if n.manager.sshKey != nil { opts.SSHKeys = []*hcloud.SSHKey{n.manager.sshKey} From 73278443f5c965901d8fd76890f7c577ef4ec05f Mon Sep 17 00:00:00 2001 From: Dominic Date: Tue, 5 Nov 2024 10:32:59 -0800 Subject: [PATCH 3/5] Run gofmt to fix whitespace --- .../cloudprovider/hetzner/hetzner_cloud_provider.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index a139fedaba02..5c8a9648da58 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -187,9 +187,9 @@ func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold // Calculate totals for each placement group for _, nodeGroup := range nodeGroups { - if nodeGroup.placementGroup == nil || nodeGroup.placementGroup.Name == "" { - continue - } + if nodeGroup.placementGroup == nil || nodeGroup.placementGroup.Name == "" { + continue + } placementGroup := nodeGroup.placementGroup placementGroupTotals[placementGroup] += nodeGroup.maxSize From 49295f8e14edb4795ff946b598c0551c8976fcff Mon Sep 17 00:00:00 2001 From: Dominic Date: Tue, 5 Nov 2024 12:22:08 -0800 Subject: [PATCH 4/5] Fix bugs in function to find large placement groups --- .../hetzner/hetzner_cloud_provider.go | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index 5c8a9648da58..aa86af25b774 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -182,28 +182,28 @@ func (d *HetznerCloudProvider) Refresh() error { } // Check if any defined placement groups could potentially have more than the maximum allowed number of nodes -func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold int) []*hcloud.PlacementGroup { - placementGroupTotals := make(map[hcloud.PlacementGroup]int) +func getLargePlacementGroups(nodeGroups map[string]*hetznerNodeGroup, threshold int) []int64 { + placementGroupTotals := make(map[int64]int) // Calculate totals for each placement group for _, nodeGroup := range nodeGroups { - if nodeGroup.placementGroup == nil || nodeGroup.placementGroup.Name == "" { + if nodeGroup.placementGroup == nil || nodeGroup.placementGroup.ID == 0 { continue } - placementGroup := nodeGroup.placementGroup - placementGroupTotals[placementGroup] += nodeGroup.maxSize + placementGroupID := nodeGroup.placementGroup.ID + placementGroupTotals[placementGroupID] += nodeGroup.maxSize } // Collect placement groups with total maxSize > threshold - var largePlacementGroups []*hcloud.PlacementGroup - for placementGroup, totalMaxSize := range placementGroupTotals { + var largePlacementGroupIDs []int64 + for id, totalMaxSize := range placementGroupTotals { if totalMaxSize > threshold { - largePlacementGroups = append(largePlacementGroups, placementGroup) + largePlacementGroupIDs = append(largePlacementGroupIDs, id) } } - return largePlacementGroups + return largePlacementGroupIDs } // BuildHetzner builds the Hetzner cloud provider. @@ -295,15 +295,15 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove if len(largePlacementGroups) > 0 { // Gather placement group names - var placementGroupNames string - for i, pg := range largePlacementGroups { + var placementGroupIDs string + for i, placementGroupID := range largePlacementGroups { if i > 0 { - placementGroupNames += ", " + placementGroupIDs += ", " } - placementGroupNames += pg.Name + placementGroupIDs += strconv.Itoa(placementGroupID) } - klog.Fatalf("The following placement groups have a potential size over the allowed maximum of %d: %s.", maxPlacementGroupSize, placementGroupNames) + klog.Fatalf("The following placement groups have a potential size over the allowed maximum of %d: %s.", maxPlacementGroupSize, placementGroupIDs) } return provider From 5958eb4199cd2f2aefbc4229ac180fdb0ca639c9 Mon Sep 17 00:00:00 2001 From: Dominic Date: Tue, 5 Nov 2024 12:37:47 -0800 Subject: [PATCH 5/5] Remove unused import and fix int64 to string conversion --- .../cloudprovider/hetzner/hetzner_cloud_provider.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index aa86af25b774..c3f802714395 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -28,7 +28,6 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" - "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/hetzner/hcloud-go/hcloud" "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" @@ -300,7 +299,7 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove if i > 0 { placementGroupIDs += ", " } - placementGroupIDs += strconv.Itoa(placementGroupID) + placementGroupIDs += strconv.FormatInt(placementGroupID, 10) } klog.Fatalf("The following placement groups have a potential size over the allowed maximum of %d: %s.", maxPlacementGroupSize, placementGroupIDs)