From b4ef919a1dc62ae5a93f5f757f3f98bfa6596553 Mon Sep 17 00:00:00 2001 From: Robert Bailey Date: Wed, 28 Sep 2022 15:40:58 +0000 Subject: [PATCH] If the user has specified cluster autoscaling behavior for their gameserver then don't overwrite it. --- pkg/apis/agones/v1/gameserver.go | 17 +++-- pkg/apis/agones/v1/gameserver_test.go | 62 ++++++++++++++++++- .../Advanced/scheduling-and-autoscaling.md | 43 ++++++++++--- 3 files changed, 108 insertions(+), 14 deletions(-) diff --git a/pkg/apis/agones/v1/gameserver.go b/pkg/apis/agones/v1/gameserver.go index 2451a3dcc3..f9e7ea3ed6 100644 --- a/pkg/apis/agones/v1/gameserver.go +++ b/pkg/apis/agones/v1/gameserver.go @@ -99,6 +99,10 @@ const ( // becomes ready, so we can track when restarts should occur and when a GameServer // should be moved to Unhealthy. GameServerReadyContainerIDAnnotation = agones.GroupName + "/ready-container-id" + // PodSafeToEvictAnnotation is an annotation that the Kubernetes cluster autoscaler uses to + // determine if a pod can safely be evicted to compact a cluster by moving pods between nodes + // and scaling down nodes. + PodSafeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict" ) var ( @@ -632,7 +636,7 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) { pod.ObjectMeta.Labels = make(map[string]string, 2) } if pod.ObjectMeta.Annotations == nil { - pod.ObjectMeta.Annotations = make(map[string]string, 1) + pod.ObjectMeta.Annotations = make(map[string]string, 2) } pod.ObjectMeta.Labels[RoleLabel] = GameServerLabelRole // store the GameServer name as a label, for easy lookup later on @@ -642,10 +646,13 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) { ref := metav1.NewControllerRef(gs, SchemeGroupVersion.WithKind("GameServer")) pod.ObjectMeta.OwnerReferences = append(pod.ObjectMeta.OwnerReferences, *ref) - if gs.Spec.Scheduling == apis.Packed { - // This means that the autoscaler cannot remove the Node that this Pod is on. - // (and evict the Pod in the process) - pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false" + // This means that the autoscaler cannot remove the Node that this Pod is on. + // (and evict the Pod in the process). Only set the value if it has not already + // been configured in the pod template (to not override user specified behavior). + // We only set this for packed game servers, under the assumption that if + // game servers are distributed then the cluster autoscaler isn't likely running. + if _, exists := pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]; !exists && gs.Spec.Scheduling == apis.Packed { + pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "false" } // Add Agones version into Pod Annotations diff --git a/pkg/apis/agones/v1/gameserver_test.go b/pkg/apis/agones/v1/gameserver_test.go index a3b6a93c30..cd48cc0a93 100644 --- a/pkg/apis/agones/v1/gameserver_test.go +++ b/pkg/apis/agones/v1/gameserver_test.go @@ -1265,7 +1265,7 @@ func TestGameServerPodObjectMeta(t *testing.T) { gs.podObjectMeta(pod) f(t, gs, pod) - assert.Equal(t, "false", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"]) + assert.Equal(t, "false", pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]) }) t.Run("distributed", func(t *testing.T) { @@ -1276,10 +1276,68 @@ func TestGameServerPodObjectMeta(t *testing.T) { gs.podObjectMeta(pod) f(t, gs, pod) - assert.Equal(t, "", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"]) + assert.Equal(t, "", pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]) }) } +func TestGameServerPodAutoscalerAnnotations(t *testing.T) { + testCases := []struct { + description string + scheduling apis.SchedulingStrategy + setAnnotation bool + expectedAnnotation string + }{ + { + description: "Packed", + scheduling: apis.Packed, + expectedAnnotation: "false", + }, + { + description: "Distributed", + scheduling: apis.Distributed, + expectedAnnotation: "", + }, + { + description: "Packed with autoscaler annotation", + scheduling: apis.Packed, + setAnnotation: true, + expectedAnnotation: "true", + }, + { + description: "Distributed with autoscaler annotation", + scheduling: apis.Distributed, + setAnnotation: true, + expectedAnnotation: "true", + }, + } + + fixture := &GameServer{ + ObjectMeta: metav1.ObjectMeta{Name: "logan"}, + Spec: GameServerSpec{Container: "sheep"}, + } + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + gs := fixture.DeepCopy() + gs.Spec.Scheduling = tc.scheduling + if tc.setAnnotation { + gs.Spec.Template = corev1.PodTemplateSpec{ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{PodSafeToEvictAnnotation: "true"}, + }} + } + pod, err := gs.Pod() + assert.Nil(t, err, "Pod should not return an error") + assert.Equal(t, gs.ObjectMeta.Name, pod.ObjectMeta.Name) + assert.Equal(t, gs.ObjectMeta.Namespace, pod.ObjectMeta.Namespace) + assert.Equal(t, GameServerLabelRole, pod.ObjectMeta.Labels[RoleLabel]) + assert.Equal(t, "gameserver", pod.ObjectMeta.Labels[agones.GroupName+"/role"]) + assert.Equal(t, gs.ObjectMeta.Name, pod.ObjectMeta.Labels[GameServerPodLabel]) + assert.Equal(t, "sheep", pod.ObjectMeta.Annotations[GameServerContainerAnnotation]) + assert.True(t, metav1.IsControlledBy(pod, gs)) + assert.Equal(t, tc.expectedAnnotation, pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]) + }) + } +} + func TestGameServerPodScheduling(t *testing.T) { fixture := &corev1.Pod{Spec: corev1.PodSpec{}} diff --git a/site/content/en/docs/Advanced/scheduling-and-autoscaling.md b/site/content/en/docs/Advanced/scheduling-and-autoscaling.md index 2f04fb97b1..dc9c9f06ff 100644 --- a/site/content/en/docs/Advanced/scheduling-and-autoscaling.md +++ b/site/content/en/docs/Advanced/scheduling-and-autoscaling.md @@ -35,7 +35,7 @@ or their cloud specific documentation. ## Fleet Autoscaling Fleet autoscaling is the only type of autoscaling that exists in Agones. It is currently available as a -buffer autoscaling strategy or as a webhook driven strategy, such that you can provide your own autoscaling logic. +buffer autoscaling strategy or as a webhook driven strategy, such that you can provide your own autoscaling logic. Have a look at the [Create a Fleet Autoscaler]({{< relref "../Getting Started/create-fleetautoscaler.md" >}}) quickstart, the [Create a Webhook Fleet Autoscaler]({{< relref "../Getting Started/create-webhook-fleetautoscaler.md" >}}) quickstart, @@ -58,7 +58,7 @@ when it is created. ### Fleet Scale Down Strategy -Fleet Scale Down strategy refers to the order in which the `GameServers` that belong to a `Fleet` are deleted, +Fleet Scale Down strategy refers to the order in which the `GameServers` that belong to a `Fleet` are deleted, when Fleets are shrunk in size. ## Fleet Scheduling @@ -86,7 +86,7 @@ spec: image: {{% example-image %}} ``` -This is the *default* Fleet scheduling strategy. It is designed for dynamic Kubernetes environments, wherein you wish +This is the *default* Fleet scheduling strategy. It is designed for dynamic Kubernetes environments, wherein you wish to scale up and down as load increases or decreases, such as in a Cloud environment where you are paying for the infrastructure you use. @@ -97,13 +97,42 @@ This affects the Cluster autoscaler, Allocation Scheduling, Pod Scheduling and F #### Cluster Autoscaler +{{% feature expiryVersion="1.27.0" %}} To ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during gameplay, Agones adds the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node) to the backing Pod. +{{% /feature %}} + +{{% feature publishVersion="1.27.0" %}} +When using the “Packed” strategy, Agones will ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during +gameplay by adding the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node) +to the backing Pod. + +However, if a gameserver can tolerate being evicted (generally in combination with setting an appropriate graceful termination period +on the gameserver pod) and you want the Cluster Autoscaler to compact your cluster by evicting game servers when it would allow the +Cluster Autoscaler to reduce the number of nodes in the cluster, then this behavior can be overridden by explicitly setting the +`"cluster-autoscaler.kubernetes.io/safe-to-evict"` annotation to `"true"` in the metadata for the game server pod, e.g. + +``` +apiVersion: "agones.dev/v1" +kind: GameServer +metadata: + name: "simple-game-server" +spec: + template: + # pod metadata. Name & Namespace is overwritten + metadata: + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: true + spec: + containers: + - image: {{< example-image >}} +``` +{{% /feature %}} #### Allocation Scheduling Strategy -Under the "Packed" strategy, allocation will prioritise allocating `GameServers` to nodes that are running on +Under the "Packed" strategy, allocation will prioritise allocating `GameServers` to nodes that are running on Nodes that already have allocated `GameServers` running on them. #### Pod Scheduling Strategy @@ -113,13 +142,13 @@ with a `preferredDuringSchedulingIgnoredDuringExecution` affinity with [hostname topology. This attempts to group together `GameServer` Pods within as few nodes in the cluster as it can. {{< alert title="Note" color="info">}} -The default Kubernetes scheduler doesn't do a perfect job of packing, but it's a good enough job for what we need - - at least at this stage. +The default Kubernetes scheduler doesn't do a perfect job of packing, but it's a good enough job for what we need - + at least at this stage. {{< /alert >}} #### Fleet Scale Down Strategy -With the "Packed" strategy, Fleets will remove `Ready` `GameServers` from Nodes with the _least_ number of `Ready` and +With the "Packed" strategy, Fleets will remove `Ready` `GameServers` from Nodes with the _least_ number of `Ready` and `Allocated` `GameServers` on them. Attempting to empty Nodes so that they can be safely removed. ### Distributed