From 69e1a3d0733cc33119eb1bcd6dc8b8c2ebfdb019 Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Fri, 10 Feb 2023 10:59:16 -0800 Subject: [PATCH] Graduate `SafeToEvict` to Beta (#2950) Graduate `SafeToEvict` to Beta --- build/Makefile | 2 +- cloudbuild.yaml | 6 +- install/helm/agones/defaultfeaturegates.yaml | 2 +- install/yaml/install.yaml | 100 +++++++++++++++++- pkg/apis/agones/v1/gameserver_test.go | 30 +++--- pkg/util/runtime/features.go | 8 +- .../docs/Advanced/controlling-disruption.md | 5 + .../Advanced/scheduling-and-autoscaling.md | 25 ++++- site/content/en/docs/Guides/feature-stages.md | 15 +++ 9 files changed, 163 insertions(+), 30 deletions(-) diff --git a/build/Makefile b/build/Makefile index d9a85daeb8..0269406868 100644 --- a/build/Makefile +++ b/build/Makefile @@ -64,7 +64,7 @@ KIND_CONTAINER_NAME=$(KIND_PROFILE)-control-plane GS_TEST_IMAGE ?= us-docker.pkg.dev/agones-images/examples/simple-game-server:0.14 # Enable all alpha feature gates. Keep in sync with `false` (alpha) entries in pkg/util/runtime/features.go:featureDefaults -ALPHA_FEATURE_GATES ?= "PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&SafeToEvict=true&PodHostname=true&SplitControllerAndExtensions=true&CountsAndLists=true&Example=true" +ALPHA_FEATURE_GATES ?= "PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&CountsAndLists=true&Example=true" # Build with Windows support WITH_WINDOWS=1 diff --git a/cloudbuild.yaml b/cloudbuild.yaml index d69e18c0ce..fb98773880 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -295,13 +295,13 @@ steps: do if [ $cloudProduct = generic ] then - featureWithGate="CustomFasSyncInterval=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&SafeToEvict=true&PodHostname=true&SplitControllerAndExtensions=true&Example=true" + featureWithGate="CustomFasSyncInterval=false&SafeToEvict=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&Example=true" featureWithoutGate="" testClusterLocation="us-west1-c" testCluster="e2e-test-cluster" else - featureWithGate="CustomFasSyncInterval=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&SafeToEvict=true&PodHostname=true&SplitControllerAndExtensions=true&Example=true" - featureWithoutGate="SafeToEvict=true&SplitControllerAndExtensions=true" + featureWithGate="CustomFasSyncInterval=false&SafeToEvict=true&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&Example=true" + featureWithoutGate="SplitControllerAndExtensions=true" testClusterLocation="us-west1" testCluster="gke-autopilot-e2e-test-cluster-1-24" fi diff --git a/install/helm/agones/defaultfeaturegates.yaml b/install/helm/agones/defaultfeaturegates.yaml index 7a0bc81ae0..d9a860efd5 100644 --- a/install/helm/agones/defaultfeaturegates.yaml +++ b/install/helm/agones/defaultfeaturegates.yaml @@ -16,6 +16,7 @@ # Beta features CustomFasSyncInterval: true +SafeToEvict: true SDKGracefulTermination: true StateAllocationFilter: true @@ -23,7 +24,6 @@ StateAllocationFilter: true PlayerAllocationFilter: false PlayerTracking: false ResetMetricsOnDelete: false -SafeToEvict: false PodHostname: false SplitControllerAndExtensions: false diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml index cd7bb5220e..b240bd874c 100644 --- a/install/yaml/install.yaml +++ b/install/yaml/install.yaml @@ -1,4 +1,16 @@ --- +# Source: agones/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: agones-gameserver-safe-to-evict-false + namespace: default +spec: + maxUnavailable: 0% + selector: + matchLabels: + agones.dev/safe-to-evict: "false" +--- # Source: agones/templates/service/allocation.yaml # Create a ServiceAccount that will be bound to the above role apiVersion: v1 @@ -5087,6 +5099,27 @@ spec: type: integer title: The initial player capacity of this Game Server minimum: 0 + eviction: + type: object + title: Eviction tolerance of the game server + properties: + safe: + type: string + title: Game server supports termination via SIGTERM + description: | + - Never: The game server should run to completion. Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"` and label `agones.dev/safe-to-evict: "false"`, which matches a restrictive PodDisruptionBudget. + - OnUpgrade: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"`, which blocks evictions by Cluster Autoscaler. Evictions from node upgrades proceed normally. + - Always: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated, typically within 10m; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "true"`, which allows evictions by Cluster Autoscaler. + enum: + - Always + - OnUpgrade + - Never + immutableReplicas: + type: integer + title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) + default: 1 + minimum: 1 + maximum: 1 status: description: 'FleetStatus is the status of a Fleet. More info: https://agones.dev/site/docs/reference/agones_crd_api_reference/#agones.dev/v1.Fleet' @@ -10049,7 +10082,28 @@ spec: initialCapacity: type: integer title: The initial player capacity of this Game Server - minimum: 0 + minimum: 0 + eviction: + type: object + title: Eviction tolerance of the game server + properties: + safe: + type: string + title: Game server supports termination via SIGTERM + description: | + - Never: The game server should run to completion. Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"` and label `agones.dev/safe-to-evict: "false"`, which matches a restrictive PodDisruptionBudget. + - OnUpgrade: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"`, which blocks evictions by Cluster Autoscaler. Evictions from node upgrades proceed normally. + - Always: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated, typically within 10m; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "true"`, which allows evictions by Cluster Autoscaler. + enum: + - Always + - OnUpgrade + - Never + immutableReplicas: + type: integer + title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) + default: 1 + minimum: 1 + maximum: 1 status: description: 'GameServerStatus is the status for a GameServer resource. More info: https://agones.dev/site/docs/reference/agones_crd_api_reference/#agones.dev/v1.GameServer' @@ -10101,6 +10155,29 @@ spec: nullable: true items: type: string + eviction: + type: object + properties: + safe: + type: string + enum: + - Always + - OnUpgrade + - Never + immutableReplicas: + type: integer + title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) + default: 1 + minimum: 1 + maximum: 1 + subresources: + # scale enables the scale subresource. We can't actually scale GameServers, but this allows + # for the use of PodDisruptionBudget (PDB) without having to use a PDB per Pod. + scale: + # specReplicasPath defines the JSONPath inside of a custom resource that corresponds to Scale.Spec.Replicas. + specReplicasPath: .spec.immutableReplicas + # statusReplicasPath defines the JSONPath inside of a custom resource that corresponds to Scale.Status.Replicas. + statusReplicasPath: .status.immutableReplicas --- # Source: agones/templates/crds/gameserverallocationpolicy.yaml # Copyright 2019 Google LLC All Rights Reserved. @@ -15120,6 +15197,27 @@ spec: type: integer title: The initial player capacity of this Game Server minimum: 0 + eviction: + type: object + title: Eviction tolerance of the game server + properties: + safe: + type: string + title: Game server supports termination via SIGTERM + description: | + - Never: The game server should run to completion. Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"` and label `agones.dev/safe-to-evict: "false"`, which matches a restrictive PodDisruptionBudget. + - OnUpgrade: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"`, which blocks evictions by Cluster Autoscaler. Evictions from node upgrades proceed normally. + - Always: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated, typically within 10m; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "true"`, which allows evictions by Cluster Autoscaler. + enum: + - Always + - OnUpgrade + - Never + immutableReplicas: + type: integer + title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) + default: 1 + minimum: 1 + maximum: 1 status: description: 'GameServerSetStatus is the status of a GameServerSet. More info: https://agones.dev/site/docs/reference/agones_crd_api_reference/#agones.dev/v1.GameServerSet' diff --git a/pkg/apis/agones/v1/gameserver_test.go b/pkg/apis/agones/v1/gameserver_test.go index 820fbc23d3..b06cb27bf6 100644 --- a/pkg/apis/agones/v1/gameserver_test.go +++ b/pkg/apis/agones/v1/gameserver_test.go @@ -176,6 +176,8 @@ func TestGameServerApplyDefaults(t *testing.T) { GRPCPort: 9357, HTTPPort: 9358, }, + evictionSafeSpec: EvictionSafeNever, + evictionSafeStatus: EvictionSafeNever, } f(&e) return e @@ -301,21 +303,22 @@ func TestGameServerApplyDefaults(t *testing.T) { } }), }, - "SafeToEvict gate off => no SafeToEvict fields": { + "SafeToEvict gate off => no eviction.safe fields": { featureFlags: string(runtime.FeatureSafeToEvict) + "=false", gameServer: defaultGameServerAnd(func(gss *GameServerSpec) {}), - expected: wantDefaultAnd(func(e *expected) {}), + expected: wantDefaultAnd(func(e *expected) { + e.evictionSafeSpec = "" + e.evictionSafeStatus = "" + }), }, - "SafeToEvict gate on => SafeToEvict: Never": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", - gameServer: defaultGameServerAnd(func(gss *GameServerSpec) {}), + "defaults are eviction.safe: Never": { + gameServer: defaultGameServerAnd(func(gss *GameServerSpec) {}), expected: wantDefaultAnd(func(e *expected) { e.evictionSafeSpec = EvictionSafeNever e.evictionSafeStatus = EvictionSafeNever }), }, - "SafeToEvict: Always": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", + "eviction.safe: Always": { gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Eviction.Safe = EvictionSafeAlways }), @@ -324,8 +327,7 @@ func TestGameServerApplyDefaults(t *testing.T) { e.evictionSafeStatus = EvictionSafeAlways }), }, - "SafeToEvict: OnUpgrade": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", + "eviction.safe: OnUpgrade": { gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Eviction.Safe = EvictionSafeOnUpgrade }), @@ -334,8 +336,7 @@ func TestGameServerApplyDefaults(t *testing.T) { e.evictionSafeStatus = EvictionSafeOnUpgrade }), }, - "SafeToEvict: Never": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", + "eviction.safe: Never": { gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Eviction.Safe = EvictionSafeNever }), @@ -344,8 +345,7 @@ func TestGameServerApplyDefaults(t *testing.T) { e.evictionSafeStatus = EvictionSafeNever }), }, - "SafeToEvict: Always inferred from safe-to-evict=true": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", + "eviction.safe: Always inferred from safe-to-evict=true": { gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Template.ObjectMeta.Annotations = map[string]string{PodSafeToEvictAnnotation: "true"} }), @@ -355,7 +355,6 @@ func TestGameServerApplyDefaults(t *testing.T) { }), }, "Nothing inferred from safe-to-evict=false": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Template.ObjectMeta.Annotations = map[string]string{PodSafeToEvictAnnotation: "false"} }), @@ -364,8 +363,7 @@ func TestGameServerApplyDefaults(t *testing.T) { e.evictionSafeStatus = EvictionSafeNever }), }, - "safe-to-evict=false AND SafeToEvict: Always => SafeToEvict: Always": { - featureFlags: string(runtime.FeatureSafeToEvict) + "=true", + "safe-to-evict=false AND eviction.safe: Always => eviction.safe: Always": { gameServer: defaultGameServerAnd(func(gss *GameServerSpec) { gss.Eviction.Safe = EvictionSafeAlways gss.Template.ObjectMeta.Annotations = map[string]string{PodSafeToEvictAnnotation: "false"} diff --git a/pkg/util/runtime/features.go b/pkg/util/runtime/features.go index db8fec5a83..4e54f0687c 100644 --- a/pkg/util/runtime/features.go +++ b/pkg/util/runtime/features.go @@ -34,6 +34,9 @@ const ( // FeatureCustomFasSyncInterval is a feature flag that enables a custom FleetAutoscaler resync interval FeatureCustomFasSyncInterval Feature = "CustomFasSyncInterval" + // FeatureSafeToEvict enables the `SafeToEvict` API to specify disruption tolerance. + FeatureSafeToEvict Feature = "SafeToEvict" + // FeatureSDKGracefulTermination is a feature flag that enables SDK to support gracefulTermination FeatureSDKGracefulTermination Feature = "SDKGracefulTermination" @@ -54,9 +57,6 @@ const ( // relevant metric views to reset their state immediately when an Agones resource is deleted. FeatureResetMetricsOnDelete Feature = "ResetMetricsOnDelete" - // FeatureSafeToEvict enables the `SafeToEvict` API to specify disruption tolerance. - FeatureSafeToEvict Feature = "SafeToEvict" - // FeaturePodHostname enables the Pod Hostname being assigned the name of the GameServer FeaturePodHostname = "PodHostname" @@ -106,6 +106,7 @@ var ( featureDefaults = map[Feature]bool{ // Beta features FeatureCustomFasSyncInterval: true, + FeatureSafeToEvict: true, FeatureSDKGracefulTermination: true, FeatureStateAllocationFilter: true, @@ -113,7 +114,6 @@ var ( FeaturePlayerAllocationFilter: false, FeaturePlayerTracking: false, FeatureResetMetricsOnDelete: false, - FeatureSafeToEvict: false, FeaturePodHostname: false, FeatureSplitControllerAndExtensions: false, diff --git a/site/content/en/docs/Advanced/controlling-disruption.md b/site/content/en/docs/Advanced/controlling-disruption.md index ec384551d2..61ba72d724 100644 --- a/site/content/en/docs/Advanced/controlling-disruption.md +++ b/site/content/en/docs/Advanced/controlling-disruption.md @@ -12,7 +12,12 @@ description: > By default, Agones assumes your game server should never be disrupted voluntarily and configures the `Pod` appropriately - but this isn't always the ideal setting. Here we discuss how Agones allows you to control the two most significant sources of voluntary `Pod` evictions, node upgrades and Cluster Autoscaler, using the `eviction` API on the `GameServer` object. +{{% feature publishVersion="1.30.0" %}} +{{< beta title="`eviction` API" gate="SafeToEvict" >}} +{{% /feature %}} +{{< feature expiryVersion="1.30.0" >}} {{< alpha title="`eviction` API" gate="SafeToEvict" >}} +{{% /feature %}} ## Benefits of Allowing Voluntary Disruption diff --git a/site/content/en/docs/Advanced/scheduling-and-autoscaling.md b/site/content/en/docs/Advanced/scheduling-and-autoscaling.md index 498094adec..00c1864507 100644 --- a/site/content/en/docs/Advanced/scheduling-and-autoscaling.md +++ b/site/content/en/docs/Advanced/scheduling-and-autoscaling.md @@ -97,17 +97,33 @@ This affects the Cluster autoscaler, Allocation Scheduling, Pod Scheduling and F #### Cluster Autoscaler +{{% feature publishVersion="1.30.0" %}} +When using the "Packed" strategy, Agones will ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during +gameplay. + +{{< beta title="`eviction` API" gate="SafeToEvict" >}} + +If a gameserver can tolerate [being evicted](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/#how-api-initiated-eviction-works) +(generally in combination with setting an appropriate graceful termination period on the gameserver pod) and you +want the Cluster Autoscaler to compact your cluster by evicting game servers when it would allow the Cluster +Autoscaler to reduce the number of nodes in the cluster, [Controlling Disruption]({{< relref "controlling-disruption.md" >}}) describes +how to choose the `.eviction` setting appropriate for your `GameServer` or `Fleet`. +{{% /feature %}} + +{{% feature expiryVersion="1.30.0" %}} When using the “Packed” strategy, Agones will ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during gameplay by adding the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node) to the backing Pod. - -{{< alert title="SafeToEvict Feature Gate" color="info" >}} +{{% /feature %}} +{{< feature expiryVersion="1.30.0" >}} +{{% alert title="SafeToEvict Feature Gate" color="info" %}} The [Alpha]({{< ref "/docs/Guides/feature-stages.md#alpha" >}}) `SafeToEvict` feature allows [controlling disruption]({{< relref "controlling-disruption.md" >}}) in a more holistic way. Please consider enabling `SafeToEvict` and using the new `eviction` API - we welcome your early feedback! -{{< /alert >}} - +{{% /alert %}} +{{< /feature >}} +{{% feature expiryVersion="1.30.0" %}} However, if a gameserver can tolerate [being evicted](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/#how-api-initiated-eviction-works) (generally in combination with setting an appropriate graceful termination period on the gameserver pod) and you want the Cluster Autoscaler to compact your cluster by evicting game servers when it would allow the Cluster @@ -156,6 +172,7 @@ spec: # grace period for terminating the game server safely. terminationGracePeriodSeconds: 300 ``` +{{% /feature %}} #### Allocation Scheduling Strategy diff --git a/site/content/en/docs/Guides/feature-stages.md b/site/content/en/docs/Guides/feature-stages.md index a64aeec7ba..08860b07d3 100644 --- a/site/content/en/docs/Guides/feature-stages.md +++ b/site/content/en/docs/Guides/feature-stages.md @@ -24,6 +24,20 @@ that can be found in the [Helm configuration]({{< ref "/docs/Installation/Instal The current set of `alpha` and `beta` feature gates: +{{% feature publishVersion="1.30.0" %}} +| Feature Name | Gate | Default | Stage | Since | +|-----------------------------------------------------------------------------------------------------------------------|--------------------------|----------|---------|--------| +| [Custom resync period for FleetAutoscaler](https://github.com/googleforgames/agones/issues/1955) | `CustomFasSyncInterval` | Enabled | `Beta` | 1.25.0 | +| [GameServer `eviction` API](https://github.com/googleforgames/agones/issues/2794) | `SafeToEvict` | Enabled | `Beta` | 1.30.0 | +| [Graceful Termination for GameServer SDK](https://github.com/googleforgames/agones/pull/2205) | `SDKGracefulTermination` | Enabled | `Beta` | 1.18.0 | +| [GameServer state filtering on GameServerAllocations](https://github.com/googleforgames/agones/issues/1239) | `StateAllocationFilter` | Enabled | `Beta` | 1.26.0 | +| [GameServer player capacity filtering on GameServerAllocations](https://github.com/googleforgames/agones/issues/1239) | `PlayerAllocationFilter` | Disabled | `Alpha` | 1.14.0 | +| [Player Tracking]({{< ref "/docs/Guides/player-tracking.md" >}}) | `PlayerTracking` | Disabled | `Alpha` | 1.6.0 | +| [Reset Metric Export on Fleet / Autoscaler deletion]({{% relref "./metrics.md#dropping-metric-labels" %}}) | `ResetMetricsOnDelete` | Disabled | `Alpha` | 1.26.0 | +| [GameServer Stable Network ID]({{% ref "/docs/Reference/gameserver.md#stable-network-id" %}}) | `PodHostname` | Disabled | `Alpha` | 1.29.0 | +| Example Gate (not in use) | `Example` | Disabled | None | 0.13.0 | +{{% /feature %}} +{{% feature expiryVersion="1.30.0" %}} | Feature Name | Gate | Default | Stage | Since | |-----------------------------------------------------------------------------------------------------------------------|--------------------------|----------|---------|--------| | [Custom resync period for FleetAutoscaler](https://github.com/googleforgames/agones/issues/1955) | `CustomFasSyncInterval` | Enabled | `Beta` | 1.25.0 | @@ -35,6 +49,7 @@ The current set of `alpha` and `beta` feature gates: | [GameServer Stable Network ID]({{% ref "/docs/Reference/gameserver.md#stable-network-id" %}}) | `PodHostname` | Disabled | `Alpha` | 1.29.0 | | [GameServer `SafeToEvict` API](https://github.com/googleforgames/agones/issues/2794) | `SafeToEvict` | Disabled | `Alpha` | 1.29.0 | | Example Gate (not in use) | `Example` | Disabled | None | 0.13.0 | +{{% /feature %}} {{< alert title="Note" color="info" >}} If you aren't sure if Feature Flags have been set correctly, have a look at the