Skip to content

Commit

Permalink
SafeToEvict: Implement Eviction API, add SetEviction cloud product ho…
Browse files Browse the repository at this point in the history
…ok (#2857)

* SafeToEvict feature: Implement the `eviction.safe` API logic

Towards #2794: This commit implements the defaulting/validation for
the eviction.safe field, under the SafeToEvict feature gate. In
particular, it enshrines the defaulting logic for the somewhat
complicated backwards compatibility case (safe-to-evict=true is set
but SafeToEvict is not).

Along the way, refactor a very repetitive unit test table.

* Ran gen-api-docs

* Implement SafeToEvict enforcement of safe-to-evict annotations/label

This commit adds the interpretation of GameServer.Status.Eviction:

* Following the pattern in #2840, moves the enforcement of
cloudproduct specific things up to the controller, adding a new cloudproduct
SetEviction hook to handle it.
  * As a result, when the SafeToEvict feature flag is on, we disable annotation
    in gameserver.go and test that we don't set anything (as a change detection test).

* Adds a generic and GKE Autopilot version of how to set policy from the
eviction.safe status. In Autopilot, we skip using safe-to-evict=false.
  • Loading branch information
zmerlynn authored Jan 5, 2023
1 parent 73da855 commit 7329fee
Show file tree
Hide file tree
Showing 9 changed files with 1,975 additions and 1,433 deletions.
15 changes: 15 additions & 0 deletions install/helm/agones/templates/crds/_gameserverspecschema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,21 @@ properties:
title: The initial player capacity of this Game Server
minimum: 0
{{- if .featureSafeToEvict }}
eviction:
type: object
title: Eviction tolerance of the game server
properties:
safe:
type: string
title: Game server supports termination via SIGTERM
description: |
- Never: The game server should run to completion. Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"` and label `agones.dev/safe-to-evict: "false"`, which matches a restrictive PodDisruptionBudget.
- OnUpgrade: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "false"`, which blocks evictions by Cluster Autoscaler. Evictions from node upgrades proceed normally.
- Always: On SIGTERM, the game server will exit within `terminationGracePeriodSeconds` or be terminated, typically within 10m; Agones sets Pod annotation `cluster-autoscaler.kubernetes.io/safe-to-evict: "true"`, which allows evictions by Cluster Autoscaler.
enum:
- Always
- OnUpgrade
- Never
immutableReplicas:
type: integer
title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.)
Expand Down
9 changes: 9 additions & 0 deletions install/helm/agones/templates/crds/_gameserverstatus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ status:
items:
type: string
{{- if .featureSafeToEvict }}
eviction:
type: object
properties:
safe:
type: string
enum:
- Always
- OnUpgrade
- Never
immutableReplicas:
type: integer
title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.)
Expand Down
39 changes: 39 additions & 0 deletions pkg/apis/agones/v1/apihooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
package v1

import (
"agones.dev/agones/pkg/util/runtime"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand All @@ -28,6 +30,9 @@ type APIHooks interface {

// MutateGameServerPodSpec is called by createGameServerPod to allow for product specific pod mutation.
MutateGameServerPodSpec(*GameServerSpec, *corev1.PodSpec) error

// SetEviction is called by gs.Pod to enforce GameServer.Status.Eviction.
SetEviction(EvictionSafe, *corev1.Pod) error
}

var apiHooks APIHooks = generic{}
Expand All @@ -45,3 +50,37 @@ type generic struct{}

func (generic) ValidateGameServerSpec(*GameServerSpec) []metav1.StatusCause { return nil }
func (generic) MutateGameServerPodSpec(*GameServerSpec, *corev1.PodSpec) error { return nil }

// SetEviction sets disruptions controls based on GameServer.Status.Eviction.
func (generic) SetEviction(safe EvictionSafe, pod *corev1.Pod) error {
if !runtime.FeatureEnabled(runtime.FeatureSafeToEvict) {
return nil
}
if _, exists := pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]; !exists {
switch safe {
case EvictionSafeAlways:
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = True
case EvictionSafeOnUpgrade, EvictionSafeNever:
// For EvictionSafeOnUpgrade and EvictionSafeNever, we block Cluster Autoscaler.
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = False
default:
return errors.Errorf("unknown eviction.safe value %q", string(safe))
}
}
if _, exists := pod.ObjectMeta.Labels[SafeToEvictLabel]; !exists {
switch safe {
case EvictionSafeAlways, EvictionSafeOnUpgrade:
// For EvictionSafeAlways and EvictionSafeOnUpgrade, we use a label value
// that does not match the agones-gameserver-safe-to-evict-false PDB. But
// we go ahead and label it, in case someone wants to adopt custom logic
// for this group of game servers.
pod.ObjectMeta.Labels[SafeToEvictLabel] = True
case EvictionSafeNever:
// For EvictionSafeNever, match gones-gameserver-safe-to-evict-false PDB.
pod.ObjectMeta.Labels[SafeToEvictLabel] = False
default:
return errors.Errorf("unknown eviction.safe value %q", string(safe))
}
}
return nil
}
123 changes: 123 additions & 0 deletions pkg/apis/agones/v1/apihooks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Copyright 2022 Google LLC All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
"testing"

"agones.dev/agones/pkg/util/runtime"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestSetEviction(t *testing.T) {
runtime.FeatureTestMutex.Lock()
defer runtime.FeatureTestMutex.Unlock()

emptyPodAnd := func(f func(*corev1.Pod)) *corev1.Pod {
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{},
Labels: map[string]string{},
},
}
f(pod)
return pod
}
for desc, tc := range map[string]struct {
featureFlags string
safeToEvict EvictionSafe
pod *corev1.Pod
wantPod *corev1.Pod
}{
"SafeToEvict feature gate disabled => no change": {
featureFlags: "SafeToEvict=false",
// intentionally leave pod nil, it'll crash if anything's touched.
},
"SafeToEvict: Always, no incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeAlways,
pod: emptyPodAnd(func(*corev1.Pod) {}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = True
pod.ObjectMeta.Labels[SafeToEvictLabel] = True
}),
},
"SafeToEvict: OnUpgrade, no incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeOnUpgrade,
pod: emptyPodAnd(func(*corev1.Pod) {}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = False
pod.ObjectMeta.Labels[SafeToEvictLabel] = True
}),
},
"SafeToEvict: Never, no incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeNever,
pod: emptyPodAnd(func(*corev1.Pod) {}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = False
pod.ObjectMeta.Labels[SafeToEvictLabel] = False
}),
},
"SafeToEvict: Always, incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeAlways,
pod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "just don't touch, ok?"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "seriously, leave it"
}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "just don't touch, ok?"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "seriously, leave it"
}),
},
"SafeToEvict: OnUpgrade, incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeOnUpgrade,
pod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "better not touch"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "not another one"
}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "better not touch"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "not another one"
}),
},
"SafeToEvict: Never, incoming labels/annotations": {
featureFlags: "SafeToEvict=true",
safeToEvict: EvictionSafeNever,
pod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "a passthrough"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "or is it passthru?"
}),
wantPod: emptyPodAnd(func(pod *corev1.Pod) {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "a passthrough"
pod.ObjectMeta.Labels[SafeToEvictLabel] = "or is it passthru?"
}),
},
} {
t.Run(desc, func(t *testing.T) {
err := runtime.ParseFeatures(tc.featureFlags)
assert.NoError(t, err)

err = (generic{}).SetEviction(tc.safeToEvict, tc.pod)
assert.NoError(t, err)
assert.Equal(t, tc.wantPod, tc.pod)
})
}
}
87 changes: 80 additions & 7 deletions pkg/apis/agones/v1/gameserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ const (
// This will mean that users will need to lookup what port has been opened through the server side SDK.
Passthrough PortPolicy = "Passthrough"

// EvictionSafeAlways means the game server supports termination via SIGTERM, and wants eviction signals
// from Cluster Autoscaler scaledown and node upgrades.
EvictionSafeAlways EvictionSafe = "Always"
// EvictionSafeOnUpgrade means the game server supports termination via SIGTERM, and wants eviction signals
// from node upgrades, but not Cluster Autoscaler scaledown.
EvictionSafeOnUpgrade EvictionSafe = "OnUpgrade"
// EvictionSafeNever means the game server should run to completion and may not understand SIGTERM. Eviction
// from ClusterAutoscaler and upgrades should both be blocked.
EvictionSafeNever EvictionSafe = "Never"

// ProtocolTCPUDP Protocol exposes the hostPort allocated for this container for both TCP and UDP.
ProtocolTCPUDP corev1.Protocol = "TCPUDP"

Expand Down Expand Up @@ -104,6 +114,13 @@ const (
// determine if a pod can safely be evicted to compact a cluster by moving pods between nodes
// and scaling down nodes.
PodSafeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
// SafeToEvictLabel is a label that, when "false", matches the restrictive PDB agones-gameserver-safe-to-evict-false.
SafeToEvictLabel = agones.GroupName + "/safe-to-evict"

// True is the string "true" to appease the goconst lint.
True = "true"
// False is the string "false" to appease the goconst lint.
False = "false"
)

var (
Expand Down Expand Up @@ -162,6 +179,9 @@ type GameServerSpec struct {
// (Alpha, PlayerTracking feature flag) Players provides the configuration for player tracking features.
// +optional
Players *PlayersSpec `json:"players,omitempty"`
// (Alpha, SafeToEvict feature flag) Eviction specifies the eviction tolerance of the GameServer. Defaults to "Never".
// +optional
Eviction Eviction `json:"eviction,omitempty"`
// immutableReplicas is present in gameservers.agones.dev but omitted here (it's always 1).
}

Expand All @@ -170,12 +190,25 @@ type PlayersSpec struct {
InitialCapacity int64 `json:"initialCapacity,omitempty"`
}

// Eviction specifies the eviction tolerance of the GameServer
type Eviction struct {
// (Alpha, SafeToEvict feature flag)
// Game server supports termination via SIGTERM:
// - Always: Allow eviction for both Cluster Autoscaler and node drain for upgrades
// - OnUpgrade: Allow eviction for upgrades alone
// - Never (default): Pod should run to completion
Safe EvictionSafe `json:"safe,omitempty"`
}

// GameServerState is the state for the GameServer
type GameServerState string

// PortPolicy is the port policy for the GameServer
type PortPolicy string

// EvictionSafe specified whether the game server supports termination via SIGTERM
type EvictionSafe string

// Health configures health checking on the GameServer
type Health struct {
// Disabled is whether health checking is disabled or not
Expand Down Expand Up @@ -235,6 +268,8 @@ type GameServerStatus struct {
// [FeatureFlag:PlayerTracking]
// +optional
Players *PlayerStatus `json:"players"`
// (Alpha, SafeToEvict feature flag) Eviction specifies the eviction tolerance of the GameServer.
Eviction Eviction `json:"eviction,omitempty"`
// immutableReplicas is present in gameservers.agones.dev but omitted here (it's always 1).
}

Expand Down Expand Up @@ -271,6 +306,7 @@ func (gss *GameServerSpec) ApplyDefaults() {
gss.applyContainerDefaults()
gss.applyPortDefaults()
gss.applyHealthDefaults()
gss.applyEvictionDefaults()
gss.applySchedulingDefaults()
gss.applySdkServerDefaults()
}
Expand Down Expand Up @@ -330,6 +366,8 @@ func (gs *GameServer) applyStatusDefaults() {
gs.Status.Players.Capacity = gs.Spec.Players.InitialCapacity
}
}

gs.applyEvictionStatus()
}

// applyPortDefaults applies default values for all ports
Expand All @@ -356,6 +394,25 @@ func (gss *GameServerSpec) applySchedulingDefaults() {
}
}

func (gss *GameServerSpec) applyEvictionDefaults() {
if !runtime.FeatureEnabled(runtime.FeatureSafeToEvict) {
return
}
if gss.Eviction.Safe == "" {
gss.Eviction.Safe = EvictionSafeNever
}
}

func (gs *GameServer) applyEvictionStatus() {
if !runtime.FeatureEnabled(runtime.FeatureSafeToEvict) {
return
}
gs.Status.Eviction = gs.Spec.Eviction
if gs.Spec.Template.ObjectMeta.Annotations[PodSafeToEvictAnnotation] == "true" {
gs.Status.Eviction.Safe = EvictionSafeAlways
}
}

// Validate validates the GameServerSpec configuration.
// devAddress is a specific IP address used for local Gameservers, for fleets "" is used
// If a GameServer Spec is invalid there will be > 0 values in
Expand All @@ -373,6 +430,16 @@ func (gss *GameServerSpec) Validate(devAddress string) ([]metav1.StatusCause, bo
}
}

if !runtime.FeatureEnabled(runtime.FeatureSafeToEvict) {
if gss.Eviction.Safe != "" {
causes = append(causes, metav1.StatusCause{
Type: metav1.CauseTypeFieldValueNotSupported,
Field: "eviction.safe",
Message: fmt.Sprintf("Value cannot be set unless feature flag %s is enabled", runtime.FeatureSafeToEvict),
})
}
}

if devAddress != "" {
// verify that the value is a valid IP address.
if net.ParseIP(devAddress) == nil {
Expand Down Expand Up @@ -631,6 +698,9 @@ func (gs *GameServer) Pod(sidecars ...corev1.Container) (*corev1.Pod, error) {
if err := apiHooks.MutateGameServerPodSpec(&gs.Spec, &pod.Spec); err != nil {
return nil, err
}
if err := apiHooks.SetEviction(gs.Status.Eviction.Safe, pod); err != nil {
return nil, err
}

return pod, nil
}
Expand Down Expand Up @@ -660,13 +730,16 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) {
ref := metav1.NewControllerRef(gs, SchemeGroupVersion.WithKind("GameServer"))
pod.ObjectMeta.OwnerReferences = append(pod.ObjectMeta.OwnerReferences, *ref)

// This means that the autoscaler cannot remove the Node that this Pod is on.
// (and evict the Pod in the process). Only set the value if it has not already
// been configured in the pod template (to not override user specified behavior).
// We only set this for packed game servers, under the assumption that if
// game servers are distributed then the cluster autoscaler isn't likely running.
if _, exists := pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]; !exists && gs.Spec.Scheduling == apis.Packed {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "false"
// When SafeToEvict=true, apiHooks.SetEviction manages disruption controls.
if !runtime.FeatureEnabled(runtime.FeatureSafeToEvict) {
// This means that the autoscaler cannot remove the Node that this Pod is on.
// (and evict the Pod in the process). Only set the value if it has not already
// been configured in the pod template (to not override user specified behavior).
// We only set this for packed game servers, under the assumption that if
// game servers are distributed then the cluster autoscaler isn't likely running.
if _, exists := pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]; !exists && gs.Spec.Scheduling == apis.Packed {
pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "false"
}
}

// Add Agones version into Pod Annotations
Expand Down
Loading

0 comments on commit 7329fee

Please sign in to comment.