Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GameServer container restart before Ready, move to Unhealthy state After (v2) #1099

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions pkg/apis/agones/v1/gameserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ const (
// DevAddressAnnotation is an annotation to indicate that a GameServer hosted outside of Agones.
// A locally hosted GameServer is not managed by Agones it is just simply registered.
DevAddressAnnotation = "agones.dev/dev-address"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(aside) Is there a reason that this one is prefixed with "agones.dev" instead of being agones.GroupName + "/dev-address"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, we should fix this.

// GameServerReadyContainerIDAnnotation is an annotation that is set on the GameServer
// becomes ready, so we can track when restarts should occur and when a GameServer
// should be moved to Unhealthy.
GameServerReadyContainerIDAnnotation = agones.GroupName + "/ready-container-id"
)

var (
Expand Down Expand Up @@ -425,6 +429,25 @@ func (gs *GameServer) IsBeingDeleted() bool {
return !gs.ObjectMeta.DeletionTimestamp.IsZero() || gs.Status.State == GameServerStateShutdown
}

// IsBeforeReady returns true if the GameServer Status has yet to move to or past the Ready
// state in its lifecycle, such as Allocated or Reserved, or any of the Error/Unhealthy states
func (gs *GameServer) IsBeforeReady() bool {
switch gs.Status.State {
case GameServerStatePortAllocation:
return true
case GameServerStateCreating:
return true
case GameServerStateStarting:
return true
case GameServerStateScheduled:
return true
case GameServerStateRequestReady:
return true
}

return false
}

// FindGameServerContainer returns the container that is specified in
// gameServer.Spec.Container. Returns the index and the value.
// Returns an error if not found
Expand Down
27 changes: 27 additions & 0 deletions pkg/apis/agones/v1/gameserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,33 @@ func TestGameServerIsDeletable(t *testing.T) {
assert.True(t, gs.IsDeletable())
}

func TestGameServerIsBeforeReady(t *testing.T) {
fixtures := []struct {
state GameServerState
expected bool
}{
{GameServerStatePortAllocation, true},
{GameServerStateCreating, true},
{GameServerStateStarting, true},
{GameServerStateScheduled, true},
{GameServerStateRequestReady, true},
{GameServerStateReady, false},
{GameServerStateShutdown, false},
{GameServerStateError, false},
{GameServerStateUnhealthy, false},
{GameServerStateReserved, false},
{GameServerStateAllocated, false},
}

for _, test := range fixtures {
t.Run(string(test.state), func(t *testing.T) {
gs := &GameServer{Status: GameServerStatus{State: test.state}}
assert.Equal(t, test.expected, gs.IsBeforeReady())
})
}

}

func TestGameServerApplyToPodGameServerContainer(t *testing.T) {
t.Parallel()

Expand Down
26 changes: 19 additions & 7 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -762,26 +762,38 @@ func (c *Controller) syncGameServerRequestReadyState(gs *agonesv1.GameServer) (*

gsCopy := gs.DeepCopy()

pod, err := c.gameServerPod(gs)
// NotFound should never happen, and if it does -- something bad happened,
// so go into workerqueue backoff.
if err != nil {
return nil, err
}

// if the address hasn't been populated, and the Ready request comes
// before the controller has had a chance to do it, then
// do it here instead
addressPopulated := false
if gs.Status.NodeName == "" {
addressPopulated = true
pod, err := c.gameServerPod(gs)
// NotFound should never happen, and if it does -- something bad happened,
// so go into workerqueue backoff.
if err != nil {
return nil, err
}
gsCopy, err = c.applyGameServerAddressAndPort(gsCopy, pod)
if err != nil {
return gs, err
}
}

// track the ready gameserver container, so we can determine that after this point, we should move to Unhealthy
// if there is a container crash/restart after we move to Ready
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == gs.Spec.Container {
if _, ok := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok {
gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = cs.ContainerID
}
break
}
}

gsCopy.Status.State = agonesv1.GameServerStateReady
gs, err := c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(gsCopy)
gs, err = c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(gsCopy)
if err != nil {
return gs, errors.Wrapf(err, "error setting Ready, Port and address on GameServer %s Status", gs.ObjectMeta.Name)
}
Expand Down
49 changes: 48 additions & 1 deletion pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,7 @@ func TestControllerApplyGameServerAddressAndPort(t *testing.T) {

func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
t.Parallel()
containerID := "1234"

t.Run("GameServer with ReadyRequest State", func(t *testing.T) {
c, m := newFakeController()
Expand All @@ -928,6 +929,9 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = "node"
pod, err := gsFixture.Pod()
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
}
assert.Nil(t, err)
gsUpdated := false

Expand All @@ -939,14 +943,15 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, agonesv1.GameServerStateReady, gs.Status.State)
assert.Equal(t, containerID, gs.Annotations[agonesv1.GameServerReadyContainerIDAnnotation])
return true, gs, nil
})

_, cancel := agtesting.StartInformers(m, c.podSynced)
defer cancel()

gs, err := c.syncGameServerRequestReadyState(gsFixture)
assert.Nil(t, err, "should not error")
assert.NoError(t, err, "should not error")
assert.True(t, gsUpdated, "GameServer wasn't updated")
assert.Equal(t, agonesv1.GameServerStateReady, gs.Status.State)
agtesting.AssertEventContains(t, m.FakeRecorder.Events, "SDK.Ready() complete")
Expand All @@ -961,6 +966,9 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
pod, err := gsFixture.Pod()
assert.Nil(t, err)
pod.Spec.NodeName = nodeFixtureName
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
}
gsUpdated := false

ipFixture := "12.12.12.12"
Expand All @@ -978,6 +986,7 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, agonesv1.GameServerStateReady, gs.Status.State)
assert.Equal(t, containerID, gs.Annotations[agonesv1.GameServerReadyContainerIDAnnotation])
return true, gs, nil
})

Expand All @@ -996,6 +1005,44 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
agtesting.AssertEventContains(t, m.FakeRecorder.Events, "SDK.Ready() complete")
})

t.Run("GameServer with a GameServerReadyContainerIDAnnotation already", func(t *testing.T) {
c, m := newFakeController()

gsFixture := &agonesv1.GameServer{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: newSingleContainerSpec(), Status: agonesv1.GameServerStatus{State: agonesv1.GameServerStateRequestReady}}
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = "node"
gsFixture.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = "4321"
pod, err := gsFixture.Pod()
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
}
assert.Nil(t, err)
gsUpdated := false

m.KubeClient.AddReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &corev1.PodList{Items: []corev1.Pod{*pod}}, nil
})
m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
gsUpdated = true
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, agonesv1.GameServerStateReady, gs.Status.State)
assert.NotEqual(t, containerID, gs.Annotations[agonesv1.GameServerReadyContainerIDAnnotation])

return true, gs, nil
})

_, cancel := agtesting.StartInformers(m, c.podSynced)
defer cancel()

gs, err := c.syncGameServerRequestReadyState(gsFixture)
assert.NoError(t, err, "should not error")
assert.True(t, gsUpdated, "GameServer wasn't updated")
assert.Equal(t, agonesv1.GameServerStateReady, gs.Status.State)
agtesting.AssertEventContains(t, m.FakeRecorder.Events, "SDK.Ready() complete")
})

for _, s := range []agonesv1.GameServerState{"Unknown", agonesv1.GameServerStateUnhealthy} {
name := fmt.Sprintf("GameServer with %s state", s)
t.Run(name, func(t *testing.T) {
Expand Down
48 changes: 48 additions & 0 deletions pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ func (hc *HealthController) syncGameServer(key string) error {
return nil
}

if skip, err := hc.skipUnhealthy(gs); err != nil || skip {
return err
}

hc.loggerForGameServer(gs).Info("Issue with GameServer pod, marking as GameServerStateUnhealthy")
gsCopy := gs.DeepCopy()
gsCopy.Status.State = agonesv1.GameServerStateUnhealthy
Expand All @@ -202,3 +206,47 @@ func (hc *HealthController) syncGameServer(key string) error {

return nil
}

// skipUnhealthy determines if it's appropriate to not move to Unhealthy when a Pod's
// gameserver container has crashed, or let it restart as per usual K8s operations.
// It does this by checking a combination of the current GameServer state and annotation data that stores
// which container instance was live if the GameServer has been marked as Ready.
// The logic is as follows:
// - If the GameServer is not yet Ready, allow to restart (return true)
// - If the GameServer is in a state past Ready, move to Unhealthy
func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error) {
pod, err := hc.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name)
if err != nil {
// Pod doesn't exist, so the GameServer is definitely not healthy
if k8serrors.IsNotFound(err) {
return false, nil
}
// if it's something else, go back into the queue
return false, errors.Wrapf(err, "error retrieving Pod %s for GameServer to check status", gs.ObjectMeta.Name)
}
if !metav1.IsControlledBy(pod, gs) {
// This is not the Pod we are looking for 🤖
return false, nil
}
if gs.IsBeforeReady() {
return hc.failedContainer(pod), nil
}

// finally, we need to check if there a failed container happened after the gameserver was ready or before.
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == gs.Spec.Container {
if cs.State.Terminated != nil {
return false, nil
}
if cs.LastTerminationState.Terminated != nil {
// if the current container is running, and is the ready container, then we know this is some
// other pod update, and we previously had a restart before we got to being Ready, and therefore
// shouldn't move to Unhealthy.
return cs.ContainerID == gs.Annotations[agonesv1.GameServerReadyContainerIDAnnotation], nil
}
break
}
}

return false, nil
}
Loading