Skip to content

Commit

Permalink
Fix for Pod deletion during unavailable controller
Browse files Browse the repository at this point in the history
If a Pod gets deleted, especially during GameServer Ready or Allocated
state, and the controller is either crashed, missing or unable to access
master, when the controller comes back up, the GameServer is left in a
zombie state in which it could be Allocated, but there is no Pod process
to back it.

Ideally, scenarios like this shouldn't happen, but it is possible,
depending on user interaction with Kubernetes, so we should cover the
scenario, as it requires manual intervention to fix otherwise.

This PR implements a controller that periodically checks GameServers to
ensure they have backing Pods, such that if this happens the GameServer
is marked as Unhealthy, and a Fleet can eventually return to a healed,
stable state, and not require manual intervention.

Closes googleforgames#1170
Closes googleforgames#398 (especially combined with fix for googleforgames#1245)
  • Loading branch information
markmandel committed Jan 16, 2020
1 parent b4f933b commit 62a7147
Show file tree
Hide file tree
Showing 4 changed files with 427 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ type Controller struct {
portAllocator *PortAllocator
healthController *HealthController
migrationController *MigrationController
missingPodController *MissingPodController
workerqueue *workerqueue.WorkerQueue
creationWorkerQueue *workerqueue.WorkerQueue // handles creation only
deletionWorkerQueue *workerqueue.WorkerQueue // handles deletion only
Expand Down Expand Up @@ -125,6 +126,7 @@ func NewController(
portAllocator: NewPortAllocator(minPort, maxPort, kubeInformerFactory, agonesInformerFactory),
healthController: NewHealthController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory),
migrationController: NewMigrationController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory),
missingPodController: NewMissingPodController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory),
}

c.baseLogger = runtime.NewLoggerWithType(c)
Expand Down Expand Up @@ -328,6 +330,13 @@ func (c *Controller) Run(workers int, stop <-chan struct{}) error {
}
}()

// Run the Missing Pod Controller
go func() {
if err := c.missingPodController.Run(stop); err != nil {
c.baseLogger.WithError(err).Error("error running missing pod controller")
}
}()

// start work queues
var wg sync.WaitGroup

Expand Down
8 changes: 8 additions & 0 deletions pkg/gameservers/gameservers.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,11 @@ func applyGameServerAddressAndPort(gs *agonesv1.GameServer, node *corev1.Node, p

return gs, nil
}

// isBeforePodCreated checks to see if the GameServer is in a state in which the pod could not have been
// created yet. This includes "Starting" in which a pod MAY exist, but may not yet be available, depending on when the
// informer cache updates
func isBeforePodCreated(gs *agonesv1.GameServer) bool {
state := gs.Status.State
return state == agonesv1.GameServerStatePortAllocation || state == agonesv1.GameServerStateCreating || state == agonesv1.GameServerStateStarting
}
158 changes: 158 additions & 0 deletions pkg/gameservers/missing.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
// Copyright 2020 Google LLC All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gameservers

import (
"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
"agones.dev/agones/pkg/client/clientset/versioned/scheme"
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
"agones.dev/agones/pkg/util/logfields"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/workerqueue"
"github.com/heptiolabs/healthcheck"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
corelisterv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
)

// MissingPodController watches makes sure that any GameServer
// that isn't in Scheduled, or Unhealthy/Error state, that is missing a Pod
// moved to Unhealthy. This can sometimes happen if the controller has downtime
// for a unexpected reason, or if there is no Delete event for a Pod for any reason.
// Since resync is every 30 seconds, even if there is some time in which a GameServer
// is in a broken state, it will eventually move to Unhealthy, and get replaced (if in a Fleet).
type MissingPodController struct {
baseLogger *logrus.Entry
podSynced cache.InformerSynced
podLister corelisterv1.PodLister
gameServerSynced cache.InformerSynced
gameServerGetter getterv1.GameServersGetter
gameServerLister listerv1.GameServerLister
workerqueue *workerqueue.WorkerQueue
recorder record.EventRecorder
}

// NewMissingPodController returns a MissingPodController
func NewMissingPodController(health healthcheck.Handler,
kubeClient kubernetes.Interface,
agonesClient versioned.Interface,
kubeInformerFactory informers.SharedInformerFactory,
agonesInformerFactory externalversions.SharedInformerFactory) *MissingPodController {

podInformer := kubeInformerFactory.Core().V1().Pods().Informer()
gameServers := agonesInformerFactory.Agones().V1().GameServers()
c := &MissingPodController{
podSynced: podInformer.HasSynced,
podLister: kubeInformerFactory.Core().V1().Pods().Lister(),
gameServerSynced: gameServers.Informer().HasSynced,
gameServerGetter: agonesClient.AgonesV1(),
gameServerLister: gameServers.Lister(),
}

c.baseLogger = runtime.NewLoggerWithType(c)
c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServer, c.baseLogger, logfields.GameServerKey, agones.GroupName+".MissingPodController")
health.AddLivenessCheck("gameserver-missing-pod-workerqueue", healthcheck.Check(c.workerqueue.Healthy))

eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(c.baseLogger.Debugf)
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
c.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "missing-pod-controller"})

gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: func(_, newObj interface{}) {
gs := newObj.(*agonesv1.GameServer)
if !isBeforePodCreated(gs) && !gs.IsBeingDeleted() &&
gs.Status.State != agonesv1.GameServerStateUnhealthy &&
gs.Status.State != agonesv1.GameServerStateError {
c.workerqueue.Enqueue(gs)
}
},
})

return c
}

// Run processes the rate limited queue.
// Will block until stop is closed
func (c *MissingPodController) Run(stop <-chan struct{}) error {
c.baseLogger.Debug("Wait for cache sync")
if !cache.WaitForCacheSync(stop, c.gameServerSynced, c.podSynced) {
return errors.New("failed to wait for caches to sync")
}

c.workerqueue.Run(1, stop)
return nil
}

func (c *MissingPodController) loggerForGameServerKey(key string) *logrus.Entry {
return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerKey, key)
}

// syncGameServer checks if a GameServer has a backing Pod, and if not,
// moves it to Unhealthy
func (c MissingPodController) syncGameServer(key string) error {
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
// don't return an error, as we don't want this retried
runtime.HandleError(c.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key"))
return nil
}

// check if the pod exists
if pod, err := c.podLister.Pods(namespace).Get(name); err != nil {
if !k8serrors.IsNotFound(err) {
return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace)
}
c.loggerForGameServerKey(key).Debug("Pod is missing. Moving GameServer to Unhealthy.")
} else if isGameServerPod(pod) {
// if the pod exists, all is well, and we can continue on our merry way.
return nil
}

gs, err := c.gameServerLister.GameServers(namespace).Get(name)
if err != nil {
if k8serrors.IsNotFound(err) {
c.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing")
return nil
}
return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace)
}

// already on the way out, so no need to do anything.
if gs.IsBeingDeleted() {
return nil
}

gsCopy := gs.DeepCopy()
gsCopy.Status.State = agonesv1.GameServerStateUnhealthy
if _, err = c.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(gsCopy); err != nil {
return err
}

c.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), "Pod is missing")

return nil
}
Loading

0 comments on commit 62a7147

Please sign in to comment.