forked from googleforgames/agones
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix for Pod deletion during unavailable controller
If a Pod gets deleted, especially during GameServer Ready or Allocated state, and the controller is either crashed, missing or unable to access master, when the controller comes back up, the GameServer is left in a zombie state in which it could be Allocated, but there is no Pod process to back it. Ideally, scenarios like this shouldn't happen, but it is possible, depending on user interaction with Kubernetes, so we should cover the scenario, as it requires manual intervention to fix otherwise. This PR implements a controller that periodically checks GameServers to ensure they have backing Pods, such that if this happens the GameServer is marked as Unhealthy, and a Fleet can eventually return to a healed, stable state, and not require manual intervention. There is no e2e test, as I couldn't work out a viable way to break the Agones controller, and then bring is back reliably. Closes googleforgames#1170 Closes googleforgames#398 (especially combined with fix for googleforgames#1245)
- Loading branch information
1 parent
a07d3cf
commit 46afcac
Showing
6 changed files
with
472 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
// Copyright 2020 Google LLC All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package gameservers | ||
|
||
import ( | ||
"agones.dev/agones/pkg/apis/agones" | ||
agonesv1 "agones.dev/agones/pkg/apis/agones/v1" | ||
"agones.dev/agones/pkg/client/clientset/versioned" | ||
"agones.dev/agones/pkg/client/clientset/versioned/scheme" | ||
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" | ||
"agones.dev/agones/pkg/client/informers/externalversions" | ||
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" | ||
"agones.dev/agones/pkg/util/logfields" | ||
"agones.dev/agones/pkg/util/runtime" | ||
"agones.dev/agones/pkg/util/workerqueue" | ||
"github.com/heptiolabs/healthcheck" | ||
"github.com/pkg/errors" | ||
"github.com/sirupsen/logrus" | ||
corev1 "k8s.io/api/core/v1" | ||
k8serrors "k8s.io/apimachinery/pkg/api/errors" | ||
"k8s.io/client-go/informers" | ||
"k8s.io/client-go/kubernetes" | ||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" | ||
corelisterv1 "k8s.io/client-go/listers/core/v1" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/client-go/tools/record" | ||
) | ||
|
||
// MissingPodController watches makes sure that any GameServer | ||
// that isn't in Scheduled, or Unhealthy/Error state, that is missing a Pod | ||
// moved to Unhealthy. This can sometimes happen if the controller has downtime | ||
// for a unexpected reason, or if there is no Delete event for a Pod for any reason. | ||
// Since resync is every 30 seconds, even if there is some time in which a GameServer | ||
// is in a broken state, it will eventually move to Unhealthy, and get replaced (if in a Fleet). | ||
type MissingPodController struct { | ||
baseLogger *logrus.Entry | ||
podSynced cache.InformerSynced | ||
podLister corelisterv1.PodLister | ||
gameServerSynced cache.InformerSynced | ||
gameServerGetter getterv1.GameServersGetter | ||
gameServerLister listerv1.GameServerLister | ||
workerqueue *workerqueue.WorkerQueue | ||
recorder record.EventRecorder | ||
} | ||
|
||
// NewMissingPodController returns a MissingPodController | ||
func NewMissingPodController(health healthcheck.Handler, | ||
kubeClient kubernetes.Interface, | ||
agonesClient versioned.Interface, | ||
kubeInformerFactory informers.SharedInformerFactory, | ||
agonesInformerFactory externalversions.SharedInformerFactory) *MissingPodController { | ||
|
||
podInformer := kubeInformerFactory.Core().V1().Pods().Informer() | ||
gameServers := agonesInformerFactory.Agones().V1().GameServers() | ||
c := &MissingPodController{ | ||
podSynced: podInformer.HasSynced, | ||
podLister: kubeInformerFactory.Core().V1().Pods().Lister(), | ||
gameServerSynced: gameServers.Informer().HasSynced, | ||
gameServerGetter: agonesClient.AgonesV1(), | ||
gameServerLister: gameServers.Lister(), | ||
} | ||
|
||
c.baseLogger = runtime.NewLoggerWithType(c) | ||
c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServer, c.baseLogger, logfields.GameServerKey, agones.GroupName+".MissingPodController") | ||
health.AddLivenessCheck("gameserver-missing-pod-workerqueue", healthcheck.Check(c.workerqueue.Healthy)) | ||
|
||
eventBroadcaster := record.NewBroadcaster() | ||
eventBroadcaster.StartLogging(c.baseLogger.Debugf) | ||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | ||
c.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "missing-pod-controller"}) | ||
|
||
gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
UpdateFunc: func(_, newObj interface{}) { | ||
gs := newObj.(*agonesv1.GameServer) | ||
_, isDev := gs.GetDevAddress() | ||
// TOXO: test for dev gameserver, which won't have a pod | ||
if !isDev && !isBeforePodCreated(gs) && !gs.IsBeingDeleted() && !gs.IsUnhealthy() { | ||
c.workerqueue.Enqueue(gs) | ||
} | ||
}, | ||
}) | ||
|
||
return c | ||
} | ||
|
||
// Run processes the rate limited queue. | ||
// Will block until stop is closed | ||
func (c *MissingPodController) Run(stop <-chan struct{}) error { | ||
c.baseLogger.Debug("Wait for cache sync") | ||
if !cache.WaitForCacheSync(stop, c.gameServerSynced, c.podSynced) { | ||
return errors.New("failed to wait for caches to sync") | ||
} | ||
|
||
c.workerqueue.Run(1, stop) | ||
return nil | ||
} | ||
|
||
func (c *MissingPodController) loggerForGameServerKey(key string) *logrus.Entry { | ||
return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerKey, key) | ||
} | ||
|
||
// syncGameServer checks if a GameServer has a backing Pod, and if not, | ||
// moves it to Unhealthy | ||
func (c *MissingPodController) syncGameServer(key string) error { | ||
namespace, name, err := cache.SplitMetaNamespaceKey(key) | ||
if err != nil { | ||
// don't return an error, as we don't want this retried | ||
runtime.HandleError(c.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key")) | ||
return nil | ||
} | ||
|
||
// check if the pod exists | ||
if pod, err := c.podLister.Pods(namespace).Get(name); err != nil { | ||
if !k8serrors.IsNotFound(err) { | ||
return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace) | ||
} | ||
c.loggerForGameServerKey(key).Debug("Pod is missing. Moving GameServer to Unhealthy.") | ||
} else if isGameServerPod(pod) { | ||
// if the pod exists, all is well, and we can continue on our merry way. | ||
return nil | ||
} | ||
|
||
gs, err := c.gameServerLister.GameServers(namespace).Get(name) | ||
if err != nil { | ||
if k8serrors.IsNotFound(err) { | ||
c.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing") | ||
return nil | ||
} | ||
return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace) | ||
} | ||
|
||
// already on the way out, so no need to do anything. | ||
if gs.IsBeingDeleted() || gs.IsUnhealthy() { | ||
return nil | ||
} | ||
|
||
gsCopy := gs.DeepCopy() | ||
gsCopy.Status.State = agonesv1.GameServerStateUnhealthy | ||
gs, err = c.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(gsCopy) | ||
if err != nil { | ||
return errors.Wrap(err, "error updating GameServer to Unhealthy") | ||
} | ||
|
||
c.recorder.Event(gs, corev1.EventTypeWarning, string(gs.Status.State), "Pod is missing") | ||
return nil | ||
} |
Oops, something went wrong.