diff --git a/CHANGELOG.md b/CHANGELOG.md index 893ff52f6..c1e650484 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ Dependencies are updated to the latest available version during each release. Th - Return a status code of 500 instead of 403 for server-side errors during login. - Errors in querying an external source of user information, such as Firestore or LDAP, are now caught in the `/auth` route and only logged, not reported to Slack as uncaught exceptions. The `/auth` route may receive multiple requests per second and should not report every error due to a possible external outage to Slack. - Errors in querying an external source of user information in the `/auth/api/v1/user-info` route are now caught, reported to Slack, and result in an orderly error message instead of an uncaught exception. +- Set a timeout on Kubernetes watches in the Kubernetes operator to work around a Kubernetes server bug where watches of unlimited duration will sometimes go silent and stop receiving events. +- Mark Kubernetes object parsing failures as Kopf permanent failures so that the same version of the object will not be retried. Mark Kubernetes API failures as temporary failures so that the retry schedule is configurable. ### Other changes diff --git a/src/gafaelfawr/constants.py b/src/gafaelfawr/constants.py index 85a64af43..e5d67a745 100644 --- a/src/gafaelfawr/constants.py +++ b/src/gafaelfawr/constants.py @@ -43,6 +43,20 @@ HTTP_TIMEOUT = 20.0 """Timeout (in seconds) for outbound HTTP requests to auth providers.""" +KUBERNETES_WATCH_TIMEOUT = 10 * 60 +"""Timeout (in seconds) for the Kubernetes operator watch operation. + +If this is not set, Kopf attempts to connect without a timeout. This sometimes +triggers a bug in Kubernetes where the server stops responding without closing +the connection (see https://github.com/nolar/kopf/issues/585). Instead, set an +explicit timeout. + +This is the timeout sent to the Kubernetes server and is supposed to be +handled on the server side. A client-side timeout will be set for one minute +longer than this timeout in case the server doesn't handle its timeout +properly. +""" + KUBERNETES_TIMER_DELAY = 5 """How long (in seconds) to delay timers after startup and changes. diff --git a/src/gafaelfawr/exceptions.py b/src/gafaelfawr/exceptions.py index 902d8963f..b43dfe7b6 100644 --- a/src/gafaelfawr/exceptions.py +++ b/src/gafaelfawr/exceptions.py @@ -357,11 +357,11 @@ class LDAPError(ExternalUserInfoError): """User or group information in LDAP was invalid or LDAP calls failed.""" -class KubernetesError(Exception): - """An error occurred during Kubernetes secret processing.""" +class KubernetesError(kopf.TemporaryError): + """An error occurred performing a Kubernetes operation.""" -class KubernetesObjectError(KubernetesError): +class KubernetesObjectError(kopf.PermanentError): """A Kubernetes object could not be parsed. Parameters diff --git a/src/gafaelfawr/operator/startup.py b/src/gafaelfawr/operator/startup.py index 076279d21..8275bbd8c 100644 --- a/src/gafaelfawr/operator/startup.py +++ b/src/gafaelfawr/operator/startup.py @@ -9,6 +9,7 @@ from safir.database import create_database_engine from safir.kubernetes import initialize_kubernetes +from ..constants import KUBERNETES_WATCH_TIMEOUT from ..dependencies.config import config_dependency from ..factory import Factory @@ -16,13 +17,15 @@ @kopf.on.startup() -async def startup(memo: kopf.Memo, **_: Any) -> None: +async def startup( + memo: kopf.Memo, settings: kopf.OperatorSettings, **_: Any +) -> None: """Initialize global data for Kubernetes operators. Anything stored in the provided ``memo`` argument will be made available, - via shallow copy, in the ``memo`` argument to any other handler. Use this + via shallow copy, in the ``memo`` argument to any other handler. Use this to initialize the database and Redis pools, create service objects, and so - forth. + forth. Also add some configuration settings to Kopf. Parameters ---------- @@ -30,7 +33,12 @@ async def startup(memo: kopf.Memo, **_: Any) -> None: Holds global state, used to store the service objects and the various infrastructure used to create them, and which needs to be freed cleanly during shutdown. + settings + Holds the Kopf settings. """ + settings.watching.server_timeout = KUBERNETES_WATCH_TIMEOUT + settings.watching.client_timeout = KUBERNETES_WATCH_TIMEOUT + 60 + config = await config_dependency() await initialize_kubernetes()