Skip to content

Commit

Permalink
♻️ Refactors webserver's healthcheck (#2910)
Browse files Browse the repository at this point in the history
  • Loading branch information
pcrespov authored Mar 24, 2022
1 parent 5c5f2b7 commit a920852
Show file tree
Hide file tree
Showing 15 changed files with 249 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .env-wb-garbage-collector
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ WEBSERVER_LOGIN=null
#WEBSERVER_RESOURCE_MANAGER from .env
WEBSERVER_SCICRUNCH=null
WEBSERVER_STATICWEB=null
# WEBSERVER_STORAGE=null
# WEBSERVER_STORAGE needed to delete data when removing anonymous service
WEBSERVER_STUDIES_DISPATCHER=null
WEBSERVER_TRACING=null
# --------
Expand Down
8 changes: 4 additions & 4 deletions api/specs/webserver/openapi-diagnostics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ paths:
get:
tags:
- maintenance
summary: run check
operationId: check_running
summary: readiness probe for
operationId: healthcheck_readiness_probe
responses:
"200":
description: Service information
Expand All @@ -18,8 +18,8 @@ paths:
get:
tags:
- maintenance
summary: health check
operationId: check_health
summary: liveliness probe
operationId: healthcheck_liveness_probe
responses:
"200":
description: Service information
Expand Down
5 changes: 3 additions & 2 deletions services/web/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@ ARG VCS_REF

ENV SC_BUILD_TARGET=production \
SC_BOOT_MODE=production \
SC_HEALTHCHECK_INTERVAL=30 \
SC_HEALTHCHECK_RETRY=3 \
SC_BUILD_DATE=${BUILD_DATE} \
SC_VCS_URL=${VCS_URL} \
SC_VCS_REF=${VCS_REF}
Expand All @@ -145,6 +143,9 @@ RUN apt-get update \
COPY --chown=scu:scu services/web/server/docker services/web/server/docker
RUN chmod +x services/web/server/docker/*.sh


# healthcheck. NOTE: do not forget to update variable
ENV SC_HEALTHCHECK_TIMEOUT=120s
HEALTHCHECK --interval=30s \
--timeout=120s \
--start-period=30s \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ paths:
get:
tags:
- maintenance
summary: run check
operationId: check_running
summary: readiness probe for
operationId: healthcheck_readiness_probe
responses:
'200':
description: Service information
Expand Down Expand Up @@ -92,8 +92,8 @@ paths:
get:
tags:
- maintenance
summary: health check
operationId: check_health
summary: liveliness probe
operationId: healthcheck_liveness_probe
responses:
'200':
description: Service information
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
)
from pydantic import validator
from pydantic.fields import Field, ModelField
from pydantic.types import PositiveInt
from settings_library.base import BaseCustomSettings
from settings_library.email import SMTPSettings
from settings_library.postgres import PostgresSettings
Expand Down Expand Up @@ -58,8 +59,12 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings):

# @Dockerfile
SC_BOOT_MODE: Optional[BootModeEnum] = None
SC_HEALTHCHECK_INTERVAL: Optional[int] = None
SC_HEALTHCHECK_RETRY: Optional[int] = None
SC_HEALTHCHECK_TIMEOUT: Optional[PositiveInt] = Field(
None,
description="If a single run of the check takes longer than timeout seconds "
"then the check is considered to have failed."
"It takes retries consecutive failures of the health check for the container to be considered unhealthy.",
)
SC_USER_ID: Optional[int] = None
SC_USER_NAME: Optional[str] = None

Expand Down Expand Up @@ -226,6 +231,19 @@ def log_level(self) -> int:
def valid_log_level(cls, value) -> str:
return cls.validate_log_level(value)

@validator("SC_HEALTHCHECK_TIMEOUT", pre=True)
@classmethod
def get_healthcheck_timeout_in_seconds(cls, v):
# Ex. HEALTHCHECK --interval=5m --timeout=3s
if isinstance(v, str):
if v.endswith("s"):
factor = 1
elif v.endswith("m"):
factor = 60
v = v.rstrip("ms")
return int(v) * factor
return v

# HELPERS --------------------------------------------------------

def is_enabled(self, field_name: str) -> bool:
Expand Down
16 changes: 15 additions & 1 deletion services/web/server/src/simcore_service_webserver/diagnostics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
from servicelib.aiohttp.application_setup import ModuleCategory, app_module_setup

from . import diagnostics_handlers
from .diagnostics_core import IncidentsRegistry, kINCIDENTS_REGISTRY, kPLUGIN_START_TIME
from .diagnostics_healthcheck import (
IncidentsRegistry,
assert_healthy_app,
kINCIDENTS_REGISTRY,
kPLUGIN_START_TIME,
)
from .diagnostics_monitoring import setup_monitoring
from .diagnostics_settings import DiagnosticsSettings, get_plugin_settings
from .rest import HealthCheck

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,6 +43,14 @@ def setup_diagnostics(
# adds middleware and /metrics
setup_monitoring(app)

# injects healthcheck
healthcheck: HealthCheck = app[HealthCheck.__name__]

async def _on_healthcheck_async_adapter(app: web.Application):
assert_healthy_app(app)

healthcheck.on_healthcheck.append(_on_healthcheck_async_adapter)

# adds other diagnostic routes: healthcheck, etc
app.router.add_routes(diagnostics_handlers.routes)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from servicelib.utils import logged_gather

from . import catalog_client, db, director_v2_api, storage_api
from ._meta import API_VERSION, APP_NAME, __version__, api_version_prefix
from .diagnostics_core import HealthError, assert_healthy_app
from ._meta import API_VERSION, APP_NAME, api_version_prefix
from .login.decorators import login_required
from .security_decorators import permission_required
from .utils import get_task_info, get_tracemalloc_info
Expand All @@ -23,23 +22,6 @@
routes = web.RouteTableDef()


@routes.get(f"/{api_version_prefix}/health", name="check_health")
async def get_app_health(request: web.Request):
# diagnostics of incidents
try:
assert_healthy_app(request.app)
except HealthError as err:
log.error("Unhealthy application: %s", err)
raise web.HTTPServiceUnavailable()

data = {
"name": APP_NAME,
"version": __version__,
"api_version": API_VERSION,
}
return data


@routes.get(f"/{api_version_prefix}/status/diagnostics", name="get_app_diagnostics")
@login_required
@permission_required("diagnostics.read")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from servicelib.aiohttp.incidents import LimitedOrderedStack, SlowCallback

from .diagnostics_settings import get_plugin_settings
from .rest_healthcheck import HealthCheckFailed

log = logging.getLogger(__name__)

Expand All @@ -23,10 +24,6 @@
kSTART_SENSING_DELAY_SECS = f"{__name__}.start_sensing_delay"


class HealthError(Exception):
"""Service is set as unhealty"""


class IncidentsRegistry(LimitedOrderedStack[SlowCallback]):
def max_delay(self) -> float:
return self.max_item.delay_secs if self else 0
Expand Down Expand Up @@ -80,11 +77,11 @@ def is_sensing_enabled(app: web.Application):


def assert_healthy_app(app: web.Application) -> None:
"""Diagnostics function that determins whether
current application is healthy based on incidents
occured up to now
"""Diagnostics function that determines whether the current
application is healthy based on incidents probed since it was started
until now.
raises DiagnosticError if any incient detected
raises HealthCheckFailed if any incient detected
"""
settings = get_plugin_settings(app)

Expand All @@ -111,7 +108,7 @@ def assert_healthy_app(app: web.Application) -> None:
max_delay,
max_delay_allowed,
)
raise HealthError(msg)
raise HealthCheckFailed(msg)

# CRITERIA 2: Mean latency of the last N request slower than 1 sec
probe: Optional[DelayWindowProbe] = app.get(kLATENCY_PROBE)
Expand All @@ -126,6 +123,6 @@ def assert_healthy_app(app: web.Application) -> None:
)

if max_latency_allowed < latency:
raise HealthError(
raise HealthCheckFailed(
f"Last requests average latency is {latency} secs and surpasses {max_latency_allowed} secs"
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from servicelib.aiohttp.monitoring import setup_monitoring as service_lib_setup

from . import _meta
from .diagnostics_core import DelayWindowProbe, is_sensing_enabled, kLATENCY_PROBE
from .diagnostics_healthcheck import (
DelayWindowProbe,
is_sensing_enabled,
kLATENCY_PROBE,
)

log = logging.getLogger(__name__)

Expand Down
16 changes: 4 additions & 12 deletions services/web/server/src/simcore_service_webserver/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from . import rest_handlers
from ._constants import APP_OPENAPI_SPECS_KEY, APP_SETTINGS_KEY
from ._meta import API_VTAG, api_version_prefix
from .rest_healthcheck import HealthCheck
from .rest_settings import RestSettings, get_plugin_settings
from .rest_utils import get_openapi_specs_path, load_openapi_specs

Expand Down Expand Up @@ -58,20 +59,11 @@ def setup_rest(app: web.Application):
f"__version__.api_version_prefix {api_version_prefix} does not fit openapi.yml version {specs.info.version}"
)

app[HealthCheck.__name__] = HealthCheck(app)
log.debug("Setup %s", f"{app[HealthCheck.__name__]=}")

# basic routes
app.add_routes(rest_handlers.routes)
if not is_diagnostics_enabled:
# NOTE: the healthcheck route is normally in diagnostics, but
# if disabled, this plugin adds a simple version of it
app.add_routes(
[
web.get(
path=f"/{api_version_prefix}/health",
handler=rest_handlers.check_running,
name="check_health",
)
]
)

# middlewares
# NOTE: using safe get here since some tests use incomplete configs
Expand Down
58 changes: 41 additions & 17 deletions services/web/server/src/simcore_service_webserver/rest_handlers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Basic diagnostic handles to the rest API for operations
""" Basic healthckeck and configuration handles to the rest API
"""
Expand All @@ -7,29 +7,53 @@
from aiohttp import web
from servicelib.aiohttp.rest_utils import extract_and_validate

from ._meta import __version__, api_version_prefix
from ._meta import api_version_prefix
from .application_settings import APP_SETTINGS_KEY
from .rest_healthcheck import HealthCheck, HealthCheckFailed

log = logging.getLogger(__name__)

routes = web.RouteTableDef()


@routes.get(f"/{api_version_prefix}/", name="check_running")
async def check_running(_request: web.Request):
#
# - This entry point is used as a fast way
# to check that the service is still running
# - Do not do add any expensive computatio here
# - Healthcheck has been moved to diagnostics module
#
data = {
"name": __name__.split(".")[0],
"version": f"{__version__}",
"status": "SERVICE_RUNNING",
"api_version": f"{__version__}",
}
return web.json_response(data={"data": data})
@routes.get(f"/{api_version_prefix}/health", name="healthcheck_liveness_probe")
async def healthcheck_liveness_probe(request: web.Request):
"""Liveness probe: "Check if the container is alive"
This is checked by the containers orchestrator (docker swarm). When the service
is unhealthy, it will restart it so it can recover a working state.
SEE doc in rest_healthcheck.py
"""
healthcheck: HealthCheck = request.app[HealthCheck.__name__]

try:
# if slots append get too delayed, just timeout
health_report = await healthcheck.run(request.app)
except HealthCheckFailed as err:
log.warning("%s", err)
raise web.HTTPServiceUnavailable(reason="unhealthy")

return web.json_response(data={"data": health_report})


@routes.get(f"/{api_version_prefix}/", name="healthcheck_readiness_probe")
async def healthcheck_readiness_probe(request: web.Request):
"""Readiness probe: "Check if the container is ready to receive traffic"
When the target service is unhealthy, no traffic should be sent to it. Service discovery
services and load balancers (e.g. traefik) typically cut traffic from targets
in one way or another.
SEE doc in rest_healthcheck.py
"""

healthcheck: HealthCheck = request.app[HealthCheck.__name__]
health_report = healthcheck.get_app_info(request.app)
# NOTE: do NOT run healthcheck here, just return info fast.
health_report["status"] = "SERVICE_RUNNING"

return web.json_response(data={"data": health_report})


@routes.get(f"/{api_version_prefix}/config", name="get_config")
Expand Down
Loading

0 comments on commit a920852

Please sign in to comment.