Skip to content

Commit

Permalink
Use monitors for health check instead of custom script: (#22951)
Browse files Browse the repository at this point in the history
- Updated `docker-compose.yml` to improve health checks for worker and web services, ensuring they are monitored effectively.
- Introduced a new management command `monitors.py` to check the health of various services, including database and web.
- Updated `Makefile-docker` to replace healthcheck with monitors
- Updated `initialize.py` to ensure the database is running before proceeding with initialization.
- Removed the obsolete `healthcheck.py` script in favor of the new monitoring approach.
- Added tests for the new monitoring functionality to ensure reliability.

TMP: better tests
  • Loading branch information
KevinMind authored Dec 17, 2024
1 parent 072ce3f commit a9aa528
Show file tree
Hide file tree
Showing 14 changed files with 377 additions and 95 deletions.
5 changes: 5 additions & 0 deletions .github/actions/run-docker/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ runs:
COMPOSE_FILE: ${{ inputs.compose_file }}
HOST_UID: ${{ steps.id.outputs.id }}
DATA_BACKUP_SKIP: ${{ inputs.data_backup_skip }}
# In CI, we should use the docker-compose wait flag to ensure
# healthchecks are passing before running any commands on the containers.
# This comes at a performance cost, but ensures containers are ready
# to accept commands before CI continues to execute.
DOCKER_WAIT: true
run: |
# Start the specified services
make up
Expand Down
1 change: 0 additions & 1 deletion Makefile-docker
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ initialize: ## ensure database exists
@echo "Initializing data..."
@echo "args: $(ARGS)"
$(PYTHON_COMMAND) ./manage.py initialize $(ARGS)
./scripts/healthcheck.py

PYTEST_SRC := src/olympia/

Expand Down
6 changes: 5 additions & 1 deletion Makefile-os
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DOCKER_PROGRESS ?= auto
DOCKER_METADATA_FILE ?= buildx-bake-metadata.json
DOCKER_PUSH ?=
DOCKER_WAIT ?=
# Not in dot env saved,
# Docker needs these values set,
# Static, cache preserved.
Expand Down Expand Up @@ -40,11 +41,14 @@ endif

DOCKER_COMPOSE_ARGS := \
-d \
--wait \
--remove-orphans \
--no-build \
--quiet-pull \

ifneq ($(DOCKER_WAIT),)
DOCKER_COMPOSE_ARGS += --wait
endif

# Paths should be cleaned before mounting .:/data/olympia
# These are files which should be sourced from the container
# or should be fresh on every run of the project
Expand Down
11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ services:
"--directory=/data/olympia/src",
"--pattern=*.py",
"--recursive",
"--no-restart-on-command-exit",
"--",
"celery -A olympia.amo.celery:app worker -E -c 2 --loglevel=INFO",
]
Expand All @@ -67,6 +68,11 @@ services:
extra_hosts:
- "olympia.test:127.0.0.1"
restart: on-failure:5
healthcheck:
test: ["CMD-SHELL", "./manage.py monitors --services celery_worker --skip-checks"]
interval: 30s
retries: 3
start_interval: 1s
depends_on:
- mysqld
- elasticsearch
Expand All @@ -80,6 +86,11 @@ services:
service: worker
command:
- uwsgi --ini /data/olympia/docker/uwsgi.ini
healthcheck:
test: ["CMD-SHELL", "./manage.py monitors --services localdev_web --skip-checks"]
interval: 30s
retries: 3
start_interval: 1s
volumes:
# Don't mount generated files. They only exist in the container
# and would otherwiser be deleted by mounting the cwd volume above
Expand Down
3 changes: 3 additions & 0 deletions docker/uwsgi.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ module = olympia.wsgi:application

# process-related settings
master = true
need-app = true
no-default-app = true
reload-on-exception = true
# maximum number of worker processes
processes = 4
vaccum = true
Expand Down
50 changes: 0 additions & 50 deletions scripts/healthcheck.py

This file was deleted.

57 changes: 33 additions & 24 deletions src/olympia/amo/management/commands/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,30 +38,39 @@ def handle(self, *args, **options):
Create the database.
"""
logging.info(f'options: {options}')
# We need to support skipping loading/seeding when desired.
# Like in CI environments where you don't want to load data every time.
if settings.DATA_BACKUP_SKIP:
logging.info(
'Skipping seeding and loading data because DATA_BACKUP_SKIP is set'
)
return
# Always ensure "olympia" database exists and is accessible.
call_command('monitors', services=['olympia_database'])

# If DB empty or we are explicitly cleaning, then bail with data_seed.
if options.get('clean') or not self.local_admin_exists():
call_command('data_seed')
return
# If we are not skipping data backup
# then run the logic to ensure the DB is ready.
if not settings.DATA_BACKUP_SKIP:
# If DB empty or we are explicitly cleaning, then bail with data_seed.
if options.get('clean') or not self.local_admin_exists():
call_command('data_seed')
# Otherwise, we're working with a pre-existing DB.
else:
load = options.get('load')
# We always migrate the DB.
logging.info('Migrating...')
call_command('migrate', '--noinput')

load = options.get('load')
# We always migrate the DB.
logging.info('Migrating...')
call_command('migrate', '--noinput')
# If we specify a specific backup, simply load that.
if load:
call_command('data_load', '--name', load)
# We should reindex even if no data is loaded/modified
# because we might have a fresh instance of elasticsearch
else:
call_command(
'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
)

# If we specify a specifi backup, simply load that.
if load:
call_command('data_load', '--name', load)
# We should reindex even if no data is loaded/modified
# because we might have a fresh instance of elasticsearch
else:
call_command(
'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
)
# By now, we excpect the database to exist, and to be migrated
# so our database tables should be accessible
call_command('monitors', services=['database'])

# Ensure any additional required dependencies are available before proceeding.
call_command(
'monitors',
services=['localdev_web', 'celery_worker', 'elastic', 'rabbitmq', 'signer'],
attempts=10,
)
60 changes: 60 additions & 0 deletions src/olympia/amo/management/commands/monitors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import time

from django.core.management.base import CommandError

import olympia.amo.monitors as monitors

from .. import BaseDataCommand


class Command(BaseDataCommand):
help = 'Check a set of AMO service monitors.'

def add_arguments(self, parser):
parser.add_argument(
'--services',
nargs='+',
help='List of services to check',
)
parser.add_argument(
'--attempts',
type=int,
default=5,
help='Number of attempts to check the services',
)

def handle(self, *args, **options):
attempts = options.get('attempts')
services = options.get('services')

self.logger.info(f'services: {services}')

if not services:
raise CommandError('No services specified')

failing_services = services.copy()

current = 0

while current < attempts:
current += 1
self.logger.info(f'Checking services {services} for the {current} time')
status_summary = monitors.execute_checks(services)
failing_services = [
service
for service, result in status_summary.items()
if result['state'] is False
]

if len(failing_services) > 0:
self.logger.error('Some services are failing: %s', failing_services)
sleep_time = round(1.618**current)
self.logger.info(f'Sleeping for {sleep_time} seconds')
time.sleep(sleep_time)
else:
break

if len(failing_services) > 0:
raise CommandError(f'Some services are failing: {failing_services}')
else:
self.logger.info(f'All services are healthy {services}')
70 changes: 70 additions & 0 deletions src/olympia/amo/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from django.conf import settings

import celery
import MySQLdb
import requests
from django_statsd.clients import statsd
from kombu import Connection
Expand All @@ -31,6 +32,38 @@ def execute_checks(checks: list[str]):
return status_summary


def localdev_web():
"""
Used in local development environments to determine if the web container
is able to serve requests. The version endpoint returns a 200 status code
and some json via the uwsgi http server.
"""
status = ''
response = requests.get('http://127.0.0.1:8002/__version__')

if response.status_code != 200:
status = f'Failed to ping web with status code: {response.status_code}'
monitor_log.critical(status)
return status, None


def celery_worker():
"""
Used in local development environments to determine if the celery worker
is able to execute tasks in the web worker.
"""
status = ''
app = celery.current_app

inspector = app.control.inspect()

if not inspector.ping():
status = 'Celery worker is not connected'
monitor_log.critical(status)

return status, None


def memcache():
memcache = getattr(settings, 'CACHES', {}).get('default')
memcache_results = []
Expand Down Expand Up @@ -190,6 +223,43 @@ def signer():
return status, signer_results


def olympia_database():
"""Check database connection by verifying the olympia database exists."""

status = ''

db_info = settings.DATABASES.get('default')

engine = db_info.get('ENGINE').split('.')[-1]

if engine != 'mysql':
raise ValueError('expecting mysql database engine, recieved %s' % engine)

mysql_args = {
'user': db_info.get('USER'),
'passwd': db_info.get('PASSWORD'),
'host': db_info.get('HOST'),
}
if db_info.get('PORT'):
mysql_args['port'] = int(db_info.get('PORT'))

try:
connection = MySQLdb.connect(**mysql_args)

expected_db_name = db_info.get('NAME')
connection.query(f'SHOW DATABASES LIKE "{expected_db_name}"')
result = connection.store_result()

if result.num_rows() == 0:
status = f'Database {expected_db_name} does not exist'
monitor_log.critical(status)
except Exception as e:
status = f'Failed to connect to database: {e}'
monitor_log.critical(status)

return status, None


def database():
# check database connection
from olympia.addons.models import Addon
Expand Down
Loading

0 comments on commit a9aa528

Please sign in to comment.