From a9aa5286eb34e9edb0bcd42fdb113d19fc0a0683 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Tue, 17 Dec 2024 19:29:24 +0100 Subject: [PATCH] Use monitors for health check instead of custom script: (#22951) - Updated `docker-compose.yml` to improve health checks for worker and web services, ensuring they are monitored effectively. - Introduced a new management command `monitors.py` to check the health of various services, including database and web. - Updated `Makefile-docker` to replace healthcheck with monitors - Updated `initialize.py` to ensure the database is running before proceeding with initialization. - Removed the obsolete `healthcheck.py` script in favor of the new monitoring approach. - Added tests for the new monitoring functionality to ensure reliability. TMP: better tests --- .github/actions/run-docker/action.yml | 5 + Makefile-docker | 1 - Makefile-os | 6 +- docker-compose.yml | 11 +++ docker/uwsgi.ini | 3 + scripts/healthcheck.py | 50 ---------- .../amo/management/commands/initialize.py | 57 ++++++----- .../amo/management/commands/monitors.py | 60 ++++++++++++ src/olympia/amo/monitors.py | 70 ++++++++++++++ src/olympia/amo/tests/test_commands.py | 96 +++++++++++++++++-- src/olympia/amo/tests/test_monitor.py | 71 +++++++++++++- src/olympia/core/apps.py | 28 ++++-- src/olympia/core/tests/test_apps.py | 9 ++ tests/make/make.spec.js | 5 +- 14 files changed, 377 insertions(+), 95 deletions(-) delete mode 100755 scripts/healthcheck.py create mode 100644 src/olympia/amo/management/commands/monitors.py diff --git a/.github/actions/run-docker/action.yml b/.github/actions/run-docker/action.yml index cf7bafaa3179..c0ccd77c1de2 100644 --- a/.github/actions/run-docker/action.yml +++ b/.github/actions/run-docker/action.yml @@ -40,6 +40,11 @@ runs: COMPOSE_FILE: ${{ inputs.compose_file }} HOST_UID: ${{ steps.id.outputs.id }} DATA_BACKUP_SKIP: ${{ inputs.data_backup_skip }} + # In CI, we should use the docker-compose wait flag to ensure + # healthchecks are passing before running any commands on the containers. + # This comes at a performance cost, but ensures containers are ready + # to accept commands before CI continues to execute. + DOCKER_WAIT: true run: | # Start the specified services make up diff --git a/Makefile-docker b/Makefile-docker index 93caecb2141b..00d3cee0ba71 100644 --- a/Makefile-docker +++ b/Makefile-docker @@ -108,7 +108,6 @@ initialize: ## ensure database exists @echo "Initializing data..." @echo "args: $(ARGS)" $(PYTHON_COMMAND) ./manage.py initialize $(ARGS) - ./scripts/healthcheck.py PYTEST_SRC := src/olympia/ diff --git a/Makefile-os b/Makefile-os index ccb332febf18..98c7e6d89085 100644 --- a/Makefile-os +++ b/Makefile-os @@ -7,6 +7,7 @@ DOCKER_PROGRESS ?= auto DOCKER_METADATA_FILE ?= buildx-bake-metadata.json DOCKER_PUSH ?= +DOCKER_WAIT ?= # Not in dot env saved, # Docker needs these values set, # Static, cache preserved. @@ -40,11 +41,14 @@ endif DOCKER_COMPOSE_ARGS := \ -d \ - --wait \ --remove-orphans \ --no-build \ --quiet-pull \ +ifneq ($(DOCKER_WAIT),) + DOCKER_COMPOSE_ARGS += --wait +endif + # Paths should be cleaned before mounting .:/data/olympia # These are files which should be sourced from the container # or should be fresh on every run of the project diff --git a/docker-compose.yml b/docker-compose.yml index 6424258c4489..d3efb78d21e5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,7 @@ services: "--directory=/data/olympia/src", "--pattern=*.py", "--recursive", + "--no-restart-on-command-exit", "--", "celery -A olympia.amo.celery:app worker -E -c 2 --loglevel=INFO", ] @@ -67,6 +68,11 @@ services: extra_hosts: - "olympia.test:127.0.0.1" restart: on-failure:5 + healthcheck: + test: ["CMD-SHELL", "./manage.py monitors --services celery_worker --skip-checks"] + interval: 30s + retries: 3 + start_interval: 1s depends_on: - mysqld - elasticsearch @@ -80,6 +86,11 @@ services: service: worker command: - uwsgi --ini /data/olympia/docker/uwsgi.ini + healthcheck: + test: ["CMD-SHELL", "./manage.py monitors --services localdev_web --skip-checks"] + interval: 30s + retries: 3 + start_interval: 1s volumes: # Don't mount generated files. They only exist in the container # and would otherwiser be deleted by mounting the cwd volume above diff --git a/docker/uwsgi.ini b/docker/uwsgi.ini index 9efabb8a85a4..a8f12b6b6d4f 100644 --- a/docker/uwsgi.ini +++ b/docker/uwsgi.ini @@ -6,6 +6,9 @@ module = olympia.wsgi:application # process-related settings master = true +need-app = true +no-default-app = true +reload-on-exception = true # maximum number of worker processes processes = 4 vaccum = true diff --git a/scripts/healthcheck.py b/scripts/healthcheck.py deleted file mode 100755 index 82c9e7c8114e..000000000000 --- a/scripts/healthcheck.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -import os -import subprocess -import sys -import time - - -env = os.environ.copy() - -env['DJANGO_SETTINGS_MODULE'] = 'olympia' - - -def worker_healthcheck(): - subprocess.run( - ['celery', '-A', 'olympia.amo.celery', 'status'], - env=env, - stdout=subprocess.DEVNULL, - ) - - -def web_healthcheck(): - subprocess.run( - [ - 'curl', - '--fail', - '--show-error', - '--include', - '--location', - '--silent', - 'http://127.0.0.1:8002/__version__', - ], - stdout=subprocess.DEVNULL, - ) - - -TIME = time.time() -TIMEOUT = 60 -SLEEP = 1 - -while time.time() - TIME < TIMEOUT: - try: - worker_healthcheck() - web_healthcheck() - print('OK') - sys.exit(0) - except Exception as e: - print(f'Error: {e}') - time.sleep(SLEEP) - SLEEP *= 2 diff --git a/src/olympia/amo/management/commands/initialize.py b/src/olympia/amo/management/commands/initialize.py index d68ac5938ec2..0f4bf276538d 100644 --- a/src/olympia/amo/management/commands/initialize.py +++ b/src/olympia/amo/management/commands/initialize.py @@ -38,30 +38,39 @@ def handle(self, *args, **options): Create the database. """ logging.info(f'options: {options}') - # We need to support skipping loading/seeding when desired. - # Like in CI environments where you don't want to load data every time. - if settings.DATA_BACKUP_SKIP: - logging.info( - 'Skipping seeding and loading data because DATA_BACKUP_SKIP is set' - ) - return + # Always ensure "olympia" database exists and is accessible. + call_command('monitors', services=['olympia_database']) - # If DB empty or we are explicitly cleaning, then bail with data_seed. - if options.get('clean') or not self.local_admin_exists(): - call_command('data_seed') - return + # If we are not skipping data backup + # then run the logic to ensure the DB is ready. + if not settings.DATA_BACKUP_SKIP: + # If DB empty or we are explicitly cleaning, then bail with data_seed. + if options.get('clean') or not self.local_admin_exists(): + call_command('data_seed') + # Otherwise, we're working with a pre-existing DB. + else: + load = options.get('load') + # We always migrate the DB. + logging.info('Migrating...') + call_command('migrate', '--noinput') - load = options.get('load') - # We always migrate the DB. - logging.info('Migrating...') - call_command('migrate', '--noinput') + # If we specify a specific backup, simply load that. + if load: + call_command('data_load', '--name', load) + # We should reindex even if no data is loaded/modified + # because we might have a fresh instance of elasticsearch + else: + call_command( + 'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists' + ) - # If we specify a specifi backup, simply load that. - if load: - call_command('data_load', '--name', load) - # We should reindex even if no data is loaded/modified - # because we might have a fresh instance of elasticsearch - else: - call_command( - 'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists' - ) + # By now, we excpect the database to exist, and to be migrated + # so our database tables should be accessible + call_command('monitors', services=['database']) + + # Ensure any additional required dependencies are available before proceeding. + call_command( + 'monitors', + services=['localdev_web', 'celery_worker', 'elastic', 'rabbitmq', 'signer'], + attempts=10, + ) diff --git a/src/olympia/amo/management/commands/monitors.py b/src/olympia/amo/management/commands/monitors.py new file mode 100644 index 000000000000..614cbfee9fc6 --- /dev/null +++ b/src/olympia/amo/management/commands/monitors.py @@ -0,0 +1,60 @@ +import time + +from django.core.management.base import CommandError + +import olympia.amo.monitors as monitors + +from .. import BaseDataCommand + + +class Command(BaseDataCommand): + help = 'Check a set of AMO service monitors.' + + def add_arguments(self, parser): + parser.add_argument( + '--services', + nargs='+', + help='List of services to check', + ) + parser.add_argument( + '--attempts', + type=int, + default=5, + help='Number of attempts to check the services', + ) + + def handle(self, *args, **options): + attempts = options.get('attempts') + services = options.get('services') + + self.logger.info(f'services: {services}') + + if not services: + raise CommandError('No services specified') + + failing_services = services.copy() + + current = 0 + + while current < attempts: + current += 1 + self.logger.info(f'Checking services {services} for the {current} time') + status_summary = monitors.execute_checks(services) + failing_services = [ + service + for service, result in status_summary.items() + if result['state'] is False + ] + + if len(failing_services) > 0: + self.logger.error('Some services are failing: %s', failing_services) + sleep_time = round(1.618**current) + self.logger.info(f'Sleeping for {sleep_time} seconds') + time.sleep(sleep_time) + else: + break + + if len(failing_services) > 0: + raise CommandError(f'Some services are failing: {failing_services}') + else: + self.logger.info(f'All services are healthy {services}') diff --git a/src/olympia/amo/monitors.py b/src/olympia/amo/monitors.py index 68f60829bb3e..dfa9f2bbbcb2 100644 --- a/src/olympia/amo/monitors.py +++ b/src/olympia/amo/monitors.py @@ -7,6 +7,7 @@ from django.conf import settings import celery +import MySQLdb import requests from django_statsd.clients import statsd from kombu import Connection @@ -31,6 +32,38 @@ def execute_checks(checks: list[str]): return status_summary +def localdev_web(): + """ + Used in local development environments to determine if the web container + is able to serve requests. The version endpoint returns a 200 status code + and some json via the uwsgi http server. + """ + status = '' + response = requests.get('http://127.0.0.1:8002/__version__') + + if response.status_code != 200: + status = f'Failed to ping web with status code: {response.status_code}' + monitor_log.critical(status) + return status, None + + +def celery_worker(): + """ + Used in local development environments to determine if the celery worker + is able to execute tasks in the web worker. + """ + status = '' + app = celery.current_app + + inspector = app.control.inspect() + + if not inspector.ping(): + status = 'Celery worker is not connected' + monitor_log.critical(status) + + return status, None + + def memcache(): memcache = getattr(settings, 'CACHES', {}).get('default') memcache_results = [] @@ -190,6 +223,43 @@ def signer(): return status, signer_results +def olympia_database(): + """Check database connection by verifying the olympia database exists.""" + + status = '' + + db_info = settings.DATABASES.get('default') + + engine = db_info.get('ENGINE').split('.')[-1] + + if engine != 'mysql': + raise ValueError('expecting mysql database engine, recieved %s' % engine) + + mysql_args = { + 'user': db_info.get('USER'), + 'passwd': db_info.get('PASSWORD'), + 'host': db_info.get('HOST'), + } + if db_info.get('PORT'): + mysql_args['port'] = int(db_info.get('PORT')) + + try: + connection = MySQLdb.connect(**mysql_args) + + expected_db_name = db_info.get('NAME') + connection.query(f'SHOW DATABASES LIKE "{expected_db_name}"') + result = connection.store_result() + + if result.num_rows() == 0: + status = f'Database {expected_db_name} does not exist' + monitor_log.critical(status) + except Exception as e: + status = f'Failed to connect to database: {e}' + monitor_log.critical(status) + + return status, None + + def database(): # check database connection from olympia.addons.models import Addon diff --git a/src/olympia/amo/tests/test_commands.py b/src/olympia/amo/tests/test_commands.py index 136385ffa49c..ed1a70adb0dd 100644 --- a/src/olympia/amo/tests/test_commands.py +++ b/src/olympia/amo/tests/test_commands.py @@ -153,6 +153,13 @@ def test_generate_jsi18n_files(): class BaseTestDataCommand(TestCase): class Commands: reset_db = mock.call('reset_db', '--no-utf8', '--noinput') + monitors_olympia_database = mock.call('monitors', services=['olympia_database']) + monitors_database = mock.call('monitors', services=['database']) + monitors = mock.call( + 'monitors', + services=['localdev_web', 'celery_worker', 'elastic', 'rabbitmq', 'signer'], + attempts=10, + ) migrate = mock.call('migrate', '--noinput') data_seed = mock.call('data_seed') @@ -256,24 +263,31 @@ def with_local_admin(self): def test_handle_with_skip_data_initialize(self): """ Test running the 'initialize' command with the DATA_BACKUP_SKIP flag set. - Expected: nothing happens. + Expected: nothing happens except verifying the dependencies. """ call_command('initialize') self._assert_commands_called_in_order( self.mocks['mock_call_command'], - [], + [ + self.mock_commands.monitors_olympia_database, + self.mock_commands.monitors, + ], ) @override_settings(DATA_BACKUP_SKIP=True) def test_handle_with_load_argument_and_skip_data_initialize(self): """ Test running the 'initialize' command with both '--load' argument - and DATA_BACKUP_SKIP flag. Expected: nothing happens. + and DATA_BACKUP_SKIP flag. Expected: + nothing happens except verifying the dependencies. """ call_command('initialize', load='test') self._assert_commands_called_in_order( self.mocks['mock_call_command'], - [], + [ + self.mock_commands.monitors_olympia_database, + self.mock_commands.monitors, + ], ) def test_handle_with_clean_and_load_arguments(self): @@ -287,7 +301,10 @@ def test_handle_with_clean_and_load_arguments(self): self._assert_commands_called_in_order( self.mocks['mock_call_command'], [ + self.mock_commands.monitors_olympia_database, self.mock_commands.data_seed, + self.mock_commands.monitors_database, + self.mock_commands.monitors, ], ) @@ -301,7 +318,10 @@ def test_handle_with_clean_argument_no_local_admin(self): self._assert_commands_called_in_order( self.mocks['mock_call_command'], [ + self.mock_commands.monitors_olympia_database, self.mock_commands.data_seed, + self.mock_commands.monitors_database, + self.mock_commands.monitors, ], ) @@ -316,8 +336,11 @@ def test_handle_without_clean_or_load_with_local_admin(self): self._assert_commands_called_in_order( self.mocks['mock_call_command'], [ + self.mock_commands.monitors_olympia_database, self.mock_commands.migrate, self.mock_commands.reindex_skip_if_exists, + self.mock_commands.monitors_database, + self.mock_commands.monitors, ], ) @@ -331,13 +354,16 @@ def test_handle_without_clean_or_load_without_local_admin(self): self._assert_commands_called_in_order( self.mocks['mock_call_command'], [ + self.mock_commands.monitors_olympia_database, self.mock_commands.data_seed, + self.mock_commands.monitors_database, + self.mock_commands.monitors, ], ) def test_handle_migration_failure(self): """ - Test running the 'initialize' command when the 'migrate' command fails. + Test running the 'initialize' command when the 'call_command' command fails. Expected: The command exits with an error and does not proceed to seeding or loading data. """ @@ -349,7 +375,7 @@ def test_handle_migration_failure(self): self._assert_commands_called_in_order( self.mocks['mock_call_command'], [ - self.mock_commands.migrate, + self.mock_commands.monitors_olympia_database, ], ) @@ -360,7 +386,12 @@ def test_handle_mysql_exception(self, mock_filter): call_command('initialize') self._assert_commands_called_in_order( self.mocks['mock_call_command'], - [self.mock_commands.data_seed], + [ + self.mock_commands.monitors_olympia_database, + self.mock_commands.data_seed, + self.mock_commands.monitors_database, + self.mock_commands.monitors, + ], ) @@ -646,3 +677,54 @@ def test_default(self): self.mock_commands.data_load(self.base_data_command.data_backup_init), ], ) + + +class TestMonitorsCommand(BaseTestDataCommand): + def setUp(self): + mock_sleep = mock.patch('olympia.amo.management.commands.monitors.time.sleep') + self.mock_sleep = mock_sleep.start() + self.addCleanup(mock_sleep.stop) + + mock_execute_checks = mock.patch( + 'olympia.amo.management.commands.monitors.monitors.execute_checks', + ) + self.mock_execute_checks = mock_execute_checks.start() + self.addCleanup(mock_execute_checks.stop) + self.mock_execute_checks.return_value = {} + + def test_monitors_no_services_raises(self): + with self.assertRaises(CommandError) as context: + call_command('monitors') + assert 'No services specified' in str(context.exception) + + def test_monitors_fails_after_specified_attempts(self): + self.mock_execute_checks.return_value = {'database': {'state': False}} + with self.assertRaises(CommandError) as context: + call_command('monitors', services=['database'], attempts=1) + + assert "Some services are failing: ['database']" in str(context.exception) + + def test_monitors_succeeds_after_specified_attempts(self): + succeed_after = 3 + + def mock_handler(services): + state = self.mock_execute_checks.call_count >= succeed_after + return {service: {'state': state} for service in services} + + self.mock_execute_checks.side_effect = mock_handler + call_command('monitors', services=['database'], attempts=succeed_after) + + def test_monitors_succeeds_after_all_services_are_healthy(self): + succeed_after = 3 + + def mock_handler(_): + state = self.mock_execute_checks.call_count >= succeed_after + return { + 'database': {'state': state}, + 'success': {'state': True}, + } + + self.mock_execute_checks.side_effect = mock_handler + call_command( + 'monitors', services=['database', 'success'], attempts=succeed_after + ) diff --git a/src/olympia/amo/tests/test_monitor.py b/src/olympia/amo/tests/test_monitor.py index baee476a9e45..853d91c43700 100644 --- a/src/olympia/amo/tests/test_monitor.py +++ b/src/olympia/amo/tests/test_monitor.py @@ -1,5 +1,6 @@ import json -from unittest.mock import Mock, patch +import re +from unittest.mock import MagicMock, Mock, patch from django.conf import settings from django.test.utils import override_settings @@ -145,3 +146,71 @@ def test_cinder_fail(self): 'Failed to chat with cinder: ' f'500 Server Error: Internal Server Error for url: {url}' ) + + def test_localdev_web_fail(self): + responses.add( + responses.GET, + 'http://127.0.0.1:8002/__version__', + status=500, + ) + status, _ = monitors.localdev_web() + assert status == 'Failed to ping web with status code: 500' + + def test_localdev_web_success(self): + responses.add( + responses.GET, + 'http://127.0.0.1:8002/__version__', + status=200, + ) + status, _ = monitors.localdev_web() + assert status == '' + + def test_celery_worker_failed(self): + # Create a mock ping object + mock_ping = MagicMock(return_value=None) + mock_inspect = MagicMock() + mock_inspect.ping = mock_ping + + with patch( + 'olympia.amo.monitors.celery.current_app.control.inspect', + return_value=mock_inspect, + ): + status, result = monitors.celery_worker() + assert result is None + assert status == 'Celery worker is not connected' + + def test_celery_worker_success(self): + # Create a mock ping object + mock_ping = MagicMock(return_value={'celery@localhost': []}) + mock_inspect = MagicMock() + mock_inspect.ping = mock_ping + + with patch( + 'olympia.amo.monitors.celery.current_app.control.inspect', + return_value=mock_inspect, + ): + status, result = monitors.celery_worker() + assert status == '' + assert result is None + + @patch('olympia.amo.monitors.MySQLdb.connect') + def test_olympia_db_unavailable(self, mock_connect): + mock_connect.side_effect = Exception('Database is unavailable') + status, result = monitors.olympia_database() + assert status == 'Failed to connect to database: Database is unavailable' + assert result is None + + @patch('olympia.amo.monitors.MySQLdb.connect') + def test_olympia_db_database_does_not_exist(self, mock_connect): + # Create a mock connection object + mock_connection = MagicMock() + mock_connect.return_value = mock_connection + + # Create a mock result object with num_rows returning 0 + mock_result = MagicMock() + mock_result.num_rows.return_value = 0 + mock_connection.store_result.return_value = mock_result + + status, result = monitors.olympia_database() + assert re.match(r'^Database .+ does not exist$', status) + assert result is None diff --git a/src/olympia/core/apps.py b/src/olympia/core/apps.py index 0b45f3b63dcf..2a32723f5214 100644 --- a/src/olympia/core/apps.py +++ b/src/olympia/core/apps.py @@ -125,18 +125,26 @@ def static_check(app_configs, **kwargs): def db_charset_check(app_configs, **kwargs): errors = [] - with connection.cursor() as cursor: - cursor.execute("SHOW VARIABLES LIKE 'character_set_database';") - result = cursor.fetchone() - if result[1] != settings.DB_CHARSET: - errors.append( - Error( - 'Database charset invalid. ' - f'Expected {settings.DB_CHARSET}, ' - f'recieved {result[1]}', - id='setup.E005', + try: + with connection.cursor() as cursor: + cursor.execute("SHOW VARIABLES LIKE 'character_set_database';") + result = cursor.fetchone() + if result[1] != settings.DB_CHARSET: + errors.append( + Error( + 'Database charset invalid. ' + f'Expected {settings.DB_CHARSET}, ' + f'recieved {result[1]}', + id='setup.E005', + ) ) + except Exception as e: + errors.append( + Error( + f'Failed to connect to database: {e}', + id='setup.E006', ) + ) return errors diff --git a/src/olympia/core/tests/test_apps.py b/src/olympia/core/tests/test_apps.py index 4073387ade55..8264c125e7e4 100644 --- a/src/olympia/core/tests/test_apps.py +++ b/src/olympia/core/tests/test_apps.py @@ -36,6 +36,15 @@ def test_db_charset_check(self, mock_cursor): ): call_command('check') + @mock.patch('olympia.core.apps.connection.cursor') + def test_db_unavailable_check(self, mock_cursor): + mock_cursor.side_effect = Exception('Database is unavailable') + with self.assertRaisesMessage( + SystemCheckError, + 'Failed to connect to database: Database is unavailable', + ): + call_command('check') + def test_uwsgi_check(self): call_command('check') diff --git a/tests/make/make.spec.js b/tests/make/make.spec.js index 82946131b347..72db7cd24ebb 100644 --- a/tests/make/make.spec.js +++ b/tests/make/make.spec.js @@ -76,7 +76,10 @@ describe('docker-compose.yml', () => { expect(service.extra_hosts).toStrictEqual(['olympia.test=127.0.0.1']); expect(service.restart).toStrictEqual('on-failure:5'); // Each service should have a healthcheck - expect(service).not.toHaveProperty('healthcheck'); + expect(service.healthcheck).toHaveProperty('test'); + expect(service.healthcheck.interval).toStrictEqual('30s'); + expect(service.healthcheck.retries).toStrictEqual(3); + expect(service.healthcheck.start_interval).toStrictEqual('1s'); // each service should have a command expect(service.command).not.toBeUndefined(); // each service should have the same dependencies