From b871fe4c0014eda6c1f15e4653dd1f15c71362f0 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Mon, 24 Jun 2024 20:37:36 +0100 Subject: [PATCH 1/6] Add Sentry SDK to requirements --- requirements.prod.in | 1 + requirements.prod.txt | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements.prod.in b/requirements.prod.in index 7ff887ca..230eded1 100644 --- a/requirements.prod.in +++ b/requirements.prod.in @@ -3,3 +3,4 @@ slack-bolt sqlalchemy[postgresql_psycopgbinary] structlog + sentry-sdk diff --git a/requirements.prod.txt b/requirements.prod.txt index 9209f8de..a8cee471 100644 --- a/requirements.prod.txt +++ b/requirements.prod.txt @@ -7,7 +7,9 @@ certifi==2024.2.2 \ --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \ --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1 - # via requests + # via + # requests + # sentry-sdk charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ @@ -241,6 +243,10 @@ requests==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 # via -r requirements.prod.in +sentry-sdk==2.6.0 \ + --hash=sha256:422b91cb49378b97e7e8d0e8d5a1069df23689d45262b86f54988a7db264e874 \ + --hash=sha256:65cc07e9c6995c5e316109f138570b32da3bd7ff8d0d0ee4aaf2628c3dd8127d + # via -r requirements.prod.in slack-bolt==1.19.0 \ --hash=sha256:45135b8a1dea40abeb20b9b9d1973953f6755b76388156ee218d6c61d96f992a \ --hash=sha256:810891cc110e0fb3948f26c044302ed90abda2a25e9ec1689e179da8bb2747cf @@ -313,4 +319,6 @@ typing-extensions==4.11.0 \ urllib3==2.2.1 \ --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \ --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19 - # via requests + # via + # requests + # sentry-sdk From 545f616c1c2fa27b99596ef351e5e60891042e36 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Mon, 24 Jun 2024 20:58:35 +0100 Subject: [PATCH 2/6] Add Sentry initialisation and configuration Based on configuration example from Sentry docs. Timezone and schedule match current config on dokku3 Previous runs of the cron job seem to take <10mins in total. Therefore 10 minutes runtime for each individual task should be plenty. The first task in the list is expected to start shortly after midnight, but the later ones might not start until nearly 10 past. Therefore 30 mins chosen to give a bit of leeway on the checkin time. --- metrics/sentry/__init__.py | 0 metrics/sentry/sentry.py | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 metrics/sentry/__init__.py create mode 100644 metrics/sentry/sentry.py diff --git a/metrics/sentry/__init__.py b/metrics/sentry/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/metrics/sentry/sentry.py b/metrics/sentry/sentry.py new file mode 100644 index 00000000..6a713bc1 --- /dev/null +++ b/metrics/sentry/sentry.py @@ -0,0 +1,27 @@ +import os + +import sentry_sdk + + +def init(): + sentry_sdk.init( + dsn=os.environ.get("SENTRY_DSN"), + ) + + +monitor_config = { + "schedule": {"type": "crontab", "value": "@daily"}, + "timezone": "Etc/UTC", + # If an expected check-in doesn't come in `checkin_margin` + # minutes, it'll be considered missed + "checkin_margin": 30, + # The check-in is allowed to run for `max_runtime` minutes + # before it's considered failed + "max_runtime": 10, + # It'll take `failure_issue_threshold` consecutive failed + # check-ins to create an issue + "failure_issue_threshold": 1, + # It'll take `recovery_threshold` OK check-ins to resolve + # an issue + "recovery_threshold": 1, +} From 5c52c08bc1524f98a19fc0fc23e98c91fe583ea0 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Mon, 24 Jun 2024 20:59:53 +0100 Subject: [PATCH 3/6] Add cron checkins to tasks main method This method is the entry point for the cron job, which is what we want to be monitoring Running the tasks in isolation (e.g. for testing) would not be monitored --- metrics/tasks/__main__.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/metrics/tasks/__main__.py b/metrics/tasks/__main__.py index a366de92..74b288f9 100644 --- a/metrics/tasks/__main__.py +++ b/metrics/tasks/__main__.py @@ -1,19 +1,45 @@ import pkgutil import structlog +from sentry_sdk.crons import capture_checkin +from sentry_sdk.crons.consts import MonitorStatus import metrics.tasks +from metrics.sentry import sentry log = structlog.get_logger() +sentry.init() for _, modname, _ in pkgutil.iter_modules(metrics.tasks.__path__): if modname != "__main__": log.info(f"Found {modname}") + monitor_slug = f"metrics-{modname}" + check_in_id = capture_checkin( + monitor_slug=monitor_slug, + status=MonitorStatus.IN_PROGRESS, + monitor_config=sentry.monitor_config, + ) try: pkgutil.resolve_name(f"metrics.tasks.{modname}").main() + capture_checkin( + monitor_slug=monitor_slug, + check_in_id=check_in_id, + status=MonitorStatus.OK, + monitor_config=sentry.monitor_config, + ) except AttributeError as error: log.error(f"Skipping {modname} because {error}") + capture_checkin( + monitor_slug=monitor_slug, + status=MonitorStatus.ERROR, + monitor_config=sentry.monitor_config, + ) except Exception as exc: log.error(f"Failed to run {modname} because because an error occurred.") log.exception(exc) + capture_checkin( + monitor_slug=monitor_slug, + status=MonitorStatus.ERROR, + monitor_config=sentry.monitor_config, + ) From a68aa1018ab06f727ff4925a066d769914d7ab29 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Tue, 25 Jun 2024 16:24:23 +0100 Subject: [PATCH 4/6] Add Sentry metrics DSN to dotenv and install instructions --- INSTALL.md | 1 + dotenv-sample | 3 +++ metrics/tasks/__main__.py | 20 +++++++------------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 4a6dd5df..38f183e5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -14,6 +14,7 @@ dokku config:set metrics SLACK_SIGNING_SECRET='xxx' dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx' dokku config:set metrics SLACK_TOKEN='xxx' dokku config:set metrics TIMESCALEDB_URL='xxx' +dokku config:set metrics SENTRY_DSN='xxx' ``` The `GITHUB_EBMDATALAB_TOKEN` and `GITHUB_OS_CORE_TOKEN` are fine-grained GitHub personal access tokens that are used for authenticating with the GitHub GraphQL API. diff --git a/dotenv-sample b/dotenv-sample index 7032b7ba..51c9576b 100644 --- a/dotenv-sample +++ b/dotenv-sample @@ -17,3 +17,6 @@ SLACK_TOKEN= # Slack channel ID for tech-support-channel SLACK_TECH_SUPPORT_CHANNEL_ID=C0270Q313H7 + +# Sentry DSN for the Metrics project +SENTRY_DSN= diff --git a/metrics/tasks/__main__.py b/metrics/tasks/__main__.py index 74b288f9..bb9f7d4f 100644 --- a/metrics/tasks/__main__.py +++ b/metrics/tasks/__main__.py @@ -15,31 +15,25 @@ if modname != "__main__": log.info(f"Found {modname}") monitor_slug = f"metrics-{modname}" + status = MonitorStatus.IN_PROGRESS check_in_id = capture_checkin( monitor_slug=monitor_slug, - status=MonitorStatus.IN_PROGRESS, + status=status, monitor_config=sentry.monitor_config, ) try: pkgutil.resolve_name(f"metrics.tasks.{modname}").main() - capture_checkin( - monitor_slug=monitor_slug, - check_in_id=check_in_id, - status=MonitorStatus.OK, - monitor_config=sentry.monitor_config, - ) + status = MonitorStatus.OK except AttributeError as error: log.error(f"Skipping {modname} because {error}") - capture_checkin( - monitor_slug=monitor_slug, - status=MonitorStatus.ERROR, - monitor_config=sentry.monitor_config, - ) + status = MonitorStatus.ERROR except Exception as exc: log.error(f"Failed to run {modname} because because an error occurred.") log.exception(exc) + status = MonitorStatus.ERROR + finally: capture_checkin( monitor_slug=monitor_slug, - status=MonitorStatus.ERROR, + status=status, monitor_config=sentry.monitor_config, ) From 57f630467878bcecac6da73bd0c9a626c5fee1d8 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Wed, 26 Jun 2024 16:55:42 +0100 Subject: [PATCH 5/6] Add ADR describing decision to add Sentry monitoring --- docs/adr/0003-use-sentry-cron-monitoring.md | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docs/adr/0003-use-sentry-cron-monitoring.md diff --git a/docs/adr/0003-use-sentry-cron-monitoring.md b/docs/adr/0003-use-sentry-cron-monitoring.md new file mode 100644 index 00000000..984626fe --- /dev/null +++ b/docs/adr/0003-use-sentry-cron-monitoring.md @@ -0,0 +1,38 @@ +# 2. Use Sentry Cron monitoring + +Date: 2024-06-26 + +## Status + +Accepted + +## Context + +The metrics tasks run as a dokku-controlled cron job. + +There have been instances when this cron job has failed silently due to software or configuration errors. +We would like to know if there have been errors or if the cron job has not run on schedule. + +The `dokku logs metrics` command did not reveal any trace of these silent errors. + +Sentry is in use for other monitoring of systems within the Bennett Institute. +Sentry [offers](https://docs.sentry.io/platforms/python/crons/) the ability to monitor scheduled cron jobs. +Sentry Cron monitoring allows [check-ins](https://docs.sentry.io/platforms/python/crons/#manual-check-ins) +to be sent at various points in the job's execution with status codes, but not full error messages. + +Sentry also offers [error capture](https://docs.sentry.io/product/sentry-basics/integrate-backend/capturing-errors/) +and reporting functionality, which is in use in other Bennett Institute projects. + +## Decision + +We will use Sentry Cron monitoring to monitor the execution of the metrics cron job on dokku3. + +We will not implement Sentry error capture at this point in time. + +## Consequences + +We will know if the metrics cron job is executing according to schedule, and if there are any errors or not. + +We will not know via Sentry what these errors are (if any). +Further manual investigation will be required. +If this becomes a problem in future, we can implement Sentry error capture. From e1394f4dcf110c29c348e38918c0533fa1594452 Mon Sep 17 00:00:00 2001 From: Jon Massey Date: Thu, 27 Jun 2024 12:01:41 +0100 Subject: [PATCH 6/6] Refactor sentry cron monitoring --- metrics/sentry/cron.py | 58 +++++++++++++++++++++++++++++++++++++++ metrics/sentry/sentry.py | 27 ------------------ metrics/tasks/__main__.py | 27 +++++------------- 3 files changed, 65 insertions(+), 47 deletions(-) create mode 100644 metrics/sentry/cron.py delete mode 100644 metrics/sentry/sentry.py diff --git a/metrics/sentry/cron.py b/metrics/sentry/cron.py new file mode 100644 index 00000000..f08b92b4 --- /dev/null +++ b/metrics/sentry/cron.py @@ -0,0 +1,58 @@ +import os + +import sentry_sdk +from sentry_sdk.crons import capture_checkin +from sentry_sdk.crons.consts import MonitorStatus + + +class Cron: + def __init__(self): + sentry_sdk.init( + dsn=os.environ.get("SENTRY_DSN"), + ) + + _monitor_config = { + "schedule": {"type": "crontab", "value": "@daily"}, + "timezone": "Etc/UTC", + # If an expected check-in doesn't come in `checkin_margin` + # minutes, it'll be considered missed + "checkin_margin": 30, + # The check-in is allowed to run for `max_runtime` minutes + # before it's considered failed + "max_runtime": 10, + # It'll take `failure_issue_threshold` consecutive failed + # check-ins to create an issue + "failure_issue_threshold": 1, + # It'll take `recovery_threshold` OK check-ins to resolve + # an issue + "recovery_threshold": 1, + } + + def get_monitor(self, modname): + monitor_slug = f"metrics-{modname}" + return self.Monitor(monitor_slug, self._monitor_config) + + class Monitor: + def __init__(self, monitor_slug, monitor_config): + self.monitor_slug = monitor_slug + self.monitor_config = monitor_config + self.check_in_id = None + + def _checkin(self, status): + check_in_id = capture_checkin( + monitor_slug=self.monitor_slug, + monitor_config=self.monitor_config, + status=status, + check_in_id=self.check_in_id, + ) + if not self.check_in_id: + self.check_in_id = check_in_id + + def in_progress(self): + self._checkin(status=MonitorStatus.IN_PROGRESS) + + def ok(self): + self._checkin(status=MonitorStatus.OK) + + def error(self): + self._checkin(status=MonitorStatus.ERROR) diff --git a/metrics/sentry/sentry.py b/metrics/sentry/sentry.py deleted file mode 100644 index 6a713bc1..00000000 --- a/metrics/sentry/sentry.py +++ /dev/null @@ -1,27 +0,0 @@ -import os - -import sentry_sdk - - -def init(): - sentry_sdk.init( - dsn=os.environ.get("SENTRY_DSN"), - ) - - -monitor_config = { - "schedule": {"type": "crontab", "value": "@daily"}, - "timezone": "Etc/UTC", - # If an expected check-in doesn't come in `checkin_margin` - # minutes, it'll be considered missed - "checkin_margin": 30, - # The check-in is allowed to run for `max_runtime` minutes - # before it's considered failed - "max_runtime": 10, - # It'll take `failure_issue_threshold` consecutive failed - # check-ins to create an issue - "failure_issue_threshold": 1, - # It'll take `recovery_threshold` OK check-ins to resolve - # an issue - "recovery_threshold": 1, -} diff --git a/metrics/tasks/__main__.py b/metrics/tasks/__main__.py index bb9f7d4f..43256aad 100644 --- a/metrics/tasks/__main__.py +++ b/metrics/tasks/__main__.py @@ -1,39 +1,26 @@ import pkgutil import structlog -from sentry_sdk.crons import capture_checkin -from sentry_sdk.crons.consts import MonitorStatus import metrics.tasks -from metrics.sentry import sentry +from metrics.sentry.cron import Cron log = structlog.get_logger() -sentry.init() +sentry_cron = Cron() for _, modname, _ in pkgutil.iter_modules(metrics.tasks.__path__): if modname != "__main__": log.info(f"Found {modname}") - monitor_slug = f"metrics-{modname}" - status = MonitorStatus.IN_PROGRESS - check_in_id = capture_checkin( - monitor_slug=monitor_slug, - status=status, - monitor_config=sentry.monitor_config, - ) + monitor = sentry_cron.get_monitor(modname) + monitor.in_progress() try: pkgutil.resolve_name(f"metrics.tasks.{modname}").main() - status = MonitorStatus.OK + monitor.ok() except AttributeError as error: log.error(f"Skipping {modname} because {error}") - status = MonitorStatus.ERROR + monitor.error() except Exception as exc: log.error(f"Failed to run {modname} because because an error occurred.") log.exception(exc) - status = MonitorStatus.ERROR - finally: - capture_checkin( - monitor_slug=monitor_slug, - status=status, - monitor_config=sentry.monitor_config, - ) + monitor.error()