diff --git a/INSTALL.md b/INSTALL.md index 4a6dd5df..38f183e5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -14,6 +14,7 @@ dokku config:set metrics SLACK_SIGNING_SECRET='xxx' dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx' dokku config:set metrics SLACK_TOKEN='xxx' dokku config:set metrics TIMESCALEDB_URL='xxx' +dokku config:set metrics SENTRY_DSN='xxx' ``` The `GITHUB_EBMDATALAB_TOKEN` and `GITHUB_OS_CORE_TOKEN` are fine-grained GitHub personal access tokens that are used for authenticating with the GitHub GraphQL API. diff --git a/docs/adr/0003-use-sentry-cron-monitoring.md b/docs/adr/0003-use-sentry-cron-monitoring.md new file mode 100644 index 00000000..984626fe --- /dev/null +++ b/docs/adr/0003-use-sentry-cron-monitoring.md @@ -0,0 +1,38 @@ +# 2. Use Sentry Cron monitoring + +Date: 2024-06-26 + +## Status + +Accepted + +## Context + +The metrics tasks run as a dokku-controlled cron job. + +There have been instances when this cron job has failed silently due to software or configuration errors. +We would like to know if there have been errors or if the cron job has not run on schedule. + +The `dokku logs metrics` command did not reveal any trace of these silent errors. + +Sentry is in use for other monitoring of systems within the Bennett Institute. +Sentry [offers](https://docs.sentry.io/platforms/python/crons/) the ability to monitor scheduled cron jobs. +Sentry Cron monitoring allows [check-ins](https://docs.sentry.io/platforms/python/crons/#manual-check-ins) +to be sent at various points in the job's execution with status codes, but not full error messages. + +Sentry also offers [error capture](https://docs.sentry.io/product/sentry-basics/integrate-backend/capturing-errors/) +and reporting functionality, which is in use in other Bennett Institute projects. + +## Decision + +We will use Sentry Cron monitoring to monitor the execution of the metrics cron job on dokku3. + +We will not implement Sentry error capture at this point in time. + +## Consequences + +We will know if the metrics cron job is executing according to schedule, and if there are any errors or not. + +We will not know via Sentry what these errors are (if any). +Further manual investigation will be required. +If this becomes a problem in future, we can implement Sentry error capture. diff --git a/dotenv-sample b/dotenv-sample index 7032b7ba..51c9576b 100644 --- a/dotenv-sample +++ b/dotenv-sample @@ -17,3 +17,6 @@ SLACK_TOKEN= # Slack channel ID for tech-support-channel SLACK_TECH_SUPPORT_CHANNEL_ID=C0270Q313H7 + +# Sentry DSN for the Metrics project +SENTRY_DSN= diff --git a/metrics/sentry/__init__.py b/metrics/sentry/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/metrics/sentry/cron.py b/metrics/sentry/cron.py new file mode 100644 index 00000000..f08b92b4 --- /dev/null +++ b/metrics/sentry/cron.py @@ -0,0 +1,58 @@ +import os + +import sentry_sdk +from sentry_sdk.crons import capture_checkin +from sentry_sdk.crons.consts import MonitorStatus + + +class Cron: + def __init__(self): + sentry_sdk.init( + dsn=os.environ.get("SENTRY_DSN"), + ) + + _monitor_config = { + "schedule": {"type": "crontab", "value": "@daily"}, + "timezone": "Etc/UTC", + # If an expected check-in doesn't come in `checkin_margin` + # minutes, it'll be considered missed + "checkin_margin": 30, + # The check-in is allowed to run for `max_runtime` minutes + # before it's considered failed + "max_runtime": 10, + # It'll take `failure_issue_threshold` consecutive failed + # check-ins to create an issue + "failure_issue_threshold": 1, + # It'll take `recovery_threshold` OK check-ins to resolve + # an issue + "recovery_threshold": 1, + } + + def get_monitor(self, modname): + monitor_slug = f"metrics-{modname}" + return self.Monitor(monitor_slug, self._monitor_config) + + class Monitor: + def __init__(self, monitor_slug, monitor_config): + self.monitor_slug = monitor_slug + self.monitor_config = monitor_config + self.check_in_id = None + + def _checkin(self, status): + check_in_id = capture_checkin( + monitor_slug=self.monitor_slug, + monitor_config=self.monitor_config, + status=status, + check_in_id=self.check_in_id, + ) + if not self.check_in_id: + self.check_in_id = check_in_id + + def in_progress(self): + self._checkin(status=MonitorStatus.IN_PROGRESS) + + def ok(self): + self._checkin(status=MonitorStatus.OK) + + def error(self): + self._checkin(status=MonitorStatus.ERROR) diff --git a/metrics/tasks/__main__.py b/metrics/tasks/__main__.py index a366de92..43256aad 100644 --- a/metrics/tasks/__main__.py +++ b/metrics/tasks/__main__.py @@ -3,17 +3,24 @@ import structlog import metrics.tasks +from metrics.sentry.cron import Cron log = structlog.get_logger() +sentry_cron = Cron() for _, modname, _ in pkgutil.iter_modules(metrics.tasks.__path__): if modname != "__main__": log.info(f"Found {modname}") + monitor = sentry_cron.get_monitor(modname) + monitor.in_progress() try: pkgutil.resolve_name(f"metrics.tasks.{modname}").main() + monitor.ok() except AttributeError as error: log.error(f"Skipping {modname} because {error}") + monitor.error() except Exception as exc: log.error(f"Failed to run {modname} because because an error occurred.") log.exception(exc) + monitor.error() diff --git a/requirements.prod.in b/requirements.prod.in index 7ff887ca..230eded1 100644 --- a/requirements.prod.in +++ b/requirements.prod.in @@ -3,3 +3,4 @@ slack-bolt sqlalchemy[postgresql_psycopgbinary] structlog + sentry-sdk diff --git a/requirements.prod.txt b/requirements.prod.txt index 9209f8de..a8cee471 100644 --- a/requirements.prod.txt +++ b/requirements.prod.txt @@ -7,7 +7,9 @@ certifi==2024.2.2 \ --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \ --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1 - # via requests + # via + # requests + # sentry-sdk charset-normalizer==3.3.2 \ --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ @@ -241,6 +243,10 @@ requests==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 # via -r requirements.prod.in +sentry-sdk==2.6.0 \ + --hash=sha256:422b91cb49378b97e7e8d0e8d5a1069df23689d45262b86f54988a7db264e874 \ + --hash=sha256:65cc07e9c6995c5e316109f138570b32da3bd7ff8d0d0ee4aaf2628c3dd8127d + # via -r requirements.prod.in slack-bolt==1.19.0 \ --hash=sha256:45135b8a1dea40abeb20b9b9d1973953f6755b76388156ee218d6c61d96f992a \ --hash=sha256:810891cc110e0fb3948f26c044302ed90abda2a25e9ec1689e179da8bb2747cf @@ -313,4 +319,6 @@ typing-extensions==4.11.0 \ urllib3==2.2.1 \ --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \ --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19 - # via requests + # via + # requests + # sentry-sdk