ebmdatalab · ghickman · Nov 6, 2023 · Nov 3, 2023
diff --git a/backfill.py b/backfill.py
@@ -6,9 +6,9 @@
 
 import structlog
 
-from metrics import influxdb
 from metrics.github.prs import process_prs
 from metrics.logs import setup_logging
+from metrics.timescaledb import TimescaleDBWriter
 from metrics.tools.dates import date_from_iso, datetime_from_iso, iter_days
 
 
@@ -17,9 +17,6 @@
 log = structlog.get_logger()
 
 
-writer = influxdb.write
-
-
 def get_data(db, orgs):
     subprocess.check_call(["github-to-sqlite", "repos", db, *orgs])
 
@@ -83,7 +80,8 @@ def pr_queue(prs, org, start, days_threshold=None):
         key = f"queue{suffix}"
 
         log.info("%s | %s | %s | Processing %s PRs", key, day, org, len(prs_on_day))
-        process_prs(writer, key, prs_on_day, day)
+        with TimescaleDBWriter("github_pull_requests", f"queue{suffix}") as writer:
+            process_prs(writer, prs_on_day, day)
 
 
 def pr_throughput(prs, org, start):
@@ -119,7 +117,8 @@ def next_weekday(d, weekday):
 
         key = "throughput"
         log.info("%s | %s | %s | Processing %s PRs", key, day, org, len(prs_in_range))
-        process_prs(writer, key, prs_in_range, day)
+        with TimescaleDBWriter("github_pull_requests", "throughput") as writer:
+            process_prs(writer, prs_in_range, day)
 
 
 if __name__ == "__main__":

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -21,25 +21,12 @@ services:
       GF_DATABASE_SSL_MODE: disable
     depends_on:
       - db
-      - influxdb
+      - timescaledb
     ports:
       - 3000:3000
     volumes:
       - grafana:/var/lib/grafana
 
-  influxdb:
-    image: influxdb:latest
-    environment:
-      DOCKER_INFLUXDB_INIT_MODE: setup
-      DOCKER_INFLUXDB_INIT_USERNAME: admin
-      DOCKER_INFLUXDB_INIT_PASSWORD: admin
-      DOCKER_INFLUXDB_INIT_ORG: bennett
-      DOCKER_INFLUXDB_INIT_BUCKET: data
-    ports:
-      - 8086:8086
-    volumes:
-      - influxdb:/var/lib/influxdb2
-
   timescaledb:
     image: timescale/timescaledb-ha:pg14-latest
     environment:
@@ -52,5 +39,4 @@ services:
 volumes:
   postgres:
   grafana:
-  influxdb:
   timescaledb:
diff --git a/metrics/cli.py b/metrics/cli.py
@@ -7,13 +7,15 @@
 
 @click.group()
 @click.option("--debug", default=False, is_flag=True)
+@click.option("--database-url", required=True, envvar="DATABASE_URL")
 @click.pass_context
-def cli(ctx, debug):
+def cli(ctx, debug, database_url):
     ctx.ensure_object(dict)
 
     setup_logging(debug)
 
     ctx.obj["DEBUG"] = debug
+    ctx.obj["DATABASE_URL"] = database_url
 
 
 cli.add_command(github)

diff --git a/metrics/github/cli.py b/metrics/github/cli.py
@@ -3,13 +3,12 @@
 import click
 import structlog
 
-from .. import influxdb
+from ..timescaledb import TimescaleDBWriter
 from . import api
 from .prs import process_prs
 
 
 log = structlog.get_logger()
-writer = influxdb.write
 
 
 @click.group()
@@ -42,7 +41,8 @@ def pr_queue(ctx, org, date, days_threshold):
     suffix = f"_older_than_{days_threshold}_days" if days_threshold else ""
 
     log.info("%s | %s | Processing %s PRs", date, org, len(prs))
-    process_prs(writer, f"queue{suffix}", prs, date)
+    with TimescaleDBWriter("github_pull_requests", f"queue{suffix}") as writer:
+        process_prs(writer, prs, date)
 
 
 @github.command()
@@ -58,4 +58,5 @@ def pr_throughput(ctx, org, date, days):
     prs = api.prs_opened_in_the_last_N_days(org, start, end)
 
     log.info("%s | %s | Processing %s PRs", date, org, len(prs))
-    process_prs(writer, "throughput", prs, date)
+    with TimescaleDBWriter("github_pull_requests", "throughput") as writer:
+        process_prs(writer, prs, date)
diff --git a/metrics/github/prs.py b/metrics/github/prs.py
@@ -1,4 +1,4 @@
-def process_prs(writer, key, prs, date):
+def process_prs(writer, prs, date):
     """
     Given a list of PRs, break them down in series for writing
 
@@ -20,13 +20,10 @@ def process_prs(writer, key, prs, date):
 
             org = list(orgs)[0]
 
-            writer(
-                f"github_pull_requests_{key}",
+            writer.write(
                 date,
                 len(prs_by_author_and_repo),
-                tags={
-                    "author": author,
-                    "organisation": org,
-                    "repo": repo,
-                },
+                author=author,
+                organisation=org,
+                repo=repo,
             )
diff --git a/metrics/influxdb.py b/metrics/influxdb.py
diff --git a/metrics/slack/cli.py b/metrics/slack/cli.py
@@ -3,13 +3,10 @@
 
 import click
 
-from .. import influxdb
+from ..timescaledb import TimescaleDBWriter
 from .api import get_app, iter_messages
 
 
-writer = influxdb.write
-
-
 @click.group()
 @click.option("--signing-secret", required=True, envvar="SLACK_SIGNING_SECRET")
 @click.option("--token", required=True, envvar="SLACK_TOKEN")
@@ -38,11 +35,8 @@ def tech_support(ctx, date, tech_support_channel_id, backfill):
 
     messages = iter_messages(app, tech_support_channel_id, date=day)
 
-    for date, messages in itertools.groupby(
-        messages, lambda m: datetime.fromtimestamp(float(m["ts"])).date()
-    ):
-        writer(
-            "slack_tech_support_requests",
-            date,
-            len(list(messages)),
-        )
+    with TimescaleDBWriter("slack_tech_support", "requests") as writer:
+        for date, messages in itertools.groupby(
+            messages, lambda m: datetime.fromtimestamp(float(m["ts"])).date()
+        ):
+            writer.write(date, len(list(messages)))
diff --git a/metrics/timescaledb.py b/metrics/timescaledb.py
diff --git a/metrics/timescaledb/__init__.py b/metrics/timescaledb/__init__.py
@@ -0,0 +1,6 @@
+from .writer import TimescaleDBWriter
+
+
+__all__ = [
+    "TimescaleDBWriter",
+]
diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py
@@ -0,0 +1,19 @@
+github_pull_requests = """
+CREATE TABLE IF NOT EXISTS github_pull_requests (
+    time TIMESTAMP WITH TIME ZONE NOT NULL,
+    name TEXT NOT NULL,
+    value INTEGER NOT NULL,
+    author TEXT NOT NULL,
+    organisation TEXT NOT NULL,
+    repo TEXT NOT NULL,
+    CONSTRAINT github_pull_requests_must_be_different UNIQUE (time, name, author, repo)
+);
+"""
+slack_tech_support = """
+CREATE TABLE IF NOT EXISTS slack_tech_support (
+    time TIMESTAMP WITH TIME ZONE NOT NULL,
+    name TEXT NOT NULL,
+    value INTEGER NOT NULL,
+    CONSTRAINT slack_tech_support_must_be_different UNIQUE (time, name)
+);
+"""
diff --git a/metrics/timescaledb/writer.py b/metrics/timescaledb/writer.py
@@ -0,0 +1,76 @@
+import os
+from datetime import datetime, time
+
+import psycopg
+import structlog
+
+from . import tables
+
+
+log = structlog.get_logger()
+
+DATABASE_URL = os.environ["DATABASE_URL"]
+
+
+def ensure_table(name):
+    """
+    Ensure both the table and hypertable config exist in the database
+    """
+    run(getattr(tables, name))
+
+    run(
+        "SELECT create_hypertable(%s, 'time', if_not_exists => TRUE);",
+        [name],
+    )
+
+
+def run(sql, *args):
+    with psycopg.connect(DATABASE_URL) as conn:
+        cursor = conn.cursor()
+
+        return cursor.execute(sql, *args)
+
+
+class TimescaleDBWriter:
+    def __init__(self, table, key):
+        self.key = key
+        self.table = table
+
+    def __enter__(self):
+        ensure_table(self.table)
+
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+    def write(self, date, value, **kwargs):
+        # convert date to a timestamp
+        # TODO: do we need to do any checking to make sure this is tz-aware and in
+        # UTC?
+        dt = datetime.combine(date, time())
+
+        # insert into the table set at instantiation
+        # unique by the tables `{name}_must_be_different` and we always want to
+        # bump the value if that triggers a conflict
+        # the columns could differ per table… do we want an object to represent tables?
+        if kwargs:
+            extra_fields = ", " + ", ".join(kwargs.keys())
+            placeholders = ", " + ", ".join(["%s" for k in kwargs.keys()])
+        else:
+            extra_fields = ""
+            placeholders = ""
+        sql = f"""
+        INSERT INTO {self.table} (time, name, value {extra_fields})
+        VALUES (%s, %s, %s {placeholders})
+        ON CONFLICT ON CONSTRAINT {self.table}_must_be_different DO UPDATE SET value = EXCLUDED.value;
+        """
+
+        run(sql, (dt, self.key, value, *kwargs.values()))
+
+        log.debug(
+            self.key,
+            date=dt.isoformat(),
+            value=value,
+            **kwargs,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,6 @@ requires-python = ">=3.11"
 dependencies = [
   "click",
   "github-to-sqlite",
-  "influxdb-client",
   "requests",
   "psycopg[binary]",
   "slack-bolt",