From fd862edea07ee1c2049d07a6c5bd76bfd3af9dea Mon Sep 17 00:00:00 2001 From: George Hickman Date: Tue, 7 Nov 2023 15:21:31 +0000 Subject: [PATCH] Add the backfill.py module for GitHub to the GitHub subcommand --- backfill.py => metrics/github/backfill.py | 58 +++++++++++------------ metrics/github/cli.py | 4 ++ 2 files changed, 33 insertions(+), 29 deletions(-) rename backfill.py => metrics/github/backfill.py (74%) diff --git a/backfill.py b/metrics/github/backfill.py similarity index 74% rename from backfill.py rename to metrics/github/backfill.py index ff8ccd38..b2566627 100644 --- a/backfill.py +++ b/metrics/github/backfill.py @@ -1,9 +1,9 @@ import sqlite3 import subprocess -import sys from datetime import date, timedelta from pathlib import Path +import click import structlog from metrics.github.prs import process_prs @@ -17,25 +17,24 @@ log = structlog.get_logger() -def get_data(db, orgs): - subprocess.check_call(["github-to-sqlite", "repos", db, *orgs]) +def get_data(db, org): + subprocess.check_call(["github-to-sqlite", "repos", db, org]) con = sqlite3.connect(db) cur = con.cursor() - for org in orgs: - result = cur.execute( - "SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",) - ).fetchall() - repo_names = [r[0] for r in result] + result = cur.execute( + "SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",) + ).fetchall() + repo_names = [r[0] for r in result] - for repo in repo_names: - subprocess.check_call( - ["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"] - ) + for repo in repo_names: + subprocess.check_call( + ["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"] + ) -def get_prs(): +def get_prs(db): sql = """ SELECT date(pull_requests.created_at) as created, @@ -121,30 +120,31 @@ def next_weekday(d, weekday): process_prs(writer, prs_in_range, day) -if __name__ == "__main__": +@click.command() +@click.argument("org") +@click.option("--pull-data", is_flag=True, default=False) +@click.pass_context +def backfill(ctx, pull_data, org): + """Backfill GitHub data for the given GitHub ORG""" db = "github.db" - orgs = ["ebmdatalab", "opensafely-core"] - # hacky switch for [re-]building the local SQLite - args = sys.argv[1:] - if args and args[0] == "--pull-data": + if pull_data: # clean up existing db Path(db).unlink(missing_ok=True) # pull all data down to make backfilling quicker - get_data(db, orgs) + get_data(db, org) - prs = get_prs() + prs = get_prs(db) - for org in orgs: - org_prs = [pr for pr in prs if pr["org"] == org] - log.info("Backfilling with %s PRs for %s", len(org_prs), org) - start_date = min([pr["created"] for pr in org_prs]) - start_date = datetime_from_iso(start_date).date() + org_prs = [pr for pr in prs if pr["org"] == org] + log.info("Backfilling with %s PRs for %s", len(org_prs), org) + start_date = min([pr["created"] for pr in org_prs]) + start_date = datetime_from_iso(start_date).date() - pr_queue(org_prs, org, start_date) + pr_queue(org_prs, org, start_date) - for day in [2, 10, 30, 60]: - pr_queue(org_prs, org, start_date, days_threshold=day) + for day in [2, 10, 30, 60]: + pr_queue(org_prs, org, start_date, days_threshold=day) - pr_throughput(org_prs, org, start_date) + pr_throughput(org_prs, org, start_date) diff --git a/metrics/github/cli.py b/metrics/github/cli.py index bdd8c4c6..a18c48d3 100644 --- a/metrics/github/cli.py +++ b/metrics/github/cli.py @@ -5,6 +5,7 @@ from ..timescaledb import TimescaleDBWriter from . import api +from .backfill import backfill from .prs import process_prs @@ -60,3 +61,6 @@ def pr_throughput(ctx, org, date, days): log.info("%s | %s | Processing %s PRs", date, org, len(prs)) with TimescaleDBWriter("github_pull_requests", "throughput") as writer: process_prs(writer, prs, date) + + +github.add_command(backfill)