Skip to content

Commit

Permalink
Add the backfill.py module for GitHub to the GitHub subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
ghickman committed Nov 7, 2023
1 parent 37ebaf2 commit 7e31bd8
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 29 deletions.
58 changes: 29 additions & 29 deletions backfill.py → metrics/github/backfill.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sqlite3
import subprocess
import sys
from datetime import date, timedelta
from pathlib import Path

import click
import structlog

from metrics.github.prs import process_prs
Expand All @@ -17,25 +17,24 @@
log = structlog.get_logger()


def get_data(db, orgs):
subprocess.check_call(["github-to-sqlite", "repos", db, *orgs])
def get_data(db, org):
subprocess.check_call(["github-to-sqlite", "repos", db, org])

con = sqlite3.connect(db)
cur = con.cursor()

for org in orgs:
result = cur.execute(
"SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",)
).fetchall()
repo_names = [r[0] for r in result]
result = cur.execute(
"SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",)
).fetchall()
repo_names = [r[0] for r in result]

for repo in repo_names:
subprocess.check_call(
["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"]
)
for repo in repo_names:
subprocess.check_call(
["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"]
)


def get_prs():
def get_prs(db):
sql = """
SELECT
date(pull_requests.created_at) as created,
Expand Down Expand Up @@ -121,30 +120,31 @@ def next_weekday(d, weekday):
process_prs(writer, prs_in_range, day)


if __name__ == "__main__":
@click.command()
@click.argument("org")
@click.option("--pull-data", is_flag=True, default=False)
@click.pass_context
def backfill(ctx, pull_data, org):
"""Backfill GitHub data for the given GitHub ORG"""
db = "github.db"
orgs = ["ebmdatalab", "opensafely-core"]

# hacky switch for [re-]building the local SQLite
args = sys.argv[1:]
if args and args[0] == "--pull-data":
if pull_data:
# clean up existing db
Path(db).unlink(missing_ok=True)

# pull all data down to make backfilling quicker
get_data(db, orgs)
get_data(db, org)

prs = get_prs()
prs = get_prs(db)

for org in orgs:
org_prs = [pr for pr in prs if pr["org"] == org]
log.info("Backfilling with %s PRs for %s", len(org_prs), org)
start_date = min([pr["created"] for pr in org_prs])
start_date = datetime_from_iso(start_date).date()
org_prs = [pr for pr in prs if pr["org"] == org]
log.info("Backfilling with %s PRs for %s", len(org_prs), org)
start_date = min([pr["created"] for pr in org_prs])
start_date = datetime_from_iso(start_date).date()

pr_queue(org_prs, org, start_date)
pr_queue(org_prs, org, start_date)

for day in [2, 10, 30, 60]:
pr_queue(org_prs, org, start_date, days_threshold=day)
for day in [2, 10, 30, 60]:
pr_queue(org_prs, org, start_date, days_threshold=day)

pr_throughput(org_prs, org, start_date)
pr_throughput(org_prs, org, start_date)
4 changes: 4 additions & 0 deletions metrics/github/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from ..timescaledb import TimescaleDBWriter
from . import api
from .backfill import backfill
from .prs import process_prs


Expand Down Expand Up @@ -60,3 +61,6 @@ def pr_throughput(ctx, org, date, days):
log.info("%s | %s | Processing %s PRs", date, org, len(prs))
with TimescaleDBWriter("github_pull_requests", "throughput") as writer:
process_prs(writer, prs, date)


github.add_command(backfill)

0 comments on commit 7e31bd8

Please sign in to comment.