Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore PRs once repo is archived #45

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions metrics/github/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def _iter_pull_requests(org, date_range):
... on PullRequest {
createdAt
closedAt
mergedAt
author {
login
}
Expand All @@ -122,6 +123,7 @@ def _iter_pull_requests(org, date_range):
owner {
login
}
archivedAt
}
}
}
Expand All @@ -139,8 +141,10 @@ def _iter_pull_requests(org, date_range):
yield {
"created": date_from_iso(pr["createdAt"]),
"closed": date_from_iso(pr["closedAt"]),
"merged": date_from_iso(pr["mergedAt"]),
"author": pr["author"]["login"],
"repo": pr["repository"]["name"],
"repo_archived_at": date_from_iso(pr["repository"]["archivedAt"]),
"org": pr["repository"]["owner"]["login"],
}

Expand Down
208 changes: 149 additions & 59 deletions metrics/github/backfill.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import sqlite3
import subprocess
import json
import textwrap
from datetime import date, timedelta
from pathlib import Path

import click
import structlog
Expand All @@ -10,56 +9,152 @@
from ..timescaledb import TimescaleDBWriter
from ..timescaledb.tables import GitHubPullRequests
from ..tools.dates import date_from_iso, iter_days, previous_weekday
from .prs import process_prs
from .api import session
from .prs import drop_archived_prs, process_prs


setup_logging()

log = structlog.get_logger()


def get_data(db, org):
subprocess.check_call(["github-to-sqlite", "repos", db, org])

con = sqlite3.connect(db)
cur = con.cursor()
def get_query_page(*, query, session, cursor, **kwargs):
benbc marked this conversation as resolved.
Show resolved Hide resolved
"""
Get a page of the given query

result = cur.execute(
"SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",)
).fetchall()
repo_names = [r[0] for r in result]
This uses the GraphQL API to avoid making O(N) calls to GitHub's (v3) REST
API. The passed cursor is a GraphQL cursor [1] allowing us to call this
function in a loop, passing in the responses cursor to advance our view of
the data.

for repo in repo_names:
subprocess.check_call(
["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"]
[1]: https://graphql.org/learn/pagination/#end-of-list-counts-and-connections
"""
# use GraphQL variables to avoid string interpolation
variables = {"cursor": cursor, **kwargs}
payload = {"query": query, "variables": variables}

log.debug(query=query, **variables)
r = session.post("https://api.github.com/graphql", json=payload)

if not r.ok: # pragma: no cover
print(r.headers)
print(r.content)

r.raise_for_status()
results = r.json()

# In some cases graphql will return a 200 response when there are errors.
# https://sachee.medium.com/200-ok-error-handling-in-graphql-7ec869aec9bc
# Handling things robustly is complex and query specific, so here we simply
# take the absence of 'data' as an error, rather than the presence of
# 'errors' key.
if "data" not in results:
msg = textwrap.dedent(
f"""
graphql query failed

query:
{query}

response:
{json.dumps(results, indent=2)}
"""
)
raise RuntimeError(msg)

return results["data"]


def get_query(query, path, **kwargs):
def extract(data):
result = data
for key in path:
result = result[key]
return result

def get_prs(db):
sql = """
SELECT
date(pull_requests.created_at) as created,
date(pull_requests.closed_at) as closed,
date(pull_requests.merged_at) as merged,
authors.login as author,
repos.name as repo,
owners.login as org
FROM
pull_requests
LEFT OUTER JOIN repos ON (pull_requests.repo = repos.id)
LEFT OUTER JOIN users owners ON (repos.owner = owners.id)
LEFT OUTER JOIN users authors ON (pull_requests.user = authors.id)
WHERE
draft = 0
more_pages = True
cursor = None
while more_pages:
page = extract(
get_query_page(query=query, session=session, cursor=cursor, **kwargs)
)
yield from page["nodes"]
more_pages = page["pageInfo"]["hasNextPage"]
cursor = page["pageInfo"]["endCursor"]


def iter_repos(org):
query = """
query repos($cursor: String, $org: String!) {
organization(login: $org) {
repositories(first: 100, after: $cursor) {
nodes {
name
archivedAt
}
pageInfo {
endCursor
hasNextPage
}
}
}
}
"""
for repo in get_query(query, path=["organization", "repositories"], org=org):
yield {
"name": repo["name"],
"archived": repo["archivedAt"],
}


def iter_repo_prs(org, repo):
query = """
query prs($cursor: String, $org: String!, $repo: String!) {
organization(login: $org) {
repository(name: $repo) {
pullRequests(first: 100, after: $cursor) {
nodes {
author {
login
}
number
createdAt
closedAt
mergedAt
}
pageInfo {
endCursor
hasNextPage
}
}
}
}
}
"""
con = sqlite3.connect(db)
con.row_factory = sqlite3.Row
cur = con.cursor()
return list(cur.execute(sql))
for pr in get_query(
query,
path=["organization", "repository", "pullRequests"],
org=org,
repo=repo["name"],
):
yield {
"org": org,
"repo": repo["name"],
"repo_archived_at": date_from_iso(repo["archived"]),
"author": pr["author"]["login"],
"created": date_from_iso(pr["createdAt"]),
"closed": date_from_iso(pr["closedAt"]),
"merged": date_from_iso(pr["mergedAt"]),
}


def iter_prs(org):
for repo in iter_repos(org):
yield from iter_repo_prs(org, repo)


def open_prs(prs, org, days_threshold):
earliest = date_from_iso(min([pr["created"] for pr in prs]))
earliest = min([pr["created"] for pr in prs])
start = previous_weekday(earliest, 0) # Monday
mondays = list(iter_days(start, date.today(), step=timedelta(days=7)))

Expand All @@ -73,8 +168,11 @@ def open_on_day(pr, start, end):
Checks whether a PR is open today and if it's been open for greater or
equal to the threshold of days.
"""
closed = date_from_iso(pr["closed"]) or today
opened = date_from_iso(pr["created"])
closed = pr["closed"] or today
opened = pr["created"]

if pr["repo_archived_at"] and start > pr["repo_archived_at"]:
return False

open_today = (opened <= start) and (closed >= end)
if not open_today:
Expand All @@ -100,40 +198,32 @@ def open_on_day(pr, start, end):


def pr_throughput(prs, org):
start = date_from_iso(min([pr["created"] for pr in prs]))
start = min([pr["created"] for pr in prs])
days = list(iter_days(start, date.today()))

with TimescaleDBWriter(GitHubPullRequests) as writer:
for day in days:
opened_prs = [pr for pr in prs if date_from_iso(pr["created"]) == day]
opened_prs = [pr for pr in prs if pr["created"] == day]
log.info("%s | %s | Processing %s opened PRs", day, org, len(opened_prs))
process_prs(writer, opened_prs, day, name="prs_opened")

merged_prs = [
pr for pr in prs if pr["merged"] and date_from_iso(pr["merged"]) == day
]
merged_prs = [pr for pr in prs if pr["merged"] and pr["merged"] == day]
merged_prs = drop_archived_prs(merged_prs, key="merged")
log.info("%s | %s | Processing %s merged PRs", day, org, len(merged_prs))
process_prs(writer, merged_prs, day, name="prs_merged")


@click.command()
@click.argument("org")
@click.option("--pull-data", is_flag=True, default=False)
@click.option("--db-path", type=str, default="github.db")
@click.pass_context
def backfill(ctx, org, pull_data, db_path):
def backfill(ctx, org):
"""Backfill GitHub data for the given GitHub ORG"""
if pull_data:
# clean up existing db
Path(db_path).unlink(missing_ok=True)

# pull all data down to make backfilling quicker
get_data(db_path, org)

prs = get_prs(db_path)
prs = list(iter_prs(org))

org_prs = [pr for pr in prs if pr["org"] == org]
log.info("Backfilling with %s PRs for %s", len(org_prs), org)
# remove all PRs opened after a repo was archived, we don't expect to ever
# want these.
prs = drop_archived_prs(prs)
log.info("Backfilling with %s PRs for %s", len(prs), org)

open_prs(org_prs, org, days_threshold=7)
pr_throughput(org_prs, org)
open_prs(prs, org, days_threshold=7)
pr_throughput(prs, org)
5 changes: 4 additions & 1 deletion metrics/github/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ..tools.dates import previous_weekday
from . import api
from .backfill import backfill
from .prs import process_prs
from .prs import drop_archived_prs, process_prs


log = structlog.get_logger()
Expand Down Expand Up @@ -51,6 +51,7 @@ def open_prs(ctx, org, date, days_threshold):
for pr in prs
if (pr["closed"] - pr["created"]) >= timedelta(days=days_threshold)
]
open_prs = drop_archived_prs(open_prs)

log.info("%s | %s | Processing %s PRs", date, org, len(open_prs))
with TimescaleDBWriter(GitHubPullRequests) as writer:
Expand All @@ -69,10 +70,12 @@ def pr_throughput(ctx, org, date):

with TimescaleDBWriter(GitHubPullRequests) as writer:
opened_prs = api.prs_opened_on_date(org, date)
opened_prs = drop_archived_prs(opened_prs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the need for this filter. There won't be any newly-opened PRs for an archived repo, will there?

log.info("%s | %s | Processing %s opened PRs", date, org, len(opened_prs))
process_prs(writer, opened_prs, date, name="prs_opened")

merged_prs = api.prs_merged_on_date(org, date)
merged_prs = drop_archived_prs(merged_prs, key="merged")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the need for this filter. There won't be any newly-merged PRs for an archived repo, will there?

log.info("%s | %s | Processing %s merged PRs", date, org, len(merged_prs))
process_prs(writer, merged_prs, date, name="prs_merged")

Expand Down
20 changes: 20 additions & 0 deletions metrics/github/prs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
def drop_archived_prs(prs, key="created"):
"""
Drop PRs where the given key happened before the repo's archival date

By default this removes PRs opened after the point at which a repo was
archived, but can be configured for other checks, eg merging.
"""

def predicate(pr, key):
if not pr["repo_archived_at"]:
return True

if pr[key] < pr["repo_archived_at"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this can be right. It's keeping all PRs that were created/merged before the repo was archived, which will be all of them. Don't we need instead to compare the archive date with the "current" date (where "current" scans through time in back-filling)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argh, yep, you're completely right

return True

return False

return [pr for pr in prs if predicate(pr, key)]


def process_prs(writer, prs, date, name=""):
"""
Given a list of PRs, break them down in series for writing
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ classifiers = [
requires-python = ">=3.11"
dependencies = [
"click",
"github-to-sqlite",
"greenlet",
"requests",
"slack-bolt",
Expand All @@ -40,7 +39,7 @@ source = [
]

[tool.coverage.report]
fail_under = 69
fail_under = 67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what the point is of asserting on coverage if we're just going to keep bumping the threshold down. It may not be worth writing tests for full coverage, but in that case we don't get any value from measuring it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is the worst part of <100% coverage. My next PR bumps it up to 73 if that helps…?

skip_covered = true
show_missing = true

Expand Down
Loading