Skip to content

Commit

Permalink
Merge pull request #159 from ebmdatalab/benbc/content-prs
Browse files Browse the repository at this point in the history
Distinguish content PRs
  • Loading branch information
benbc authored Apr 9, 2024
2 parents 2faf47d + 094b718 commit 0330763
Show file tree
Hide file tree
Showing 11 changed files with 280 additions and 43 deletions.
45 changes: 42 additions & 3 deletions metrics/github/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,22 @@
# Slugs (not names!) of the GitHub entities we're interested in
_TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"]
_ORGS = ["ebmdatalab", "opensafely-core"]
# Some repos (for example the websites) frequently have PRs created by people
# outside the tech teams. We don't want our delivery metrics to be skewed by these
# and we don't necessarily want to hold people in other teams to the same hygiene
# standards as we hold ourselves. So we treat them differently. (We don't want to
# exclude such PRs from other repos because it's definitely interesting if there are
# old PRs (or lots of PRs) created by non-tech-team members in those repos.)
_CONTENT_REPOS = {
"ebmdatalab": [
"opensafely.org",
"team-manual",
"bennett.ox.ac.uk",
"openprescribing",
]
}
_MANAGERS = {"sebbacon", "benbc"}
_EX_DEVELOPERS = {"ghickman", "milanwiedemann", "CarolineMorton"}


@dataclass(frozen=True)
Expand All @@ -23,6 +39,10 @@ class Repo:
def is_tech_owned(self):
return self.team in _TECH_TEAMS

@property
def is_content_repo(self):
return self.org in _CONTENT_REPOS and self.name in _CONTENT_REPOS[self.org]

@classmethod
def from_dict(cls, data, org, team):
return cls(
Expand All @@ -42,6 +62,7 @@ class PR:
created_on: datetime.date
merged_on: datetime.date
closed_on: datetime.date
is_content: bool

def was_old_on(self, date):
opened = self.created_on
Expand All @@ -56,13 +77,17 @@ def was_merged_on(self, date):
return self.merged_on and date == self.merged_on

@classmethod
def from_dict(cls, data, repo):
def from_dict(cls, data, repo, tech_team_members):
author = data["author"]["login"]
is_content = repo.is_content_repo and author not in tech_team_members

return cls(
repo,
data["author"]["login"],
author,
date_from_iso(data["createdAt"]),
date_from_iso(data["mergedAt"]),
date_from_iso(data["closedAt"]),
is_content,
)


Expand All @@ -84,8 +109,9 @@ def from_dict(cls, data, repo):


def tech_prs():
tech_team_members = _tech_team_members()
return [
PR.from_dict(pr, repo)
PR.from_dict(pr, repo, tech_team_members)
for repo in tech_repos()
for pr in query.prs(repo.org, repo.name)
]
Expand Down Expand Up @@ -119,3 +145,16 @@ def _get_repos():

def _repo_owners(org):
return {repo: team for team in _TECH_TEAMS for repo in query.team_repos(org, team)}


def _tech_team_members():
return (
_MANAGERS
| _EX_DEVELOPERS
| {
person
for org in _ORGS
for team in _TECH_TEAMS
for person in query.team_members(org, team)
}
)
5 changes: 3 additions & 2 deletions metrics/github/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def calculate_pr_counts(prs, predicate):
end = pr.closed_on if pr.closed_on else datetime.date.today()
for day in iter_days(start, end):
if predicate(pr, day):
counts[(pr.repo.org, pr.repo.name, pr.author, day)] += 1
counts[(pr.repo.org, pr.repo.name, pr.author, pr.is_content, day)] += 1
return dict(counts)


def convert_pr_counts_to_metrics(counts, name):
metrics = []
for coord, count in counts.items():
org, repo, author, date_ = coord
org, repo, author, is_content, date_ = coord
timestamp = datetime.datetime.combine(date_, datetime.time())
metrics.append(
{
Expand All @@ -38,6 +38,7 @@ def convert_pr_counts_to_metrics(counts, name):
"organisation": org,
"repo": repo,
"author": author,
"is_content": is_content,
"value": count,
}
)
Expand Down
12 changes: 10 additions & 2 deletions metrics/github/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,19 @@ def team_repos(org, team):
"""The API doesn't make it easy for us to get all the information we need about repos in
one place, so we just return a list of repos here and join that to the richer repo objects
in the caller."""
results = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
for repo in results:
repos = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
for repo in repos:
yield repo["name"]


def team_members(org, team):
members = _client().rest_query(
"/orgs/{org}/teams/{team}/members", org=org, team=team
)
for member in members:
yield member["login"]


def vulnerabilities(org, repo):
query = """
query vulnerabilities($cursor: String, $org: String!, $repo: String!) {
Expand Down
1 change: 1 addition & 0 deletions metrics/timescaledb/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Column("name", Text, primary_key=True),
Column("value", Integer),
Column("author", Text, primary_key=True),
Column("is_content", Boolean, primary_key=True),
Column("organisation", Text, primary_key=True),
Column("repo", Text, primary_key=True),
)
Expand Down
61 changes: 61 additions & 0 deletions queries/current-old-prs-content.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
WITH
last_saturday AS (
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
SELECT day, repo, num_prs
FROM authors_aggregated, last_saturday
WHERE day < last_saturday.the_date
),
last_week_only AS (
SELECT day, repo, num_prs
FROM partial_week_ignored, last_saturday
WHERE day >= last_saturday.the_date - 7
),
bucketed_in_weeks AS (
SELECT
-- label buckets with the (exclusive) end date;
-- the 'origin' argument can be _any_ Saturday
time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
repo,
-- aggregate by taking the last value because this is a gauge, not a count
last(num_prs, day) as num_prs
FROM
last_week_only, last_saturday
GROUP BY bucket, repo
)
SELECT repo as "Repo", num_prs AS "Old PRs"
FROM bucketed_in_weeks
ORDER BY bucket DESC, num_prs DESC
LIMIT 5
14 changes: 7 additions & 7 deletions queries/current-old-prs-humans.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, num_prs
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
55 changes: 55 additions & 0 deletions queries/old-prs-content.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
WITH
last_saturday AS (
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
just_content AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM just_content
GROUP BY day, repo
),
partial_week_ignored AS (
SELECT day, repo, num_prs
FROM authors_aggregated, last_saturday
WHERE day < last_saturday.the_date
),
bucketed_in_weeks AS (
SELECT
-- label buckets with the (exclusive) end date;
-- the 'origin' argument can be _any_ Saturday
time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
repo,
-- aggregate by taking the last value because this is a gauge, not a count
last(num_prs, day) as num_prs
FROM
partial_week_ignored, last_saturday
GROUP BY bucket, repo
)
SELECT bucket, repo, num_prs
FROM bucketed_in_weeks
ORDER BY bucket DESC, repo
14 changes: 7 additions & 7 deletions queries/old-prs-humans.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, num_prs
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
14 changes: 7 additions & 7 deletions queries/throughput-humals.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
throughputs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'prs_merged'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM throughputs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as throughput
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as throughput
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, throughput
SELECT day, repo, author, is_content, throughput
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, throughput
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(throughput) as throughput
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
Loading

0 comments on commit 0330763

Please sign in to comment.