Merge pull request #159 from ebmdatalab/benbc/content-prs

Distinguish content PRs
ebmdatalab · Apr 9, 2024 · 0330763 · 0330763
2 parents 2faf47d + 094b718
commit 0330763
Show file tree

Hide file tree

Showing 11 changed files with 280 additions and 43 deletions.
diff --git a/metrics/github/github.py b/metrics/github/github.py
@@ -8,6 +8,22 @@
 # Slugs (not names!) of the GitHub entities we're interested in
 _TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"]
 _ORGS = ["ebmdatalab", "opensafely-core"]
+# Some repos (for example the websites) frequently have PRs created by people
+# outside the tech teams. We don't want our delivery metrics to be skewed by these
+# and we don't necessarily want to hold people in other teams to the same hygiene
+# standards as we hold ourselves. So we treat them differently. (We don't want to
+# exclude such PRs from other repos because it's definitely interesting if there are
+# old PRs (or lots of PRs) created by non-tech-team members in those repos.)
+_CONTENT_REPOS = {
+    "ebmdatalab": [
+        "opensafely.org",
+        "team-manual",
+        "bennett.ox.ac.uk",
+        "openprescribing",
+    ]
+}
+_MANAGERS = {"sebbacon", "benbc"}
+_EX_DEVELOPERS = {"ghickman", "milanwiedemann", "CarolineMorton"}
 
 
 @dataclass(frozen=True)
@@ -23,6 +39,10 @@ class Repo:
     def is_tech_owned(self):
         return self.team in _TECH_TEAMS
 
+    @property
+    def is_content_repo(self):
+        return self.org in _CONTENT_REPOS and self.name in _CONTENT_REPOS[self.org]
+
     @classmethod
     def from_dict(cls, data, org, team):
         return cls(
@@ -42,6 +62,7 @@ class PR:
     created_on: datetime.date
     merged_on: datetime.date
     closed_on: datetime.date
+    is_content: bool
 
     def was_old_on(self, date):
         opened = self.created_on
@@ -56,13 +77,17 @@ def was_merged_on(self, date):
         return self.merged_on and date == self.merged_on
 
     @classmethod
-    def from_dict(cls, data, repo):
+    def from_dict(cls, data, repo, tech_team_members):
+        author = data["author"]["login"]
+        is_content = repo.is_content_repo and author not in tech_team_members
+
         return cls(
             repo,
-            data["author"]["login"],
+            author,
             date_from_iso(data["createdAt"]),
             date_from_iso(data["mergedAt"]),
             date_from_iso(data["closedAt"]),
+            is_content,
         )
 
 
@@ -84,8 +109,9 @@ def from_dict(cls, data, repo):
 
 
 def tech_prs():
+    tech_team_members = _tech_team_members()
     return [
-        PR.from_dict(pr, repo)
+        PR.from_dict(pr, repo, tech_team_members)
         for repo in tech_repos()
         for pr in query.prs(repo.org, repo.name)
     ]
@@ -119,3 +145,16 @@ def _get_repos():
 
 def _repo_owners(org):
     return {repo: team for team in _TECH_TEAMS for repo in query.team_repos(org, team)}
+
+
+def _tech_team_members():
+    return (
+        _MANAGERS
+        | _EX_DEVELOPERS
+        | {
+            person
+            for org in _ORGS
+            for team in _TECH_TEAMS
+            for person in query.team_members(org, team)
+        }
+    )
diff --git a/metrics/github/metrics.py b/metrics/github/metrics.py
@@ -22,14 +22,14 @@ def calculate_pr_counts(prs, predicate):
         end = pr.closed_on if pr.closed_on else datetime.date.today()
         for day in iter_days(start, end):
             if predicate(pr, day):
-                counts[(pr.repo.org, pr.repo.name, pr.author, day)] += 1
+                counts[(pr.repo.org, pr.repo.name, pr.author, pr.is_content, day)] += 1
     return dict(counts)
 
 
 def convert_pr_counts_to_metrics(counts, name):
     metrics = []
     for coord, count in counts.items():
-        org, repo, author, date_ = coord
+        org, repo, author, is_content, date_ = coord
         timestamp = datetime.datetime.combine(date_, datetime.time())
         metrics.append(
             {
@@ -38,6 +38,7 @@ def convert_pr_counts_to_metrics(counts, name):
                 "organisation": org,
                 "repo": repo,
                 "author": author,
+                "is_content": is_content,
                 "value": count,
             }
         )

diff --git a/metrics/github/query.py b/metrics/github/query.py
@@ -32,11 +32,19 @@ def team_repos(org, team):
     """The API doesn't make it easy for us to get all the information we need about repos in
     one place, so we just return a list of repos here and join that to the richer repo objects
     in the caller."""
-    results = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
-    for repo in results:
+    repos = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
+    for repo in repos:
         yield repo["name"]
 
 
+def team_members(org, team):
+    members = _client().rest_query(
+        "/orgs/{org}/teams/{team}/members", org=org, team=team
+    )
+    for member in members:
+        yield member["login"]
+
+
 def vulnerabilities(org, repo):
     query = """
     query vulnerabilities($cursor: String, $org: String!, $repo: String!) {

diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py
@@ -20,6 +20,7 @@
     Column("name", Text, primary_key=True),
     Column("value", Integer),
     Column("author", Text, primary_key=True),
+    Column("is_content", Boolean, primary_key=True),
     Column("organisation", Text, primary_key=True),
     Column("repo", Text, primary_key=True),
 )

diff --git a/queries/current-old-prs-content.sql b/queries/current-old-prs-content.sql
@@ -0,0 +1,61 @@
+WITH
+  last_saturday AS (
+    SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
+  ),
+  old_prs_only AS (
+    SELECT time, organisation, repo, author, is_content, value
+    FROM github_pull_requests
+    WHERE name = 'queue_older_than_7_days'
+  ),
+  in_timeframe AS (
+    SELECT time, organisation, repo, author, is_content, value
+    FROM old_prs_only
+    WHERE $__timeFilter(time)
+  ),
+  fields_munged AS (
+    -- the time field is a timestamp, but we only ever write midnight;
+    -- we need to keep it as a timestamp type for bucketing below
+    SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
+    FROM in_timeframe
+  ),
+  dependabot_removed AS (
+    SELECT day, repo, author, is_content, num_prs
+    FROM fields_munged
+    WHERE author NOT LIKE 'dependabot%'
+  ),
+  content_ignored AS (
+    SELECT day, repo, author, num_prs
+    FROM dependabot_removed
+    WHERE is_content
+  ),
+  authors_aggregated AS (
+    SELECT day, repo, sum(num_prs) as num_prs
+    FROM content_ignored
+    GROUP BY day, repo
+  ),
+  partial_week_ignored AS (
+    SELECT day, repo, num_prs
+    FROM authors_aggregated, last_saturday
+    WHERE day < last_saturday.the_date
+  ),
+  last_week_only AS (
+    SELECT day, repo, num_prs
+    FROM partial_week_ignored, last_saturday
+    WHERE day >= last_saturday.the_date - 7
+  ),
+  bucketed_in_weeks AS (
+    SELECT
+      -- label buckets with the (exclusive) end date;
+      -- the 'origin' argument can be _any_ Saturday
+      time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
+      repo,
+      -- aggregate by taking the last value because this is a gauge, not a count
+      last(num_prs, day) as num_prs
+    FROM
+      last_week_only, last_saturday
+    GROUP BY bucket, repo
+  )
+SELECT repo as "Repo", num_prs AS "Old PRs"
+FROM bucketed_in_weeks
+ORDER BY bucket DESC, num_prs DESC
+LIMIT 5
diff --git a/queries/current-old-prs-humans.sql b/queries/current-old-prs-humans.sql
@@ -3,34 +3,34 @@ WITH
     SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
   ),
   old_prs_only AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM github_pull_requests
     WHERE name = 'queue_older_than_7_days'
   ),
   in_timeframe AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM old_prs_only
     WHERE $__timeFilter(time)
   ),
   fields_munged AS (
     -- the time field is a timestamp, but we only ever write midnight;
     -- we need to keep it as a timestamp type for bucketing below
-    SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
+    SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
     FROM in_timeframe
   ),
   dependabot_removed AS (
-    SELECT day, repo, author, num_prs
+    SELECT day, repo, author, is_content, num_prs
     FROM fields_munged
     WHERE author NOT LIKE 'dependabot%'
   ),
-  non_dev_op_ignored AS (
+  content_ignored AS (
     SELECT day, repo, author, num_prs
     FROM dependabot_removed
-    WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
+    WHERE NOT is_content
   ),
   authors_aggregated AS (
     SELECT day, repo, sum(num_prs) as num_prs
-    FROM non_dev_op_ignored
+    FROM content_ignored
     GROUP BY day, repo
   ),
   partial_week_ignored AS (

diff --git a/queries/old-prs-content.sql b/queries/old-prs-content.sql
@@ -0,0 +1,55 @@
+WITH
+  last_saturday AS (
+    SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
+  ),
+  old_prs_only AS (
+    SELECT time, organisation, repo, author, is_content, value
+    FROM github_pull_requests
+    WHERE name = 'queue_older_than_7_days'
+  ),
+  in_timeframe AS (
+    SELECT time, organisation, repo, author, is_content, value
+    FROM old_prs_only
+    WHERE $__timeFilter(time)
+  ),
+  fields_munged AS (
+    -- the time field is a timestamp, but we only ever write midnight;
+    -- we need to keep it as a timestamp type for bucketing below
+    SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
+    FROM in_timeframe
+  ),
+  dependabot_removed AS (
+    SELECT day, repo, author, is_content, num_prs
+    FROM fields_munged
+    WHERE author NOT LIKE 'dependabot%'
+  ),
+  just_content AS (
+    SELECT day, repo, author, num_prs
+    FROM dependabot_removed
+    WHERE is_content
+  ),
+  authors_aggregated AS (
+    SELECT day, repo, sum(num_prs) as num_prs
+    FROM just_content
+    GROUP BY day, repo
+  ),
+  partial_week_ignored AS (
+    SELECT day, repo, num_prs
+    FROM authors_aggregated, last_saturday
+    WHERE day < last_saturday.the_date
+  ),
+  bucketed_in_weeks AS (
+    SELECT
+      -- label buckets with the (exclusive) end date;
+      -- the 'origin' argument can be _any_ Saturday
+      time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
+      repo,
+      -- aggregate by taking the last value because this is a gauge, not a count
+      last(num_prs, day) as num_prs
+    FROM
+      partial_week_ignored, last_saturday
+    GROUP BY bucket, repo
+  )
+SELECT bucket, repo, num_prs
+FROM bucketed_in_weeks
+ORDER BY bucket DESC, repo
diff --git a/queries/old-prs-humans.sql b/queries/old-prs-humans.sql
@@ -3,34 +3,34 @@ WITH
     SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
   ),
   old_prs_only AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM github_pull_requests
     WHERE name = 'queue_older_than_7_days'
   ),
   in_timeframe AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM old_prs_only
     WHERE $__timeFilter(time)
   ),
   fields_munged AS (
     -- the time field is a timestamp, but we only ever write midnight;
     -- we need to keep it as a timestamp type for bucketing below
-    SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
+    SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
     FROM in_timeframe
   ),
   dependabot_removed AS (
-    SELECT day, repo, author, num_prs
+    SELECT day, repo, author, is_content, num_prs
     FROM fields_munged
     WHERE author NOT LIKE 'dependabot%'
   ),
-  non_dev_op_ignored AS (
+  content_ignored AS (
     SELECT day, repo, author, num_prs
     FROM dependabot_removed
-    WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
+    WHERE NOT is_content
   ),
   authors_aggregated AS (
     SELECT day, repo, sum(num_prs) as num_prs
-    FROM non_dev_op_ignored
+    FROM content_ignored
     GROUP BY day, repo
   ),
   partial_week_ignored AS (

diff --git a/queries/throughput-humals.sql b/queries/throughput-humals.sql
@@ -3,34 +3,34 @@ WITH
     SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
   ),
   throughputs_only AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM github_pull_requests
     WHERE name = 'prs_merged'
   ),
   in_timeframe AS (
-    SELECT time, organisation, repo, author, value
+    SELECT time, organisation, repo, author, is_content, value
     FROM throughputs_only
     WHERE $__timeFilter(time)
   ),
   fields_munged AS (
     -- the time field is a timestamp, but we only ever write midnight;
     -- we need to keep it as a timestamp type for bucketing below
-    SELECT time as day, organisation||'/'||repo AS repo, author, value as throughput
+    SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as throughput
     FROM in_timeframe
   ),
   dependabot_removed AS (
-    SELECT day, repo, author, throughput
+    SELECT day, repo, author, is_content, throughput
     FROM fields_munged
     WHERE author NOT LIKE 'dependabot%'
   ),
-  non_dev_op_ignored AS (
+  content_ignored AS (
     SELECT day, repo, author, throughput
     FROM dependabot_removed
-    WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
+    WHERE NOT is_content
   ),
   authors_aggregated AS (
     SELECT day, repo, sum(throughput) as throughput
-    FROM non_dev_op_ignored
+    FROM content_ignored
     GROUP BY day, repo
   ),
   partial_week_ignored AS (