From a5714e29de0fa8ec1b15b8c7e56c5200c6f06c53 Mon Sep 17 00:00:00 2001 From: George Hickman Date: Wed, 22 Nov 2023 16:49:56 +0000 Subject: [PATCH] Switch GitHub backfill to use the GraphQL API --- metrics/github/backfill.py | 172 +++++++++++++++++++++++++++---------- pyproject.toml | 1 - requirements.prod.txt | 91 +------------------- 3 files changed, 127 insertions(+), 137 deletions(-) diff --git a/metrics/github/backfill.py b/metrics/github/backfill.py index a468aa76..bc777705 100644 --- a/metrics/github/backfill.py +++ b/metrics/github/backfill.py @@ -1,7 +1,6 @@ -import sqlite3 -import subprocess +import json +import textwrap from datetime import date, timedelta -from pathlib import Path import click import structlog @@ -10,6 +9,7 @@ from ..timescaledb import TimescaleDBWriter from ..timescaledb.tables import GitHubPullRequests from ..tools.dates import date_from_iso, iter_days, previous_weekday +from .api import session from .prs import process_prs @@ -18,44 +18,133 @@ log = structlog.get_logger() -def get_data(db, org): - subprocess.check_call(["github-to-sqlite", "repos", db, org]) - - con = sqlite3.connect(db) - cur = con.cursor() +def get_query_page(*, query, session, cursor, **kwargs): + """ + Get a page of the given query - result = cur.execute( - "SELECT name FROM repos WHERE full_name LIKE ?", (f"{org}%",) - ).fetchall() - repo_names = [r[0] for r in result] + This uses the GraphQL API to avoid making O(N) calls to GitHub's (v3) REST + API. The passed cursor is a GraphQL cursor [1] allowing us to call this + function in a loop, passing in the responses cursor to advance our view of + the data. - for repo in repo_names: - subprocess.check_call( - ["github-to-sqlite", "pull-requests", db, f"{org}/{repo}"] + [1]: https://graphql.org/learn/pagination/#end-of-list-counts-and-connections + """ + # use GraphQL variables to avoid string interpolation + variables = {"cursor": cursor, **kwargs} + payload = {"query": query, "variables": variables} + + log.debug(query=query, **variables) + r = session.post("https://api.github.com/graphql", json=payload) + + if not r.ok: # pragma: no cover + print(r.headers) + print(r.content) + + r.raise_for_status() + results = r.json() + + # In some cases graphql will return a 200 response when there are errors. + # https://sachee.medium.com/200-ok-error-handling-in-graphql-7ec869aec9bc + # Handling things robustly is complex and query specific, so here we simply + # take the absence of 'data' as an error, rather than the presence of + # 'errors' key. + if "data" not in results: + msg = textwrap.dedent( + f""" + graphql query failed + + query: + {query} + + response: + {json.dumps(results, indent=2)} + """ ) + raise RuntimeError(msg) + + return results["data"] + +def get_query(query, path, **kwargs): + def extract(data): + result = data + for key in path: + result = result[key] + return result -def get_prs(db): - sql = """ - SELECT - date(pull_requests.created_at) as created, - date(pull_requests.closed_at) as closed, - date(pull_requests.merged_at) as merged, - authors.login as author, - repos.name as repo, - owners.login as org - FROM - pull_requests - LEFT OUTER JOIN repos ON (pull_requests.repo = repos.id) - LEFT OUTER JOIN users owners ON (repos.owner = owners.id) - LEFT OUTER JOIN users authors ON (pull_requests.user = authors.id) - WHERE - draft = 0 + more_pages = True + cursor = None + while more_pages: + page = extract( + get_query_page(query=query, session=session, cursor=cursor, **kwargs) + ) + yield from page["nodes"] + more_pages = page["pageInfo"]["hasNextPage"] + cursor = page["pageInfo"]["endCursor"] + + +def iter_repos(org): + query = """ + query repos($cursor: String, $org: String!) { + organization(login: $org) { + repositories(first: 100, after: $cursor) { + nodes { + name + } + pageInfo { + endCursor + hasNextPage + } + } + } + } + """ + for repo in get_query(query, path=["organization", "repositories"], org=org): + yield { + "name": repo["name"], + } + + +def iter_repo_prs(org, repo): + query = """ + query prs($cursor: String, $org: String!, $repo: String!) { + organization(login: $org) { + repository(name: $repo) { + pullRequests(first: 100, after: $cursor) { + nodes { + author { + login + } + number + createdAt + closedAt + mergedAt + } + pageInfo { + endCursor + hasNextPage + } + } + } + } + } """ - con = sqlite3.connect(db) - con.row_factory = sqlite3.Row - cur = con.cursor() - return list(cur.execute(sql)) + for pr in get_query( + query, path=["organization", "repository", "pullRequests"], org=org, repo=repo + ): + yield { + "org": org, + "repo": repo, + "author": pr["author"]["login"], + "created": pr["createdAt"], + "closed": pr["closedAt"], + "merged": pr["mergedAt"], + } + + +def iter_prs(org): + for r in iter_repos(org): + yield from iter_repo_prs(org, r["name"]) def open_prs(prs, org, days_threshold): @@ -118,19 +207,10 @@ def pr_throughput(prs, org): @click.command() @click.argument("org") -@click.option("--pull-data", is_flag=True, default=False) -@click.option("--db-path", type=str, default="github.db") @click.pass_context -def backfill(ctx, org, pull_data, db_path): +def backfill(ctx, org): """Backfill GitHub data for the given GitHub ORG""" - if pull_data: - # clean up existing db - Path(db_path).unlink(missing_ok=True) - - # pull all data down to make backfilling quicker - get_data(db_path, org) - - prs = get_prs(db_path) + prs = list(iter_prs(org)) org_prs = [pr for pr in prs if pr["org"] == org] log.info("Backfilling with %s PRs for %s", len(org_prs), org) diff --git a/pyproject.toml b/pyproject.toml index 4952f409..c41040f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ classifiers = [ requires-python = ">=3.11" dependencies = [ "click", - "github-to-sqlite", "greenlet", "requests", "slack-bolt", diff --git a/requirements.prod.txt b/requirements.prod.txt index da867bcc..13367dd3 100644 --- a/requirements.prod.txt +++ b/requirements.prod.txt @@ -103,17 +103,6 @@ charset-normalizer==3.3.2 \ click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de - # via - # click-default-group - # metrics (pyproject.toml) - # sqlite-utils -click-default-group==1.2.4 \ - --hash=sha256:9b60486923720e7fc61731bdb32b617039aba820e22e1c88766b1125592eaa5f \ - --hash=sha256:eb3f3c99ec0d456ca6cd2a7f08f7d4e91771bef51b01bdd9580cc6450fe1251e - # via sqlite-utils -github-to-sqlite==2.8.3 \ - --hash=sha256:16af0e18e4c1002e973b585333207b061137f509a9c42fc2ad48eae797ac3f0b \ - --hash=sha256:f5f28b9144bb758f99a923dadd7c3904c84c5786f34cc70c47ba64ee6f7dbe41 # via metrics (pyproject.toml) greenlet==3.0.1 \ --hash=sha256:0a02d259510b3630f330c86557331a3b0e0c79dac3d166e449a39363beaae174 \ @@ -178,10 +167,6 @@ idna==3.4 \ --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 # via requests -pluggy==1.3.0 \ - --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ - --hash=sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7 - # via sqlite-utils psycopg[binary]==3.1.12 \ --hash=sha256:8ec5230d6a7eb654b4fb3cf2d3eda8871d68f24807b934790504467f1deee9f8 \ --hash=sha256:cec7ad2bc6a8510e56c45746c631cf9394148bdc8a9a11fd8cf8554ce129ae78 @@ -255,72 +240,10 @@ psycopg-binary==3.1.12 \ --hash=sha256:f6f55979804853efa5ce84d7ef59ff3772e0823247497f7d4a6870e6527fd791 \ --hash=sha256:f93749f0fe69cfbfec22af690bb4b241f1a4347c57be26fe2e5b70588f7d602f # via psycopg -python-dateutil==2.8.2 \ - --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ - --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 - # via sqlite-utils -pyyaml==6.0.1 \ - --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ - --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ - --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \ - --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \ - --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \ - --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \ - --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \ - --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \ - --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \ - --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \ - --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \ - --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \ - --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \ - --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \ - --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \ - --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \ - --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \ - --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \ - --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \ - --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \ - --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \ - --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \ - --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \ - --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \ - --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \ - --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \ - --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \ - --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \ - --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \ - --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \ - --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \ - --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \ - --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \ - --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \ - --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \ - --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \ - --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \ - --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \ - --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \ - --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \ - --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \ - --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \ - --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \ - --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \ - --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \ - --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \ - --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \ - --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ - --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ - --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f - # via github-to-sqlite requests==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 - # via - # github-to-sqlite - # metrics (pyproject.toml) -six==1.16.0 \ - --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ - --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 - # via python-dateutil + # via metrics (pyproject.toml) slack-bolt==1.18.0 \ --hash=sha256:43b121acf78440303ce5129e53be36bdfe5d926a193daef7daf2860688e65dd3 \ --hash=sha256:63089a401ae3900c37698890249acd008a4651d06e86194edc7b72a00819bbac @@ -380,22 +303,10 @@ sqlalchemy[postgresql-psycopgbinary,postgresql_psycopgbinary]==2.0.23 \ --hash=sha256:f508ba8f89e0a5ecdfd3761f82dda2a3d7b678a626967608f4273e0dba8f07ac \ --hash=sha256:fd54601ef9cc455a0c61e5245f690c8a3ad67ddb03d3b91c361d076def0b4c60 # via metrics (pyproject.toml) -sqlite-fts4==1.0.3 \ - --hash=sha256:0359edd8dea6fd73c848989e1e2b1f31a50fe5f9d7272299ff0e8dbaa62d035f \ - --hash=sha256:78b05eeaf6680e9dbed8986bde011e9c086a06cb0c931b3cf7da94c214e8930c - # via sqlite-utils -sqlite-utils==3.35.1 \ - --hash=sha256:58da19f64b37fd47e33158ac4dadf2616701cd17d825a1625866d04647f72805 \ - --hash=sha256:e0f03e6976b05bdb7a5c56454971a0e980fc16dbfd3512bbd3bdcac4f0e4370e - # via github-to-sqlite structlog==23.2.0 \ --hash=sha256:16a167e87b9fa7fae9a972d5d12805ef90e04857a93eba479d4be3801a6a1482 \ --hash=sha256:334666b94707f89dbc4c81a22a8ccd34449f0201d5b1ee097a030b577fa8c858 # via metrics (pyproject.toml) -tabulate==0.9.0 \ - --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \ - --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f - # via sqlite-utils typing-extensions==4.8.0 \ --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \ --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef