Skip to content

Commit

Permalink
Safer pagination and refresh for site cache (#1239)
Browse files Browse the repository at this point in the history
* refactor: retrieve repo pages using github's pagination link system

* refactor: clear all references to ISOMERPAGES_REPO_PAGE_COUNT

* refactor: retrieve site's lastUpdated in O(1)

* feat(SiteCache): optimize cache refresh for load distribution rather than time

* fix(SiteCache): fix invalid page number injection 🤦

* feat(SiteCache): instrument renewCache()

* fix(test): update tests

* fix: more test fixes

* fix: reset axios mock properly

* fix: remove hardcoded value (was used for tests)

* fix: add explicit return type

* feat: introduce new type LinkRelation

* refactor: using string.matchAll() instead of RegExp.exec()

* refactor: better use of types
  • Loading branch information
timotheeg authored Apr 24, 2024
1 parent 17ed20f commit c26c016
Show file tree
Hide file tree
Showing 10 changed files with 263 additions and 110 deletions.
4 changes: 0 additions & 4 deletions .aws/deploy/backend-task-definition.prod.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,6 @@
"name": "INCOMING_QUEUE_URL",
"valueFrom": "PROD_INCOMING_QUEUE_URL"
},
{
"name": "ISOMERPAGES_REPO_PAGE_COUNT",
"valueFrom": "PROD_ISOMERPAGES_REPO_PAGE_COUNT"
},
{ "name": "JWT_SECRET", "valueFrom": "PROD_JWT_SECRET" },
{
"name": "MAX_NUM_OTP_ATTEMPTS",
Expand Down
4 changes: 0 additions & 4 deletions .aws/deploy/backend-task-definition.staging.json
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,6 @@
"name": "INCOMING_QUEUE_URL",
"valueFrom": "STAGING_INCOMING_QUEUE_URL"
},
{
"name": "ISOMERPAGES_REPO_PAGE_COUNT",
"valueFrom": "STAGING_ISOMERPAGES_REPO_PAGE_COUNT"
},
{ "name": "JWT_SECRET", "valueFrom": "STAGING_JWT_SECRET" },
{
"name": "MAX_NUM_OTP_ATTEMPTS",
Expand Down
3 changes: 1 addition & 2 deletions .env-example
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ export GITHUB_ORG_NAME="isomerpages"
export GITHUB_BUILD_ORG_NAME="opengovsg"
export GITHUB_BUILD_REPO_NAME="isomer-build"
export BRANCH_REF="staging"
export ISOMERPAGES_REPO_PAGE_COUNT=10

# Required to run end-to-end tests
export E2E_TEST_REPO="e2e-test-repo"
Expand Down Expand Up @@ -81,4 +80,4 @@ export UPTIME_ROBOT_API_KEY=""
# Git SSH Keys
export ENV_TYPE="LOCAL"
export LOCAL_SSH_PUBLIC_KEY=""
export LOCAL_SSH_PRIVATE_KEY=""
export LOCAL_SSH_PRIVATE_KEY=""
1 change: 0 additions & 1 deletion .env.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ export GITHUB_ORG_NAME="isomerpages"
export GITHUB_BUILD_ORG_NAME="opengovsg"
export GITHUB_BUILD_REPO_NAME="isomer-build"
export MUTEX_TABLE_NAME="mutex-table"
export ISOMERPAGES_REPO_PAGE_COUNT=3
export MAX_NUM_OTP_ATTEMPTS=5
export OTP_EXPIRY=900000
export SESSION_SECRET=blahblah
Expand Down
1 change: 0 additions & 1 deletion .platform/hooks/predeploy/06_fetch_ssm_parameters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ ENV_VARS=(
"GITHUB_ORG_NAME"
"GROWTHBOOK_CLIENT_KEY"
"INCOMING_QUEUE_URL"
"ISOMERPAGES_REPO_PAGE_COUNT"
"JWT_SECRET"
"MAX_NUM_OTP_ATTEMPTS"
"MOCK_AMPLIFY_DOMAIN_ASSOCIATION_CALLS"
Expand Down
8 changes: 0 additions & 8 deletions src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,6 @@ const config = convict({
format: "required-string",
default: "isomer-mutexes",
},
sites: {
pageCount: {
doc: "Number of pages of repos to retrieve from GitHub API",
env: "ISOMERPAGES_REPO_PAGE_COUNT",
format: "required-positive-number",
default: 10,
},
},
auth: {
cookieDomain: {
doc: "Domain to set for auth cookie",
Expand Down
1 change: 0 additions & 1 deletion src/constants/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ export const E2E_TEST_EMAIL = "test@e2e"
export const E2E_TEST_CONTACT = "12345678"

export const GH_MAX_REPO_COUNT = 100
export const ISOMERPAGES_REPO_PAGE_COUNT = config.get("sites.pageCount")
export const ISOMER_GITHUB_ORG_NAME = config.get("github.orgName")
export const ISOMER_ADMIN_REPOS = [
"isomercms-backend",
Expand Down
1 change: 1 addition & 0 deletions src/integration/Sites.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ describe("Sites Router", () => {
],
}
mockGenericAxios.get.mockResolvedValueOnce({
headers: {},
data: [
{
pushed_at: mockUpdatedAt,
Expand Down
258 changes: 202 additions & 56 deletions src/services/identity/SitesCacheService.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { HeadersDefaults } from "axios"
import _ from "lodash"

import tracer from "@utils/tracer"

import {
ISOMERPAGES_REPO_PAGE_COUNT,
GH_MAX_REPO_COUNT,
ISOMER_ADMIN_REPOS,
GITHUB_ORG_REPOS_ENDPOINT,
Expand All @@ -14,81 +16,225 @@ import type { GitHubRepositoryData, RepositoryData } from "@root/types/repoInfo"
// Changes are polled at fixed intervals, stored in cache, and served
// to avoid long load time from live querying.

const DEFAULT_PAGE_PARAMS = {
per_page: GH_MAX_REPO_COUNT,
sort: "full_name",
page: 1,
}

type GitHubResponseHeaders = HeadersDefaults & {
link?: string
}

type LinkRelation = "first" | "last" | "prev" | "next"

type Link = {
relid: LinkRelation
url: string
pagenum: number
}

type LinkSet = {
[relid in LinkRelation]?: Link
}

type LinkMatch = RegExpExecArray & {
groups: {
relid: LinkRelation
url: string
}
}

type FetchRepoPageResult = {
repos: RepositoryData[]
links?: LinkSet
}

type CacheStore = {
[key: string]: RepositoryData
}

function parseGitHubLinkHeader(linkheader: string): LinkSet {
// example value: link: <https://api.github.com/organizations/40887764/repos?page=2>; rel="next", <https://api.github.com/organizations/40887764/repos?page=34>; rel="last"
const links: LinkSet = {}

const linkRe = /<(?<url>[^>]+)>; rel="(?<relid>[a-z]+)"(, )?/g

// we expect a small input so it's OK to loop on the matchAll() iterator
/* eslint-disable no-restricted-syntax */
for (const regRes of linkheader.matchAll(
linkRe
) as IterableIterator<LinkMatch>) {
const url = new URL(regRes.groups.url)
const pageNum = url.searchParams.get("page") as string // Per specifications, we KNOW the github link urls WILL include a page number in the url
links[regRes.groups.relid] = {
relid: regRes.groups.relid,
url: regRes.groups.url,
pagenum: parseInt(pageNum, 10),
}
}
/* eslint-enable no-restricted-syntax */

return links
}

function getRepositoryData(source: GitHubRepositoryData): RepositoryData {
const {
pushed_at: lastUpdated,
permissions,
name: repoName,
private: isPrivate,
} = source

return {
lastUpdated,
permissions,
repoName,
isPrivate,
}
}

function processAndFilterPageOfRepositories(
repos: GitHubRepositoryData[]
): RepositoryData[] {
return repos.map(getRepositoryData).filter((repoData) => {
if (!repoData || !repoData.permissions) {
return false
}
return (
repoData.permissions.push === true &&
!ISOMER_ADMIN_REPOS.includes(repoData.repoName)
)
})
}

async function fetchPageOfRepositories({
accessToken,
page,
getLinks,
}: {
accessToken?: string
page: number
getLinks?: boolean
}): Promise<FetchRepoPageResult> {
const params = { ...DEFAULT_PAGE_PARAMS, page }

const {
data,
headers,
}: {
data: GitHubRepositoryData[]
headers: GitHubResponseHeaders
} = await genericGitHubAxiosInstance.get(
GITHUB_ORG_REPOS_ENDPOINT,
accessToken
? {
headers: { Authorization: `token ${accessToken}` },
params,
}
: { params }
)

const res: FetchRepoPageResult = {
repos: processAndFilterPageOfRepositories(data),
}

if (getLinks && headers.link) {
res.links = parseGitHubLinkHeader(headers.link)
}

return res
}

export async function getAllRepoData(
accessToken: string | undefined
): Promise<RepositoryData[]> {
// Simultaneously retrieve all isomerpages repos
const paramsArr = _.fill(Array(ISOMERPAGES_REPO_PAGE_COUNT), null).map(
(__, idx) => ({
per_page: GH_MAX_REPO_COUNT,
sort: "full_name",
page: idx + 1,
})
const { repos: firstPageRepos, links } = await fetchPageOfRepositories({
accessToken,
page: 1,
getLinks: true,
})

if (!links?.last || links.last.pagenum <= 1) {
// There are no links, or no last link specifically, which is the behaviour when the page returned is the last page
// (In the same manner has the links for the first page do not prev and first links)
return firstPageRepos
}

// There are more pages to retrieve! Fetch all pages 2 to N in parallel, as was done before
const pageNums = _.range(2, links.last.pagenum + 1)

const pages2ToLast = await Promise.all(
pageNums.map((page) => fetchPageOfRepositories({ accessToken, page }))
)

const allSites = await Promise.all(
paramsArr.map(async (params) => {
const {
data: respData,
}: {
data: GitHubRepositoryData[]
} = await genericGitHubAxiosInstance.get(
GITHUB_ORG_REPOS_ENDPOINT,
accessToken
? {
headers: { Authorization: `token ${accessToken}` },
params,
}
: { params }
)
return respData
.map((gitHubRepoData) => {
const {
pushed_at: updatedAt,
permissions,
name,
private: isPrivate,
} = gitHubRepoData

return {
lastUpdated: updatedAt,
permissions,
repoName: name,
isPrivate,
} as RepositoryData
})
.filter((repoData) => {
if (!repoData || !repoData.permissions) {
return false
}
return (
repoData.permissions.push === true &&
!ISOMER_ADMIN_REPOS.includes(repoData.repoName)
)
})
return firstPageRepos.concat(...pages2ToLast.map((res) => res.repos))
}

async function getAllRepoDataSequentially() {
// This function exists because when we refresh the cache on interval, we want to optimize for load
// distribution in the process over time, rather than optimize for time to refresh

const allRepos = []
let page = 1

// We disable the eslint rule no-await-in-loop, because we specifically want sequence here to distribute load
/* eslint-disable no-await-in-loop */
do {
const { repos, links } = await fetchPageOfRepositories({
page,
getLinks: true,
})
)
return _.flatten(allSites)

allRepos.push(...repos)

if (!links?.next) {
break
}

page = links.next.pagenum // Note that by right, we should follow the next link, rather than extract the page number from the link
} while (true) // eslint-disable-line no-constant-condition
/* eslint-enable no-await-in-loop */

return allRepos
}

export class SitesCacheService {
private repoDataCache: RepositoryData[]
private repoDataCache: CacheStore

private refreshInterval: number

constructor(refreshInterval: number) {
this.repoDataCache = []
this.repoDataCache = {} as CacheStore
this.refreshInterval = refreshInterval
this.renewCache()

this.startCache()

setInterval(() => this.renewCache(), this.refreshInterval)
}

private transformRepoList(repos: RepositoryData[]) {
// Since the only cache API we expose is to retrieve repo info by repo name, we store the repos as a map in cache
// to have a O(1) retrieval later
return repos.reduce((acc, repo) => {
acc[repo.repoName] = repo
return acc
}, {} as CacheStore)
}

private async startCache() {
const repos = await getAllRepoData(undefined)
this.repoDataCache = this.transformRepoList(repos)
}

private async renewCache() {
this.repoDataCache = await getAllRepoData(undefined)
tracer.trace("SiteCache.renewCache", async () => {
const repos = await getAllRepoDataSequentially()
this.repoDataCache = this.transformRepoList(repos)
})
}

getLastUpdated(repoName: string) {
return this.repoDataCache.find((siteData) => siteData.repoName === repoName)
?.lastUpdated
return this.repoDataCache[repoName]?.lastUpdated
}
}
Loading

0 comments on commit c26c016

Please sign in to comment.