Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Safer pagination and refresh for site cache #1239

Merged
merged 14 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .aws/deploy/backend-task-definition.prod.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,6 @@
"name": "INCOMING_QUEUE_URL",
"valueFrom": "PROD_INCOMING_QUEUE_URL"
},
{
"name": "ISOMERPAGES_REPO_PAGE_COUNT",
"valueFrom": "PROD_ISOMERPAGES_REPO_PAGE_COUNT"
},
{ "name": "JWT_SECRET", "valueFrom": "PROD_JWT_SECRET" },
{
"name": "MAX_NUM_OTP_ATTEMPTS",
Expand Down
4 changes: 0 additions & 4 deletions .aws/deploy/backend-task-definition.staging.json
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,6 @@
"name": "INCOMING_QUEUE_URL",
"valueFrom": "STAGING_INCOMING_QUEUE_URL"
},
{
"name": "ISOMERPAGES_REPO_PAGE_COUNT",
"valueFrom": "STAGING_ISOMERPAGES_REPO_PAGE_COUNT"
},
{ "name": "JWT_SECRET", "valueFrom": "STAGING_JWT_SECRET" },
{
"name": "MAX_NUM_OTP_ATTEMPTS",
Expand Down
3 changes: 1 addition & 2 deletions .env-example
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ export GITHUB_ORG_NAME="isomerpages"
export GITHUB_BUILD_ORG_NAME="opengovsg"
export GITHUB_BUILD_REPO_NAME="isomer-build"
export BRANCH_REF="staging"
export ISOMERPAGES_REPO_PAGE_COUNT=10

# Required to run end-to-end tests
export E2E_TEST_REPO="e2e-test-repo"
Expand Down Expand Up @@ -81,4 +80,4 @@ export UPTIME_ROBOT_API_KEY=""
# Git SSH Keys
export ENV_TYPE="LOCAL"
export LOCAL_SSH_PUBLIC_KEY=""
export LOCAL_SSH_PRIVATE_KEY=""
export LOCAL_SSH_PRIVATE_KEY=""
1 change: 0 additions & 1 deletion .env.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ export GITHUB_ORG_NAME="isomerpages"
export GITHUB_BUILD_ORG_NAME="opengovsg"
export GITHUB_BUILD_REPO_NAME="isomer-build"
export MUTEX_TABLE_NAME="mutex-table"
export ISOMERPAGES_REPO_PAGE_COUNT=3
export MAX_NUM_OTP_ATTEMPTS=5
export OTP_EXPIRY=900000
export SESSION_SECRET=blahblah
Expand Down
1 change: 0 additions & 1 deletion .platform/hooks/predeploy/06_fetch_ssm_parameters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ ENV_VARS=(
"GITHUB_ORG_NAME"
"GROWTHBOOK_CLIENT_KEY"
"INCOMING_QUEUE_URL"
"ISOMERPAGES_REPO_PAGE_COUNT"
"JWT_SECRET"
"MAX_NUM_OTP_ATTEMPTS"
"MOCK_AMPLIFY_DOMAIN_ASSOCIATION_CALLS"
Expand Down
8 changes: 0 additions & 8 deletions src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

convict.addFormat({
name: "required-string",
validate: (val: any) => {

Check warning on line 5 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type

Check warning on line 5 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type
if (!val) throw new Error("value cannot be empty, null or undefined")
if (typeof val !== "string") throw new Error("value must be a string")
},
Expand All @@ -10,14 +10,14 @@

convict.addFormat({
name: "required-positive-number",
validate: (val: any) => {

Check warning on line 13 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type

Check warning on line 13 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type
if (val === null || val === undefined || val === "")
throw new Error("value cannot be empty, null or undefined")
if (typeof val !== "number") throw new Error("value must be a number")
},
coerce: (val: string) => {
const coercedVal = Number(val)
if (isNaN(coercedVal)) {

Check warning on line 20 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected use of 'isNaN'. Use Number.isNaN instead https://github.com/airbnb/javascript#standard-library--isnan

Check warning on line 20 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected use of 'isNaN'. Use Number.isNaN instead https://github.com/airbnb/javascript#standard-library--isnan
throw new Error(
"value provided is not a positive number. please provide a valid positive number"
)
Expand All @@ -31,7 +31,7 @@

convict.addFormat({
name: "required-boolean",
validate: (val: any) => {

Check warning on line 34 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type

Check warning on line 34 in src/config/config.ts

View workflow job for this annotation

GitHub Actions / lint

Unexpected any. Specify a different type
if (val === null || val === undefined)
throw new Error("value cannot be empty, null or undefined")
if (typeof val !== "boolean") throw new Error("value must be a boolean")
Expand Down Expand Up @@ -81,14 +81,6 @@
format: "required-string",
default: "isomer-mutexes",
},
sites: {
pageCount: {
doc: "Number of pages of repos to retrieve from GitHub API",
env: "ISOMERPAGES_REPO_PAGE_COUNT",
format: "required-positive-number",
default: 10,
},
},
auth: {
cookieDomain: {
doc: "Domain to set for auth cookie",
Expand Down
1 change: 0 additions & 1 deletion src/constants/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,24 @@

import { config } from "@config/config"

export enum JobStatus {

Check warning on line 5 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'JobStatus' is already declared in the upper scope on line 5 column 13

Check warning on line 5 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'JobStatus' is already declared in the upper scope on line 5 column 13
Ready = "READY", // Ready to run jobs
Running = "RUNNING", // A job is running
Failed = "FAILED", // A job has failed and recovery is needed
}

export enum SiteStatus {

Check warning on line 11 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'SiteStatus' is already declared in the upper scope on line 11 column 13

Check warning on line 11 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'SiteStatus' is already declared in the upper scope on line 11 column 13
Empty = "EMPTY", // A site record site is being initialized
Initialized = "INITIALIZED",
Launched = "LAUNCHED",
}

export enum RedirectionTypes {

Check warning on line 17 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'RedirectionTypes' is already declared in the upper scope on line 17 column 13

Check warning on line 17 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'RedirectionTypes' is already declared in the upper scope on line 17 column 13
CNAME = "CNAME",
A = "A",
}

export enum CollaboratorRoles {

Check warning on line 22 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'CollaboratorRoles' is already declared in the upper scope on line 22 column 13

Check warning on line 22 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'CollaboratorRoles' is already declared in the upper scope on line 22 column 13
Admin = "ADMIN",
Contributor = "CONTRIBUTOR",
IsomerAdmin = "ISOMERADMIN",
Expand All @@ -30,7 +30,7 @@
CollaboratorRoles.IsomerAdmin
>

export enum ReviewRequestStatus {

Check warning on line 33 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'ReviewRequestStatus' is already declared in the upper scope on line 33 column 13

Check warning on line 33 in src/constants/constants.ts

View workflow job for this annotation

GitHub Actions / lint

'ReviewRequestStatus' is already declared in the upper scope on line 33 column 13
Approved = "APPROVED",
Open = "OPEN",
Merged = "MERGED",
Expand All @@ -42,7 +42,6 @@
export const E2E_TEST_CONTACT = "12345678"

export const GH_MAX_REPO_COUNT = 100
export const ISOMERPAGES_REPO_PAGE_COUNT = config.get("sites.pageCount")
export const ISOMER_GITHUB_ORG_NAME = config.get("github.orgName")
export const ISOMER_ADMIN_REPOS = [
"isomercms-backend",
Expand Down
1 change: 1 addition & 0 deletions src/integration/Sites.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ describe("Sites Router", () => {
],
}
mockGenericAxios.get.mockResolvedValueOnce({
headers: {},
data: [
{
pushed_at: mockUpdatedAt,
Expand Down
269 changes: 212 additions & 57 deletions src/services/identity/SitesCacheService.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { HeadersDefaults } from "axios"
import _ from "lodash"

import logger from "@logger/logger"

import tracer from "@utils/tracer"

import {
ISOMERPAGES_REPO_PAGE_COUNT,
GH_MAX_REPO_COUNT,
ISOMER_ADMIN_REPOS,
GITHUB_ORG_REPOS_ENDPOINT,
Expand All @@ -14,81 +18,232 @@ import type { GitHubRepositoryData, RepositoryData } from "@root/types/repoInfo"
// Changes are polled at fixed intervals, stored in cache, and served
// to avoid long load time from live querying.

const DEFAULT_PAGE_PARAMS = {
per_page: GH_MAX_REPO_COUNT,
sort: "full_name",
page: 1,
}

type GitHubResponseHeaders = HeadersDefaults & {
link?: string
}

type Link = {
relid: "first" | "last" | "prev" | "next"
timotheeg marked this conversation as resolved.
Show resolved Hide resolved
url: string
pagenum: number
}

type LinkSet = {
first?: Link
last?: Link
prev?: Link
next?: Link
}
timotheeg marked this conversation as resolved.
Show resolved Hide resolved

type LinkMatch = RegExpExecArray & {
groups: {
relid: "first" | "last" | "prev" | "next"
url: string
}
}

type FetchRepoPageResult = {
repos: RepositoryData[]
links?: LinkSet
}

type CacheStore = {
[key: string]: RepositoryData
}

function parseGitHubLinkHeader(linkheader: string) {
timotheeg marked this conversation as resolved.
Show resolved Hide resolved
// example value: link: <https://api.github.com/organizations/40887764/repos?page=2>; rel="next", <https://api.github.com/organizations/40887764/repos?page=34>; rel="last"
const links: LinkSet = {}

const linkRe = /<(?<url>[^>]+)>; rel="(?<relid>[a-z]+)"(, )?/g
Copy link
Contributor Author

@timotheeg timotheeg Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this regex is not actually validating anything on the urls, or that relid should be one of "first" | "last" | "prev" | "next", even thought we do assert it in the types below with as IterableIterator<LinkMatch>

Because we are the one calling github, we trust their APIs, but we could add paranoia checks, and warning logging here 🤔

Or the regexp could be made more "validating" in that sense, like so for example, which would at least guarantee that relid is correct according to the type:

Suggested change
const linkRe = /<(?<url>[^>]+)>; rel="(?<relid>[a-z]+)"(, )?/g
const linkRe = /<(?<url>[^>]+)>; rel="(?<relid>first|last|prev|next)"(, )?/g

Copy link
Contributor Author

@timotheeg timotheeg Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather leave the regexp loose tbh, in case github introduces more link relations in the future, they'd already be captured properly as part of the parsing routine.

let regRes: LinkMatch | null = null

// eslint-disable-next-line no-cond-assign
while ((regRes = linkRe.exec(linkheader) as LinkMatch) !== null) {
timotheeg marked this conversation as resolved.
Show resolved Hide resolved
const url = new URL(regRes.groups.url)
const pageNum = url.searchParams.get("page") as string // Per specifications, we KNOW the github link urls WILL include a page number in the url
links[regRes.groups.relid] = {
relid: regRes.groups.relid,
url: regRes.groups.url,
pagenum: parseInt(pageNum, 10),
}
}

return links
}

function getRepositoryData(source: GitHubRepositoryData): RepositoryData {
const {
pushed_at: lastUpdated,
permissions,
name: repoName,
private: isPrivate,
} = source

return {
lastUpdated,
permissions,
repoName,
isPrivate,
}
}

function processAndFilterPageOfRepositories(
repos: GitHubRepositoryData[]
): RepositoryData[] {
return repos.map(getRepositoryData).filter((repoData) => {
if (!repoData || !repoData.permissions) {
return false
}
return (
repoData.permissions.push === true &&
!ISOMER_ADMIN_REPOS.includes(repoData.repoName)
)
})
}

async function fetchPageOfRepositories({
accessToken,
page,
getLinks,
}: {
accessToken?: string
page: number
getLinks?: boolean
}): Promise<FetchRepoPageResult> {
const params = { ...DEFAULT_PAGE_PARAMS, page }

const {
data,
headers,
}: {
data: GitHubRepositoryData[]
headers: GitHubResponseHeaders
} = await genericGitHubAxiosInstance.get(
GITHUB_ORG_REPOS_ENDPOINT,
accessToken
? {
headers: { Authorization: `token ${accessToken}` },
params,
}
: { params }
)

logger.info({
message: "Fetched one page of repo",
meta: {
page,
headers,
data,
},
})

const res: FetchRepoPageResult = {
repos: processAndFilterPageOfRepositories(data),
}

if (getLinks && headers.link) {
res.links = parseGitHubLinkHeader(headers.link)
}

return res
}

export async function getAllRepoData(
accessToken: string | undefined
): Promise<RepositoryData[]> {
// Simultaneously retrieve all isomerpages repos
const paramsArr = _.fill(Array(ISOMERPAGES_REPO_PAGE_COUNT), null).map(
(__, idx) => ({
per_page: GH_MAX_REPO_COUNT,
sort: "full_name",
page: idx + 1,
})
const { repos: firstPageRepos, links } = await fetchPageOfRepositories({
accessToken,
page: 1,
getLinks: true,
})

if (!links?.last || links.last.pagenum <= 1) {
// There are no links, or no last link specifically, which is the behaviour when the page returned is the last page
// (In the same manner has the links for the first page do not prev and first links)
return firstPageRepos
}

// There are more pages to retrieve! Fetch all pages 2 to N in parallel, as was done before
const pageNums = _.range(2, links.last.pagenum + 1)

const pages2ToLast = await Promise.all(
pageNums.map((page) => fetchPageOfRepositories({ accessToken, page }))
)

const allSites = await Promise.all(
paramsArr.map(async (params) => {
const {
data: respData,
}: {
data: GitHubRepositoryData[]
} = await genericGitHubAxiosInstance.get(
GITHUB_ORG_REPOS_ENDPOINT,
accessToken
? {
headers: { Authorization: `token ${accessToken}` },
params,
}
: { params }
)
return respData
.map((gitHubRepoData) => {
const {
pushed_at: updatedAt,
permissions,
name,
private: isPrivate,
} = gitHubRepoData

return {
lastUpdated: updatedAt,
permissions,
repoName: name,
isPrivate,
} as RepositoryData
})
.filter((repoData) => {
if (!repoData || !repoData.permissions) {
return false
}
return (
repoData.permissions.push === true &&
!ISOMER_ADMIN_REPOS.includes(repoData.repoName)
)
})
return firstPageRepos.concat(...pages2ToLast.map((res) => res.repos))
}

async function getAllRepoDataSequentially() {
// This function exists because when we refresh the cache on interval, we want to optimize for load
// distribution in the process over time, rather than optimize for time to refresh

const allRepos = []
let page = 1

// We disable the eslint rule no-await-in-loop, because we specifically want sequence here to distribute load
/* eslint-disable no-await-in-loop */
do {
const { repos, links } = await fetchPageOfRepositories({
page,
getLinks: true,
})
)
return _.flatten(allSites)

allRepos.push(...repos)

if (!links?.next) {
break
}

page = links.next.pagenum // Note that by right, we should follow the next link, rather than extract the page number from the link
} while (true) // eslint-disable-line no-constant-condition
/* eslint-enable no-await-in-loop */

return allRepos
}

export class SitesCacheService {
private repoDataCache: RepositoryData[]
private repoDataCache: CacheStore

private refreshInterval: number

constructor(refreshInterval: number) {
this.repoDataCache = []
this.refreshInterval = refreshInterval
this.renewCache()
this.repoDataCache = {} as CacheStore
this.refreshInterval = 90000
timotheeg marked this conversation as resolved.
Show resolved Hide resolved

this.startCache()

setInterval(() => this.renewCache(), this.refreshInterval)
}

private transformRepoList(repos: RepositoryData[]) {
// Since the only cache API we expose is to retrieve repo info by repo name, we store the repos as a map in cache
// to have a O(1) retrieval later
return repos.reduce((acc, repo) => {
acc[repo.repoName] = repo
return acc
}, {} as CacheStore)
}

private async startCache() {
const repos = await getAllRepoData(undefined)
this.repoDataCache = this.transformRepoList(repos)
}

private async renewCache() {
this.repoDataCache = await getAllRepoData(undefined)
tracer.trace("SiteCache.renewCache", async () => {
const repos = await getAllRepoDataSequentially()
this.repoDataCache = this.transformRepoList(repos)
})
}

getLastUpdated(repoName: string) {
return this.repoDataCache.find((siteData) => siteData.repoName === repoName)
?.lastUpdated
return this.repoDataCache[repoName]?.lastUpdated
}
}
Loading
Loading