From 1aab649b989cfb517232bfd9333ef194af20b511 Mon Sep 17 00:00:00 2001 From: Holly Cummins Date: Tue, 1 Aug 2023 18:41:41 +0100 Subject: [PATCH] Cache github information about the same repository (except issue counts, in the case where we have labels, and subfolder queries to find extension-yamls) --- plugins/github-enricher/gatsby-node.js | 72 ++- plugins/github-enricher/gatsby-node.test.js | 482 +++++++++++++++++--- 2 files changed, 482 insertions(+), 72 deletions(-) diff --git a/plugins/github-enricher/gatsby-node.js b/plugins/github-enricher/gatsby-node.js index e249093cbe39..020037f04076 100644 --- a/plugins/github-enricher/gatsby-node.js +++ b/plugins/github-enricher/gatsby-node.js @@ -11,6 +11,9 @@ const defaultOptions = { nodeType: "Extension", } +// To avoid hitting the git rate limiter retrieving information we already know, cache what we can +let repoCache = {} + let getLabels // TODO ideally we would cache between builds, too @@ -76,6 +79,11 @@ exports.onPreBootstrap = async ({}) => { return yaml } +// Mostly useful for testing +exports.clearCache = () => { + repoCache = {} +} + exports.onCreateNode = async ( { node, actions, createNodeId, createContentDigest }, pluginOptions @@ -145,6 +153,8 @@ async function fetchScmLabel(scmUrl, artifactId) { } const fetchScmInfo = async (scmUrl, artifactId, labels) => { + const cachedScmInfo = repoCache[scmUrl] + // TODO we can just treat label as an array, almost const labelFilterString = labels ? `, filterBy: { labels: [${labels.map(label => `"${label}"`).join()}] }` @@ -165,7 +175,12 @@ const fetchScmInfo = async (scmUrl, artifactId, labels) => { ) : scmUrl + "/issues" - const scmInfo = { url: scmUrl, project, issuesUrl, labels } + // Take what we have cached as a base, if we have it + const scmInfo = cachedScmInfo ? cachedScmInfo : { url: scmUrl, project } + + // Always set the issuesUrl and labels since the cached one might be invalid + scmInfo.issuesUrl = issuesUrl + scmInfo.labels = labels const accessToken = process.env.GITHUB_TOKEN @@ -173,34 +188,51 @@ const fetchScmInfo = async (scmUrl, artifactId, labels) => { // Batching this may not help that much because rate limits are done on query complexity and cost, // not the number of actual http calls; see https://docs.github.com/en/graphql/overview/resource-limitations if (accessToken) { - const query = ` - query { - repository(owner:"${coords.owner}", name:"${coords.name}") { - issues(states:OPEN, ${labelFilterString}) { + const issuesQuery = `issues(states:OPEN, ${labelFilterString}) { totalCount - } - - defaultBranchRef { - name - } - - metaInfs: object(expression: "HEAD:runtime/src/main/resources/META-INF/") { + }` + + const subfoldersQuery = `subfolderMetaInfs: object(expression: "HEAD:${artifactId}/runtime/src/main/resources/META-INF/") { ... on Tree { entries { path } } } - - subfolderMetaInfs: object(expression: "HEAD:${artifactId}/runtime/src/main/resources/META-INF/") { + + shortenedSubfolderMetaInfs: object(expression: "HEAD:${shortArtifactId}/runtime/src/main/resources/META-INF/") { ... on Tree { entries { path } } + }` + + let query + if (cachedScmInfo) { + if (labels) { + query = `query { + repository(owner:"${coords.owner}", name:"${coords.name}") { + ${issuesQuery} + + ${subfoldersQuery} + }` + } else { + query = `query { + repository(owner:"${coords.owner}", name:"${coords.name}") { + ${subfoldersQuery} + }` } - - shortenedSubfolderMetaInfs: object(expression: "HEAD:${shortArtifactId}/runtime/src/main/resources/META-INF/") { + } else { + query = `query { + repository(owner:"${coords.owner}", name:"${coords.name}") { + ${issuesQuery} + + defaultBranchRef { + name + } + + metaInfs: object(expression: "HEAD:runtime/src/main/resources/META-INF/") { ... on Tree { entries { path @@ -208,6 +240,8 @@ const fetchScmInfo = async (scmUrl, artifactId, labels) => { } } + ${subfoldersQuery} + openGraphImageUrl } @@ -215,6 +249,7 @@ const fetchScmInfo = async (scmUrl, artifactId, labels) => { avatarUrl } }` + } // We sometimes get bad results back from the git API where json() is null, so do a bit of retrying const body = await promiseRetry( @@ -300,10 +335,13 @@ const fetchScmInfo = async (scmUrl, artifactId, labels) => { scmInfo.socialImage = openGraphImageUrl } + // Save this information for the next time + repoCache[scmUrl] = scmInfo + return scmInfo } else { console.warn( - `Cannot read GitHub information ofr ${artifactId}, because the API did not return any data.` + `Cannot read GitHub information for ${artifactId}, because the API did not return any data.` ) return scmInfo } diff --git a/plugins/github-enricher/gatsby-node.test.js b/plugins/github-enricher/gatsby-node.test.js index 4207218a8486..a7093474736a 100644 --- a/plugins/github-enricher/gatsby-node.test.js +++ b/plugins/github-enricher/gatsby-node.test.js @@ -2,7 +2,7 @@ * @jest-environment node */ -const { onCreateNode, onPreBootstrap } = require("./gatsby-node") +const { onCreateNode, onPreBootstrap, clearCache } = require("./gatsby-node") const { createRemoteFileNode } = require("gatsby-source-filesystem") jest.mock("gatsby-source-filesystem") @@ -57,35 +57,38 @@ describe("the github data handler", () => { const projectName = "somerepo" const ownerName = "someuser" const url = "http://gitsomething.com/someuser/" + projectName + const otherUrl = "http://gitsomething.com/someuser/other" + projectName const issuesUrl = url + "/issues" const avatarUrl = "http://something.com/someuser.png" const socialMediaPreviewUrl = "https://testopengraph.githubassets.com/3096043220541a8ea73deb5cb6baddf0f01d50244737d22402ba12d665e9aec2/quarkiverse/quarkus-some-extension" - const gitHubData = { - json: jest.fn().mockResolvedValue({ - data: { - repository: { - issues: { - totalCount: 16, - }, - defaultBranchRef: { name: "unusual" }, - metaInfs: null, - subfolderMetaInfs: null, - shortenedSubfolderMetaInfs: { - entries: [ - { path: "runtime/src/main/resources/META-INF/beans.xml" }, - { - path: "some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", - }, - { path: "runtime/src/main/resources/META-INF/services" }, - ], - }, - openGraphImageUrl: socialMediaPreviewUrl, + const response = { + data: { + repository: { + issues: { + totalCount: 16, }, - repositoryOwner: { avatarUrl: avatarUrl }, + defaultBranchRef: { name: "unusual" }, + metaInfs: null, + subfolderMetaInfs: null, + shortenedSubfolderMetaInfs: { + entries: [ + { path: "runtime/src/main/resources/META-INF/beans.xml" }, + { + path: "some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }, + { path: "runtime/src/main/resources/META-INF/services" }, + ], + }, + openGraphImageUrl: socialMediaPreviewUrl, }, - }), + repositoryOwner: { avatarUrl: avatarUrl }, + }, + } + + const gitHubApi = { + json: jest.fn().mockResolvedValue(response), } const metadata = { maven: { artifactId: "something", groupId: "grouper" }, @@ -97,11 +100,27 @@ describe("the github data handler", () => { internal, } + const otherMetadata = { + maven: { artifactId: "something", groupId: "grouper" }, + sourceControl: `${otherUrl},mavenstuff`, + } + + const otherNode = { + metadata: otherMetadata, + internal, + } + beforeAll(async () => { // Needed so that we do not short circuit the git path process.env.GITHUB_TOKEN = "test_value" - fetch.mockResolvedValue(gitHubData) + fetch.mockResolvedValue(gitHubApi) await onPreBootstrap({ actions: {} }) + }) + + beforeEach(async () => { + clearCache() + + // Needed so that we do not short circuit the git path return onCreateNode({ node, createContentDigest, @@ -115,6 +134,10 @@ describe("the github data handler", () => { fetch.resetMocks() }) + afterEach(() => { + jest.clearAllMocks() + }) + it("creates an scm node", async () => { expect(createNode).toHaveBeenCalled() }) @@ -208,56 +231,219 @@ describe("the github data handler", () => { ) }) - it("does not create any nodes", async () => { + it("caches the issue count", async () => { + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + const callCount = fetch.mock.calls.length + + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + // It should call the GitHub API again ... + expect(fetch).toHaveBeenCalledTimes(callCount + 1) + + // But it should not ask for the issues + expect(fetch).not.toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + // It should set an issue count, even though it didn't ask for one + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ issues: 16 }) + ) + }) + + it("caches the top-level quarkus-extension.yaml", async () => { + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + // But it should not ask for the top-level meta-inf listing + expect(fetch).not.toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // It should set an extension descriptor path, even though it didn't ask for one + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ + extensionYamlUrl: + "http://gitsomething.com/someuser/somerepo/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }) + ) + + // It should fill in the cached information for everything else + expect(createNode).toHaveBeenCalledWith( + expect.objectContaining({ + ownerImageUrl: "http://something.com/someuser.png", + }) + ) + }) + + it("does not cache the quarkus-extension.yaml in subfolders", async () => { + // Sense check + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:something\/runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + // And it should still ask for the subfolder meta-inf listing + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:something\/runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // It should set an extension descriptor path + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ + extensionYamlUrl: + "http://gitsomething.com/someuser/somerepo/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }) + ) + }) + + it("does not cache the issue count on a subsequent call for a different project", async () => { + // Re-trigger the invocation + const newIssueCount = 56 + response.data.repository.issues.totalCount = newIssueCount + + await onCreateNode( + { + node: otherNode, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ issues: newIssueCount }) + ) + }) + + it("does not create any remote file nodes", async () => { expect(createRemoteFileNode).not.toHaveBeenCalled() }) }) describe("where a label should be used", () => { - const projectName = "somerepo" - const ownerName = "someuser" + const projectName = "quarkus" + const ownerName = "quarkusio" const url = "https://github.com/quarkusio/quarkus" const issuesUrl = url + "/issues?q=is%3Aopen+is%3Aissue+label%3Aarea%2Falabel,area%2Fanotherlabel" + const secondIssuesUrl = + url + + "/issues?q=is%3Aopen+is%3Aissue+label%3Aarea%2Fdifferent,area%2Funique" const avatarUrl = "http://something.com/someuser.png" const socialMediaPreviewUrl = "https://testopengraph.githubassets.com/3096043220541a8ea73deb5cb6baddf0f01d50244737d22402ba12d665e9aec2/quarkiverse/quarkus-some-extension" const labels = ["area/alabel", "area/anotherlabel"] + const secondLabels = ["area/different", "area/unique"] const yaml = `triage: rules: - id: amazon-lambda labels: [ area/alabel, area/anotherlabel ] directories: - extensions/something - - integration-tests/amazon-lambda` + - integration-tests/amazon-lambda + - id: second + labels: [ area/different, area/unique ] + directories: + - extensions/second` - const gitHubData = { - text: jest.fn().mockResolvedValue(yaml), - json: jest.fn().mockResolvedValue({ - data: { - repository: { - issues: { - totalCount: 16, - }, - defaultBranchRef: { name: "unusual" }, - metaInfs: null, - subfolderMetaInfs: null, - shortenedSubfolderMetaInfs: { - entries: [ - { path: "runtime/src/main/resources/META-INF/beans.xml" }, - { - path: "some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", - }, - { path: "runtime/src/main/resources/META-INF/services" }, - ], - }, - openGraphImageUrl: socialMediaPreviewUrl, + const response = { + data: { + repository: { + issues: { + totalCount: 16, }, - repositoryOwner: { avatarUrl: avatarUrl }, + defaultBranchRef: { name: "unusual" }, + metaInfs: null, + subfolderMetaInfs: null, + shortenedSubfolderMetaInfs: { + entries: [ + { path: "runtime/src/main/resources/META-INF/beans.xml" }, + { + path: "some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }, + { path: "runtime/src/main/resources/META-INF/services" }, + ], + }, + openGraphImageUrl: socialMediaPreviewUrl, }, - }), + repositoryOwner: { avatarUrl: avatarUrl }, + }, + } + + const gitHubApi = { + text: jest.fn().mockResolvedValue(yaml), + json: jest.fn().mockResolvedValue(response), } const metadata = { maven: { artifactId: "something", groupId: "group" }, @@ -272,8 +458,14 @@ describe("the github data handler", () => { beforeAll(async () => { // Needed so that we do not short circuit the git path process.env.GITHUB_TOKEN = "test_value" - fetch.mockResolvedValue(gitHubData) + fetch.mockResolvedValue(gitHubApi) await onPreBootstrap({ actions: {} }) + }) + + beforeEach(async () => { + clearCache() + + // Needed so that we do not short circuit the git path return onCreateNode({ node, createContentDigest, @@ -287,6 +479,10 @@ describe("the github data handler", () => { fetch.resetMocks() }) + afterEach(() => { + jest.clearAllMocks() + }) + it("creates an scm node", async () => { expect(createNode).toHaveBeenCalled() }) @@ -361,7 +557,8 @@ describe("the github data handler", () => { expect(createNode).toHaveBeenCalledWith( expect.objectContaining({ extensionYamlUrl: - "http://gitsomething.com/someuser/somerepo/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + url + + "/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", }) ) }) @@ -376,6 +573,181 @@ describe("the github data handler", () => { }) ) }) + + it("does not cache the issue count", async () => { + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + // This is a bit fragile with the assumptions about whitespace and a bit fiddly with the slashes, but it checks we did a query and got the project name right + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + // Now re-trigger the invocation + const newIssueCount = 4 + response.data.repository.issues.totalCount = newIssueCount + + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ issues: newIssueCount }) + ) + }) + + it("does not cache the labels and issue url", async () => { + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + // This is a bit fragile with the assumptions about whitespace and a bit fiddly with the slashes, but it checks we did a query and got the project name right + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + // Now re-trigger the invocation + response.data.repository.issues.totalCount = 4 + + node.metadata.maven.artifactId = "second" + + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ issuesUrl: secondIssuesUrl }) + ) + + // It should return the labels for this extension + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ labels: secondLabels }) + ) + }) + + it("caches the top-level quarkus-extension.yaml", async () => { + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + // But it should not ask for the top-level meta-inf listing + expect(fetch).not.toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // It should set an extension descriptor path, even though it didn't ask for one + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ + extensionYamlUrl: + url + + "/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }) + ) + }) + + it("does not cache the quarkus-extension.yaml in subfolders", async () => { + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + // And it should still ask for the subfolder meta-inf listing + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching( + /HEAD:second\/runtime\/src\/main\/resources\/META-INF/ + ), + }) + ) + + // It should set an extension descriptor path + expect(createNode).toHaveBeenLastCalledWith( + expect.objectContaining({ + extensionYamlUrl: + url + + "/blob/unusual/some-folder-name/runtime/src/main/resources/META-INF/quarkus-extension.yaml", + }) + ) + }) + + it("caches the image location", async () => { + expect(fetch).toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + // This is a bit fragile with the assumptions about whitespace and a bit fiddly with the slashes, but it checks we did a query and got the project name right + body: expect.stringMatching(/issues\(states:OPEN/), + }) + ) + + response.data.repository.issues.totalCount = 7 + + const callCount = fetch.mock.calls.length + + // Now re-trigger the invocation + await onCreateNode( + { + node, + createContentDigest, + createNodeId, + actions, + }, + {} + ) + + expect(fetch).toHaveBeenCalledTimes(callCount + 1) + + // We shouldn't be asking for image urls or file paths, since those are totally cacheable + expect(fetch).not.toHaveBeenLastCalledWith( + "https://api.github.com/graphql", + expect.objectContaining({ + body: expect.stringMatching(/openGraphImageUrl/), + }) + ) + + // It should fill in the cached information for everything but the issue count and issue url + expect(createNode).toHaveBeenCalledWith( + expect.objectContaining({ + ownerImageUrl: "http://something.com/someuser.png", + }) + ) + }) }) describe("where the social media preview has been set by the user", () => { @@ -385,7 +757,7 @@ describe("the github data handler", () => { const socialMediaPreviewUrl = "https://repository-images.githubusercontent.com/437045322/39ad4dec-e606-4b21-bb24-4c09a4790b58" - const gitHubData = { + const gitHubApi = { json: jest.fn().mockResolvedValue({ data: { repository: { @@ -411,7 +783,7 @@ describe("the github data handler", () => { beforeAll(async () => { // Needed so that we do not short circuit the git path process.env.GITHUB_TOKEN = "social-preview-test_value" - fetch.mockResolvedValue(gitHubData) + fetch.mockResolvedValue(gitHubApi) await onPreBootstrap({ actions: {} }) return onCreateNode({ node,