From 1ce22881c657becf0397b83ac393fb5d2399104c Mon Sep 17 00:00:00 2001 From: Bjorn Lu Date: Tue, 16 Apr 2024 19:48:49 +0800 Subject: [PATCH] Improve sitemap generate performance (#10795) --- .changeset/famous-seals-camp.md | 5 + .../sitemap/src/generate-sitemap.ts | 94 +++++++---- .../sitemap/src/utils/parse-i18n-url.ts | 42 +++++ .../sitemap/src/utils/parse-url.ts | 39 ----- .../test/units/generate-sitemap.test.js | 147 ++++++++++++++++++ 5 files changed, 254 insertions(+), 73 deletions(-) create mode 100644 .changeset/famous-seals-camp.md create mode 100644 packages/integrations/sitemap/src/utils/parse-i18n-url.ts delete mode 100644 packages/integrations/sitemap/src/utils/parse-url.ts create mode 100644 packages/integrations/sitemap/test/units/generate-sitemap.test.js diff --git a/.changeset/famous-seals-camp.md b/.changeset/famous-seals-camp.md new file mode 100644 index 000000000000..b1703ee45ee0 --- /dev/null +++ b/.changeset/famous-seals-camp.md @@ -0,0 +1,5 @@ +--- +"@astrojs/sitemap": patch +--- + +Improves performance when generating the sitemap data diff --git a/packages/integrations/sitemap/src/generate-sitemap.ts b/packages/integrations/sitemap/src/generate-sitemap.ts index b10771ce481a..abd5d2c6ff49 100644 --- a/packages/integrations/sitemap/src/generate-sitemap.ts +++ b/packages/integrations/sitemap/src/generate-sitemap.ts @@ -1,51 +1,77 @@ import type { EnumChangefreq } from 'sitemap'; import type { SitemapItem, SitemapOptions } from './index.js'; -import { parseUrl } from './utils/parse-url.js'; +import { parseI18nUrl } from './utils/parse-i18n-url.js'; /** Construct sitemap.xml given a set of URLs */ -export function generateSitemap(pages: string[], finalSiteUrl: string, opts: SitemapOptions) { - const { changefreq, priority, lastmod: lastmodSrc, i18n } = opts!; +export function generateSitemap(pages: string[], finalSiteUrl: string, opts?: SitemapOptions) { + const { changefreq, priority, lastmod: lastmodSrc, i18n } = opts ?? {}; // TODO: find way to respect URLs here const urls = [...pages]; urls.sort((a, b) => a.localeCompare(b, 'en', { numeric: true })); // sort alphabetically so sitemap is same each time const lastmod = lastmodSrc?.toISOString(); - const { locales, defaultLocale } = i18n || {}; - const localeCodes = Object.keys(locales || {}); + // Parse URLs for i18n matching later + const { defaultLocale, locales } = i18n ?? {}; + let getI18nLinks: GetI18nLinks | undefined; + if (defaultLocale && locales) { + getI18nLinks = createGetI18nLinks(urls, defaultLocale, locales, finalSiteUrl); + } - const getPath = (url: string) => { - const result = parseUrl(url, i18n?.defaultLocale || '', localeCodes, finalSiteUrl); - return result?.path; - }; - const getLocale = (url: string) => { - const result = parseUrl(url, i18n?.defaultLocale || '', localeCodes, finalSiteUrl); - return result?.locale; - }; + const urlData: SitemapItem[] = urls.map((url, i) => ({ + url, + links: getI18nLinks?.(i), + lastmod, + priority, + changefreq: changefreq as EnumChangefreq, + })); + + return urlData; +} + +type GetI18nLinks = (urlIndex: number) => SitemapItem['links'] | undefined; + +function createGetI18nLinks( + urls: string[], + defaultLocale: string, + locales: Record, + finalSiteUrl: string +): GetI18nLinks { + // `parsedI18nUrls` will have the same length as `urls`, matching correspondingly + const parsedI18nUrls = urls.map((url) => parseI18nUrl(url, defaultLocale, locales, finalSiteUrl)); + // Cache as multiple i18n URLs with the same path will have the same links + const i18nPathToLinksCache = new Map(); - const urlData: SitemapItem[] = urls.map((url) => { - let links; - if (defaultLocale && locales) { - const currentPath = getPath(url); - if (currentPath) { - const filtered = urls.filter((subUrl) => getPath(subUrl) === currentPath); - if (filtered.length > 1) { - links = filtered.map((subUrl) => ({ - url: subUrl, - lang: locales[getLocale(subUrl)!], - })); - } + return (urlIndex) => { + const i18nUrl = parsedI18nUrls[urlIndex]; + if (!i18nUrl) { + return undefined; + } + + const cached = i18nPathToLinksCache.get(i18nUrl.path); + if (cached) { + return cached; + } + + // Find all URLs with the same path (without the locale part), e.g. /en/foo and /es/foo + const links: NonNullable = []; + for (let i = 0; i < parsedI18nUrls.length; i++) { + const parsed = parsedI18nUrls[i]; + if (parsed?.path === i18nUrl.path) { + links.push({ + url: urls[i], + lang: locales[parsed.locale], + }); } } - return { - url, - links, - lastmod, - priority, - changefreq: changefreq as EnumChangefreq, - }; - }); + // If 0 or 1 (which is itself), return undefined to not create any links. + // We also don't need to cache this as we know there's no other URLs that would've match this. + if (links.length <= 1) { + return undefined; + } - return urlData; + i18nPathToLinksCache.set(i18nUrl.path, links); + return links; + }; } diff --git a/packages/integrations/sitemap/src/utils/parse-i18n-url.ts b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts new file mode 100644 index 000000000000..5b7ebf78550d --- /dev/null +++ b/packages/integrations/sitemap/src/utils/parse-i18n-url.ts @@ -0,0 +1,42 @@ +interface ParsedI18nUrl { + locale: string; + path: string; +} + +// NOTE: The parameters have been schema-validated with Zod +export function parseI18nUrl( + url: string, + defaultLocale: string, + locales: Record, + base: string +): ParsedI18nUrl | undefined { + if (!url.startsWith(base)) { + return undefined; + } + + let s = url.slice(base.length); + + // Handle root URL + if (!s || s === '/') { + return { locale: defaultLocale, path: '/' }; + } + + if (s[0] !== '/') { + s = '/' + s; + } + + // Get locale from path, e.g. + // "/en-US/" -> "en-US" + // "/en-US/foo" -> "en-US" + const locale = s.split('/')[1]; + if (locale in locales) { + // "/en-US/foo" -> "/foo" + let path = s.slice(1 + locale.length); + if (!path) { + path = '/'; + } + return { locale, path }; + } + + return { locale: defaultLocale, path: s }; +} diff --git a/packages/integrations/sitemap/src/utils/parse-url.ts b/packages/integrations/sitemap/src/utils/parse-url.ts deleted file mode 100644 index f9189cf7d60d..000000000000 --- a/packages/integrations/sitemap/src/utils/parse-url.ts +++ /dev/null @@ -1,39 +0,0 @@ -export const parseUrl = ( - url: string, - defaultLocale: string, - localeCodes: string[], - base: string -) => { - if ( - !url || - !defaultLocale || - localeCodes.length === 0 || - localeCodes.some((key) => !key) || - !base - ) { - throw new Error('parseUrl: some parameters are empty'); - } - if (url.indexOf(base) !== 0) { - return undefined; - } - let s = url.replace(base, ''); - if (!s || s === '/') { - return { locale: defaultLocale, path: '/' }; - } - if (!s.startsWith('/')) { - s = '/' + s; - } - const a = s.split('/'); - const locale = a[1]; - if (localeCodes.some((key) => key === locale)) { - let path = a.slice(2).join('/'); - if (path === '//') { - path = '/'; - } - if (path !== '/' && !path.startsWith('/')) { - path = '/' + path; - } - return { locale, path }; - } - return { locale: defaultLocale, path: s }; -}; diff --git a/packages/integrations/sitemap/test/units/generate-sitemap.test.js b/packages/integrations/sitemap/test/units/generate-sitemap.test.js new file mode 100644 index 000000000000..2af0e42ab8a4 --- /dev/null +++ b/packages/integrations/sitemap/test/units/generate-sitemap.test.js @@ -0,0 +1,147 @@ +import assert from 'node:assert/strict'; +import { describe, it } from 'node:test'; +import { generateSitemap } from '../../dist/generate-sitemap.js'; + +const site = 'http://example.com'; + +describe('generateSitemap', () => { + describe('basic', () => { + it('works', () => { + const items = generateSitemap( + [ + // All pages + `${site}/a`, + `${site}/b`, + `${site}/c`, + ], + site + ); + assert.equal(items.length, 3); + assert.equal(items[0].url, `${site}/a`); + assert.equal(items[1].url, `${site}/b`); + assert.equal(items[2].url, `${site}/c`); + }); + + it('sorts the items', () => { + const items = generateSitemap( + [ + // All pages + `${site}/c`, + `${site}/a`, + `${site}/b`, + ], + site + ); + assert.equal(items.length, 3); + assert.equal(items[0].url, `${site}/a`); + assert.equal(items[1].url, `${site}/b`); + assert.equal(items[2].url, `${site}/c`); + }); + + it('sitemap props are passed to items', () => { + const now = new Date(); + const items = generateSitemap( + [ + // All pages + `${site}/a`, + `${site}/b`, + `${site}/c`, + ], + site, + { + changefreq: 'monthly', + lastmod: now, + priority: 0.5, + } + ); + + assert.equal(items.length, 3); + + assert.equal(items[0].url, `${site}/a`); + assert.equal(items[0].changefreq, 'monthly'); + assert.equal(items[0].lastmod, now.toISOString()); + assert.equal(items[0].priority, 0.5); + + assert.equal(items[1].url, `${site}/b`); + assert.equal(items[1].changefreq, 'monthly'); + assert.equal(items[1].lastmod, now.toISOString()); + assert.equal(items[1].priority, 0.5); + + assert.equal(items[2].url, `${site}/c`); + assert.equal(items[2].changefreq, 'monthly'); + assert.equal(items[2].lastmod, now.toISOString()); + assert.equal(items[2].priority, 0.5); + }); + }); + + describe('i18n', () => { + it('works', () => { + const items = generateSitemap( + [ + // All pages + `${site}/a`, + `${site}/b`, + `${site}/c`, + `${site}/es/a`, + `${site}/es/b`, + `${site}/es/c`, + `${site}/fr/a`, + `${site}/fr/b`, + // `${site}/fr-CA/c`, (intentionally missing for testing) + ], + site, + { + i18n: { + defaultLocale: 'en', + locales: { + en: 'en-US', + es: 'es-ES', + fr: 'fr-CA', + }, + }, + } + ); + + assert.equal(items.length, 8); + + const aLinks = [ + { url: `${site}/a`, lang: 'en-US' }, + { url: `${site}/es/a`, lang: 'es-ES' }, + { url: `${site}/fr/a`, lang: 'fr-CA' }, + ]; + const bLinks = [ + { url: `${site}/b`, lang: 'en-US' }, + { url: `${site}/es/b`, lang: 'es-ES' }, + { url: `${site}/fr/b`, lang: 'fr-CA' }, + ]; + const cLinks = [ + { url: `${site}/c`, lang: 'en-US' }, + { url: `${site}/es/c`, lang: 'es-ES' }, + ]; + + assert.equal(items[0].url, `${site}/a`); + assert.deepEqual(items[0].links, aLinks); + + assert.equal(items[1].url, `${site}/b`); + assert.deepEqual(items[1].links, bLinks); + + assert.equal(items[2].url, `${site}/c`); + assert.deepEqual(items[2].links, cLinks); + + assert.equal(items[3].url, `${site}/es/a`); + assert.deepEqual(items[3].links, aLinks); + + assert.equal(items[4].url, `${site}/es/b`); + assert.deepEqual(items[4].links, bLinks); + + assert.equal(items[5].url, `${site}/es/c`); + assert.deepEqual(items[5].links, cLinks); + + assert.equal(items[6].url, `${site}/fr/a`); + assert.deepEqual(items[6].links, aLinks); + + assert.equal(items[7].url, `${site}/fr/b`); + assert.deepEqual(items[7].links, bLinks); + }); + }); +});