From 2c13e268da3b06d31df447777e6b40e759aece89 Mon Sep 17 00:00:00 2001 From: Thomas Woodward Date: Fri, 9 Aug 2024 14:04:34 -0400 Subject: [PATCH] update prerender to include a content manifest csv output (#2268) * update prerender to include a content manifest csv output * add toc node types * Add lanugage and book slug * :shirt: * remove debug * :shirt: * :pencil: * fix import --------- Co-authored-by: staxly[bot] <35789409+staxly[bot]@users.noreply.github.com> --- script/prerender/contentManifest.ts | 60 +++++++++++++++++++++++++++++ script/prerender/fleet.ts | 30 +++++---------- script/prerender/local.ts | 4 +- script/prerender/sitemap.ts | 35 +++++------------ script/prerender/thread.ts | 23 +---------- src/app/content/types.ts | 2 + 6 files changed, 85 insertions(+), 69 deletions(-) create mode 100644 script/prerender/contentManifest.ts diff --git a/script/prerender/contentManifest.ts b/script/prerender/contentManifest.ts new file mode 100644 index 0000000000..f57766b577 --- /dev/null +++ b/script/prerender/contentManifest.ts @@ -0,0 +1,60 @@ +import { BookWithOSWebData, ArchiveTreeNode, ArchiveTree } from '../../src/app/content/types'; +import { content } from '../../src/app/content/routes'; +import { writeAssetFile } from './fileUtils'; +import { stripIdVersion } from '../../src/app/content/utils/idUtils'; +import { splitTitleParts } from '../../src/app/content/utils/archiveTreeUtils'; + +const quoteValue = (value?: string) => value ? `"${value.replace(/"/g, '""')}"` : '""'; + +export const renderAndSaveContentManifest = async( + saveFile: (path: string, contents: string) => Promise, + books: BookWithOSWebData[] +) => { + + const rows = books.map(book => getContentsRows(book, book.tree)) + .reduce((result, item) => ([...result, ...item]), [] as string[][]); + + const manifestText = [ + ['id', 'title', 'text title', 'language', 'slug', 'url', 'toc type', 'toc target type'], + ...rows, + ].map(row => row.map(quoteValue).join(',')).join('\n'); + + await saveFile('/rex/content-metadata.csv', manifestText); +}; + +function getContentsRows( + book: BookWithOSWebData, + node: ArchiveTree | ArchiveTreeNode, + chapterNumber?: string +): string[][] { + const {title, toc_target_type} = node; + const [titleNumber, titleString] = splitTitleParts(node.title); + const textTitle = `${titleNumber || chapterNumber || ''} ${titleString}`.replace(/\s+/, ' ').trim(); + const id = stripIdVersion(node.id); + const tocType = node.toc_type ?? (id === book.id ? 'book' : ''); + + const urlParams = tocType === 'book' + ? [node.slug, ''] + : 'contents' in node + ? ['', ''] + : [node.slug, content.getUrl({book: {slug: book.slug}, page: {slug: node.slug}})]; + + const contents = 'contents' in node + ? node.contents.map(child => getContentsRows(book, child, titleNumber || chapterNumber)) + .reduce((result, item) => ([...result, ...item]), [] as string[][]) + : []; + + return [ + [stripIdVersion(id), title, textTitle, book.language, ...urlParams, tocType, toc_target_type ?? ''], + ...contents, + ]; +} + + +// simple helper for local +const writeAssetFileAsync = async(filepath: string, contents: string) => { + return writeAssetFile(filepath, contents); +}; +export const renderContentManifest = async(books: BookWithOSWebData[]) => { + return renderAndSaveContentManifest(writeAssetFileAsync, books); +}; diff --git a/script/prerender/fleet.ts b/script/prerender/fleet.ts index 969b9dbaf6..9e5173110c 100644 --- a/script/prerender/fleet.ts +++ b/script/prerender/fleet.ts @@ -43,11 +43,13 @@ import { getBooksConfigSync } from '../../src/gateways/createBookConfigLoader'; import createOSWebLoader from '../../src/gateways/createOSWebLoader'; import { readFile } from '../../src/helpers/fileUtils'; import { globalMinuteCounter, prepareBookPages } from './contentPages'; -import { SerializedBookMatch, SerializedPageMatch } from './contentRoutes'; +import { SerializedPageMatch } from './contentRoutes'; import createRedirects from './createRedirects'; import './logUnhandledRejectionsAndExit'; import renderManifest from './renderManifest'; -import { SitemapPayload } from './sitemap'; +import { SitemapPayload, renderAndSaveSitemapIndex } from './sitemap'; +import { writeS3ReleaseXmlFile } from './fileUtils'; +import { renderAndSaveContentManifest } from './contentManifest'; const { ARCHIVE_URL, @@ -86,7 +88,6 @@ const sqsClient = new SQSClient({ region: WORK_REGION }); type PageTask = { payload: SerializedPageMatch, type: 'page' }; type SitemapTask = { payload: SitemapPayload, type: 'sitemap' }; -type SitemapIndexTask = { payload: SerializedBookMatch[], type: 'sitemapIndex' }; const booksConfig = getBooksConfigSync(); const archiveLoader = createArchiveLoader({ @@ -288,8 +289,7 @@ async function getQueueUrls(workersStackName: string) { class Stats { public pages = 0; public sitemaps = 0; - public sitemapIndexes = 0; - get total() { return this.pages + this.sitemaps + this.sitemapIndexes; } + get total() { return this.pages + this.sitemaps; } } function makePrepareAndQueueBook(workQueueUrl: string, stats: Stats) { @@ -347,11 +347,7 @@ function makePrepareAndQueueBook(workQueueUrl: string, stats: Stats) { console.log(`[${book.title}] Sitemap queued`); - // Used in the sitemap index - return { - params: { book: { slug: book.slug } }, - state: { bookUid: book.id, bookVersion: book.version }, - }; + return book; }; } @@ -371,14 +367,8 @@ async function queueWork(workQueueUrl: string) { `All ${stats.pages} page prerendering jobs and all ${stats.sitemaps} sitemap jobs queued` ); - await sendWithRetries(sqsClient, new SendMessageCommand({ - MessageBody: JSON.stringify({ payload: books, type: 'sitemapIndex' } as SitemapIndexTask), - QueueUrl: workQueueUrl, - })); - - stats.sitemapIndexes = 1; - - console.log('1 sitemap index job queued'); + renderAndSaveSitemapIndex(writeS3ReleaseXmlFile, books); + renderAndSaveContentManifest(writeS3ReleaseXmlFile, books); return stats; } @@ -463,8 +453,8 @@ async function finishRendering(stats: Stats) { const elapsedMinutes = globalMinuteCounter(); console.log( - `Prerender complete in ${elapsedMinutes} minutes. Rendered ${stats.pages} pages, ${ - stats.sitemaps} sitemaps and ${stats.sitemapIndexes} sitemap index. ${ + `Prerender complete in ${elapsedMinutes} minutes. Rendered ${stats.pages} pages, and ${ + stats.sitemaps} sitemaps. ${ stats.total / elapsedMinutes}ppm` ); } diff --git a/script/prerender/local.ts b/script/prerender/local.ts index 45f53a7f74..a419347bd7 100644 --- a/script/prerender/local.ts +++ b/script/prerender/local.ts @@ -25,6 +25,7 @@ import { createDiskCache } from './fileUtils'; import renderManifest from './renderManifest'; import { renderSitemap, renderSitemapIndex } from './sitemap'; import userLoader from './stubbedUserLoader'; +import { renderContentManifest } from './contentManifest'; const { REACT_APP_HIGHLIGHTS_URL, @@ -81,7 +82,8 @@ async function render() { await renderSitemap(book.slug, sitemap); } - await renderSitemapIndex(); + await renderSitemapIndex(books); + await renderContentManifest(books); await renderManifest(); await createRedirects(archiveLoader, osWebLoader); diff --git a/script/prerender/sitemap.ts b/script/prerender/sitemap.ts index b46d1c0b75..7538d59066 100644 --- a/script/prerender/sitemap.ts +++ b/script/prerender/sitemap.ts @@ -1,12 +1,8 @@ -import filter from 'lodash/fp/filter'; -import flow from 'lodash/fp/flow'; -import get from 'lodash/fp/get'; -import identity from 'lodash/fp/identity'; -import map from 'lodash/fp/map'; -import max from 'lodash/fp/max'; import sitemap, { SitemapItemOptions } from 'sitemap'; import { SerializedPageMatch } from './contentRoutes'; import { writeAssetFile } from './fileUtils'; +import { BookWithOSWebData } from '../../src/app/content/types'; +import { getSitemapItemOptions } from './contentPages'; export const sitemapPath = (pathName: string) => `/rex/sitemaps/${pathName}.xml`; @@ -28,40 +24,27 @@ export const renderAndSaveSitemap = async( export const renderAndSaveSitemapIndex = async( saveFile: (path: string, contents: string) => Promise, - urls: SitemapItemOptions[] + books: BookWithOSWebData[] ) => { - const sitemapIndex = sitemap.buildSitemapIndex({ urls }); + const sitemapIndex = sitemap.buildSitemapIndex({urls: books.map(book => + getSitemapItemOptions(book, `https://openstax.org${sitemapPath(book.slug)}`) + )}); const filePath = sitemapPath('index'); await saveFile(filePath, sitemapIndex.toString()); - - return filePath; }; // renderSitemap() and renderSitemapIndex() are used only by single-instance prerender code -// Multi-instance code cannot store an array of sitemaps in memory and then use it across instances -const sitemaps: SitemapItemOptions[] = []; - const writeAssetFileAsync = async(filepath: string, contents: string) => { return writeAssetFile(filepath, contents); }; export const renderSitemap = async(filename: string, urls: SitemapItemOptions[]) => { - const lastmod = flow( - map(get('lastmod')), - filter(identity), - max - )(urls); - - const filePath = await renderAndSaveSitemap(writeAssetFileAsync, filename, urls); - - const url = `https://openstax.org${filePath}`; - - sitemaps.push({url, lastmod}); + await renderAndSaveSitemap(writeAssetFileAsync, filename, urls); }; -export const renderSitemapIndex = async() => { - return renderAndSaveSitemapIndex(writeAssetFileAsync, sitemaps); +export const renderSitemapIndex = async(books: BookWithOSWebData[]) => { + return renderAndSaveSitemapIndex(writeAssetFileAsync, books); }; diff --git a/script/prerender/thread.ts b/script/prerender/thread.ts index 5106d28c94..4b821a29c2 100644 --- a/script/prerender/thread.ts +++ b/script/prerender/thread.ts @@ -24,17 +24,13 @@ import createImageCDNUtils from '../../src/gateways/createImageCDNUtils'; import { getSitemapItemOptions, renderAndSavePage } from './contentPages'; import { deserializePageMatch, - getArchiveBook, getArchivePage, - SerializedBookMatch, SerializedPageMatch, } from './contentRoutes'; import { writeS3ReleaseHtmlFile, writeS3ReleaseXmlFile } from './fileUtils'; import './logUnhandledRejectionsAndExit'; import { renderAndSaveSitemap, - renderAndSaveSitemapIndex, - sitemapPath, SitemapPayload, } from './sitemap'; import userLoader from './stubbedUserLoader'; @@ -90,24 +86,8 @@ function makeSitemapTask(services: AppOptions['services']) { }; } -function makeSitemapIndexTask(services: AppOptions['services']) { - return async(payload: SerializedBookMatch[]) => { - const books = payload.map( - (book: SerializedBookMatch, index: number) => assertObject( - book, `Sitemap Index task payload[${index}] is not an object: ${payload}` - ) - ); - const items = await asyncPool(MAX_CONCURRENT_CONNECTIONS, books, async(book) => { - const archiveBook = await getArchiveBook(services, book); - return getSitemapItemOptions(archiveBook, `https://openstax.org${sitemapPath(book.params.book.slug)}`); - }); - return renderAndSaveSitemapIndex(writeS3ReleaseXmlFile, items); - }; -} - type AnyTaskFunction = ((payload: SerializedPageMatch) => void) | - ((payload: SitemapPayload) => void) | - ((payload: SerializedBookMatch[]) => void); + ((payload: SitemapPayload) => void); type TaskFunctionsMap = { [key: string]: AnyTaskFunction | undefined }; @@ -141,7 +121,6 @@ async function makeTaskFunctionsMap() { return { page: makePageTask(services), sitemap: makeSitemapTask(services), - sitemapIndex: makeSitemapIndexTask(services), } as TaskFunctionsMap; } diff --git a/src/app/content/types.ts b/src/app/content/types.ts index 2462f7e830..d770dfb4f1 100644 --- a/src/app/content/types.ts +++ b/src/app/content/types.ts @@ -112,6 +112,8 @@ export interface ArchiveTreeNode { id: string; title: string; slug: string; + toc_type?: string; + toc_target_type?: string; } export type ArchiveTreeSectionType = 'book' | 'unit' | 'chapter' | 'page' | 'eoc-dropdown' | 'eob-dropdown';