Skip to content

Commit

Permalink
update prerender to include a content manifest csv output (#2268)
Browse files Browse the repository at this point in the history
* update prerender to include a content manifest csv output

* add toc node types

* Add lanugage and book slug

* 👕

* remove debug

* 👕

* 📝

* fix import

---------

Co-authored-by: staxly[bot] <35789409+staxly[bot]@users.noreply.github.com>
  • Loading branch information
TomWoodward and staxly[bot] authored Aug 9, 2024
1 parent 27a755a commit 2c13e26
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 69 deletions.
60 changes: 60 additions & 0 deletions script/prerender/contentManifest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { BookWithOSWebData, ArchiveTreeNode, ArchiveTree } from '../../src/app/content/types';
import { content } from '../../src/app/content/routes';
import { writeAssetFile } from './fileUtils';
import { stripIdVersion } from '../../src/app/content/utils/idUtils';
import { splitTitleParts } from '../../src/app/content/utils/archiveTreeUtils';

const quoteValue = (value?: string) => value ? `"${value.replace(/"/g, '""')}"` : '""';

export const renderAndSaveContentManifest = async(
saveFile: (path: string, contents: string) => Promise<unknown>,
books: BookWithOSWebData[]
) => {

const rows = books.map(book => getContentsRows(book, book.tree))
.reduce((result, item) => ([...result, ...item]), [] as string[][]);

const manifestText = [
['id', 'title', 'text title', 'language', 'slug', 'url', 'toc type', 'toc target type'],
...rows,
].map(row => row.map(quoteValue).join(',')).join('\n');

await saveFile('/rex/content-metadata.csv', manifestText);
};

function getContentsRows(
book: BookWithOSWebData,
node: ArchiveTree | ArchiveTreeNode,
chapterNumber?: string
): string[][] {
const {title, toc_target_type} = node;
const [titleNumber, titleString] = splitTitleParts(node.title);
const textTitle = `${titleNumber || chapterNumber || ''} ${titleString}`.replace(/\s+/, ' ').trim();
const id = stripIdVersion(node.id);
const tocType = node.toc_type ?? (id === book.id ? 'book' : '');

const urlParams = tocType === 'book'
? [node.slug, '']
: 'contents' in node
? ['', '']
: [node.slug, content.getUrl({book: {slug: book.slug}, page: {slug: node.slug}})];

const contents = 'contents' in node
? node.contents.map(child => getContentsRows(book, child, titleNumber || chapterNumber))
.reduce((result, item) => ([...result, ...item]), [] as string[][])
: [];

return [
[stripIdVersion(id), title, textTitle, book.language, ...urlParams, tocType, toc_target_type ?? ''],
...contents,
];
}


// simple helper for local
const writeAssetFileAsync = async(filepath: string, contents: string) => {
return writeAssetFile(filepath, contents);
};
export const renderContentManifest = async(books: BookWithOSWebData[]) => {
return renderAndSaveContentManifest(writeAssetFileAsync, books);
};
30 changes: 10 additions & 20 deletions script/prerender/fleet.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ import { getBooksConfigSync } from '../../src/gateways/createBookConfigLoader';
import createOSWebLoader from '../../src/gateways/createOSWebLoader';
import { readFile } from '../../src/helpers/fileUtils';
import { globalMinuteCounter, prepareBookPages } from './contentPages';
import { SerializedBookMatch, SerializedPageMatch } from './contentRoutes';
import { SerializedPageMatch } from './contentRoutes';
import createRedirects from './createRedirects';
import './logUnhandledRejectionsAndExit';
import renderManifest from './renderManifest';
import { SitemapPayload } from './sitemap';
import { SitemapPayload, renderAndSaveSitemapIndex } from './sitemap';
import { writeS3ReleaseXmlFile } from './fileUtils';
import { renderAndSaveContentManifest } from './contentManifest';

const {
ARCHIVE_URL,
Expand Down Expand Up @@ -86,7 +88,6 @@ const sqsClient = new SQSClient({ region: WORK_REGION });

type PageTask = { payload: SerializedPageMatch, type: 'page' };
type SitemapTask = { payload: SitemapPayload, type: 'sitemap' };
type SitemapIndexTask = { payload: SerializedBookMatch[], type: 'sitemapIndex' };

const booksConfig = getBooksConfigSync();
const archiveLoader = createArchiveLoader({
Expand Down Expand Up @@ -288,8 +289,7 @@ async function getQueueUrls(workersStackName: string) {
class Stats {
public pages = 0;
public sitemaps = 0;
public sitemapIndexes = 0;
get total() { return this.pages + this.sitemaps + this.sitemapIndexes; }
get total() { return this.pages + this.sitemaps; }
}

function makePrepareAndQueueBook(workQueueUrl: string, stats: Stats) {
Expand Down Expand Up @@ -347,11 +347,7 @@ function makePrepareAndQueueBook(workQueueUrl: string, stats: Stats) {

console.log(`[${book.title}] Sitemap queued`);

// Used in the sitemap index
return {
params: { book: { slug: book.slug } },
state: { bookUid: book.id, bookVersion: book.version },
};
return book;
};
}

Expand All @@ -371,14 +367,8 @@ async function queueWork(workQueueUrl: string) {
`All ${stats.pages} page prerendering jobs and all ${stats.sitemaps} sitemap jobs queued`
);

await sendWithRetries(sqsClient, new SendMessageCommand({
MessageBody: JSON.stringify({ payload: books, type: 'sitemapIndex' } as SitemapIndexTask),
QueueUrl: workQueueUrl,
}));

stats.sitemapIndexes = 1;

console.log('1 sitemap index job queued');
renderAndSaveSitemapIndex(writeS3ReleaseXmlFile, books);
renderAndSaveContentManifest(writeS3ReleaseXmlFile, books);

return stats;
}
Expand Down Expand Up @@ -463,8 +453,8 @@ async function finishRendering(stats: Stats) {
const elapsedMinutes = globalMinuteCounter();

console.log(
`Prerender complete in ${elapsedMinutes} minutes. Rendered ${stats.pages} pages, ${
stats.sitemaps} sitemaps and ${stats.sitemapIndexes} sitemap index. ${
`Prerender complete in ${elapsedMinutes} minutes. Rendered ${stats.pages} pages, and ${
stats.sitemaps} sitemaps. ${
stats.total / elapsedMinutes}ppm`
);
}
Expand Down
4 changes: 3 additions & 1 deletion script/prerender/local.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { createDiskCache } from './fileUtils';
import renderManifest from './renderManifest';
import { renderSitemap, renderSitemapIndex } from './sitemap';
import userLoader from './stubbedUserLoader';
import { renderContentManifest } from './contentManifest';

const {
REACT_APP_HIGHLIGHTS_URL,
Expand Down Expand Up @@ -81,7 +82,8 @@ async function render() {
await renderSitemap(book.slug, sitemap);
}

await renderSitemapIndex();
await renderSitemapIndex(books);
await renderContentManifest(books);
await renderManifest();
await createRedirects(archiveLoader, osWebLoader);

Expand Down
35 changes: 9 additions & 26 deletions script/prerender/sitemap.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import filter from 'lodash/fp/filter';
import flow from 'lodash/fp/flow';
import get from 'lodash/fp/get';
import identity from 'lodash/fp/identity';
import map from 'lodash/fp/map';
import max from 'lodash/fp/max';
import sitemap, { SitemapItemOptions } from 'sitemap';
import { SerializedPageMatch } from './contentRoutes';
import { writeAssetFile } from './fileUtils';
import { BookWithOSWebData } from '../../src/app/content/types';
import { getSitemapItemOptions } from './contentPages';

export const sitemapPath = (pathName: string) => `/rex/sitemaps/${pathName}.xml`;

Expand All @@ -28,40 +24,27 @@ export const renderAndSaveSitemap = async(

export const renderAndSaveSitemapIndex = async(
saveFile: (path: string, contents: string) => Promise<unknown>,
urls: SitemapItemOptions[]
books: BookWithOSWebData[]
) => {
const sitemapIndex = sitemap.buildSitemapIndex({ urls });
const sitemapIndex = sitemap.buildSitemapIndex({urls: books.map(book =>
getSitemapItemOptions(book, `https://openstax.org${sitemapPath(book.slug)}`)
)});

const filePath = sitemapPath('index');

await saveFile(filePath, sitemapIndex.toString());

return filePath;
};

// renderSitemap() and renderSitemapIndex() are used only by single-instance prerender code

// Multi-instance code cannot store an array of sitemaps in memory and then use it across instances
const sitemaps: SitemapItemOptions[] = [];

const writeAssetFileAsync = async(filepath: string, contents: string) => {
return writeAssetFile(filepath, contents);
};

export const renderSitemap = async(filename: string, urls: SitemapItemOptions[]) => {
const lastmod = flow(
map<SitemapItemOptions, (string | undefined)>(get('lastmod')),
filter<string | undefined>(identity),
max
)(urls);

const filePath = await renderAndSaveSitemap(writeAssetFileAsync, filename, urls);

const url = `https://openstax.org${filePath}`;

sitemaps.push({url, lastmod});
await renderAndSaveSitemap(writeAssetFileAsync, filename, urls);
};

export const renderSitemapIndex = async() => {
return renderAndSaveSitemapIndex(writeAssetFileAsync, sitemaps);
export const renderSitemapIndex = async(books: BookWithOSWebData[]) => {
return renderAndSaveSitemapIndex(writeAssetFileAsync, books);
};
23 changes: 1 addition & 22 deletions script/prerender/thread.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,13 @@ import createImageCDNUtils from '../../src/gateways/createImageCDNUtils';
import { getSitemapItemOptions, renderAndSavePage } from './contentPages';
import {
deserializePageMatch,
getArchiveBook,
getArchivePage,
SerializedBookMatch,
SerializedPageMatch,
} from './contentRoutes';
import { writeS3ReleaseHtmlFile, writeS3ReleaseXmlFile } from './fileUtils';
import './logUnhandledRejectionsAndExit';
import {
renderAndSaveSitemap,
renderAndSaveSitemapIndex,
sitemapPath,
SitemapPayload,
} from './sitemap';
import userLoader from './stubbedUserLoader';
Expand Down Expand Up @@ -90,24 +86,8 @@ function makeSitemapTask(services: AppOptions['services']) {
};
}

function makeSitemapIndexTask(services: AppOptions['services']) {
return async(payload: SerializedBookMatch[]) => {
const books = payload.map(
(book: SerializedBookMatch, index: number) => assertObject(
book, `Sitemap Index task payload[${index}] is not an object: ${payload}`
)
);
const items = await asyncPool(MAX_CONCURRENT_CONNECTIONS, books, async(book) => {
const archiveBook = await getArchiveBook(services, book);
return getSitemapItemOptions(archiveBook, `https://openstax.org${sitemapPath(book.params.book.slug)}`);
});
return renderAndSaveSitemapIndex(writeS3ReleaseXmlFile, items);
};
}

type AnyTaskFunction = ((payload: SerializedPageMatch) => void) |
((payload: SitemapPayload) => void) |
((payload: SerializedBookMatch[]) => void);
((payload: SitemapPayload) => void);

type TaskFunctionsMap = { [key: string]: AnyTaskFunction | undefined };

Expand Down Expand Up @@ -141,7 +121,6 @@ async function makeTaskFunctionsMap() {
return {
page: makePageTask(services),
sitemap: makeSitemapTask(services),
sitemapIndex: makeSitemapIndexTask(services),
} as TaskFunctionsMap;
}

Expand Down
2 changes: 2 additions & 0 deletions src/app/content/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ export interface ArchiveTreeNode {
id: string;
title: string;
slug: string;
toc_type?: string;
toc_target_type?: string;
}

export type ArchiveTreeSectionType = 'book' | 'unit' | 'chapter' | 'page' | 'eoc-dropdown' | 'eob-dropdown';
Expand Down

0 comments on commit 2c13e26

Please sign in to comment.