Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Added pagination to sitemap.xml to avoid max 50,000 entries limit #13698

Merged
merged 1 commit into from
Jan 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions core/frontend/services/sitemap/base-generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ class BaseSiteMapGenerator {
constructor() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
this.siteMapContent = new Map();
this.lastModified = 0;
this.maxNodes = 50000;
this.maxPerPage = 50000;
}

generateXmlFromNodes() {
generateXmlFromNodes(page) {
// Get a mapping of node to timestamp
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
return {
Expand All @@ -33,20 +33,23 @@ class BaseSiteMapGenerator {
};
});

// Limit to 50k nodes - this is a quick fix to prevent errors in google console
if (this.maxNodes) {
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
}

// Sort nodes by timestamp
nodesToProcess = _.sortBy(nodesToProcess, 'ts');

// Get the page of nodes that was requested
nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage);

// Do not generate empty sitemaps
if (nodesToProcess.length === 0) {
return null;
}

// Grab just the nodes
nodesToProcess = _.map(nodesToProcess, 'node');
const nodes = _.map(nodesToProcess, 'node');

const data = {
// Concat the elements to the _attr declaration
urlset: [XMLNS_DECLS].concat(nodesToProcess)
urlset: [XMLNS_DECLS].concat(nodes)
};

// Generate full xml
Expand All @@ -67,15 +70,15 @@ class BaseSiteMapGenerator {
this.updateLastModified(datum);
this.updateLookups(datum, node);
// force regeneration of xml
this.siteMapContent = null;
this.siteMapContent.clear();
}
}

removeUrl(url, datum) {
this.removeFromLookups(datum);

// force regeneration of xml
this.siteMapContent = null;
this.siteMapContent.clear();
this.lastModified = Date.now();
}

Expand Down Expand Up @@ -152,13 +155,13 @@ class BaseSiteMapGenerator {
return !!imageUrl;
}

getXml() {
if (this.siteMapContent) {
return this.siteMapContent;
getXml(page = 1) {
if (this.siteMapContent.has(page)) {
return this.siteMapContent.get(page);
}

const content = this.generateXmlFromNodes();
this.siteMapContent = content;
const content = this.generateXmlFromNodes(page);
this.siteMapContent.set(page, content);
return content;
}

Expand All @@ -181,7 +184,7 @@ class BaseSiteMapGenerator {
reset() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
this.siteMapContent.clear();
}
}

Expand Down
17 changes: 13 additions & 4 deletions core/frontend/services/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ const manager = new Manager();
// Responsible for handling requests for sitemap files
module.exports = function handler(siteApp) {
const verifyResourceType = function verifyResourceType(req, res, next) {
if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) {
const resourceWithoutPage = req.params.resource.replace(/-\d+$/, '');
if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) {
return res.sendStatus(404);
}

Expand All @@ -22,14 +23,22 @@ module.exports = function handler(siteApp) {
});

siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
const type = req.params.resource;
const page = 1;
const type = req.params.resource.replace(/-\d+$/, '');
const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1];
const page = pageParam ? parseInt(pageParam, 10) : 1;

const content = manager.getSiteMapXml(type, page);
// Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps
// (except for the first page so that at least one sitemap exists per type)
if (pageParam === '1' || (!content && page !== 1)) {
return res.sendStatus(404);
}

res.set({
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
'Content-Type': 'text/xml'
});

res.send(manager.getSiteMapXml(type, page));
res.send(content);
});
};
30 changes: 20 additions & 10 deletions core/frontend/services/sitemap/index-generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class SiteMapIndexGenerator {
constructor(options) {
options = options || {};
this.types = options.types;
this.maxPerPage = options.maxPerPage;
}

getXml() {
Expand All @@ -30,16 +31,25 @@ class SiteMapIndexGenerator {

generateSiteMapUrlElements() {
return _.map(this.types, (resourceType) => {
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true);
const lastModified = resourceType.lastModified;

return {
sitemap: [
{loc: url},
{lastmod: moment(lastModified).toISOString()}
]
};
});
// `|| 1` = even if there are no items we still have an empty sitemap file
const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1;
const pages = [];

for (let i = 0; i < noOfPages; i++) {
const page = i === 0 ? '' : `-${i + 1}`;
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true);
const lastModified = resourceType.lastModified;

pages.push({
sitemap: [
{loc: url},
{lastmod: moment(lastModified).toISOString()}
]
});
}

return pages;
}).flat();
}
}

Expand Down
13 changes: 8 additions & 5 deletions core/frontend/services/sitemap/manager.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ class SiteMapManager {
constructor(options) {
options = options || {};

options.maxPerPage = options.maxPerPage || 50000;

this.pages = options.pages || this.createPagesGenerator(options);
this.posts = options.posts || this.createPostsGenerator(options);
this.users = this.authors = options.authors || this.createUsersGenerator(options);
this.tags = options.tags || this.createTagsGenerator(options);
this.index = options.index || this.createIndexGenerator();
this.index = options.index || this.createIndexGenerator(options);

events.on('router.created', (router) => {
if (router.name === 'StaticRoutesRouter') {
Expand Down Expand Up @@ -43,14 +45,15 @@ class SiteMapManager {
});
}

createIndexGenerator() {
createIndexGenerator(options) {
return new IndexMapGenerator({
types: {
pages: this.pages,
posts: this.posts,
authors: this.authors,
tags: this.tags
}
},
maxPerPage: options.maxPerPage
});
}

Expand All @@ -74,8 +77,8 @@ class SiteMapManager {
return this.index.getXml();
}

getSiteMapXml(type) {
return this[type].getXml();
getSiteMapXml(type, page) {
return this[type].getXml(page);
}
}

Expand Down
62 changes: 52 additions & 10 deletions test/unit/frontend/services/sitemap/generator.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ describe('Generators', function () {
});

it('max node setting results in the right number of nodes', function () {
generator = new PostGenerator({maxNodes: 5});
generator = new PostGenerator({maxPerPage: 5});

for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
Expand All @@ -70,12 +70,12 @@ describe('Generators', function () {
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);

// But only 5 are output in the xml
generator.siteMapContent.match(/<loc>/g).should.be.Array().with.lengthOf(5);
generator.siteMapContent.get(1).match(/<loc>/g).should.be.Array().with.lengthOf(5);
});

it('default is 50k', function () {
generator = new PostGenerator();
generator.maxNodes.should.eql(50000);
generator.maxPerPage.should.eql(50000);
});

describe('IndexGenerator', function () {
Expand All @@ -86,7 +86,8 @@ describe('Generators', function () {
pages: new PageGenerator(),
tags: new TagGenerator(),
authors: new UserGenerator()
}
},
maxPerPage: 5
});
});

Expand All @@ -99,6 +100,21 @@ describe('Generators', function () {
xml.should.match(/sitemap-pages.xml/);
xml.should.match(/sitemap-authors.xml/);
});

it('creates multiple pages when there are too many posts', function () {
for (let i = 0; i < 10; i++) {
generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}
const xml = generator.getXml();

xml.should.match(/sitemap-posts.xml/);
xml.should.match(/sitemap-posts-2.xml/);
});
});
});

Expand Down Expand Up @@ -126,9 +142,9 @@ describe('Generators', function () {

it('get cached xml', function () {
sinon.spy(generator, 'generateXmlFromNodes');
generator.siteMapContent = 'something';
generator.siteMapContent.set(1, 'something');
generator.getXml().should.eql('something');
generator.siteMapContent = null;
generator.siteMapContent.clear();
generator.generateXmlFromNodes.called.should.eql(false);
});

Expand Down Expand Up @@ -184,6 +200,32 @@ describe('Generators', function () {
idxFirst.should.be.below(idxSecond);
idxSecond.should.be.below(idxThird);
});

it('creates multiple pages when there are too many posts', function () {
generator.maxPerPage = 5;
urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl');
for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}

const pages = [generator.getXml(), generator.getXml(2)];

for (let i = 0; i < 10; i++) {
const pageIndex = Math.floor(i / 5);
pages[pageIndex].should.containEql(`<loc>http://my-ghost-blog.com/episode-${i}/</loc>`);
}
});

it('shouldn\'t break with out of bounds pages', function () {
should.not.exist(generator.getXml(-1));
should.not.exist(generator.getXml(99999));
should.not.exist(generator.getXml(0));
});
});

describe('fn: removeUrl', function () {
Expand Down Expand Up @@ -224,12 +266,12 @@ describe('Generators', function () {

generator.getXml();

generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');

// <loc> should exist exactly one time
generator.siteMapContent.match(/<loc>/g).length.should.eql(3);
generator.siteMapContent.get(1).match(/<loc>/g).length.should.eql(3);
});
});
});
Expand Down