From f2ae9588148fd2033e899e88c186db92bfd93289 Mon Sep 17 00:00:00 2001 From: Eakam <67077705+Eakam1007@users.noreply.github.com> Date: Wed, 14 Dec 2022 15:50:15 -0500 Subject: [PATCH] Filter out irrelevant feed URLs (#3780) --- src/api/feed-discovery/src/util.js | 22 ++++++++-- src/api/feed-discovery/test/router.test.js | 24 ++++++++--- src/api/feed-discovery/test/util.test.js | 49 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/api/feed-discovery/src/util.js b/src/api/feed-discovery/src/util.js index 8c49b85b4f..3cad274899 100644 --- a/src/api/feed-discovery/src/util.js +++ b/src/api/feed-discovery/src/util.js @@ -76,6 +76,17 @@ const getFeedUrlType = (feedUrl) => { } }; +// return true if the feedURL is a relevant URL that we want to keep +const relevantFeedUrl = (feedUrl) => { + const invalidPathMatchers = [ + /\/comments\/feed\/$/, // wordpress.com comments feed + /^https:\/\/public-api.wordpress.com\/oembed/, + /^https:\/\/www.blogger.com\/feeds\/[0-9]*\/posts\/default$/, + /\/feeds\/posts\/default\?alt=rss$/, // blogspot.com alternate rss feed + ]; + return invalidPathMatchers.every((matcher) => !matcher.test(feedUrl)); +}; + // Helper function to return the feed url of a given blog url const getFeedUrls = (document) => { try { @@ -103,10 +114,12 @@ const getFeedUrls = (document) => { if (links.length > 0) { for (let i = 0; i < links.length; i += 1) { const feedUrl = links[i].attribs.href; - feedUrls.push({ - feedUrl, - type: getFeedUrlType(feedUrl), - }); + if (relevantFeedUrl(feedUrl)) { + feedUrls.push({ + feedUrl, + type: getFeedUrlType(feedUrl), + }); + } } } @@ -123,3 +136,4 @@ module.exports.getFeedUrls = getFeedUrls; module.exports.isTwitchUrl = isTwitchUrl; module.exports.toTwitchFeedUrl = toTwitchFeedUrl; module.exports.isFeedUrl = isFeedUrl; +module.exports.relevantFeedUrl = relevantFeedUrl; diff --git a/src/api/feed-discovery/test/router.test.js b/src/api/feed-discovery/test/router.test.js index 21a241bbe5..7a5d5a51a0 100644 --- a/src/api/feed-discovery/test/router.test.js +++ b/src/api/feed-discovery/test/router.test.js @@ -267,17 +267,25 @@ describe('POST /', () => { expect(res.body).toEqual(result); }); - it('should return 200 and all feed urls if there are multiple link elements that could contain a feed url', async () => { + it('should return 200 and all relevant feed urls if there are multiple link elements that could contain a feed url', async () => { const blogUrl = 'https://test321.blogspot.com/'; const mockBlogUrlResponseBody = `
- - - - - + + + + + + + + + + + + + @@ -290,6 +298,10 @@ describe('POST /', () => { 'https://test321.blogspot.com/json', 'https://test321.blogspot.com/oembed/?format=json', 'https://test321.blogspot.com/oembed/?format=xml', + 'https://test321.blogspot.com/feeds/posts/default', + 'https://test321.wordpress.com/feed/', + 'https://medium.com/feed/@test321', + 'https://dev.to/feed/test321', ].map((feedUrl) => ({ feedUrl, type: 'blog' })), }; diff --git a/src/api/feed-discovery/test/util.test.js b/src/api/feed-discovery/test/util.test.js index e0a72ed442..a599166e12 100644 --- a/src/api/feed-discovery/test/util.test.js +++ b/src/api/feed-discovery/test/util.test.js @@ -6,6 +6,7 @@ const { isFeedUrl, getBlogBody, getFeedUrlType, + relevantFeedUrl, getFeedUrls, } = require('../src/util'); @@ -127,6 +128,27 @@ describe('util.js', () => { expect(getFeedUrlType('not-valid')).toBe('blog'); }); + test('relevantFeedUrl returns true for relevant feed URLs', () => { + [ + 'https://test321.com/feed/user', + 'https://test321.workpress.com/feed/', + 'https://test321.blogspot.com/feeds/posts/default', + ].forEach((feedUrl) => { + expect(relevantFeedUrl(feedUrl)).toBe(true); + }); + }); + + test('relevantFeedUrl returns false for irrelevant feed URLs', () => { + [ + 'https://test321.workpress.com/comments/feed/', + 'https://public-api.wordpress.com/oembed/?format=json&url=https%3A%2F%2Ftest321.wordpress.com%2F&for=wpcom-auto-discovery', + 'https://www.blogger.com/feeds/123/posts/default', + 'https://test321.blogspot.com/feeds/posts/default?alt=rss', + ].forEach((feedUrl) => { + expect(relevantFeedUrl(feedUrl)).toBe(false); + }); + }); + test('getFeedUrls returns expected atom+xml feed URL for a given document', () => { const html = (type) => ` @@ -158,4 +180,31 @@ describe('util.js', () => { test('getFeedUrls returns null if document cannot be parsed', () => { expect(getFeedUrls(null)).toBe(null); }); + + test('getFeedUrls filters irrelevant feed URLs', () => { + const html = ` + + + + + + + + + + + + + + `; + + const expectedFeedUrls = [ + 'https://test321.blogspot.com/feeds/posts/default', + 'https://test321.wordpress.com/feed/', + 'https://medium.com/feed/@test321', + 'https://dev.to/feed/test321', + ].map((feedUrl) => ({ feedUrl, type: 'blog' })); + + expect(getFeedUrls(html)).toEqual(expectedFeedUrls); + }); });