Skip to content

Commit

Permalink
Making parser service work:
Browse files Browse the repository at this point in the history
- Update README.md
- Revert some docker, config changes
- Removed unncessary files (hash.js, redis.js, elastic.js)
- Change imports to use Satellite's (hash, redis, elastic, logger)
- Add remove-empty-anchor to html/index.js
- Remove some dependencies in package.json
- Update lock file
  • Loading branch information
manekenpix authored and aserputov committed Feb 22, 2022
1 parent 8f04242 commit 387350f
Show file tree
Hide file tree
Showing 33 changed files with 2,644 additions and 1,702 deletions.
3,015 changes: 1,623 additions & 1,392 deletions pnpm-lock.yaml

Large diffs are not rendered by default.

Empty file added src/api/parser/.gitignore
Empty file.
29 changes: 29 additions & 0 deletions src/api/parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Parser Service: (To be updated when Parser service is dockerized and live)

The Parser service parses posts from user's feeds to populate Redis

## Install

```
pnpm install
```

## Usage

### Normal mode

```
pnpm start
```

### Dev mode with automatic restarts

```
pnpm dev
```

By default the server is running on <http://localhost:10000/>.

### Examples

## Docker
34 changes: 34 additions & 0 deletions src/api/parser/env.local
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
NODE_ENV=development
LOG_LEVEL=debug

REDIS_URL=redis://localhost

ELASTIC_URL=http://localhost
ELASTIC_PORT=9200
# Max number of results per query
ELASTIC_MAX_RESULTS_PER_PAGE=5
# Delay to check connectivity with Elasticsearch in ms
ELASTIC_DELAY_MS=10000

# FEED_URL url used to access feed list
FEED_URL=https://wiki.cdot.senecacollege.ca/wiki/Planet_CDOT_Feed_List

# Milliseconds to wait after attempting to fetch the feed list when the server is not available
FEED_URL_INTERVAL_MS=30000

# Period of time (seconds) that an unprocessed feed must wait before its
# next processing attempt (due to previous attempt responding with HTTP 429)
FEED_PROCESSING_DELAY_SEC=3600

# Feed job queue attempts
FEED_QUEUE_ATTEMPTS=5

# Feed job queue delay(ms)
FEED_QUEUE_DELAY_MS=60000

# Number of concurrent worker processors to run. Use * if you want to run
# one per CPU. Use a number if you want to set it manually, up to a max
# of the CPU count. If not set, we'll assume 1.
FEED_QUEUE_PARALLEL_WORKERS=1

PARSER_PORT = 10000
9 changes: 0 additions & 9 deletions src/api/parser/jest.config.e2e.js

This file was deleted.

42 changes: 26 additions & 16 deletions src/api/parser/package.json
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
{
"name": "parser",
"name": "@senecacdot/parser-service",
"private": true,
"version": "1.0.0",
"description": "A service for parsing posts",
"description": "A service for parsing feeds",
"scripts": {
"start": "node src/server.js"
"dev": "env-cmd -f env.local node src/index.js",
"start": "node src/index.js"
},
"repository": {
"type": "git",
"url": "git+https://github.com/Seneca-CDOT/telescope.git"
},
"author": "",
"repository": "Seneca-CDOT/telescope",
"license": "BSD-2-Clause",
"bugs": {
"url": "https://github.com/Seneca-CDOT/telescope/issues"
},
"homepage": "https://github.com/Seneca-CDOT/telescope#readme",
"dependencies": {
"@elastic/elasticsearch": "7.16.0",
"@elastic/elasticsearch-mock": "0.3.1",
"@senecacdot/satellite": "1.25.0",
"bull": "3.29.3",
"bull-board": "2.1.3",
"jsdom": "18.1.1",
"normalize-url": "6.1.0",
"rss-parser": "3.12.0"
"@bull-board/api": "3.7.0",
"@bull-board/express": "3.7.0",
"@senecacdot/satellite": "latest",
"bull": "3.22.0",
"clean-whitespace": "0.1.2",
"highlight.js": "11.3.1",
"jsdom": "18.0.1",
"linkify-html": "3.0.5",
"linkifyjs": "3.0.5",
"normalize-url": "6.0.1",
"rss-parser": "3.12.0",
"sanitize-html": "2.5.3"
},
"engines": {
"node": ">=14.0.0"
},
"devDependencies": {
"babel-jest": "27.3.1",
"env-cmd": "10.1.0",
"nodemon": "2.0.7"
}
}
29 changes: 0 additions & 29 deletions src/api/parser/readme.md

This file was deleted.

12 changes: 9 additions & 3 deletions src/api/parser/src/data/feed.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ const {
getFlaggedFeeds,
setFlaggedFeed,
unsetFlaggedFeed,
} = require('../storage');
} = require('../utils/storage');
const { deletePost } = require('../utils/indexer');

const urlToId = (url) => hash(normalizeUrl(url));

Expand Down Expand Up @@ -61,7 +62,12 @@ class Feed {
posts = posts.filter((post) => post.feed === this.id);

// Remove the post from Redis + ElasticSearch
await Promise.all([].concat(posts.map((post) => removePost(post.id))));
await Promise.all(
[].concat(
posts.map((post) => removePost(post.id)),
posts.map((post) => deletePost(post.id))
)
);
}

/**
Expand Down Expand Up @@ -103,7 +109,7 @@ class Feed {
* Returns a Promise<Boolean>
*/
async isDelayed() {
return (await isDelayed(this.id)) === '1';
return (await isDelayed(this.id)) === 1;
}

/**
Expand Down
124 changes: 116 additions & 8 deletions src/api/parser/src/data/post.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
const { hash } = require('@senecacdot/satellite');
const { getPost, addPost } = require('../storage');
const linkifyHtml = require('linkify-html');
const { hash, logger } = require('@senecacdot/satellite');
const { getPost, addPost } = require('../utils/storage');
const processHTML = require('../utils/html');
const textParser = require('../utils/text-parser');
const Feed = require('./feed');
const textParser = require('../text-parser');
const ArticleError = require('./article-error');
const { indexPost } = require('../utils/indexer');

/**
* Makes sure that a given date can be constructed as a Date object
Expand All @@ -10,7 +14,7 @@ const textParser = require('../text-parser');
* @param {Object} date an Object to construct as a Date object
* @param {Date} [fallbackDate] an optional second Date to construct in case the first fails to do so
*/
function ensureDate(date, fallbackDate) {
const ensureDate = (date, fallbackDate) => {
if (
date &&
(Object.prototype.toString.call(date) === '[object String]' ||
Expand All @@ -23,18 +27,34 @@ function ensureDate(date, fallbackDate) {
}

throw new Error(`post has an invalid date: ${date}'`);
}
};

/**
* Makes sure that the given feed is a Feed and not just an id. If the latter
* it gets the full feed.
* @param {Feed|String} feed a Feed object or feed id
* Returns a Promise<Feed>
*/
function ensureFeed(feed) {
return feed instanceof Feed ? Promise.resolve(feed) : Feed.byId(feed);
}
const ensureFeed = (feed) => (feed instanceof Feed ? Promise.resolve(feed) : Feed.byId(feed));

/**
* @param {string} url
* @returns {"video" | "blogpost"} the post's type
*/
const determinePostType = (url) => {
try {
const associatedLink = new URL(url);

if (associatedLink.hostname.includes('youtube.com')) {
return 'video';
}
// Assume that we are dealing with a blogpost if we
// are not dealing with videos
return 'blogpost';
} catch {
return 'blogpost';
}
};
class Post {
constructor(title, html, datePublished, dateUpdated, postUrl, guid, feed) {
// Use the post's guid as our unique identifier
Expand All @@ -46,7 +66,9 @@ class Post {
// create an absolute url if postURL is relative
this.url = new URL(postUrl, feed.url).href;
this.guid = guid;
this.type = determinePostType(this.url);

// We expect to get a real Feed vs. a feed id
if (!(feed instanceof Feed)) {
throw new Error(`expected feed to be a Feed Object, got '${feed}'`);
}
Expand Down Expand Up @@ -76,6 +98,92 @@ class Post {
return this.feed.author;
}

/**
* Parse an article object into a Post object.
* @param {Object} article parsed via feedparser, see:
* https://www.npmjs.com/package/feedparser#what-is-the-parsed-output-produced-by-feedparser
*
* If data is missing, throws an error.
*/
static async createFromArticle(article, feed) {
// Validate the properties we get, and if we don't have them all, throw
if (!article) {
throw new Error('unable to parse, missing article');
}

if (article.contentEncoded) article.content = article.contentEncoded;

if (article.mediaGroup) article.content = article.mediaGroup['media:description'];

// A valid RSS/Atom feed can have missing fields that we care about.
// Keep track of any that are missing, and throw if necessary.
const missing = [];
// article.content is the content of the post
if (!article.content) missing.push('content');
// link is the URL of the post
if (!article.link) missing.push('link');
// guid is the unique identifier of the post
if (!article.guid) missing.push('guid');
// pubdate is the publication date of the post
if (!article.pubdate) missing.push('pubdate');

if (missing.length) {
const message = `invalid article: missing ${missing.join(', ')}`;
logger.debug(message);
throw new ArticleError(message);
}

// Allow for missing title, but give it one
if (!article.title) {
logger.debug('article missing title, substituting "Untitled"');
article.title = 'Untitled';
}

// Allow for missing date of most recent update, use original publication date instead
if (!article.date) {
logger.debug('article missing date of last update, substituting publication date');
article.date = article.pubdate;
}

// All the Youtube feed return an array off html so we will need to convert it to a string so as to process and sanitize it
if (Array.isArray(article.content)) {
article.content = article.content.join(' ');
}

// Wrap an <a> tag on any link inside our content
article.content = linkifyHtml(article.content);

let html;
try {
// The article.content is frequently the full HTML article content.
// Sanitize it of any scripts or other dangerous attributes/elements,
// add lazy loading for <img> and <iframe>, and syntax highlight all
// <pre><code>...</code></pre> blocks.
html = processHTML(article.content);
} catch (error) {
logger.error({ error }, 'Unable to process HTML for feed');
throw error;
}

// NOTE: feedparser article properties are documented here:
// https://www.npmjs.com/package/feedparser#list-of-article-properties
const post = new Post(
article.title,
// processed HTML version of the post
html,
// pubdate (original published date)
article.pubdate,
// date (most recent update)
article.date,
// link is the url to the post
article.link,
article.guid,
feed
);
await Promise.all([post.save(), indexPost(post)]);
return post.id;
}

/**
* Creates a new Post object by extracting data from the given post-like object.
* @param {Object} postData - an Object containing the necessary fields. The
Expand Down
Loading

0 comments on commit 387350f

Please sign in to comment.