Making parser service work:

- Update README.md - Revert some docker, config changes - Removed unncessary files (hash.js, redis.js, elastic.js) - Change imports to use Satellite's (hash, redis, elastic, logger) - Add remove-empty-anchor to html/index.js - Remove some dependencies in package.json - Update lock file
Seneca-CDOT · Feb 22, 2022 · 387350f · 387350f
1 parent 8f04242
commit 387350f
Show file tree

Hide file tree

Showing 33 changed files with 2,644 additions and 1,702 deletions.
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/api/parser/.gitignore b/src/api/parser/.gitignore
diff --git a/src/api/parser/README.md b/src/api/parser/README.md
@@ -0,0 +1,29 @@
+# Parser Service: (To be updated when Parser service is dockerized and live)
+
+The Parser service parses posts from user's feeds to populate Redis
+
+## Install
+
+```
+pnpm install
+```
+
+## Usage
+
+### Normal mode
+
+```
+pnpm start
+```
+
+### Dev mode with automatic restarts
+
+```
+pnpm dev
+```
+
+By default the server is running on <http://localhost:10000/>.
+
+### Examples
+
+## Docker
diff --git a/src/api/parser/env.local b/src/api/parser/env.local
@@ -0,0 +1,34 @@
+NODE_ENV=development
+LOG_LEVEL=debug
+
+REDIS_URL=redis://localhost
+
+ELASTIC_URL=http://localhost
+ELASTIC_PORT=9200
+# Max number of results per query
+ELASTIC_MAX_RESULTS_PER_PAGE=5
+# Delay to check connectivity with Elasticsearch in ms
+ELASTIC_DELAY_MS=10000
+
+# FEED_URL url used to access feed list
+FEED_URL=https://wiki.cdot.senecacollege.ca/wiki/Planet_CDOT_Feed_List
+
+# Milliseconds to wait after attempting to fetch the feed list when the server is not available
+FEED_URL_INTERVAL_MS=30000
+
+# Period of time (seconds) that an unprocessed feed must wait before its
+# next processing attempt (due to previous attempt responding with HTTP 429)
+FEED_PROCESSING_DELAY_SEC=3600
+
+# Feed job queue attempts
+FEED_QUEUE_ATTEMPTS=5
+
+# Feed job queue delay(ms)
+FEED_QUEUE_DELAY_MS=60000
+
+# Number of concurrent worker processors to run. Use * if you want to run
+# one per CPU. Use a number if you want to set it manually, up to a max
+# of the CPU count.  If not set, we'll assume 1.
+FEED_QUEUE_PARALLEL_WORKERS=1
+
+PARSER_PORT = 10000
diff --git a/src/api/parser/jest.config.e2e.js b/src/api/parser/jest.config.e2e.js
diff --git a/src/api/parser/package.json b/src/api/parser/package.json
@@ -1,28 +1,38 @@
 {
-  "name": "parser",
+  "name": "@senecacdot/parser-service",
+  "private": true,
   "version": "1.0.0",
-  "description": "A service for parsing posts",
+  "description": "A service for parsing feeds",
   "scripts": {
-    "start": "node src/server.js"
+    "dev": "env-cmd -f env.local node src/index.js",
+    "start": "node src/index.js"
   },
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/Seneca-CDOT/telescope.git"
-  },
-  "author": "",
+  "repository": "Seneca-CDOT/telescope",
   "license": "BSD-2-Clause",
   "bugs": {
     "url": "https://github.com/Seneca-CDOT/telescope/issues"
   },
   "homepage": "https://github.com/Seneca-CDOT/telescope#readme",
   "dependencies": {
-    "@elastic/elasticsearch": "7.16.0",
-    "@elastic/elasticsearch-mock": "0.3.1",
-    "@senecacdot/satellite": "1.25.0",
-    "bull": "3.29.3",
-    "bull-board": "2.1.3",
-    "jsdom": "18.1.1",
-    "normalize-url": "6.1.0",
-    "rss-parser": "3.12.0"
+    "@bull-board/api": "3.7.0",
+    "@bull-board/express": "3.7.0",
+    "@senecacdot/satellite": "latest",
+    "bull": "3.22.0",
+    "clean-whitespace": "0.1.2",
+    "highlight.js": "11.3.1",
+    "jsdom": "18.0.1",
+    "linkify-html": "3.0.5",
+    "linkifyjs": "3.0.5",
+    "normalize-url": "6.0.1",
+    "rss-parser": "3.12.0",
+    "sanitize-html": "2.5.3"
+  },
+  "engines": {
+    "node": ">=14.0.0"
+  },
+  "devDependencies": {
+    "babel-jest": "27.3.1",
+    "env-cmd": "10.1.0",
+    "nodemon": "2.0.7"
   }
 }
diff --git a/src/api/parser/readme.md b/src/api/parser/readme.md
diff --git a/src/api/parser/src/data/feed.js b/src/api/parser/src/data/feed.js
@@ -16,7 +16,8 @@ const {
   getFlaggedFeeds,
   setFlaggedFeed,
   unsetFlaggedFeed,
-} = require('../storage');
+} = require('../utils/storage');
+const { deletePost } = require('../utils/indexer');
 
 const urlToId = (url) => hash(normalizeUrl(url));
 
@@ -61,7 +62,12 @@ class Feed {
     posts = posts.filter((post) => post.feed === this.id);
 
     // Remove the post from Redis + ElasticSearch
-    await Promise.all([].concat(posts.map((post) => removePost(post.id))));
+    await Promise.all(
+      [].concat(
+        posts.map((post) => removePost(post.id)),
+        posts.map((post) => deletePost(post.id))
+      )
+    );
   }
 
   /**
@@ -103,7 +109,7 @@ class Feed {
    * Returns a Promise<Boolean>
    */
   async isDelayed() {
-    return (await isDelayed(this.id)) === '1';
+    return (await isDelayed(this.id)) === 1;
   }
 
   /**

diff --git a/src/api/parser/src/data/post.js b/src/api/parser/src/data/post.js
@@ -1,7 +1,11 @@
-const { hash } = require('@senecacdot/satellite');
-const { getPost, addPost } = require('../storage');
+const linkifyHtml = require('linkify-html');
+const { hash, logger } = require('@senecacdot/satellite');
+const { getPost, addPost } = require('../utils/storage');
+const processHTML = require('../utils/html');
+const textParser = require('../utils/text-parser');
 const Feed = require('./feed');
-const textParser = require('../text-parser');
+const ArticleError = require('./article-error');
+const { indexPost } = require('../utils/indexer');
 
 /**
  * Makes sure that a given date can be constructed as a Date object
@@ -10,7 +14,7 @@ const textParser = require('../text-parser');
  * @param {Object} date an Object to construct as a Date object
  * @param {Date} [fallbackDate] an optional second Date to construct in case the first fails to do so
  */
-function ensureDate(date, fallbackDate) {
+const ensureDate = (date, fallbackDate) => {
   if (
     date &&
     (Object.prototype.toString.call(date) === '[object String]' ||
@@ -23,18 +27,34 @@ function ensureDate(date, fallbackDate) {
   }
 
   throw new Error(`post has an invalid date: ${date}'`);
-}
+};
 
 /**
  * Makes sure that the given feed is a Feed and not just an id.  If the latter
  * it gets the full feed.
  * @param {Feed|String} feed a Feed object or feed id
  * Returns a Promise<Feed>
  */
-function ensureFeed(feed) {
-  return feed instanceof Feed ? Promise.resolve(feed) : Feed.byId(feed);
-}
+const ensureFeed = (feed) => (feed instanceof Feed ? Promise.resolve(feed) : Feed.byId(feed));
+
+/**
+ * @param {string} url
+ * @returns {"video" | "blogpost"} the post's type
+ */
+const determinePostType = (url) => {
+  try {
+    const associatedLink = new URL(url);
 
+    if (associatedLink.hostname.includes('youtube.com')) {
+      return 'video';
+    }
+    // Assume that we are dealing with a blogpost if we
+    // are not dealing with videos
+    return 'blogpost';
+  } catch {
+    return 'blogpost';
+  }
+};
 class Post {
   constructor(title, html, datePublished, dateUpdated, postUrl, guid, feed) {
     // Use the post's guid as our unique identifier
@@ -46,7 +66,9 @@ class Post {
     // create an absolute url if postURL is relative
     this.url = new URL(postUrl, feed.url).href;
     this.guid = guid;
+    this.type = determinePostType(this.url);
 
+    // We expect to get a real Feed vs. a feed id
     if (!(feed instanceof Feed)) {
       throw new Error(`expected feed to be a Feed Object, got '${feed}'`);
     }
@@ -76,6 +98,92 @@ class Post {
     return this.feed.author;
   }
 
+  /**
+   * Parse an article object into a Post object.
+   * @param {Object} article parsed via feedparser, see:
+   * https://www.npmjs.com/package/feedparser#what-is-the-parsed-output-produced-by-feedparser
+   *
+   * If data is missing, throws an error.
+   */
+  static async createFromArticle(article, feed) {
+    // Validate the properties we get, and if we don't have them all, throw
+    if (!article) {
+      throw new Error('unable to parse, missing article');
+    }
+
+    if (article.contentEncoded) article.content = article.contentEncoded;
+
+    if (article.mediaGroup) article.content = article.mediaGroup['media:description'];
+
+    // A valid RSS/Atom feed can have missing fields that we care about.
+    // Keep track of any that are missing, and throw if necessary.
+    const missing = [];
+    // article.content is the content of the post
+    if (!article.content) missing.push('content');
+    // link is the URL of the post
+    if (!article.link) missing.push('link');
+    // guid is the unique identifier of the post
+    if (!article.guid) missing.push('guid');
+    // pubdate is the publication date of the post
+    if (!article.pubdate) missing.push('pubdate');
+
+    if (missing.length) {
+      const message = `invalid article: missing ${missing.join(', ')}`;
+      logger.debug(message);
+      throw new ArticleError(message);
+    }
+
+    // Allow for missing title, but give it one
+    if (!article.title) {
+      logger.debug('article missing title, substituting "Untitled"');
+      article.title = 'Untitled';
+    }
+
+    // Allow for missing date of most recent update, use original publication date instead
+    if (!article.date) {
+      logger.debug('article missing date of last update, substituting publication date');
+      article.date = article.pubdate;
+    }
+
+    // All the Youtube feed return an array off html so we will need to convert it to a string so as to process and sanitize it
+    if (Array.isArray(article.content)) {
+      article.content = article.content.join(' ');
+    }
+
+    // Wrap an <a> tag on any link inside our content
+    article.content = linkifyHtml(article.content);
+
+    let html;
+    try {
+      // The article.content is frequently the full HTML article content.
+      // Sanitize it of any scripts or other dangerous attributes/elements,
+      // add lazy loading for <img> and <iframe>, and syntax highlight all
+      // <pre><code>...</code></pre> blocks.
+      html = processHTML(article.content);
+    } catch (error) {
+      logger.error({ error }, 'Unable to process HTML for feed');
+      throw error;
+    }
+
+    // NOTE: feedparser article properties are documented here:
+    // https://www.npmjs.com/package/feedparser#list-of-article-properties
+    const post = new Post(
+      article.title,
+      // processed HTML version of the post
+      html,
+      // pubdate (original published date)
+      article.pubdate,
+      // date (most recent update)
+      article.date,
+      // link is the url to the post
+      article.link,
+      article.guid,
+      feed
+    );
+    await Promise.all([post.save(), indexPost(post)]);
+    return post.id;
+  }
+
   /**
    * Creates a new Post object by extracting data from the given post-like object.
    * @param {Object} postData - an Object containing the necessary fields.  The