isomerpages · seaerchin · Apr 20, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/src/fixtures/markdown-fixtures.ts b/src/fixtures/markdown-fixtures.ts
@@ -52,3 +52,105 @@ export const rawInstagramEmbedScript =
 
 export const sanitizedInstagramEmbedScript =
   '<script async="" src="//www.instagram.com/embed.js"></script>'
+
+export const frontMatterWithSymbolAndHtmlBody = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---
+<b>&something</b>`
+
+export const frontMatterWithSymbolWithoutBodyAndHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---`
+
+export const frontMatterWithSymbolAndBodyWithoutHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---
+something`
+
+export const frontMatterWithSymbolAndHtml = `---
+title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---`
+
+export const frontMatterWithSymbolAndHtmlAndBody = `---
+title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---
+<b>&something</b>`
+
+export const escapedFrontMatterWithSymbolAndHtml = `---
+title: <b>Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)</b>
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---`
+
+export const escapedFrontMatterWithSymbolAndHtmlBody = `---
+title: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses &amp; Workshops
+description: ""
+---
+<b>&amp;something</b>`
+
+export const safeEscapedJson = {
+  frontMatter: {
+    breadcrumb:
+      "Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
+    description: "",
+    permalink:
+      "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
+    third_nav_title: "Masterclasses & Workshops",
+    // NOTE: **NOT** escaped even when enclosed in <b/> tag
+    title:
+      "<b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>",
+  },
+  // NOTE: Properly escaped
+  pageContent: "<b>&amp;something</b>",
+}
+
+export const encodedFrontmatterJson = {
+  frontMatter: {
+    breadcrumb:
+      "Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)",
+    description: "",
+    permalink:
+      "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
+    third_nav_title: "Masterclasses & Workshops",
+    // NOTE: The special character is escaped within a html tag
+    title:
+      "<b>Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)</b>",
+  },
+  pageContent: "<b>&amp;something</b>",
+}
+
+export const frontMatterWithSymbolAndEscapedBody = `---
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+description: ""
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+third_nav_title: Masterclasses & Workshops
+title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
+---
+<b>&amp;something</b>`
+
+export const HTML_COMMENT_TAG = "<!---->"
diff --git a/src/utils/__tests__/markdown-utils.spec.ts b/src/utils/__tests__/markdown-utils.spec.ts
@@ -10,42 +10,101 @@ import {
   maliciousJsonObject,
   rawInstagramEmbedScript,
   sanitizedInstagramEmbedScript,
+  frontMatterWithSymbolAndHtmlBody,
+  frontMatterWithSymbolAndHtml,
+  frontMatterWithSymbolWithoutBodyAndHtml,
+  escapedFrontMatterWithSymbolAndHtml,
+  escapedFrontMatterWithSymbolAndHtmlBody,
+  frontMatterWithSymbolAndBodyWithoutHtml,
+  frontMatterWithSymbolAndHtmlAndBody,
+  safeEscapedJson,
+  frontMatterWithSymbolAndEscapedBody,
+  encodedFrontmatterJson,
+  HTML_COMMENT_TAG,
 } from "@fixtures/markdown-fixtures"
 import { sanitizer } from "@root/services/utilServices/Sanitizer"
 
 describe("Sanitized markdown utils test", () => {
-  it("should parse normal markdown content into an object successfully", () => {
-    expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
-      normalJsonObject
-    )
-  })
+  describe("retrieveDataFromMarkdown", () => {
+    it("should parse normal markdown content into an object successfully", () => {
+      expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
+        normalJsonObject
+      )
+    })
 
-  it("should parse malicious markdown content into a sanitized object successfully", () => {
-    expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
-      normalJsonObject
-    )
-  })
+    it("should parse malicious markdown content into a sanitized object successfully", () => {
+      expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
+        normalJsonObject
+      )
+    })
 
-  it("should stringify a normal JSON object into markdown content successfully", () => {
-    const { frontMatter, pageContent } = normalJsonObject
-    expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
-      normalMarkdownContent
-    )
+    it("should not html encode special characters in the frontmatter even when there is html but it should encode special characters in body", () => {
+      expect(
+        retrieveDataFromMarkdown(frontMatterWithSymbolAndHtmlAndBody)
+      ).toStrictEqual(safeEscapedJson)
+    })
   })
 
-  it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
-    const { frontMatter, pageContent } = maliciousJsonObject
-    expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
-      normalMarkdownContent
-    )
+  describe("convertDataToMarkdown", () => {
+    it("should stringify a normal JSON object into markdown content successfully", () => {
+      const { frontMatter, pageContent } = normalJsonObject
+      expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+        normalMarkdownContent
+      )
+    })
+
+    it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
+      const { frontMatter, pageContent } = maliciousJsonObject
+      expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+        normalMarkdownContent
+      )
+    })
+
+    it("should stringify a JSON object containing encoded frontmatter into markdown content without encoded frontmatter", () => {
+      const { frontMatter, pageContent } = encodedFrontmatterJson
+
+      expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+        frontMatterWithSymbolAndEscapedBody
+      )
+    })
   })
 
-  it("should sanitize boolean tags with an empty string", () => {
-    // NOTE: Setting a boolean attr to an empty string is equivalent
-    // to it being true.
-    // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
-    expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
-      sanitizedInstagramEmbedScript
-    )
+  describe("sanitize", () => {
+    it("should sanitize boolean tags with an empty string", () => {
+      // NOTE: Setting a boolean attr to an empty string is equivalent
+      // to it being true.
+      // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
+      expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
+        sanitizedInstagramEmbedScript
+      )
+    })
+
+    it("should escape special characters in our frontmatter if there is html in the frontmatter", () => {
+      expect(sanitizer.sanitize(frontMatterWithSymbolAndHtml)).toBe(
+        escapedFrontMatterWithSymbolAndHtml
+      )
+    })
+
+    it("should escape special characters in our frontmatter if there is html in the body", () => {
+      expect(sanitizer.sanitize(frontMatterWithSymbolAndHtmlBody)).toBe(
+        escapedFrontMatterWithSymbolAndHtmlBody
+      )
+    })
+
+    it("should not escape special characters in our frontmatter if there is no html and no body", () => {
+      expect(sanitizer.sanitize(frontMatterWithSymbolWithoutBodyAndHtml)).toBe(
+        frontMatterWithSymbolWithoutBodyAndHtml
+      )
+    })
+
+    it("should not escape special characters in our frontmatter if there is no html but there is body", () => {
+      expect(sanitizer.sanitize(frontMatterWithSymbolAndBodyWithoutHtml)).toBe(
+        frontMatterWithSymbolAndBodyWithoutHtml
+      )
+    })
+
+    it("should inject a html comment tag when the string is empty", () => {
+      expect(sanitizer.sanitize("")).toBe(HTML_COMMENT_TAG)
+    })
   })
 })
diff --git a/src/utils/markdown-utils.js b/src/utils/markdown-utils.js
@@ -12,11 +12,33 @@ const getTrailingSlashWithPermalink = (permalink) =>
 
 const retrieveDataFromMarkdown = (fileContent) => {
   // eslint-disable-next-line no-unused-vars
-  const [unused, encodedFrontMatter, ...pageContent] = sanitizer
-    .sanitize(fileContent)
-    .split("---")
-  const frontMatter = sanitizedYamlParse(encodedFrontMatter)
-  return { frontMatter, pageContent: pageContent.join("---").trim() }
+  const [unused, encodedFrontMatter, ...pageContent] = fileContent.split("---")
+  // NOTE: We separate the sanitization into 2 steps.
+  // This is because DOMPurify does URL encoding when it detects html in the string.
+  // For example, `<b>&something</b>` will get HTML encoded to `<b>&amp;something</b>`.
+  // To prevent this behaviour from affecting our frontmatter, we do the sanitization separately
+  // on the frontmatter and the content
+  const frontMatter = _.mapValues(
+    sanitizedYamlParse(encodedFrontMatter),
+    // NOTE: We call `unescape` here to transform `&amp` into `&`.
+    // Because of the above property, where DOMPurify does html encoding on detection of a html tag,
+    // there might be special characters that are encoded into their html form.
+    // This is a safe transformation to run because the original value was already a special character
+    // so this does not do anything destructive.
+    // Do note that frontmatter containing pre-existing html encoded characters (&amp;)
+    // will get transformed regardless.
+    (val) => _.unescape(val)
+  )
+  const originalPageContent = pageContent.join("---")
+  // NOTE: We don't sanitize if there is no page content.
+  // This is to avoid injection of a HTML comment by the sanitize function.
+  const sanitizedPageContent = originalPageContent
+    ? sanitizer.sanitize(originalPageContent).trim()
+    : ""
+  return {
+    frontMatter,
+    pageContent: sanitizedPageContent,
+  }
 }
 
 const isResourceFileOrLink = (frontMatter) => {
@@ -33,9 +55,20 @@ const convertDataToMarkdown = (originalFrontMatter, pageContent) => {
   if (permalink) {
     frontMatter.permalink = getTrailingSlashWithPermalink(permalink)
   }
-  const newFrontMatter = sanitizedYamlStringify(frontMatter)
-  const newContent = ["---\n", newFrontMatter, "---\n", pageContent].join("")
-  return sanitizer.sanitize(newContent)
+  // NOTE: We don't sanitize if there is no page content.
+  // This is to avoid injection of a HTML comment by the sanitize function.
+  const sanitizedPageContent = pageContent
+    ? sanitizer.sanitize(pageContent)
+    : ""
+  // NOTE: See above on why we call `unescape`
+  const newFrontMatter = _.unescape(sanitizedYamlStringify(frontMatter))
+  const newContent = [
+    "---\n",
+    newFrontMatter,
+    "---\n",
+    sanitizedPageContent,
+  ].join("")
+  return newContent
 }
 
 module.exports = {