Skip to content

Commit

Permalink
[LS-81] fix(markdown-utils): change sanitization process + add unesca…
Browse files Browse the repository at this point in the history
…pe (#718)

* chore(markdown): add spec for bug

* chore(markdown): add more test

* fix(markdown-utils): change sanitization order

* test(markdown): add more test cases

* fix(markdown-utils): change so that frontmatter is ALWAYS unescaped

* test(markdown): add new spec for empty strings

* chore(markdown-utils): skip sanitisation if content is empty
  • Loading branch information
seaerchin authored Apr 20, 2023
1 parent bd577d7 commit 0f10b20
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 35 deletions.
102 changes: 102 additions & 0 deletions src/fixtures/markdown-fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,105 @@ export const rawInstagramEmbedScript =

export const sanitizedInstagramEmbedScript =
'<script async="" src="//www.instagram.com/embed.js"></script>'

export const frontMatterWithSymbolAndHtmlBody = `---
title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---
<b>&something</b>`

export const frontMatterWithSymbolWithoutBodyAndHtml = `---
title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---`

export const frontMatterWithSymbolAndBodyWithoutHtml = `---
title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---
something`

export const frontMatterWithSymbolAndHtml = `---
title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---`

export const frontMatterWithSymbolAndHtmlAndBody = `---
title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---
<b>&something</b>`

export const escapedFrontMatterWithSymbolAndHtml = `---
title: <b>Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)</b>
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---`

export const escapedFrontMatterWithSymbolAndHtmlBody = `---
title: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
breadcrumb: Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)
third_nav_title: Masterclasses &amp; Workshops
description: ""
---
<b>&amp;something</b>`

export const safeEscapedJson = {
frontMatter: {
breadcrumb:
"Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
description: "",
permalink:
"/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
third_nav_title: "Masterclasses & Workshops",
// NOTE: **NOT** escaped even when enclosed in <b/> tag
title:
"<b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>",
},
// NOTE: Properly escaped
pageContent: "<b>&amp;something</b>",
}

export const encodedFrontmatterJson = {
frontMatter: {
breadcrumb:
"Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)",
description: "",
permalink:
"/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
third_nav_title: "Masterclasses & Workshops",
// NOTE: The special character is escaped within a html tag
title:
"<b>Digital Strategy &amp; the 101 of Search Engine Optimisation (SEO)</b>",
},
pageContent: "<b>&amp;something</b>",
}

export const frontMatterWithSymbolAndEscapedBody = `---
breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
description: ""
permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
third_nav_title: Masterclasses & Workshops
title: <b>Digital Strategy & the 101 of Search Engine Optimisation (SEO)</b>
---
<b>&amp;something</b>`

export const HTML_COMMENT_TAG = "<!---->"
113 changes: 86 additions & 27 deletions src/utils/__tests__/markdown-utils.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,42 +10,101 @@ import {
maliciousJsonObject,
rawInstagramEmbedScript,
sanitizedInstagramEmbedScript,
frontMatterWithSymbolAndHtmlBody,
frontMatterWithSymbolAndHtml,
frontMatterWithSymbolWithoutBodyAndHtml,
escapedFrontMatterWithSymbolAndHtml,
escapedFrontMatterWithSymbolAndHtmlBody,
frontMatterWithSymbolAndBodyWithoutHtml,
frontMatterWithSymbolAndHtmlAndBody,
safeEscapedJson,
frontMatterWithSymbolAndEscapedBody,
encodedFrontmatterJson,
HTML_COMMENT_TAG,
} from "@fixtures/markdown-fixtures"
import { sanitizer } from "@root/services/utilServices/Sanitizer"

describe("Sanitized markdown utils test", () => {
it("should parse normal markdown content into an object successfully", () => {
expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
normalJsonObject
)
})
describe("retrieveDataFromMarkdown", () => {
it("should parse normal markdown content into an object successfully", () => {
expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
normalJsonObject
)
})

it("should parse malicious markdown content into a sanitized object successfully", () => {
expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
normalJsonObject
)
})
it("should parse malicious markdown content into a sanitized object successfully", () => {
expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
normalJsonObject
)
})

it("should stringify a normal JSON object into markdown content successfully", () => {
const { frontMatter, pageContent } = normalJsonObject
expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
normalMarkdownContent
)
it("should not html encode special characters in the frontmatter even when there is html but it should encode special characters in body", () => {
expect(
retrieveDataFromMarkdown(frontMatterWithSymbolAndHtmlAndBody)
).toStrictEqual(safeEscapedJson)
})
})

it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
const { frontMatter, pageContent } = maliciousJsonObject
expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
normalMarkdownContent
)
describe("convertDataToMarkdown", () => {
it("should stringify a normal JSON object into markdown content successfully", () => {
const { frontMatter, pageContent } = normalJsonObject
expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
normalMarkdownContent
)
})

it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
const { frontMatter, pageContent } = maliciousJsonObject
expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
normalMarkdownContent
)
})

it("should stringify a JSON object containing encoded frontmatter into markdown content without encoded frontmatter", () => {
const { frontMatter, pageContent } = encodedFrontmatterJson

expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
frontMatterWithSymbolAndEscapedBody
)
})
})

it("should sanitize boolean tags with an empty string", () => {
// NOTE: Setting a boolean attr to an empty string is equivalent
// to it being true.
// See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
sanitizedInstagramEmbedScript
)
describe("sanitize", () => {
it("should sanitize boolean tags with an empty string", () => {
// NOTE: Setting a boolean attr to an empty string is equivalent
// to it being true.
// See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
sanitizedInstagramEmbedScript
)
})

it("should escape special characters in our frontmatter if there is html in the frontmatter", () => {
expect(sanitizer.sanitize(frontMatterWithSymbolAndHtml)).toBe(
escapedFrontMatterWithSymbolAndHtml
)
})

it("should escape special characters in our frontmatter if there is html in the body", () => {
expect(sanitizer.sanitize(frontMatterWithSymbolAndHtmlBody)).toBe(
escapedFrontMatterWithSymbolAndHtmlBody
)
})

it("should not escape special characters in our frontmatter if there is no html and no body", () => {
expect(sanitizer.sanitize(frontMatterWithSymbolWithoutBodyAndHtml)).toBe(
frontMatterWithSymbolWithoutBodyAndHtml
)
})

it("should not escape special characters in our frontmatter if there is no html but there is body", () => {
expect(sanitizer.sanitize(frontMatterWithSymbolAndBodyWithoutHtml)).toBe(
frontMatterWithSymbolAndBodyWithoutHtml
)
})

it("should inject a html comment tag when the string is empty", () => {
expect(sanitizer.sanitize("")).toBe(HTML_COMMENT_TAG)
})
})
})
49 changes: 41 additions & 8 deletions src/utils/markdown-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,33 @@ const getTrailingSlashWithPermalink = (permalink) =>

const retrieveDataFromMarkdown = (fileContent) => {
// eslint-disable-next-line no-unused-vars
const [unused, encodedFrontMatter, ...pageContent] = sanitizer
.sanitize(fileContent)
.split("---")
const frontMatter = sanitizedYamlParse(encodedFrontMatter)
return { frontMatter, pageContent: pageContent.join("---").trim() }
const [unused, encodedFrontMatter, ...pageContent] = fileContent.split("---")
// NOTE: We separate the sanitization into 2 steps.
// This is because DOMPurify does URL encoding when it detects html in the string.
// For example, `<b>&something</b>` will get HTML encoded to `<b>&amp;something</b>`.
// To prevent this behaviour from affecting our frontmatter, we do the sanitization separately
// on the frontmatter and the content
const frontMatter = _.mapValues(
sanitizedYamlParse(encodedFrontMatter),
// NOTE: We call `unescape` here to transform `&amp` into `&`.
// Because of the above property, where DOMPurify does html encoding on detection of a html tag,
// there might be special characters that are encoded into their html form.
// This is a safe transformation to run because the original value was already a special character
// so this does not do anything destructive.
// Do note that frontmatter containing pre-existing html encoded characters (&amp;)
// will get transformed regardless.
(val) => _.unescape(val)
)
const originalPageContent = pageContent.join("---")
// NOTE: We don't sanitize if there is no page content.
// This is to avoid injection of a HTML comment by the sanitize function.
const sanitizedPageContent = originalPageContent
? sanitizer.sanitize(originalPageContent).trim()
: ""
return {
frontMatter,
pageContent: sanitizedPageContent,
}
}

const isResourceFileOrLink = (frontMatter) => {
Expand All @@ -33,9 +55,20 @@ const convertDataToMarkdown = (originalFrontMatter, pageContent) => {
if (permalink) {
frontMatter.permalink = getTrailingSlashWithPermalink(permalink)
}
const newFrontMatter = sanitizedYamlStringify(frontMatter)
const newContent = ["---\n", newFrontMatter, "---\n", pageContent].join("")
return sanitizer.sanitize(newContent)
// NOTE: We don't sanitize if there is no page content.
// This is to avoid injection of a HTML comment by the sanitize function.
const sanitizedPageContent = pageContent
? sanitizer.sanitize(pageContent)
: ""
// NOTE: See above on why we call `unescape`
const newFrontMatter = _.unescape(sanitizedYamlStringify(frontMatter))
const newContent = [
"---\n",
newFrontMatter,
"---\n",
sanitizedPageContent,
].join("")
return newContent
}

module.exports = {
Expand Down

0 comments on commit 0f10b20

Please sign in to comment.