diff --git a/src/fixtures/markdown-fixtures.ts b/src/fixtures/markdown-fixtures.ts
index ddc9203a1..04ebae212 100644
--- a/src/fixtures/markdown-fixtures.ts
+++ b/src/fixtures/markdown-fixtures.ts
@@ -52,3 +52,105 @@ export const rawInstagramEmbedScript =
export const sanitizedInstagramEmbedScript =
''
+
+export const frontMatterWithSymbolAndHtmlBody = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---
+&something`
+
+export const frontMatterWithSymbolWithoutBodyAndHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---`
+
+export const frontMatterWithSymbolAndBodyWithoutHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---
+something`
+
+export const frontMatterWithSymbolAndHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---`
+
+export const frontMatterWithSymbolAndHtmlAndBody = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---
+&something`
+
+export const escapedFrontMatterWithSymbolAndHtml = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---`
+
+export const escapedFrontMatterWithSymbolAndHtmlBody = `---
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+third_nav_title: Masterclasses & Workshops
+description: ""
+---
+&something`
+
+export const safeEscapedJson = {
+ frontMatter: {
+ breadcrumb:
+ "Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
+ description: "",
+ permalink:
+ "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
+ third_nav_title: "Masterclasses & Workshops",
+ // NOTE: **NOT** escaped even when enclosed in tag
+ title:
+ "Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
+ },
+ // NOTE: Properly escaped
+ pageContent: "&something",
+}
+
+export const encodedFrontmatterJson = {
+ frontMatter: {
+ breadcrumb:
+ "Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
+ description: "",
+ permalink:
+ "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/",
+ third_nav_title: "Masterclasses & Workshops",
+ // NOTE: The special character is escaped within a html tag
+ title:
+ "Digital Strategy & the 101 of Search Engine Optimisation (SEO)",
+ },
+ pageContent: "&something",
+}
+
+export const frontMatterWithSymbolAndEscapedBody = `---
+breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+description: ""
+permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/
+third_nav_title: Masterclasses & Workshops
+title: Digital Strategy & the 101 of Search Engine Optimisation (SEO)
+---
+&something`
+
+export const HTML_COMMENT_TAG = ""
diff --git a/src/utils/__tests__/markdown-utils.spec.ts b/src/utils/__tests__/markdown-utils.spec.ts
index 9aaa3acfd..85101269c 100644
--- a/src/utils/__tests__/markdown-utils.spec.ts
+++ b/src/utils/__tests__/markdown-utils.spec.ts
@@ -10,42 +10,101 @@ import {
maliciousJsonObject,
rawInstagramEmbedScript,
sanitizedInstagramEmbedScript,
+ frontMatterWithSymbolAndHtmlBody,
+ frontMatterWithSymbolAndHtml,
+ frontMatterWithSymbolWithoutBodyAndHtml,
+ escapedFrontMatterWithSymbolAndHtml,
+ escapedFrontMatterWithSymbolAndHtmlBody,
+ frontMatterWithSymbolAndBodyWithoutHtml,
+ frontMatterWithSymbolAndHtmlAndBody,
+ safeEscapedJson,
+ frontMatterWithSymbolAndEscapedBody,
+ encodedFrontmatterJson,
+ HTML_COMMENT_TAG,
} from "@fixtures/markdown-fixtures"
import { sanitizer } from "@root/services/utilServices/Sanitizer"
describe("Sanitized markdown utils test", () => {
- it("should parse normal markdown content into an object successfully", () => {
- expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
- normalJsonObject
- )
- })
+ describe("retrieveDataFromMarkdown", () => {
+ it("should parse normal markdown content into an object successfully", () => {
+ expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual(
+ normalJsonObject
+ )
+ })
- it("should parse malicious markdown content into a sanitized object successfully", () => {
- expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
- normalJsonObject
- )
- })
+ it("should parse malicious markdown content into a sanitized object successfully", () => {
+ expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual(
+ normalJsonObject
+ )
+ })
- it("should stringify a normal JSON object into markdown content successfully", () => {
- const { frontMatter, pageContent } = normalJsonObject
- expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
- normalMarkdownContent
- )
+ it("should not html encode special characters in the frontmatter even when there is html but it should encode special characters in body", () => {
+ expect(
+ retrieveDataFromMarkdown(frontMatterWithSymbolAndHtmlAndBody)
+ ).toStrictEqual(safeEscapedJson)
+ })
})
- it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
- const { frontMatter, pageContent } = maliciousJsonObject
- expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
- normalMarkdownContent
- )
+ describe("convertDataToMarkdown", () => {
+ it("should stringify a normal JSON object into markdown content successfully", () => {
+ const { frontMatter, pageContent } = normalJsonObject
+ expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+ normalMarkdownContent
+ )
+ })
+
+ it("should stringify a malicious JSON object into sanitized markdown content successfully", () => {
+ const { frontMatter, pageContent } = maliciousJsonObject
+ expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+ normalMarkdownContent
+ )
+ })
+
+ it("should stringify a JSON object containing encoded frontmatter into markdown content without encoded frontmatter", () => {
+ const { frontMatter, pageContent } = encodedFrontmatterJson
+
+ expect(convertDataToMarkdown(frontMatter, pageContent)).toBe(
+ frontMatterWithSymbolAndEscapedBody
+ )
+ })
})
- it("should sanitize boolean tags with an empty string", () => {
- // NOTE: Setting a boolean attr to an empty string is equivalent
- // to it being true.
- // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
- expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
- sanitizedInstagramEmbedScript
- )
+ describe("sanitize", () => {
+ it("should sanitize boolean tags with an empty string", () => {
+ // NOTE: Setting a boolean attr to an empty string is equivalent
+ // to it being true.
+ // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes
+ expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe(
+ sanitizedInstagramEmbedScript
+ )
+ })
+
+ it("should escape special characters in our frontmatter if there is html in the frontmatter", () => {
+ expect(sanitizer.sanitize(frontMatterWithSymbolAndHtml)).toBe(
+ escapedFrontMatterWithSymbolAndHtml
+ )
+ })
+
+ it("should escape special characters in our frontmatter if there is html in the body", () => {
+ expect(sanitizer.sanitize(frontMatterWithSymbolAndHtmlBody)).toBe(
+ escapedFrontMatterWithSymbolAndHtmlBody
+ )
+ })
+
+ it("should not escape special characters in our frontmatter if there is no html and no body", () => {
+ expect(sanitizer.sanitize(frontMatterWithSymbolWithoutBodyAndHtml)).toBe(
+ frontMatterWithSymbolWithoutBodyAndHtml
+ )
+ })
+
+ it("should not escape special characters in our frontmatter if there is no html but there is body", () => {
+ expect(sanitizer.sanitize(frontMatterWithSymbolAndBodyWithoutHtml)).toBe(
+ frontMatterWithSymbolAndBodyWithoutHtml
+ )
+ })
+
+ it("should inject a html comment tag when the string is empty", () => {
+ expect(sanitizer.sanitize("")).toBe(HTML_COMMENT_TAG)
+ })
})
})
diff --git a/src/utils/markdown-utils.js b/src/utils/markdown-utils.js
index a293efecc..2bda31df7 100644
--- a/src/utils/markdown-utils.js
+++ b/src/utils/markdown-utils.js
@@ -12,11 +12,33 @@ const getTrailingSlashWithPermalink = (permalink) =>
const retrieveDataFromMarkdown = (fileContent) => {
// eslint-disable-next-line no-unused-vars
- const [unused, encodedFrontMatter, ...pageContent] = sanitizer
- .sanitize(fileContent)
- .split("---")
- const frontMatter = sanitizedYamlParse(encodedFrontMatter)
- return { frontMatter, pageContent: pageContent.join("---").trim() }
+ const [unused, encodedFrontMatter, ...pageContent] = fileContent.split("---")
+ // NOTE: We separate the sanitization into 2 steps.
+ // This is because DOMPurify does URL encoding when it detects html in the string.
+ // For example, `&something` will get HTML encoded to `&something`.
+ // To prevent this behaviour from affecting our frontmatter, we do the sanitization separately
+ // on the frontmatter and the content
+ const frontMatter = _.mapValues(
+ sanitizedYamlParse(encodedFrontMatter),
+ // NOTE: We call `unescape` here to transform `&` into `&`.
+ // Because of the above property, where DOMPurify does html encoding on detection of a html tag,
+ // there might be special characters that are encoded into their html form.
+ // This is a safe transformation to run because the original value was already a special character
+ // so this does not do anything destructive.
+ // Do note that frontmatter containing pre-existing html encoded characters (&)
+ // will get transformed regardless.
+ (val) => _.unescape(val)
+ )
+ const originalPageContent = pageContent.join("---")
+ // NOTE: We don't sanitize if there is no page content.
+ // This is to avoid injection of a HTML comment by the sanitize function.
+ const sanitizedPageContent = originalPageContent
+ ? sanitizer.sanitize(originalPageContent).trim()
+ : ""
+ return {
+ frontMatter,
+ pageContent: sanitizedPageContent,
+ }
}
const isResourceFileOrLink = (frontMatter) => {
@@ -33,9 +55,20 @@ const convertDataToMarkdown = (originalFrontMatter, pageContent) => {
if (permalink) {
frontMatter.permalink = getTrailingSlashWithPermalink(permalink)
}
- const newFrontMatter = sanitizedYamlStringify(frontMatter)
- const newContent = ["---\n", newFrontMatter, "---\n", pageContent].join("")
- return sanitizer.sanitize(newContent)
+ // NOTE: We don't sanitize if there is no page content.
+ // This is to avoid injection of a HTML comment by the sanitize function.
+ const sanitizedPageContent = pageContent
+ ? sanitizer.sanitize(pageContent)
+ : ""
+ // NOTE: See above on why we call `unescape`
+ const newFrontMatter = _.unescape(sanitizedYamlStringify(frontMatter))
+ const newContent = [
+ "---\n",
+ newFrontMatter,
+ "---\n",
+ sanitizedPageContent,
+ ].join("")
+ return newContent
}
module.exports = {