From 0f10b202572cd73d01cd6203cd1306d6b19ebf19 Mon Sep 17 00:00:00 2001 From: seaerchin <44049504+seaerchin@users.noreply.github.com> Date: Thu, 20 Apr 2023 14:36:39 +0800 Subject: [PATCH] [LS-81] fix(markdown-utils): change sanitization process + add unescape (#718) * chore(markdown): add spec for bug * chore(markdown): add more test * fix(markdown-utils): change sanitization order * test(markdown): add more test cases * fix(markdown-utils): change so that frontmatter is ALWAYS unescaped * test(markdown): add new spec for empty strings * chore(markdown-utils): skip sanitisation if content is empty --- src/fixtures/markdown-fixtures.ts | 102 +++++++++++++++++++ src/utils/__tests__/markdown-utils.spec.ts | 113 ++++++++++++++++----- src/utils/markdown-utils.js | 49 +++++++-- 3 files changed, 229 insertions(+), 35 deletions(-) diff --git a/src/fixtures/markdown-fixtures.ts b/src/fixtures/markdown-fixtures.ts index ddc9203a1..04ebae212 100644 --- a/src/fixtures/markdown-fixtures.ts +++ b/src/fixtures/markdown-fixtures.ts @@ -52,3 +52,105 @@ export const rawInstagramEmbedScript = export const sanitizedInstagramEmbedScript = '' + +export const frontMatterWithSymbolAndHtmlBody = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +--- +&something` + +export const frontMatterWithSymbolWithoutBodyAndHtml = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +---` + +export const frontMatterWithSymbolAndBodyWithoutHtml = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +--- +something` + +export const frontMatterWithSymbolAndHtml = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +---` + +export const frontMatterWithSymbolAndHtmlAndBody = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +--- +&something` + +export const escapedFrontMatterWithSymbolAndHtml = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +---` + +export const escapedFrontMatterWithSymbolAndHtmlBody = `--- +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +third_nav_title: Masterclasses & Workshops +description: "" +--- +&something` + +export const safeEscapedJson = { + frontMatter: { + breadcrumb: + "Digital Strategy & the 101 of Search Engine Optimisation (SEO)", + description: "", + permalink: + "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/", + third_nav_title: "Masterclasses & Workshops", + // NOTE: **NOT** escaped even when enclosed in tag + title: + "Digital Strategy & the 101 of Search Engine Optimisation (SEO)", + }, + // NOTE: Properly escaped + pageContent: "&something", +} + +export const encodedFrontmatterJson = { + frontMatter: { + breadcrumb: + "Digital Strategy & the 101 of Search Engine Optimisation (SEO)", + description: "", + permalink: + "/digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/", + third_nav_title: "Masterclasses & Workshops", + // NOTE: The special character is escaped within a html tag + title: + "Digital Strategy & the 101 of Search Engine Optimisation (SEO)", + }, + pageContent: "&something", +} + +export const frontMatterWithSymbolAndEscapedBody = `--- +breadcrumb: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +description: "" +permalink: /digital-programmes/masterclasses-and-workshops/digital-strategy-and-the-101-of-seo/ +third_nav_title: Masterclasses & Workshops +title: Digital Strategy & the 101 of Search Engine Optimisation (SEO) +--- +&something` + +export const HTML_COMMENT_TAG = "" diff --git a/src/utils/__tests__/markdown-utils.spec.ts b/src/utils/__tests__/markdown-utils.spec.ts index 9aaa3acfd..85101269c 100644 --- a/src/utils/__tests__/markdown-utils.spec.ts +++ b/src/utils/__tests__/markdown-utils.spec.ts @@ -10,42 +10,101 @@ import { maliciousJsonObject, rawInstagramEmbedScript, sanitizedInstagramEmbedScript, + frontMatterWithSymbolAndHtmlBody, + frontMatterWithSymbolAndHtml, + frontMatterWithSymbolWithoutBodyAndHtml, + escapedFrontMatterWithSymbolAndHtml, + escapedFrontMatterWithSymbolAndHtmlBody, + frontMatterWithSymbolAndBodyWithoutHtml, + frontMatterWithSymbolAndHtmlAndBody, + safeEscapedJson, + frontMatterWithSymbolAndEscapedBody, + encodedFrontmatterJson, + HTML_COMMENT_TAG, } from "@fixtures/markdown-fixtures" import { sanitizer } from "@root/services/utilServices/Sanitizer" describe("Sanitized markdown utils test", () => { - it("should parse normal markdown content into an object successfully", () => { - expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual( - normalJsonObject - ) - }) + describe("retrieveDataFromMarkdown", () => { + it("should parse normal markdown content into an object successfully", () => { + expect(retrieveDataFromMarkdown(normalMarkdownContent)).toStrictEqual( + normalJsonObject + ) + }) - it("should parse malicious markdown content into a sanitized object successfully", () => { - expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual( - normalJsonObject - ) - }) + it("should parse malicious markdown content into a sanitized object successfully", () => { + expect(retrieveDataFromMarkdown(maliciousMarkdownContent)).toStrictEqual( + normalJsonObject + ) + }) - it("should stringify a normal JSON object into markdown content successfully", () => { - const { frontMatter, pageContent } = normalJsonObject - expect(convertDataToMarkdown(frontMatter, pageContent)).toBe( - normalMarkdownContent - ) + it("should not html encode special characters in the frontmatter even when there is html but it should encode special characters in body", () => { + expect( + retrieveDataFromMarkdown(frontMatterWithSymbolAndHtmlAndBody) + ).toStrictEqual(safeEscapedJson) + }) }) - it("should stringify a malicious JSON object into sanitized markdown content successfully", () => { - const { frontMatter, pageContent } = maliciousJsonObject - expect(convertDataToMarkdown(frontMatter, pageContent)).toBe( - normalMarkdownContent - ) + describe("convertDataToMarkdown", () => { + it("should stringify a normal JSON object into markdown content successfully", () => { + const { frontMatter, pageContent } = normalJsonObject + expect(convertDataToMarkdown(frontMatter, pageContent)).toBe( + normalMarkdownContent + ) + }) + + it("should stringify a malicious JSON object into sanitized markdown content successfully", () => { + const { frontMatter, pageContent } = maliciousJsonObject + expect(convertDataToMarkdown(frontMatter, pageContent)).toBe( + normalMarkdownContent + ) + }) + + it("should stringify a JSON object containing encoded frontmatter into markdown content without encoded frontmatter", () => { + const { frontMatter, pageContent } = encodedFrontmatterJson + + expect(convertDataToMarkdown(frontMatter, pageContent)).toBe( + frontMatterWithSymbolAndEscapedBody + ) + }) }) - it("should sanitize boolean tags with an empty string", () => { - // NOTE: Setting a boolean attr to an empty string is equivalent - // to it being true. - // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes - expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe( - sanitizedInstagramEmbedScript - ) + describe("sanitize", () => { + it("should sanitize boolean tags with an empty string", () => { + // NOTE: Setting a boolean attr to an empty string is equivalent + // to it being true. + // See the HTML spec: https://html.spec.whatwg.org/#boolean-attributes + expect(sanitizer.sanitize(rawInstagramEmbedScript)).toBe( + sanitizedInstagramEmbedScript + ) + }) + + it("should escape special characters in our frontmatter if there is html in the frontmatter", () => { + expect(sanitizer.sanitize(frontMatterWithSymbolAndHtml)).toBe( + escapedFrontMatterWithSymbolAndHtml + ) + }) + + it("should escape special characters in our frontmatter if there is html in the body", () => { + expect(sanitizer.sanitize(frontMatterWithSymbolAndHtmlBody)).toBe( + escapedFrontMatterWithSymbolAndHtmlBody + ) + }) + + it("should not escape special characters in our frontmatter if there is no html and no body", () => { + expect(sanitizer.sanitize(frontMatterWithSymbolWithoutBodyAndHtml)).toBe( + frontMatterWithSymbolWithoutBodyAndHtml + ) + }) + + it("should not escape special characters in our frontmatter if there is no html but there is body", () => { + expect(sanitizer.sanitize(frontMatterWithSymbolAndBodyWithoutHtml)).toBe( + frontMatterWithSymbolAndBodyWithoutHtml + ) + }) + + it("should inject a html comment tag when the string is empty", () => { + expect(sanitizer.sanitize("")).toBe(HTML_COMMENT_TAG) + }) }) }) diff --git a/src/utils/markdown-utils.js b/src/utils/markdown-utils.js index a293efecc..2bda31df7 100644 --- a/src/utils/markdown-utils.js +++ b/src/utils/markdown-utils.js @@ -12,11 +12,33 @@ const getTrailingSlashWithPermalink = (permalink) => const retrieveDataFromMarkdown = (fileContent) => { // eslint-disable-next-line no-unused-vars - const [unused, encodedFrontMatter, ...pageContent] = sanitizer - .sanitize(fileContent) - .split("---") - const frontMatter = sanitizedYamlParse(encodedFrontMatter) - return { frontMatter, pageContent: pageContent.join("---").trim() } + const [unused, encodedFrontMatter, ...pageContent] = fileContent.split("---") + // NOTE: We separate the sanitization into 2 steps. + // This is because DOMPurify does URL encoding when it detects html in the string. + // For example, `&something` will get HTML encoded to `&something`. + // To prevent this behaviour from affecting our frontmatter, we do the sanitization separately + // on the frontmatter and the content + const frontMatter = _.mapValues( + sanitizedYamlParse(encodedFrontMatter), + // NOTE: We call `unescape` here to transform `&` into `&`. + // Because of the above property, where DOMPurify does html encoding on detection of a html tag, + // there might be special characters that are encoded into their html form. + // This is a safe transformation to run because the original value was already a special character + // so this does not do anything destructive. + // Do note that frontmatter containing pre-existing html encoded characters (&) + // will get transformed regardless. + (val) => _.unescape(val) + ) + const originalPageContent = pageContent.join("---") + // NOTE: We don't sanitize if there is no page content. + // This is to avoid injection of a HTML comment by the sanitize function. + const sanitizedPageContent = originalPageContent + ? sanitizer.sanitize(originalPageContent).trim() + : "" + return { + frontMatter, + pageContent: sanitizedPageContent, + } } const isResourceFileOrLink = (frontMatter) => { @@ -33,9 +55,20 @@ const convertDataToMarkdown = (originalFrontMatter, pageContent) => { if (permalink) { frontMatter.permalink = getTrailingSlashWithPermalink(permalink) } - const newFrontMatter = sanitizedYamlStringify(frontMatter) - const newContent = ["---\n", newFrontMatter, "---\n", pageContent].join("") - return sanitizer.sanitize(newContent) + // NOTE: We don't sanitize if there is no page content. + // This is to avoid injection of a HTML comment by the sanitize function. + const sanitizedPageContent = pageContent + ? sanitizer.sanitize(pageContent) + : "" + // NOTE: See above on why we call `unescape` + const newFrontMatter = _.unescape(sanitizedYamlStringify(frontMatter)) + const newContent = [ + "---\n", + newFrontMatter, + "---\n", + sanitizedPageContent, + ].join("") + return newContent } module.exports = {