From 6d25d04fdb3caf8be5101150ed7b83f087c7612f Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 5 Jun 2024 16:19:22 -0400 Subject: [PATCH] Don't consider textual characters to be emoji MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like 'โ†”' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. --- .eslintrc.js | 10 +++++ src/HtmlUtils.tsx | 27 +++++++++--- .../views/rooms/SendMessageComposer.tsx | 2 +- src/editor/parts.ts | 11 +++-- test/HtmlUtils-test.tsx | 12 ++++++ test/__snapshots__/HtmlUtils-test.tsx.snap | 41 +++++++++++++++++++ 6 files changed, 91 insertions(+), 12 deletions(-) diff --git a/.eslintrc.js b/.eslintrc.js index 0f1335b586d..c6251ef722d 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -78,6 +78,11 @@ module.exports = { name: "matrix-react-sdk/", message: "Please use matrix-react-sdk/src/index instead", }, + { + name: "emojibase-regex", + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], patterns: [ { @@ -141,6 +146,11 @@ module.exports = { ], message: "Please use matrix-js-sdk/src/matrix instead", }, + { + group: ["emojibase-regex/emoji*"], + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], }, ], diff --git a/src/HtmlUtils.tsx b/src/HtmlUtils.tsx index d8c154440bd..58646043945 100644 --- a/src/HtmlUtils.tsx +++ b/src/HtmlUtils.tsx @@ -20,7 +20,6 @@ limitations under the License. import React, { LegacyRef, ReactNode } from "react"; import sanitizeHtml from "sanitize-html"; import classNames from "classnames"; -import EMOJIBASE_REGEX from "emojibase-regex"; import katex from "katex"; import { decode } from "html-entities"; import { IContent } from "matrix-js-sdk/src/matrix"; @@ -46,10 +45,28 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/; const SYMBOL_PATTERN = /([\u2100-\u2bff])/; // Regex pattern for non-emoji characters that can appear in an "all-emoji" message -// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace) -const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g; +// (Zero-Width Space, other whitespace) +const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g; + +// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional +// emoji presentation VS (U+FE0F), but not those sequences that are followed by +// a text presentation VS (U+FE0E). We also count lone regional indicators +// (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji +// followed by U+FE0E when the emoji doesn't have a text variant, but in +// practice this doesn't matter. +export const EMOJI_REGEX = (() => { + try { + // Per our support policy, v mode is available to us, but we still don't + // want the app to completely crash on older platforms. We use the + // constructor here to avoid a syntax error on such platforms. + return new RegExp("\\p{RGI_Emoji}(?!\\uFE0E)(?:(? { expect(html).toMatchInlineSnapshot(`"test foo <b>bar"`); }); + it("generates big emoji for emoji made of multiple characters", () => { + const { asFragment } = render(bodyToHtml({ body: "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ โ†”๏ธ ๐Ÿ‡ฎ๐Ÿ‡ธ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + it("should generate big emoji for an emoji-only reply to a message", () => { const { asFragment } = render( bodyToHtml( @@ -132,6 +138,12 @@ describe("bodyToHtml", () => { expect(asFragment()).toMatchSnapshot(); }); + it("does not mistake characters in text presentation mode for emoji", () => { + const { asFragment } = render(bodyToHtml({ body: "โ†” โ—๏ธŽ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + describe("feature_latex_maths", () => { beforeEach(() => { jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); diff --git a/test/__snapshots__/HtmlUtils-test.tsx.snap b/test/__snapshots__/HtmlUtils-test.tsx.snap index c33cc46433d..c69eaa7d952 100644 --- a/test/__snapshots__/HtmlUtils-test.tsx.snap +++ b/test/__snapshots__/HtmlUtils-test.tsx.snap @@ -1,5 +1,16 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`bodyToHtml does not mistake characters in text presentation mode for emoji 1`] = ` + + + โ†” โ—๏ธŽ + + +`; + exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; exports[`bodyToHtml feature_latex_maths should not mangle divs 1`] = `"

hello

world
"`; @@ -8,6 +19,36 @@ exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hel exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ฮพ\\xi world"`; +exports[`bodyToHtml generates big emoji for emoji made of multiple characters 1`] = ` + + + + ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ + + + + โ†”๏ธ + + + + ๐Ÿ‡ฎ๐Ÿ‡ธ + + + +`; + exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = `