Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Strip all variation selectors on emoji #3814

Merged
merged 3 commits into from
Jan 8, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions src/emoji.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@ limitations under the License.

import EMOJIBASE from 'emojibase-data/en/compact.json';

export const VARIATION_SELECTOR = String.fromCharCode(0xFE0F);

// The unicode is stored without the variant selector
const UNICODE_TO_EMOJI = new Map(); // not exported as gets for it are handled by getEmojiFromUnicode
export const EMOTICON_TO_EMOJI = new Map();
export const SHORTCODE_TO_EMOJI = new Map();

export const getEmojiFromUnicode = unicode => UNICODE_TO_EMOJI.get(unicode.replace(VARIATION_SELECTOR, ""));
export const getEmojiFromUnicode = unicode => UNICODE_TO_EMOJI.get(stripVariation(unicode));

const EMOJIBASE_GROUP_ID_TO_CATEGORY = [
"people", // smileys
Expand Down Expand Up @@ -51,13 +49,6 @@ export const DATA_BY_CATEGORY = {

// Store various mappings from unicode/emoticon/shortcode to the Emoji objects
EMOJIBASE.forEach(emoji => {
if (emoji.unicode.includes(VARIATION_SELECTOR)) {
// Clone data into variation-less version
emoji = Object.assign({}, emoji, {
unicode: emoji.unicode.replace(VARIATION_SELECTOR, ""),
});
}

const categoryId = EMOJIBASE_GROUP_ID_TO_CATEGORY[emoji.group];
if (DATA_BY_CATEGORY.hasOwnProperty(categoryId)) {
DATA_BY_CATEGORY[categoryId].push(emoji);
Expand All @@ -66,7 +57,13 @@ EMOJIBASE.forEach(emoji => {
emoji.filterString = `${emoji.annotation}\n${emoji.shortcodes.join('\n')}}\n${emoji.emoticon || ''}`.toLowerCase();

// Add mapping from unicode to Emoji object
UNICODE_TO_EMOJI.set(emoji.unicode, emoji);
// The 'unicode' field that we use in emojibase has either
// VS15 or VS16 appended to any characters that can take
// variation selectors. Which one it appends depends
// on whether emojibase considers their type to be 'text' or
// 'emoji'. We therefore strip any variation chars from strings
// both when building the map and when looking up.
UNICODE_TO_EMOJI.set(stripVariation(emoji.unicode), emoji);

if (emoji.emoticon) {
// Add mapping from emoticon to Emoji object
Expand All @@ -80,3 +77,23 @@ EMOJIBASE.forEach(emoji => {
});
}
});

/**
* Strips variation selectors from a string
* NB. Skin tone modifers are not variation selectors:
* this function does not touch them. (Should it?)
*
* @param {string} str string to strip
* @returns {string} stripped string
t3chguy marked this conversation as resolved.
Show resolved Hide resolved
*/
function stripVariation(str) {
let ret = '';
for (let i = 0; i < str.length; ++i) {
const charCode = str.charCodeAt(i);
// append to output only if it's outside the variation selector range
if (charCode < 0xFE00 && charCode > 0xFE0F) {
ret += str.charAt(i);
}
}
return ret;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this not just be done using a regex unicode character range?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question: I'm not sure as I'd have to look at whether js regexes match by UTF-16 character code or by codepoint, which I suspect depends on whether you use an old style regex or an ES6 style 'u' regex?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nm: a regex is indeed fine. My code wasn't though because I had an 'and' when I needed an 'or'.

}