Skip to content

Commit

Permalink
Combine segments separated by double-width diacritical mark
Browse files Browse the repository at this point in the history
  • Loading branch information
1ec5 committed Aug 20, 2024
1 parent 304acf5 commit 2c4bdb9
Showing 1 changed file with 170 additions and 1 deletion.
171 changes: 170 additions & 1 deletion src/util/script_detection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,175 @@ import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block';

const segmenter = new Intl.Segmenter();

const doubleWidthDiacritics = [
0x035C, // Combining Double Breve Below
0x035D, // Combining Double Breve
0x035E, // Combining Double Macron
0x035F, // Combining Double Macron Below
0x0360, // Combining Double Tilde
0x0361, // Combining Double Inverted Breve
0x0362, // Combining Double Rightwards Arrow Below
0x0955, // Devanagari Vowel Sign Candra Long E
0x0956, // Devanagari Vowel Sign Ue
0x0957, // Devanagari Vowel Sign Uue
0x0A01, // Gurmukhi Sign Adak Bindi
0x0A51, // Gurmukhi Sign Udaat
0x0A75, // Gurmukhi Sign Yakash
0x0AE2, // Gujarati Vowel Sign Vocalic L
0x0AE3, // Gujarati Vowel Sign Vocalic Ll
0x0B3F, // Oriya Vowel Sign I
0x0B41, // Oriya Vowel Sign U
0x0B42, // Oriya Vowel Sign Uu
0x0B43, // Oriya Vowel Sign Vocalic R
0x0B44, // Oriya Vowel Sign Vocalic Rr
0x0B4D, // Oriya Sign Virama
0x0B55, // Oriya Sign Overline
0x0B56, // Oriya Ai Length Mark
0x0B62, // Oriya Vowel Sign Vocalic L
0x0B63, // Oriya Vowel Sign Vocalic Ll
0x0C48, // Telugu Vowel Sign Ai
0x0C81, // Kannada Sign Candrabindu
0x0CBC, // Kannada Sign Nukta
0x0CBF, // Kannada Vowel Sign I
0x0CC6, // Kannada Vowel Sign E
0x0CCC, // Kannada Vowel Sign Au
0x0CCD, // Kannada Sign Virama
0x0CE2, // Kannada Vowel Sign Vocalic L
0x0CE3, // Kannada Vowel Sign Vocalic Ll
0x0D41, // Malayalam Vowel Sign U
0x0D42, // Malayalam Vowel Sign Uu
0x0D43, // Malayalam Vowel Sign Vocalic R
0x0D44, // Malayalam Vowel Sign Vocalic Rr
0x0D4D, // Malayalam Sign Virama
0x0DCA, // Sinhala Sign Al-Lakuna
0x0DD2, // Sinhala Vowel Sign Ketti Is-Pilla
0x0DD3, // Sinhala Vowel Sign Diga Is-Pilla
0x0DD4, // Sinhala Vowel Sign Ketti Paa-Pilla
0x0DD6, // Sinhala Vowel Sign Diga Paa-Pilla
0x0E31, // Thai Character Mai Han-Akat
0x0E34, // Thai Character Sara I
0x0E35, // Thai Character Sara Ii
0x0E36, // Thai Character Sara Ue
0x0E37, // Thai Character Sara Uee
0x0E38, // Thai Character Sara U
0x0E39, // Thai Character Sara Uu
0x0E3A, // Thai Character Phinthu
0x0E47, // Thai Character Maitaikhu
0x0E48, // Thai Character Mai Ek
0x0E49, // Thai Character Mai Tho
0x0E4A, // Thai Character Mai Tri
0x0E4B, // Thai Character Mai Chattawa
0x0E4C, // Thai Character Thanthakhat
0x0E4D, // Thai Character Nikhahit
0x0E4E, // Thai Character Yamakkan
0x0F71, // Tibetan Vowel Sign Aa
0x0F73, // Tibetan Vowel Sign Ii
0x0F74, // Tibetan Vowel Sign U
0x0F75, // Tibetan Vowel Sign Uu
0x0F76, // Tibetan Vowel Sign Vocalic R
0x0F77, // Tibetan Vowel Sign Vocalic Rr
0x0F78, // Tibetan Vowel Sign Vocalic L
0x0F79, // Tibetan Vowel Sign Vocalic Ll
0x0F81, // Tibetan Vowel Sign Reversed Ii
0x0F8D, // Tibetan Subjoined Sign Lce Tsa Can
0x0F8E, // Tibetan Subjoined Sign Mchu Can
0x0F8F, // Tibetan Subjoined Sign Inverted Mchu Can
0x0F90, // Tibetan Subjoined Letter Ka
0x0F91, // Tibetan Subjoined Letter Kha
0x0F92, // Tibetan Subjoined Letter Ga
0x0F93, // Tibetan Subjoined Letter Gha
0x0F94, // Tibetan Subjoined Letter Nga
0x0F95, // Tibetan Subjoined Letter Ca
0x0F96, // Tibetan Subjoined Letter Cha
0x0F97, // Tibetan Subjoined Letter Ja
0x0F99, // Tibetan Subjoined Letter Nya
0x0F9A, // Tibetan Subjoined Letter Tta
0x0F9B, // Tibetan Subjoined Letter Ttha
0x0F9C, // Tibetan Subjoined Letter Dda
0x0F9D, // Tibetan Subjoined Letter Ddha
0x0F9E, // Tibetan Subjoined Letter Nna
0x0F9F, // Tibetan Subjoined Letter Ta
0x0FA0, // Tibetan Subjoined Letter Tha
0x0FA1, // Tibetan Subjoined Letter Da
0x0FA2, // Tibetan Subjoined Letter Dha
0x0FA3, // Tibetan Subjoined Letter Na
0x0FA4, // Tibetan Subjoined Letter Pa
0x0FA5, // Tibetan Subjoined Letter Pha
0x0FA6, // Tibetan Subjoined Letter Ba
0x0FA7, // Tibetan Subjoined Letter Bha
0x0FA8, // Tibetan Subjoined Letter Ma
0x0FA9, // Tibetan Subjoined Letter Tsa
0x0FAA, // Tibetan Subjoined Letter Tsha
0x0FAB, // Tibetan Subjoined Letter Dza
0x0FAC, // Tibetan Subjoined Letter Dzha
0x0FAD, // Tibetan Subjoined Letter Wa
0x0FAE, // Tibetan Subjoined Letter Zha
0x0FAF, // Tibetan Subjoined Letter Za
0x0FB0, // Tibetan Subjoined Letter undefined-A
0x0FB1, // Tibetan Subjoined Letter Ya
0x0FB2, // Tibetan Subjoined Letter Ra
0x0FB3, // Tibetan Subjoined Letter La
0x0FB4, // Tibetan Subjoined Letter Sha
0x0FB5, // Tibetan Subjoined Letter Ssa
0x0FB6, // Tibetan Subjoined Letter Sa
0x0FB7, // Tibetan Subjoined Letter Ha
0x0FB8, // Tibetan Subjoined Letter A
0x0FB9, // Tibetan Subjoined Letter Kssa
0x0FBA, // Tibetan Subjoined Letter Fixed-Form Wa
0x0FBB, // Tibetan Subjoined Letter Fixed-Form Ya
0x0FBC, // Tibetan Subjoined Letter Fixed-Form Ra
0x102D, // Myanmar Vowel Sign I
0x102E, // Myanmar Vowel Sign Ii
0x102F, // Myanmar Vowel Sign U
0x1030, // Myanmar Vowel Sign Uu
0x1032, // Myanmar Vowel Sign Ai
0x1033, // Myanmar Vowel Sign Mon Ii
0x1034, // Myanmar Vowel Sign Mon O
0x1035, // Myanmar Vowel Sign E Above
0x1036, // Myanmar Sign Anusvara
0x1037, // Myanmar Sign Dot Below
0x1039, // Myanmar Sign Virama
0x103A, // Myanmar Sign Asat
0x103D, // Myanmar Consonant Sign Medial Wa
0x103E, // Myanmar Consonant Sign Medial Ha
0x1058, // Myanmar Vowel Sign Vocalic L
0x1059, // Myanmar Vowel Sign Vocalic Ll
0x105E, // Myanmar Consonant Sign Mon Medial Na
0x105F, // Myanmar Consonant Sign Mon Medial Ma
0x1060, // Myanmar Consonant Sign Mon Medial La
0x1071, // Myanmar Vowel Sign Geba Karen I
0x1072, // Myanmar Vowel Sign Kayah Oe
0x1073, // Myanmar Vowel Sign Kayah U
0x1074, // Myanmar Vowel Sign Kayah Ee
0x1082, // Myanmar Consonant Sign Shan Medial Wa
0x1085, // Myanmar Vowel Sign Shan E Above
0x1086, // Myanmar Vowel Sign Shan Final Y
0x108D, // Myanmar Sign Shan Council Emphatic Tone
0x109D, // Myanmar Vowel Sign Aiton Ai
0x1732, // Hanunoo Vowel Sign I
0x1733, // Hanunoo Vowel Sign U
0x1734, // Hanunoo Sign Pamudpod
0x1772, // Tagbanwa Vowel Sign I
0x1773, // Tagbanwa Vowel Sign U
0x17B7, // Khmer Vowel Sign I
0x17B8, // Khmer Vowel Sign Ii
0x17B9, // Khmer Vowel Sign Y
0x17BA, // Khmer Vowel Sign Yy
0x17BB, // Khmer Vowel Sign U
0x17BC, // Khmer Vowel Sign Uu
0x17BD, // Khmer Vowel Sign Ua
0x17C6, // Khmer Sign Nikahit
0x17CB, // Khmer Sign Bantoc
0x17CD, // Khmer Sign Toandakhiat
0x17CE, // Khmer Sign Kakabat
0x17CF, // Khmer Sign Ahsda
0x17D0, // Khmer Sign Samyok Sannya
0x17D1, // Khmer Sign Viriam
0x17D2, // Khmer Sign Coeng
0x17D3, // Khmer Sign Bathamasat
0x17DD, // Khmer Sign Atthacan
];

export function splitByGraphemeCluster(text: string) {
const segments = segmenter.segment(text)[Symbol.iterator]();
let segment = segments.next();
Expand All @@ -14,7 +183,7 @@ export function splitByGraphemeCluster(text: string) {
const baseSegments = [];
while (!segment.done) {
const baseSegment = segment;
while (!nextSegment.done && /^\p{Mc}/u.test(nextSegment.value.segment)) {
while (!nextSegment.done && (/^\p{Mc}/u.test(nextSegment.value.segment) || doubleWidthDiacritics.indexOf(baseSegment.value.segment.at(-1).codePointAt(0)) !== -1)) {
baseSegment.value.segment += nextSegment.value.segment;
segment = segments.next();
nextSegment = nextSegments.next();
Expand Down

0 comments on commit 2c4bdb9

Please sign in to comment.