From 2c4bdb9f3d19bf337f271dfbdefe02b5bf543788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Minh=20Nguy=E1=BB=85n?= Date: Mon, 19 Aug 2024 17:43:57 -0700 Subject: [PATCH] Combine segments separated by double-width diacritical mark --- src/util/script_detection.ts | 171 ++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts index 15ed0ae6cea..80e32ce3b64 100644 --- a/src/util/script_detection.ts +++ b/src/util/script_detection.ts @@ -4,6 +4,175 @@ import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block'; const segmenter = new Intl.Segmenter(); +const doubleWidthDiacritics = [ + 0x035C, // Combining Double Breve Below + 0x035D, // Combining Double Breve + 0x035E, // Combining Double Macron + 0x035F, // Combining Double Macron Below + 0x0360, // Combining Double Tilde + 0x0361, // Combining Double Inverted Breve + 0x0362, // Combining Double Rightwards Arrow Below + 0x0955, // Devanagari Vowel Sign Candra Long E + 0x0956, // Devanagari Vowel Sign Ue + 0x0957, // Devanagari Vowel Sign Uue + 0x0A01, // Gurmukhi Sign Adak Bindi + 0x0A51, // Gurmukhi Sign Udaat + 0x0A75, // Gurmukhi Sign Yakash + 0x0AE2, // Gujarati Vowel Sign Vocalic L + 0x0AE3, // Gujarati Vowel Sign Vocalic Ll + 0x0B3F, // Oriya Vowel Sign I + 0x0B41, // Oriya Vowel Sign U + 0x0B42, // Oriya Vowel Sign Uu + 0x0B43, // Oriya Vowel Sign Vocalic R + 0x0B44, // Oriya Vowel Sign Vocalic Rr + 0x0B4D, // Oriya Sign Virama + 0x0B55, // Oriya Sign Overline + 0x0B56, // Oriya Ai Length Mark + 0x0B62, // Oriya Vowel Sign Vocalic L + 0x0B63, // Oriya Vowel Sign Vocalic Ll + 0x0C48, // Telugu Vowel Sign Ai + 0x0C81, // Kannada Sign Candrabindu + 0x0CBC, // Kannada Sign Nukta + 0x0CBF, // Kannada Vowel Sign I + 0x0CC6, // Kannada Vowel Sign E + 0x0CCC, // Kannada Vowel Sign Au + 0x0CCD, // Kannada Sign Virama + 0x0CE2, // Kannada Vowel Sign Vocalic L + 0x0CE3, // Kannada Vowel Sign Vocalic Ll + 0x0D41, // Malayalam Vowel Sign U + 0x0D42, // Malayalam Vowel Sign Uu + 0x0D43, // Malayalam Vowel Sign Vocalic R + 0x0D44, // Malayalam Vowel Sign Vocalic Rr + 0x0D4D, // Malayalam Sign Virama + 0x0DCA, // Sinhala Sign Al-Lakuna + 0x0DD2, // Sinhala Vowel Sign Ketti Is-Pilla + 0x0DD3, // Sinhala Vowel Sign Diga Is-Pilla + 0x0DD4, // Sinhala Vowel Sign Ketti Paa-Pilla + 0x0DD6, // Sinhala Vowel Sign Diga Paa-Pilla + 0x0E31, // Thai Character Mai Han-Akat + 0x0E34, // Thai Character Sara I + 0x0E35, // Thai Character Sara Ii + 0x0E36, // Thai Character Sara Ue + 0x0E37, // Thai Character Sara Uee + 0x0E38, // Thai Character Sara U + 0x0E39, // Thai Character Sara Uu + 0x0E3A, // Thai Character Phinthu + 0x0E47, // Thai Character Maitaikhu + 0x0E48, // Thai Character Mai Ek + 0x0E49, // Thai Character Mai Tho + 0x0E4A, // Thai Character Mai Tri + 0x0E4B, // Thai Character Mai Chattawa + 0x0E4C, // Thai Character Thanthakhat + 0x0E4D, // Thai Character Nikhahit + 0x0E4E, // Thai Character Yamakkan + 0x0F71, // Tibetan Vowel Sign Aa + 0x0F73, // Tibetan Vowel Sign Ii + 0x0F74, // Tibetan Vowel Sign U + 0x0F75, // Tibetan Vowel Sign Uu + 0x0F76, // Tibetan Vowel Sign Vocalic R + 0x0F77, // Tibetan Vowel Sign Vocalic Rr + 0x0F78, // Tibetan Vowel Sign Vocalic L + 0x0F79, // Tibetan Vowel Sign Vocalic Ll + 0x0F81, // Tibetan Vowel Sign Reversed Ii + 0x0F8D, // Tibetan Subjoined Sign Lce Tsa Can + 0x0F8E, // Tibetan Subjoined Sign Mchu Can + 0x0F8F, // Tibetan Subjoined Sign Inverted Mchu Can + 0x0F90, // Tibetan Subjoined Letter Ka + 0x0F91, // Tibetan Subjoined Letter Kha + 0x0F92, // Tibetan Subjoined Letter Ga + 0x0F93, // Tibetan Subjoined Letter Gha + 0x0F94, // Tibetan Subjoined Letter Nga + 0x0F95, // Tibetan Subjoined Letter Ca + 0x0F96, // Tibetan Subjoined Letter Cha + 0x0F97, // Tibetan Subjoined Letter Ja + 0x0F99, // Tibetan Subjoined Letter Nya + 0x0F9A, // Tibetan Subjoined Letter Tta + 0x0F9B, // Tibetan Subjoined Letter Ttha + 0x0F9C, // Tibetan Subjoined Letter Dda + 0x0F9D, // Tibetan Subjoined Letter Ddha + 0x0F9E, // Tibetan Subjoined Letter Nna + 0x0F9F, // Tibetan Subjoined Letter Ta + 0x0FA0, // Tibetan Subjoined Letter Tha + 0x0FA1, // Tibetan Subjoined Letter Da + 0x0FA2, // Tibetan Subjoined Letter Dha + 0x0FA3, // Tibetan Subjoined Letter Na + 0x0FA4, // Tibetan Subjoined Letter Pa + 0x0FA5, // Tibetan Subjoined Letter Pha + 0x0FA6, // Tibetan Subjoined Letter Ba + 0x0FA7, // Tibetan Subjoined Letter Bha + 0x0FA8, // Tibetan Subjoined Letter Ma + 0x0FA9, // Tibetan Subjoined Letter Tsa + 0x0FAA, // Tibetan Subjoined Letter Tsha + 0x0FAB, // Tibetan Subjoined Letter Dza + 0x0FAC, // Tibetan Subjoined Letter Dzha + 0x0FAD, // Tibetan Subjoined Letter Wa + 0x0FAE, // Tibetan Subjoined Letter Zha + 0x0FAF, // Tibetan Subjoined Letter Za + 0x0FB0, // Tibetan Subjoined Letter undefined-A + 0x0FB1, // Tibetan Subjoined Letter Ya + 0x0FB2, // Tibetan Subjoined Letter Ra + 0x0FB3, // Tibetan Subjoined Letter La + 0x0FB4, // Tibetan Subjoined Letter Sha + 0x0FB5, // Tibetan Subjoined Letter Ssa + 0x0FB6, // Tibetan Subjoined Letter Sa + 0x0FB7, // Tibetan Subjoined Letter Ha + 0x0FB8, // Tibetan Subjoined Letter A + 0x0FB9, // Tibetan Subjoined Letter Kssa + 0x0FBA, // Tibetan Subjoined Letter Fixed-Form Wa + 0x0FBB, // Tibetan Subjoined Letter Fixed-Form Ya + 0x0FBC, // Tibetan Subjoined Letter Fixed-Form Ra + 0x102D, // Myanmar Vowel Sign I + 0x102E, // Myanmar Vowel Sign Ii + 0x102F, // Myanmar Vowel Sign U + 0x1030, // Myanmar Vowel Sign Uu + 0x1032, // Myanmar Vowel Sign Ai + 0x1033, // Myanmar Vowel Sign Mon Ii + 0x1034, // Myanmar Vowel Sign Mon O + 0x1035, // Myanmar Vowel Sign E Above + 0x1036, // Myanmar Sign Anusvara + 0x1037, // Myanmar Sign Dot Below + 0x1039, // Myanmar Sign Virama + 0x103A, // Myanmar Sign Asat + 0x103D, // Myanmar Consonant Sign Medial Wa + 0x103E, // Myanmar Consonant Sign Medial Ha + 0x1058, // Myanmar Vowel Sign Vocalic L + 0x1059, // Myanmar Vowel Sign Vocalic Ll + 0x105E, // Myanmar Consonant Sign Mon Medial Na + 0x105F, // Myanmar Consonant Sign Mon Medial Ma + 0x1060, // Myanmar Consonant Sign Mon Medial La + 0x1071, // Myanmar Vowel Sign Geba Karen I + 0x1072, // Myanmar Vowel Sign Kayah Oe + 0x1073, // Myanmar Vowel Sign Kayah U + 0x1074, // Myanmar Vowel Sign Kayah Ee + 0x1082, // Myanmar Consonant Sign Shan Medial Wa + 0x1085, // Myanmar Vowel Sign Shan E Above + 0x1086, // Myanmar Vowel Sign Shan Final Y + 0x108D, // Myanmar Sign Shan Council Emphatic Tone + 0x109D, // Myanmar Vowel Sign Aiton Ai + 0x1732, // Hanunoo Vowel Sign I + 0x1733, // Hanunoo Vowel Sign U + 0x1734, // Hanunoo Sign Pamudpod + 0x1772, // Tagbanwa Vowel Sign I + 0x1773, // Tagbanwa Vowel Sign U + 0x17B7, // Khmer Vowel Sign I + 0x17B8, // Khmer Vowel Sign Ii + 0x17B9, // Khmer Vowel Sign Y + 0x17BA, // Khmer Vowel Sign Yy + 0x17BB, // Khmer Vowel Sign U + 0x17BC, // Khmer Vowel Sign Uu + 0x17BD, // Khmer Vowel Sign Ua + 0x17C6, // Khmer Sign Nikahit + 0x17CB, // Khmer Sign Bantoc + 0x17CD, // Khmer Sign Toandakhiat + 0x17CE, // Khmer Sign Kakabat + 0x17CF, // Khmer Sign Ahsda + 0x17D0, // Khmer Sign Samyok Sannya + 0x17D1, // Khmer Sign Viriam + 0x17D2, // Khmer Sign Coeng + 0x17D3, // Khmer Sign Bathamasat + 0x17DD, // Khmer Sign Atthacan +]; + export function splitByGraphemeCluster(text: string) { const segments = segmenter.segment(text)[Symbol.iterator](); let segment = segments.next(); @@ -14,7 +183,7 @@ export function splitByGraphemeCluster(text: string) { const baseSegments = []; while (!segment.done) { const baseSegment = segment; - while (!nextSegment.done && /^\p{Mc}/u.test(nextSegment.value.segment)) { + while (!nextSegment.done && (/^\p{Mc}/u.test(nextSegment.value.segment) || doubleWidthDiacritics.indexOf(baseSegment.value.segment.at(-1).codePointAt(0)) !== -1)) { baseSegment.value.segment += nextSegment.value.segment; segment = segments.next(); nextSegment = nextSegments.next();