Decomposition tests for icu4x (#658)

unicode-org · Jan 18, 2024 · b5347b9 · b5347b9
1 parent 77e3ce6
commit b5347b9
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/unicodetools/data/ucd/dev/NormalizationTest.txt b/unicodetools/data/ucd/dev/NormalizationTest.txt
@@ -1,5 +1,5 @@
 # NormalizationTest-16.0.0.txt
-# Date: 2023-11-10, 20:57:38 GMT
+# Date: 2024-01-18, 12:57:29 GMT
 # © 2023 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -66,6 +66,8 @@
 0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD;05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4;05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4;05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4;05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4; # (◌֒◌ַ◌ּ◌֥◌ְ׀◌ׄ◌֭; ◌ְ◌ַ◌ּ◌֥◌֒׀◌֭◌ׄ; ◌ְ◌ַ◌ּ◌֥◌֒׀◌֭◌ׄ; ◌ְ◌ַ◌ּ◌֥◌֒׀◌֭◌ׄ; ◌ְ◌ַ◌ּ◌֥◌֒׀◌֭◌ׄ; ) HEBREW ACCENT SEGOL, HEBREW POINT PATAH, HEBREW POINT DAGESH OR MAPIQ, HEBREW ACCENT MERKHA, HEBREW POINT SHEVA, HEBREW PUNCTUATION PASEQ, HEBREW MARK UPPER DOT, HEBREW ACCENT DEHI
 1100 AC00 11A8;1100 AC01;1100 1100 1161 11A8;1100 AC01;1100 1100 1161 11A8; # (ᄀ각; ᄀ각; ᄀ각; ᄀ각; ᄀ각; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK
 1100 AC00 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8; # (ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK, HANGUL JONGSEONG KIYEOK
+01C4 0323;01C4 0323;01C4 0323;0044 1E92 030C;0044 005A 0323 030C; # (Ǆ◌̣; Ǆ◌̣; Ǆ◌̣; DẒ◌̌; DZ◌̣◌̌; ) LATIN CAPITAL LETTER DZ WITH CARON, COMBINING DOT BELOW
+0DDD 0334;0DDD 0334;0DD9 0DCF 0334 0DCA;0DDD 0334;0DD9 0DCF 0334 0DCA; # (ෝ◌̴; ෝ◌̴; ො◌̴◌්; ෝ◌̴; ො◌̴◌්; ) SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA, COMBINING TILDE OVERLAY
 #
 @Part1 # Character by character test
 # All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java
@@ -1051,6 +1051,13 @@ static final String comma(String s) {
         "\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD",
         "\u1100\uAC00\u11A8",
         "\u1100\uAC00\u11A8\u11A8",
+        // Some implementations have an edge case when a character whose
+        // decomposition contains multiple starters and ends with a non-starter
+        // is followed by a non-starter of lower CCC.
+        // See https://github.com/unicode-org/unicodetools/issues/656
+        // and https://github.com/unicode-org/icu4x/pull/4530.
+        "\u01C4\u0323",
+        "\u0DDD\u0334",
     };
     /*
     static final void backwardsCompat(String directory, String filename, int[] list) throws IOException {