diff --git a/unicodetools/data/ucd/dev/NormalizationTest.txt b/unicodetools/data/ucd/dev/NormalizationTest.txt index f8445b262..c1021b150 100644 --- a/unicodetools/data/ucd/dev/NormalizationTest.txt +++ b/unicodetools/data/ucd/dev/NormalizationTest.txt @@ -1,5 +1,5 @@ # NormalizationTest-16.0.0.txt -# Date: 2024-01-20, 01:49:31 GMT +# Date: 2024-01-21, 18:36:20 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -67,7 +67,25 @@ 1100 AC00 11A8;1100 AC01;1100 1100 1161 11A8;1100 AC01;1100 1100 1161 11A8; # (ᄀ각; ᄀ각; ᄀ각; ᄀ각; ᄀ각; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK 1100 AC00 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8; # (ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK, HANGUL JONGSEONG KIYEOK 01C4 0323;01C4 0323;01C4 0323;0044 1E92 030C;0044 005A 0323 030C; # (DŽ◌̣; DŽ◌̣; DŽ◌̣; DẒ◌̌; DZ◌̣◌̌; ) LATIN CAPITAL LETTER DZ WITH CARON, COMBINING DOT BELOW +01C5 0323;01C5 0323;01C5 0323;0044 1E93 030C;0044 007A 0323 030C; # (Dž◌̣; Dž◌̣; Dž◌̣; Dẓ◌̌; Dz◌̣◌̌; ) LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON, COMBINING DOT BELOW +01C6 0323;01C6 0323;01C6 0323;0064 1E93 030C;0064 007A 0323 030C; # (dž◌̣; dž◌̣; dž◌̣; dẓ◌̌; dz◌̣◌̌; ) LATIN SMALL LETTER DZ WITH CARON, COMBINING DOT BELOW 0DDD 0334;0DDD 0334;0DD9 0DCF 0334 0DCA;0DDD 0334;0DD9 0DCF 0334 0DCA; # (ෝ◌̴; ෝ◌̴; ො◌̴◌්; ෝ◌̴; ො◌̴◌්; ) SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA, COMBINING TILDE OVERLAY +3304 0334;3304 0334;3304 0334;30A4 30CB 30F3 30B0 0334;30A4 30CB 30F3 30AF 0334 3099; # (㌄◌̴; ㌄◌̴; ㌄◌̴; イニング◌̴; イニンク◌̴◌゙; ) SQUARE ININGU, COMBINING TILDE OVERLAY +3307 0334;3307 0334;3307 0334;30A8 30B9 30AF 30FC 30C9 0334;30A8 30B9 30AF 30FC 30C8 0334 3099; # (㌇◌̴; ㌇◌̴; ㌇◌̴; エスクード◌̴; エスクート◌̴◌゙; ) SQUARE ESUKUUDO, COMBINING TILDE OVERLAY +3310 0334;3310 0334;3310 0334;30AE 30AC 0334;30AD 3099 30AB 0334 3099; # (㌐◌̴; ㌐◌̴; ㌐◌̴; ギガ◌̴; キ◌゙カ◌̴◌゙; ) SQUARE GIGA, COMBINING TILDE OVERLAY +331E 0334;331E 0334;331E 0334;30B3 30FC 30DD 0334;30B3 30FC 30DB 0334 309A; # (㌞◌̴; ㌞◌̴; ㌞◌̴; コーポ◌̴; コーホ◌̴◌゚; ) SQUARE KOOPO, COMBINING TILDE OVERLAY +3321 0334;3321 0334;3321 0334;30B7 30EA 30F3 30B0 0334;30B7 30EA 30F3 30AF 0334 3099; # (㌡◌̴; ㌡◌̴; ㌡◌̴; シリング◌̴; シリンク◌̴◌゙; ) SQUARE SIRINGU, COMBINING TILDE OVERLAY +3332 0334;3332 0334;3332 0334;30D5 30A1 30E9 30C3 30C9 0334;30D5 30A1 30E9 30C3 30C8 0334 3099; # (㌲◌̴; ㌲◌̴; ㌲◌̴; ファラッド◌̴; ファラット◌̴◌゙; ) SQUARE HUARADDO, COMBINING TILDE OVERLAY +333B 0334;333B 0334;333B 0334;30DA 30FC 30B8 0334;30D8 309A 30FC 30B7 0334 3099; # (㌻◌̴; ㌻◌̴; ㌻◌̴; ページ◌̴; ヘ◌゚ーシ◌̴◌゙; ) SQUARE PEEZI, COMBINING TILDE OVERLAY +3340 0334;3340 0334;3340 0334;30DD 30F3 30C9 0334;30DB 309A 30F3 30C8 0334 3099; # (㍀◌̴; ㍀◌̴; ㍀◌̴; ポンド◌̴; ホ◌゚ント◌̴◌゙; ) SQUARE PONDO, COMBINING TILDE OVERLAY +334B 0334;334B 0334;334B 0334;30E1 30AC 0334;30E1 30AB 0334 3099; # (㍋◌̴; ㍋◌̴; ㍋◌̴; メガ◌̴; メカ◌̴◌゙; ) SQUARE MEGA, COMBINING TILDE OVERLAY +334E 0334;334E 0334;334E 0334;30E4 30FC 30C9 0334;30E4 30FC 30C8 0334 3099; # (㍎◌̴; ㍎◌̴; ㍎◌̴; ヤード◌̴; ヤート◌̴◌゙; ) SQUARE YAADO, COMBINING TILDE OVERLAY +FEF5 0656;FEF5 0656;FEF5 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻵ◌ٖ; ﻵ◌ٖ; ﻵ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF +FEF6 0656;FEF6 0656;FEF6 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻶ◌ٖ; ﻶ◌ٖ; ﻶ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF +FEF7 0656;FEF7 0656;FEF7 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻷ◌ٖ; ﻷ◌ٖ; ﻷ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF +FEF8 0656;FEF8 0656;FEF8 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻸ◌ٖ; ﻸ◌ٖ; ﻸ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF +FEF9 0334;FEF9 0334;FEF9 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻹ◌̴; ﻹ◌̴; ﻹ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW ISOLATED FORM, COMBINING TILDE OVERLAY +FEFA 0334;FEFA 0334;FEFA 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻺ◌̴; ﻺ◌̴; ﻺ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW FINAL FORM, COMBINING TILDE OVERLAY # @Part1 # Character by character test # All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms. diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java index caf581ebb..19f57e3c1 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java @@ -13,6 +13,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -23,6 +24,8 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.function.Consumer; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.UTF32; import org.unicode.text.utility.UnicodeDataFile; @@ -811,6 +814,54 @@ public static void writeNormalizerTestSuite(String directory, String fileName) for (final String testSuiteCase : testSuiteCases) { writeLine(testSuiteCase, log, false); } + // At least one implementation (ICU4X) has an edge case when a character + // whose decomposition contains multiple starters and ends with a + // non-starter is followed by a non-starter of lower CCC. + // See https://github.com/unicode-org/unicodetools/issues/656 + // and https://github.com/unicode-org/icu4x/pull/4530. + // That implementation also has separate code paths for the BMP and + // higher planes. No such decompositions currently exist outside the + // BMP, but by generating these test cases we ensure that this would be + // covered. + // We stick them in Part 0, which is in principle for handcrafted test + // cases, because there are not many of them, and the edge case feels a + // tad too weird to describe in the title of a new part. + final org.unicode.props.UnicodeProperty sc = + IndexUnicodeProperties.make().getProperty(UcdProperty.Script); + for (final String cp : UnicodeSet.ALL_CODE_POINTS) { + final String[] decompositions = + new String[] {Default.nfd().normalize(cp), Default.nfkd().normalize(cp)}; + for (final String decomposition : decompositions) { + final int lastCCC = + Default.ucd() + .getCombiningClass( + decomposition.codePointBefore(decomposition.length())); + final long nonStarterCount = + decomposition + .codePoints() + .filter(c -> (Default.ucd().getCombiningClass(c) == 0)) + .count(); + final String script = sc.getValue(cp.codePointAt(0)); + if (lastCCC > 1 && nonStarterCount > 1) { + // Try to pick a trailing nonstarter that might have a + // chance of combining with the character if possible, + // both for æsthetic reasons and to reproduce the example + // ICU4X came across. If all else fails, use a character + // with CCC=1, as low as it gets. + if (script.equals("Arabic") && lastCCC > 220) { + // ARABIC SUBSCRIPT ALEF. + writeLine(cp + "\u0656", log, false); + } else if (lastCCC > 220) { + // COMBINING DOT BELOW. + writeLine(cp + "\u0323", log, false); + } else { + // COMBINING TILDE OVERLAY. + writeLine(cp + "\u0334", log, false); + } + break; + } + } + } System.out.println("Writing Part 2"); @@ -1318,13 +1369,6 @@ static final String comma(String s) { "\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD", "\u1100\uAC00\u11A8", "\u1100\uAC00\u11A8\u11A8", - // Some implementations have an edge case when a character whose - // decomposition contains multiple starters and ends with a non-starter - // is followed by a non-starter of lower CCC. - // See https://github.com/unicode-org/unicodetools/issues/656 - // and https://github.com/unicode-org/icu4x/pull/4530. - "\u01C4\u0323", - "\u0DDD\u0334", }; /* static final void backwardsCompat(String directory, String filename, int[] list) throws IOException {