From 8566bc6302181f1a4b870fa6972abe599068c7a8 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 16 Aug 2024 19:18:34 +0200 Subject: [PATCH] [SPARK-49265][SQL][TESTS] Add collation support unit tests for Upper, Lower, and InitCap ### What changes were proposed in this pull request? Add collation support unit tests for: - Upper - Lower - InitCap This PR contains test-only changes, providing additional test coverage for cases such as: - case and accent variation - one-to-many case mapping - conditional case mapping - surrogate pairs - etc. ### Why are the changes needed? Improve collation support testing. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests in `CollationSupportSuite`. ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #47727 from uros-db/unit-tests-3. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Max Gekk --- .../unsafe/types/CollationSupportSuite.java | 316 +++++++++++------- 1 file changed, 193 insertions(+), 123 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 12f5e0eea0e61..fda6664915ec8 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -821,8 +821,12 @@ public void testStringSplitSQL() throws SparkException { assertStringSplitSQL("A𐐭B", "𐐅", "UNICODE_CI", array_A_B); } + /** + * Verify the behaviour of the `Upper` collation support class. + */ + private void assertUpper(String target, String collationName, String expected) - throws SparkException { + throws SparkException { UTF8String target_utf8 = UTF8String.fromString(target); UTF8String expected_utf8 = UTF8String.fromString(expected); int collationId = CollationFactory.collationNameToId(collationName); @@ -835,52 +839,57 @@ private void assertUpper(String target, String collationName, String expected) @Test public void testUpper() throws SparkException { - // Edge cases - assertUpper("", "UTF8_BINARY", ""); - assertUpper("", "UTF8_LCASE", ""); - assertUpper("", "UNICODE", ""); - assertUpper("", "UNICODE_CI", ""); - // Basic tests - assertUpper("abcde", "UTF8_BINARY", "ABCDE"); - assertUpper("abcde", "UTF8_LCASE", "ABCDE"); - assertUpper("abcde", "UNICODE", "ABCDE"); - assertUpper("abcde", "UNICODE_CI", "ABCDE"); - // Uppercase present - assertUpper("AbCdE", "UTF8_BINARY", "ABCDE"); - assertUpper("aBcDe", "UTF8_BINARY", "ABCDE"); - assertUpper("AbCdE", "UTF8_LCASE", "ABCDE"); - assertUpper("aBcDe", "UTF8_LCASE", "ABCDE"); - assertUpper("AbCdE", "UNICODE", "ABCDE"); - assertUpper("aBcDe", "UNICODE", "ABCDE"); - assertUpper("AbCdE", "UNICODE_CI", "ABCDE"); - assertUpper("aBcDe", "UNICODE_CI", "ABCDE"); - // Accent letters - assertUpper("aBćDe","UTF8_BINARY", "ABĆDE"); - assertUpper("aBćDe","UTF8_LCASE", "ABĆDE"); - assertUpper("aBćDe","UNICODE", "ABĆDE"); - assertUpper("aBćDe","UNICODE_CI", "ABĆDE"); - // Variable byte length characters - assertUpper("ab世De", "UTF8_BINARY", "AB世DE"); - assertUpper("äbćδe", "UTF8_BINARY", "ÄBĆΔE"); - assertUpper("ab世De", "UTF8_LCASE", "AB世DE"); - assertUpper("äbćδe", "UTF8_LCASE", "ÄBĆΔE"); - assertUpper("ab世De", "UNICODE", "AB世DE"); - assertUpper("äbćδe", "UNICODE", "ÄBĆΔE"); - assertUpper("ab世De", "UNICODE_CI", "AB世DE"); - assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE"); - // Case-variable character length - assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O"); - assertUpper("i\u0307o", "UTF8_LCASE","I\u0307O"); - assertUpper("i\u0307o", "UNICODE","I\u0307O"); - assertUpper("i\u0307o", "UNICODE_CI","I\u0307O"); - assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342"); - assertUpper("ß fi ffi ff st ῗ", "UTF8_LCASE","SS FI FFI FF ST \u0399\u0308\u0342"); - assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); - assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); + for (String collationName: testSupportedCollations) { + // Empty strings. + assertUpper("", collationName, ""); + // Basic tests. + assertUpper("abcde", collationName, "ABCDE"); + assertUpper("AbCdE", collationName, "ABCDE"); + assertUpper("aBcDe", collationName, "ABCDE"); + assertUpper("ABCDE", collationName, "ABCDE"); + // Advanced tests. + assertUpper("aBćDe", collationName, "ABĆDE"); + assertUpper("ab世De", collationName, "AB世DE"); + assertUpper("äbćδe", collationName, "ÄBĆΔE"); + assertUpper("AbĆdE", collationName, "ABĆDE"); + assertUpper("aB世De", collationName, "AB世DE"); + assertUpper("ÄBĆΔE", collationName, "ÄBĆΔE"); + // One-to-many case mapping (e.g. Turkish dotted I). + assertUpper("İ", collationName, "İ"); + assertUpper("i\u0307", collationName,"I\u0307"); + assertUpper("İonic", collationName, "İONIC"); + assertUpper("i\u0307onic", collationName,"I\u0307ONIC"); + assertUpper("FIDELİO", collationName, "FIDELİO"); + // Conditional case mapping (e.g. Greek sigmas). + assertUpper("σ", collationName, "Σ"); + assertUpper("σ", collationName, "Σ"); + assertUpper("ς", collationName, "Σ"); + assertUpper("Σ", collationName, "Σ"); + assertUpper("ΣΑΛΑΤΑ", collationName, "ΣΑΛΑΤΑ"); + assertUpper("σαλατα", collationName, "ΣΑΛΑΤΑ"); + assertUpper("ςαλατα", collationName, "ΣΑΛΑΤΑ"); + assertUpper("ΘΑΛΑΣΣΙΝΟΣ", collationName, "ΘΑΛΑΣΣΙΝΟΣ"); + assertUpper("θαλασσινοσ", collationName, "ΘΑΛΑΣΣΙΝΟΣ"); + assertUpper("θαλασσινος", collationName, "ΘΑΛΑΣΣΙΝΟΣ"); + // Surrogate pairs. + assertUpper("a🙃B🙃c", collationName, "A🙃B🙃C"); + assertUpper("😄 😆", collationName, "😄 😆"); + assertUpper("😀😆😃😄", collationName, "😀😆😃😄"); + assertUpper("𝔸", collationName, "𝔸"); + assertUpper("𐐅", collationName, "𐐅"); + assertUpper("𐐭", collationName, "𐐅"); + assertUpper("𐐭𝔸", collationName, "𐐅𝔸"); + // Ligatures. + assertUpper("ß fi ffi ff st ῗ", collationName,"SS FI FFI FF ST \u0399\u0308\u0342"); + } } + /** + * Verify the behaviour of the `Lower` collation support class. + */ + private void assertLower(String target, String collationName, String expected) - throws SparkException { + throws SparkException { UTF8String target_utf8 = UTF8String.fromString(target); UTF8String expected_utf8 = UTF8String.fromString(expected); int collationId = CollationFactory.collationNameToId(collationName); @@ -893,48 +902,56 @@ private void assertLower(String target, String collationName, String expected) @Test public void testLower() throws SparkException { - // Edge cases - assertLower("", "UTF8_BINARY", ""); - assertLower("", "UTF8_LCASE", ""); - assertLower("", "UNICODE", ""); - assertLower("", "UNICODE_CI", ""); - // Basic tests - assertLower("ABCDE", "UTF8_BINARY", "abcde"); - assertLower("ABCDE", "UTF8_LCASE", "abcde"); - assertLower("ABCDE", "UNICODE", "abcde"); - assertLower("ABCDE", "UNICODE_CI", "abcde"); - // Uppercase present - assertLower("AbCdE", "UTF8_BINARY", "abcde"); - assertLower("aBcDe", "UTF8_BINARY", "abcde"); - assertLower("AbCdE", "UTF8_LCASE", "abcde"); - assertLower("aBcDe", "UTF8_LCASE", "abcde"); - assertLower("AbCdE", "UNICODE", "abcde"); - assertLower("aBcDe", "UNICODE", "abcde"); - assertLower("AbCdE", "UNICODE_CI", "abcde"); - assertLower("aBcDe", "UNICODE_CI", "abcde"); - // Accent letters - assertLower("AbĆdE","UTF8_BINARY", "abćde"); - assertLower("AbĆdE","UTF8_LCASE", "abćde"); - assertLower("AbĆdE","UNICODE", "abćde"); - assertLower("AbĆdE","UNICODE_CI", "abćde"); - // Variable byte length characters - assertLower("aB世De", "UTF8_BINARY", "ab世de"); - assertLower("ÄBĆΔE", "UTF8_BINARY", "äbćδe"); - assertLower("aB世De", "UTF8_LCASE", "ab世de"); - assertLower("ÄBĆΔE", "UTF8_LCASE", "äbćδe"); - assertLower("aB世De", "UNICODE", "ab世de"); - assertLower("ÄBĆΔE", "UNICODE", "äbćδe"); - assertLower("aB世De", "UNICODE_CI", "ab世de"); - assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe"); - // Case-variable character length - assertLower("İo", "UTF8_BINARY","i\u0307o"); - assertLower("İo", "UTF8_LCASE","i\u0307o"); - assertLower("İo", "UNICODE","i\u0307o"); - assertLower("İo", "UNICODE_CI","i\u0307o"); + for (String collationName: testSupportedCollations) { + // Empty strings. + assertLower("", collationName, ""); + // Basic tests. + assertLower("abcde", collationName, "abcde"); + assertLower("AbCdE", collationName, "abcde"); + assertLower("aBcDe", collationName, "abcde"); + assertLower("ABCDE", collationName, "abcde"); + // Advanced tests. + assertUpper("aBćDe", collationName, "ABĆDE"); + assertUpper("ab世De", collationName, "AB世DE"); + assertUpper("äbćδe", collationName, "ÄBĆΔE"); + assertLower("AbĆdE", collationName, "abćde"); + assertLower("aB世De", collationName, "ab世de"); + assertLower("ÄBĆΔE", collationName, "äbćδe"); + // One-to-many case mapping (e.g. Turkish dotted I). + assertLower("İ", collationName, "i\u0307"); + assertLower("I\u0307", collationName,"i\u0307"); + assertLower("İonic", collationName, "i\u0307onic"); + assertLower("i\u0307onic", collationName,"i\u0307onic"); + assertLower("FIDELİO", collationName, "fideli\u0307o"); + // Conditional case mapping (e.g. Greek sigmas). + assertLower("σ", collationName, "σ"); + assertLower("ς", collationName, "ς"); + assertLower("Σ", collationName, "σ"); + assertLower("ΣΑΛΑΤΑ", collationName, "σαλατα"); + assertLower("σαλατα", collationName, "σαλατα"); + assertLower("ςαλατα", collationName, "ςαλατα"); + assertLower("ΘΑΛΑΣΣΙΝΟΣ", collationName, "θαλασσινος"); + assertLower("θαλασσινοσ", collationName, "θαλασσινοσ"); + assertLower("θαλασσινος", collationName, "θαλασσινος"); + // Surrogate pairs. + assertLower("a🙃B🙃c", collationName, "a🙃b🙃c"); + assertLower("😄 😆", collationName, "😄 😆"); + assertLower("😀😆😃😄", collationName, "😀😆😃😄"); + assertLower("𝔸", collationName, "𝔸"); + assertLower("𐐅", collationName, "𐐭"); + assertLower("𐐭", collationName, "𐐭"); + assertLower("𐐭𝔸", collationName, "𐐭𝔸"); + // Ligatures. + assertLower("ß fi ffi ff st ῗ", collationName,"ß fi ffi ff st ῗ"); + } } + /** + * Verify the behaviour of the `InitCap` collation support class. + */ + private void assertInitCap(String target, String collationName, String expected) - throws SparkException { + throws SparkException { UTF8String target_utf8 = UTF8String.fromString(target); UTF8String expected_utf8 = UTF8String.fromString(expected); int collationId = CollationFactory.collationNameToId(collationName); @@ -947,49 +964,102 @@ private void assertInitCap(String target, String collationName, String expected) @Test public void testInitCap() throws SparkException { - // Edge cases - assertInitCap("", "UTF8_BINARY", ""); - assertInitCap("", "UTF8_LCASE", ""); - assertInitCap("", "UNICODE", ""); - assertInitCap("", "UNICODE_CI", ""); - // Basic tests - assertInitCap("ABCDE", "UTF8_BINARY", "Abcde"); - assertInitCap("ABCDE", "UTF8_LCASE", "Abcde"); - assertInitCap("ABCDE", "UNICODE", "Abcde"); - assertInitCap("ABCDE", "UNICODE_CI", "Abcde"); - // Uppercase present - assertInitCap("AbCdE", "UTF8_BINARY", "Abcde"); - assertInitCap("aBcDe", "UTF8_BINARY", "Abcde"); - assertInitCap("AbCdE", "UTF8_LCASE", "Abcde"); - assertInitCap("aBcDe", "UTF8_LCASE", "Abcde"); - assertInitCap("AbCdE", "UNICODE", "Abcde"); - assertInitCap("aBcDe", "UNICODE", "Abcde"); - assertInitCap("AbCdE", "UNICODE_CI", "Abcde"); - assertInitCap("aBcDe", "UNICODE_CI", "Abcde"); - // Accent letters - assertInitCap("AbĆdE", "UTF8_BINARY", "Abćde"); - assertInitCap("AbĆdE", "UTF8_LCASE", "Abćde"); - assertInitCap("AbĆdE", "UNICODE", "Abćde"); - assertInitCap("AbĆdE", "UNICODE_CI", "Abćde"); - // Variable byte length characters - assertInitCap("aB 世 De", "UTF8_BINARY", "Ab 世 De"); + for (String collationName: testSupportedCollations) { + // Empty strings. + assertInitCap("", collationName, ""); + // Basic tests. + assertInitCap("abcde", collationName, "Abcde"); + assertInitCap("AbCdE", collationName, "Abcde"); + assertInitCap("aBcDe", collationName, "Abcde"); + assertInitCap("ABCDE", collationName, "Abcde"); + // Conditional case mapping (e.g. Greek sigmas). + assertInitCap("σ", collationName, "Σ"); + assertInitCap("ς", collationName, "Σ"); + assertInitCap("Σ", collationName, "Σ"); + assertInitCap("ΣΑΛΑΤΑ", collationName, "Σαλατα"); + assertInitCap("σαλατα", collationName, "Σαλατα"); + assertInitCap("ςαλατα", collationName, "Σαλατα"); + assertInitCap("ΘΑΛΑΣΣΙΝΟΣ", collationName, "Θαλασσινος"); + assertInitCap("θαλασσινοσ", collationName, "Θαλασσινοσ"); + assertInitCap("θαλασσινος", collationName, "Θαλασσινος"); + } + // Advanced tests. + assertInitCap("aBćDe", "UTF8_BINARY", "Abćde"); + assertInitCap("aBćDe", "UTF8_LCASE", "Abćde"); + assertInitCap("aBćDe", "UNICODE", "Abćde"); + assertInitCap("aBćDe", "UNICODE_CI", "Abćde"); + assertInitCap("ab世De", "UTF8_BINARY", "Ab世de"); + assertInitCap("ab世De", "UTF8_LCASE", "Ab世De"); + assertInitCap("ab世De", "UNICODE", "Ab世De"); + assertInitCap("ab世De", "UNICODE_CI", "Ab世De"); + assertInitCap("äbćδe", "UTF8_BINARY", "Äbćδe"); + assertInitCap("äbćδe", "UTF8_LCASE", "Äbćδe"); + assertInitCap("äbćδe", "UNICODE", "Äbćδe"); + assertInitCap("äbćδe", "UNICODE_CI", "Äbćδe"); assertInitCap("ÄBĆΔE", "UTF8_BINARY", "Äbćδe"); - assertInitCap("aB 世 De", "UTF8_LCASE", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe"); - assertInitCap("aB 世 De", "UNICODE", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe"); - assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); - // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); - assertInitCap("İo", "UTF8_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "İo"); - assertInitCap("İo", "UNICODE_CI", "İo"); - assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); - assertInitCap("i\u0307o", "UTF8_LCASE", "I\u0307o"); - assertInitCap("i\u0307o", "UNICODE", "I\u0307o"); - assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o"); - // Different possible word boundaries + assertInitCap("aB 世 de", "UTF8_BINARY", "Ab 世 De"); + assertInitCap("aB 世 de", "UTF8_LCASE", "Ab 世 De"); + assertInitCap("aB 世 de", "UNICODE", "Ab 世 De"); + assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); + // One-to-many case mapping (e.g. Turkish dotted I). + assertInitCap("İ", "UTF8_BINARY", "I\u0307"); + assertInitCap("İ", "UTF8_LCASE", "İ"); + assertInitCap("İ", "UNICODE", "İ"); + assertInitCap("İ", "UNICODE_CI", "İ"); + assertInitCap("I\u0307", "UTF8_BINARY","I\u0307"); + assertInitCap("I\u0307", "UTF8_LCASE","I\u0307"); + assertInitCap("I\u0307", "UNICODE","I\u0307"); + assertInitCap("I\u0307", "UNICODE_CI","I\u0307"); + assertInitCap("İonic", "UTF8_BINARY", "I\u0307onic"); + assertInitCap("İonic", "UTF8_LCASE", "İonic"); + assertInitCap("İonic", "UNICODE", "İonic"); + assertInitCap("İonic", "UNICODE_CI", "İonic"); + assertInitCap("i\u0307onic", "UTF8_BINARY","I\u0307onic"); + assertInitCap("i\u0307onic", "UTF8_LCASE","I\u0307onic"); + assertInitCap("i\u0307onic", "UNICODE","I\u0307onic"); + assertInitCap("i\u0307onic", "UNICODE_CI","I\u0307onic"); + assertInitCap("FIDELİO", "UTF8_BINARY", "Fideli\u0307o"); + assertInitCap("FIDELİO", "UTF8_LCASE", "Fideli\u0307o"); + assertInitCap("FIDELİO", "UNICODE", "Fideli\u0307o"); + assertInitCap("FIDELİO", "UNICODE_CI", "Fideli\u0307o"); + // Surrogate pairs. + assertInitCap("a🙃B🙃c", "UTF8_BINARY", "A🙃b🙃c"); + assertInitCap("a🙃B🙃c", "UTF8_LCASE", "A🙃B🙃C"); + assertInitCap("a🙃B🙃c", "UNICODE", "A🙃B🙃C"); + assertInitCap("a🙃B🙃c", "UNICODE_CI", "A🙃B🙃C"); + assertInitCap("😄 😆", "UTF8_BINARY", "😄 😆"); + assertInitCap("😄 😆", "UTF8_LCASE", "😄 😆"); + assertInitCap("😄 😆", "UNICODE", "😄 😆"); + assertInitCap("😄 😆", "UNICODE_CI", "😄 😆"); + assertInitCap("😀😆😃😄", "UTF8_BINARY", "😀😆😃😄"); + assertInitCap("😀😆😃😄", "UTF8_LCASE", "😀😆😃😄"); + assertInitCap("😀😆😃😄", "UNICODE", "😀😆😃😄"); + assertInitCap("😀😆😃😄", "UNICODE_CI", "😀😆😃😄"); + assertInitCap("𝔸", "UTF8_BINARY", "𝔸"); + assertInitCap("𝔸", "UTF8_LCASE", "𝔸"); + assertInitCap("𝔸", "UNICODE", "𝔸"); + assertInitCap("𝔸", "UNICODE_CI", "𝔸"); + assertInitCap("𐐅", "UTF8_BINARY", "𐐭"); + assertInitCap("𐐅", "UTF8_LCASE", "𐐅"); + assertInitCap("𐐅", "UNICODE", "𐐅"); + assertInitCap("𐐅", "UNICODE_CI", "𐐅"); + assertInitCap("𐐭", "UTF8_BINARY", "𐐭"); + assertInitCap("𐐭", "UTF8_LCASE", "𐐅"); + assertInitCap("𐐭", "UNICODE", "𐐅"); + assertInitCap("𐐭", "UNICODE_CI", "𐐅"); + assertInitCap("𐐭𝔸", "UTF8_BINARY", "𐐭𝔸"); + assertInitCap("𐐭𝔸", "UTF8_LCASE", "𐐅𝔸"); + assertInitCap("𐐭𝔸", "UNICODE", "𐐅𝔸"); + assertInitCap("𐐭𝔸", "UNICODE_CI", "𐐅𝔸"); + // Ligatures. + assertInitCap("ß fi ffi ff st ῗ", "UTF8_BINARY","ß fi ffi ff st ῗ"); + assertInitCap("ß fi ffi ff st ῗ", "UTF8_LCASE","Ss Fi Ffi Ff St \u0399\u0308\u0342"); + assertInitCap("ß fi ffi ff st ῗ", "UNICODE","Ss Fi Ffi Ff St \u0399\u0308\u0342"); + assertInitCap("ß fi ffi ff st ῗ", "UNICODE","Ss Fi Ffi Ff St \u0399\u0308\u0342"); + // Different possible word boundaries. assertInitCap("a b c", "UTF8_BINARY", "A B C"); assertInitCap("a b c", "UNICODE", "A B C"); assertInitCap("a b c", "UTF8_LCASE", "A B C"); @@ -1006,7 +1076,7 @@ public void testInitCap() throws SparkException { assertInitCap("a?b世c", "UNICODE", "A?B世C"); assertInitCap("a?b世c", "UTF8_LCASE", "A?B世C"); assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); - // Titlecase characters that are different from uppercase characters + // Titlecase characters that are different from uppercase characters. assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz"); assertInitCap("dzDZDz", "UNICODE", "Dzdzdz"); assertInitCap("dzDZDz", "UTF8_LCASE", "Dzdzdz");