diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index d5dbca7eb89bc..e368e2479a3a1 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -99,7 +99,8 @@ public record CollationMeta( String icuVersion, String padAttribute, boolean accentSensitivity, - boolean caseSensitivity) { } + boolean caseSensitivity, + String spaceTrimming) { } /** * Entry encapsulating all information about a collation. @@ -200,6 +201,7 @@ public Collation( * bit 28-24: Reserved. * bit 23-22: Reserved for version. * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17-0: Depend on collation family. * --- * INDETERMINATE collation ID binary layout: @@ -214,7 +216,8 @@ public Collation( * UTF8_BINARY collation ID binary layout: * bit 31-24: Zeroes. * bit 23-22: Zeroes, reserved for version. - * bit 21-18: Zeroes, reserved for space trimming. + * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17-3: Zeroes. * bit 2: 0, reserved for accent sensitivity. * bit 1: 0, reserved for uppercase and case-insensitive. @@ -225,7 +228,8 @@ public Collation( * bit 29: 1 * bit 28-24: Zeroes. * bit 23-22: Zeroes, reserved for version. - * bit 21-18: Zeroes, reserved for space trimming. + * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17: 0 = case-sensitive, 1 = case-insensitive. * bit 16: 0 = accent-sensitive, 1 = accent-insensitive. * bit 15-14: Zeroes, reserved for punctuation sensitivity. @@ -238,7 +242,13 @@ public Collation( * - UNICODE -> 0x20000000 * - UNICODE_AI -> 0x20010000 * - UNICODE_CI -> 0x20020000 + * - UNICODE_LTRIM -> 0x20040000 + * - UNICODE_RTRIM -> 0x20080000 + * - UNICODE_TRIM -> 0x200C0000 * - UNICODE_CI_AI -> 0x20030000 + * - UNICODE_CI_TRIM -> 0x200E0000 + * - UNICODE_AI_TRIM -> 0x200D0000 + * - UNICODE_CI_AI_TRIM-> 0x200F0000 * - af -> 0x20000001 * - af_CI_AI -> 0x20030001 */ @@ -259,6 +269,15 @@ protected enum ImplementationProvider { UTF8_BINARY, ICU } + /** + * Bits 19-18 having value 00 for no space trimming, 01 for left space trimming + * 10 for right space trimming and 11 for both sides space trimming. Bits 21, 20 + * remained reserved (and fixed to 0) for future use. + */ + protected enum SpaceTrimming { + NONE, LTRIM, RTRIM, TRIM + } + /** * Offset in binary collation ID layout. */ @@ -279,6 +298,17 @@ protected enum ImplementationProvider { */ protected static final int IMPLEMENTATION_PROVIDER_MASK = 0b1; + + /** + * Offset in binary collation ID layout. + */ + protected static final int SPACE_TRIMMING_OFFSET = 18; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + protected static final int SPACE_TRIMMING_MASK = 0b11; + private static final int INDETERMINATE_COLLATION_ID = -1; /** @@ -303,6 +333,14 @@ private static DefinitionOrigin getDefinitionOrigin(int collationId) { DEFINITION_ORIGIN_OFFSET, DEFINITION_ORIGIN_MASK)]; } + /** + * Utility function to retrieve `SpaceTrimming` enum instance from collation ID. + */ + protected static SpaceTrimming getSpaceTrimming(int collationId) { + return SpaceTrimming.values()[SpecifierUtils.getSpecValue(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK)]; + } + /** * Main entry point for retrieving `Collation` instance from collation ID. */ @@ -358,6 +396,8 @@ private static int collationNameToId(String collationName) throws SparkException protected abstract CollationMeta buildCollationMeta(); + protected abstract String normalizedCollationName(); + static List listCollations() { return Stream.concat( CollationSpecUTF8.listCollations().stream(), @@ -398,48 +438,99 @@ private enum CaseSensitivity { private static final String UTF8_LCASE_COLLATION_NAME = "UTF8_LCASE"; private static final int UTF8_BINARY_COLLATION_ID = - new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).collationId; + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).collationId; private static final int UTF8_LCASE_COLLATION_ID = - new CollationSpecUTF8(CaseSensitivity.LCASE).collationId; + new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).collationId; protected static Collation UTF8_BINARY_COLLATION = - new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).buildCollation(); + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).buildCollation(); protected static Collation UTF8_LCASE_COLLATION = - new CollationSpecUTF8(CaseSensitivity.LCASE).buildCollation(); + new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).buildCollation(); + private final CaseSensitivity caseSensitivity; + private final SpaceTrimming spaceTrimming; private final int collationId; - private CollationSpecUTF8(CaseSensitivity caseSensitivity) { - this.collationId = + private CollationSpecUTF8( + CaseSensitivity caseSensitivity, + SpaceTrimming spaceTrimming) { + this.caseSensitivity = caseSensitivity; + this.spaceTrimming = spaceTrimming; + + int collationId = SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, caseSensitivity); + this.collationId = + SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET, spaceTrimming); } private static int collationNameToId(String originalName, String collationName) throws SparkException { - if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) { - return UTF8_BINARY_COLLATION_ID; - } else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) { - return UTF8_LCASE_COLLATION_ID; + + int baseId; + String collationNamePrefix; + + if (collationName.startsWith(UTF8_BINARY_COLLATION.collationName)) { + baseId = UTF8_BINARY_COLLATION_ID; + collationNamePrefix = UTF8_BINARY_COLLATION.collationName; + } else if (collationName.startsWith(UTF8_LCASE_COLLATION.collationName)) { + baseId = UTF8_LCASE_COLLATION_ID; + collationNamePrefix = UTF8_LCASE_COLLATION.collationName; } else { // Throw exception with original (before case conversion) collation name. throw collationInvalidNameException(originalName); } + + String remainingSpecifiers = collationName.substring(collationNamePrefix.length()); + if(remainingSpecifiers.isEmpty()) { + return baseId; + } + if(!remainingSpecifiers.startsWith("_")){ + throw collationInvalidNameException(originalName); + } + + SpaceTrimming spaceTrimming = SpaceTrimming.NONE; + String remainingSpec = remainingSpecifiers.substring(1); + if (remainingSpec.equals("LTRIM")) { + spaceTrimming = SpaceTrimming.LTRIM; + } else if (remainingSpec.equals("RTRIM")) { + spaceTrimming = SpaceTrimming.RTRIM; + } else if(remainingSpec.equals("TRIM")) { + spaceTrimming = SpaceTrimming.TRIM; + } else { + throw collationInvalidNameException(originalName); + } + + return SpecifierUtils.setSpecValue(baseId, SPACE_TRIMMING_OFFSET, spaceTrimming); } private static CollationSpecUTF8 fromCollationId(int collationId) { // Extract case sensitivity from collation ID. int caseConversionOrdinal = SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); - // Verify only case sensitivity bits were set settable in UTF8_BINARY family of collations. - assert (SpecifierUtils.removeSpec(collationId, - CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK) == 0); - return new CollationSpecUTF8(CaseSensitivity.values()[caseConversionOrdinal]); + // Extract space trimming from collation ID. + int spaceTrimmingOrdinal = getSpaceTrimming(collationId).ordinal(); + assert(isValidCollationId(collationId)); + return new CollationSpecUTF8( + CaseSensitivity.values()[caseConversionOrdinal], + SpaceTrimming.values()[spaceTrimmingOrdinal]); + } + + private static boolean isValidCollationId(int collationId) { + collationId = SpecifierUtils.removeSpec( + collationId, + SPACE_TRIMMING_OFFSET, + SPACE_TRIMMING_MASK); + collationId = SpecifierUtils.removeSpec( + collationId, + CASE_SENSITIVITY_OFFSET, + CASE_SENSITIVITY_MASK); + return collationId == 0; } @Override protected Collation buildCollation() { - if (collationId == UTF8_BINARY_COLLATION_ID) { + if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { return new Collation( - UTF8_BINARY_COLLATION_NAME, + normalizedCollationName(), PROVIDER_SPARK, null, UTF8String::binaryCompare, @@ -450,7 +541,7 @@ protected Collation buildCollation() { /* supportsLowercaseEquality = */ false); } else { return new Collation( - UTF8_LCASE_COLLATION_NAME, + normalizedCollationName(), PROVIDER_SPARK, null, CollationAwareUTF8String::compareLowerCase, @@ -464,29 +555,52 @@ protected Collation buildCollation() { @Override protected CollationMeta buildCollationMeta() { - if (collationId == UTF8_BINARY_COLLATION_ID) { + if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { return new CollationMeta( CATALOG, SCHEMA, - UTF8_BINARY_COLLATION_NAME, + normalizedCollationName(), /* language = */ null, /* country = */ null, /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, - /* caseSensitivity = */ true); + /* caseSensitivity = */ true, + spaceTrimming.toString()); } else { return new CollationMeta( CATALOG, SCHEMA, - UTF8_LCASE_COLLATION_NAME, + normalizedCollationName(), /* language = */ null, /* country = */ null, /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, - /* caseSensitivity = */ false); + /* caseSensitivity = */ false, + spaceTrimming.toString()); + } + } + + /** + * Compute normalized collation name. Components of collation name are given in order: + * - Base collation name (UTF8_BINARY or UTF8_LCASE) + * - Optional space trimming when non-default preceded by underscore + * Examples: UTF8_BINARY, UTF8_BINARY_LCASE_LTRIM, UTF8_BINARY_TRIM. + */ + @Override + protected String normalizedCollationName() { + StringBuilder builder = new StringBuilder(); + if(caseSensitivity == CaseSensitivity.UNSPECIFIED){ + builder.append(UTF8_BINARY_COLLATION_NAME); + } else{ + builder.append(UTF8_LCASE_COLLATION_NAME); } + if (spaceTrimming != SpaceTrimming.NONE) { + builder.append('_'); + builder.append(spaceTrimming.toString()); + } + return builder.toString(); } static List listCollations() { @@ -620,21 +734,33 @@ private enum AccentSensitivity { } } - private static final int UNICODE_COLLATION_ID = - new CollationSpecICU("UNICODE", CaseSensitivity.CS, AccentSensitivity.AS).collationId; - private static final int UNICODE_CI_COLLATION_ID = - new CollationSpecICU("UNICODE", CaseSensitivity.CI, AccentSensitivity.AS).collationId; + private static final int UNICODE_COLLATION_ID = new CollationSpecICU( + "UNICODE", + CaseSensitivity.CS, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; + + private static final int UNICODE_CI_COLLATION_ID = new CollationSpecICU( + "UNICODE", + CaseSensitivity.CI, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; private final CaseSensitivity caseSensitivity; private final AccentSensitivity accentSensitivity; + private final SpaceTrimming spaceTrimming; private final String locale; private final int collationId; - private CollationSpecICU(String locale, CaseSensitivity caseSensitivity, - AccentSensitivity accentSensitivity) { + private CollationSpecICU( + String locale, + CaseSensitivity caseSensitivity, + AccentSensitivity accentSensitivity, + SpaceTrimming spaceTrimming) { this.locale = locale; this.caseSensitivity = caseSensitivity; this.accentSensitivity = accentSensitivity; + this.spaceTrimming = spaceTrimming; // Construct collation ID from locale, case-sensitivity and accent-sensitivity specifiers. int collationId = ICULocaleToId.get(locale); // Mandatory ICU implementation provider. @@ -644,6 +770,8 @@ private CollationSpecICU(String locale, CaseSensitivity caseSensitivity, caseSensitivity); collationId = SpecifierUtils.setSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET, + spaceTrimming); this.collationId = collationId; } @@ -661,58 +789,88 @@ private static int collationNameToId( } if (lastPos == -1) { throw collationInvalidNameException(originalName); - } else { - String locale = collationName.substring(0, lastPos); - int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); - - // Try all combinations of AS/AI and CS/CI. - CaseSensitivity caseSensitivity; - AccentSensitivity accentSensitivity; - if (collationName.equals(locale) || - collationName.equals(locale + "_AS") || - collationName.equals(locale + "_CS") || - collationName.equals(locale + "_AS_CS") || - collationName.equals(locale + "_CS_AS") - ) { - caseSensitivity = CaseSensitivity.CS; - accentSensitivity = AccentSensitivity.AS; - } else if (collationName.equals(locale + "_CI") || - collationName.equals(locale + "_AS_CI") || - collationName.equals(locale + "_CI_AS")) { - caseSensitivity = CaseSensitivity.CI; - accentSensitivity = AccentSensitivity.AS; - } else if (collationName.equals(locale + "_AI") || - collationName.equals(locale + "_CS_AI") || - collationName.equals(locale + "_AI_CS")) { - caseSensitivity = CaseSensitivity.CS; - accentSensitivity = AccentSensitivity.AI; - } else if (collationName.equals(locale + "_AI_CI") || - collationName.equals(locale + "_CI_AI")) { - caseSensitivity = CaseSensitivity.CI; - accentSensitivity = AccentSensitivity.AI; - } else { - throw collationInvalidNameException(originalName); - } + } + String locale = collationName.substring(0, lastPos); + int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); + collationId = SpecifierUtils.setSpecValue(collationId, + IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); - // Build collation ID from computed specifiers. - collationId = SpecifierUtils.setSpecValue(collationId, - IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); - collationId = SpecifierUtils.setSpecValue(collationId, - CASE_SENSITIVITY_OFFSET, caseSensitivity); - collationId = SpecifierUtils.setSpecValue(collationId, - ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + // No other specifiers present. + if(collationName.equals(locale)){ return collationId; } + if(collationName.charAt(locale.length()) != '_'){ + throw collationInvalidNameException(originalName); + } + // Extract remaining specifiers and trim "_" separator. + String remainingSpecifiers = collationName.substring(lastPos + 1); + + // Initialize default specifier flags. + // Case sensitive, accent sensitive, no space trimming. + boolean isCaseSpecifierSet = false; + boolean isAccentSpecifierSet = false; + boolean isSpaceTrimmingSpecifierSet = false; + CaseSensitivity caseSensitivity = CaseSensitivity.CS; + AccentSensitivity accentSensitivity = AccentSensitivity.AS; + SpaceTrimming spaceTrimming = SpaceTrimming.NONE; + + String[] specifiers = remainingSpecifiers.split("_"); + + // Iterate through specifiers and set corresponding flags + for (String specifier : specifiers) { + switch (specifier) { + case "CI": + case "CS": + if (isCaseSpecifierSet) { + throw collationInvalidNameException(originalName); + } + caseSensitivity = CaseSensitivity.valueOf(specifier); + isCaseSpecifierSet = true; + break; + case "AI": + case "AS": + if (isAccentSpecifierSet) { + throw collationInvalidNameException(originalName); + } + accentSensitivity = AccentSensitivity.valueOf(specifier); + isAccentSpecifierSet = true; + break; + case "LTRIM": + case "RTRIM": + case "TRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.valueOf(specifier); + isSpaceTrimmingSpecifierSet = true; + break; + default: + throw collationInvalidNameException(originalName); + } + } + + // Build collation ID from computed specifiers. + collationId = SpecifierUtils.setSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, caseSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, + ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, + SPACE_TRIMMING_OFFSET, spaceTrimming); + return collationId; } private static CollationSpecICU fromCollationId(int collationId) { // Parse specifiers from collation ID. + int spaceTrimmingOrdinal = SpecifierUtils.getSpecValue(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); int caseSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); int accentSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET, ACCENT_SENSITIVITY_MASK); collationId = SpecifierUtils.removeSpec(collationId, IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK); + collationId = SpecifierUtils.removeSpec(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); collationId = SpecifierUtils.removeSpec(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); collationId = SpecifierUtils.removeSpec(collationId, @@ -723,8 +881,9 @@ private static CollationSpecICU fromCollationId(int collationId) { assert(localeId >= 0 && localeId < ICULocaleNames.length); CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal]; AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal]; + SpaceTrimming spaceTrimming = SpaceTrimming.values()[spaceTrimmingOrdinal]; String locale = ICULocaleNames[localeId]; - return new CollationSpecICU(locale, caseSensitivity, accentSensitivity); + return new CollationSpecICU(locale, caseSensitivity, accentSensitivity, spaceTrimming); } @Override @@ -752,7 +911,7 @@ protected Collation buildCollation() { // Freeze ICU collator to ensure thread safety. collator.freeze(); return new Collation( - collationName(), + normalizedCollationName(), PROVIDER_ICU, collator, (s1, s2) -> collator.compare(s1.toValidString(), s2.toValidString()), @@ -768,13 +927,14 @@ protected CollationMeta buildCollationMeta() { return new CollationMeta( CATALOG, SCHEMA, - collationName(), + normalizedCollationName(), ICULocaleMap.get(locale).getDisplayLanguage(), ICULocaleMap.get(locale).getDisplayCountry(), VersionInfo.ICU_VERSION.toString(), COLLATION_PAD_ATTRIBUTE, accentSensitivity == AccentSensitivity.AS, - caseSensitivity == CaseSensitivity.CS); + caseSensitivity == CaseSensitivity.CS, + spaceTrimming.toString()); } /** @@ -782,9 +942,11 @@ protected CollationMeta buildCollationMeta() { * - Locale name * - Optional case sensitivity when non-default preceded by underscore * - Optional accent sensitivity when non-default preceded by underscore - * Examples: en, en_USA_CI_AI, sr_Cyrl_SRB_AI. + * - Optional space trimming when non-default preceded by underscore + * Examples: en, en_USA_CI_LTRIM, en_USA_CI_AI, en_USA_CI_AI_TRIM, sr_Cyrl_SRB_AI. */ - private String collationName() { + @Override + protected String normalizedCollationName() { StringBuilder builder = new StringBuilder(); builder.append(locale); if (caseSensitivity != CaseSensitivity.CS) { @@ -795,20 +957,21 @@ private String collationName() { builder.append('_'); builder.append(accentSensitivity.toString()); } + if(spaceTrimming != SpaceTrimming.NONE) { + builder.append('_'); + builder.append(spaceTrimming.toString()); + } return builder.toString(); } private static List allCollationNames() { List collationNames = new ArrayList<>(); - for (String locale: ICULocaleToId.keySet()) { - // CaseSensitivity.CS + AccentSensitivity.AS - collationNames.add(locale); - // CaseSensitivity.CS + AccentSensitivity.AI - collationNames.add(locale + "_AI"); - // CaseSensitivity.CI + AccentSensitivity.AS - collationNames.add(locale + "_CI"); - // CaseSensitivity.CI + AccentSensitivity.AI - collationNames.add(locale + "_CI_AI"); + List caseAccentSpecifiers = Arrays.asList("", "_AI", "_CI", "_CI_AI"); + for (String locale : ICULocaleToId.keySet()) { + for (String caseAccent : caseAccentSpecifiers) { + String collationName = locale + caseAccent; + collationNames.add(collationName); + } } return collationNames.stream().sorted().toList(); } @@ -933,6 +1096,14 @@ public static boolean isCaseSensitiveAndAccentInsensitive(int collationId) { Collation.CollationSpecICU.AccentSensitivity.AI; } + /** + * Returns whether the collation uses trim collation for the given collation id. + */ + public static boolean usesTrimCollation(int collationId) { + return Collation.CollationSpec.getSpaceTrimming(collationId) != + Collation.CollationSpec.SpaceTrimming.NONE; + } + public static void assertValidProvider(String provider) throws SparkException { if (!SUPPORTED_PROVIDERS.contains(provider.toLowerCase())) { Map params = Map.of( diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index 321d1ccd700f2..054c44f7286b7 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -369,9 +369,8 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig 1 << 15, // UTF8_BINARY mandatory zero bit 15 breach. 1 << 16, // UTF8_BINARY mandatory zero bit 16 breach. 1 << 17, // UTF8_BINARY mandatory zero bit 17 breach. - 1 << 18, // UTF8_BINARY mandatory zero bit 18 breach. - 1 << 19, // UTF8_BINARY mandatory zero bit 19 breach. 1 << 20, // UTF8_BINARY mandatory zero bit 20 breach. + 1 << 21, // UTF8_BINARY mandatory zero bit 21 breach. 1 << 23, // UTF8_BINARY mandatory zero bit 23 breach. 1 << 24, // UTF8_BINARY mandatory zero bit 24 breach. 1 << 25, // UTF8_BINARY mandatory zero bit 25 breach. @@ -382,8 +381,6 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig (1 << 29) | (1 << 13), // ICU mandatory zero bit 13 breach. (1 << 29) | (1 << 14), // ICU mandatory zero bit 14 breach. (1 << 29) | (1 << 15), // ICU mandatory zero bit 15 breach. - (1 << 29) | (1 << 18), // ICU mandatory zero bit 18 breach. - (1 << 29) | (1 << 19), // ICU mandatory zero bit 19 breach. (1 << 29) | (1 << 20), // ICU mandatory zero bit 20 breach. (1 << 29) | (1 << 21), // ICU mandatory zero bit 21 breach. (1 << 29) | (1 << 22), // ICU mandatory zero bit 22 breach. diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e83202d9e5ee3..9a78c29dc9593 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4886,11 +4886,6 @@ "Catalog does not support ." ] }, - "COLLATION" : { - "message" : [ - "Collation is not yet supported." - ] - }, "COMBINATION_QUERY_RESULT_CLAUSES" : { "message" : [ "Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY." @@ -5117,6 +5112,11 @@ "message" : [ "TRANSFORM with SERDE is only supported in hive mode." ] + }, + "TRIM_COLLATION" : { + "message" : [ + "TRIM specifier in the collation." + ] } }, "sqlState" : "0A000" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index d45ca533f9392..baace414fd9b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -52,6 +52,10 @@ object CollateExpressionBuilder extends ExpressionBuilder { if (evalCollation == null) { throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr) } else { + if (!SQLConf.get.trimCollationEnabled && + evalCollation.toString.toUpperCase().contains("TRIM")) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } Collate(e, evalCollation.toString) } case (_: StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 52529bb4b789b..2bed131341893 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2557,6 +2557,10 @@ class AstBuilder extends DataTypeAstBuilder } override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { + val collationName = ctx.collationName.getText + if (!SQLConf.get.trimCollationEnabled && collationName.toUpperCase().contains("TRIM")) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } ctx.identifier.getText } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 0b5255e95f073..0d27f7bedbd3e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -351,6 +351,13 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } + def trimCollationNotEnabledError(): Throwable = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.TRIM_COLLATION", + messageParameters = Map.empty + ) + } + def unresolvedUsingColForJoinError( colName: String, suggestion: String, side: String): Throwable = { new AnalysisException( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 2eaafde52228b..560c48f580986 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -759,6 +759,18 @@ object SQLConf { .checkValue(_ > 0, "The initial number of partitions must be positive.") .createOptional + lazy val TRIM_COLLATION_ENABLED = + buildConf("spark.sql.collation.trim.enabled") + .internal() + .doc( + "Trim collation feature is under development and its use should be done under this" + + "feature flag. Trim collation trims leading, trailing or both spaces depending of" + + "specifier (LTRIM, RTRIM, TRIM)." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(Utils.isTesting) + val DEFAULT_COLLATION = buildConf(SqlApiConfHelper.DEFAULT_COLLATION) .doc("Sets default collation to use for string literals, parameter markers or the string" + @@ -5456,6 +5468,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { } } + def trimCollationEnabled: Boolean = getConf(TRIM_COLLATION_ENABLED) + override def defaultStringType: StringType = { if (getConf(DEFAULT_COLLATION).toUpperCase(Locale.ROOT) == "UTF8_BINARY") { StringType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a8261e5d98ba0..ee646f7180624 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -149,6 +149,10 @@ class SparkSqlAstBuilder extends AstBuilder { * }}} */ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { + val collationName = ctx.collationName.getText + if (!SQLConf.get.trimCollationEnabled && collationName.toUpperCase().contains("TRIM")) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } val key = SQLConf.DEFAULT_COLLATION.key SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT)))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 73fd897e91f53..e15b8cfbf3f08 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -44,27 +44,57 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { private val allFileBasedDataSources = collationPreservingSources ++ collationNonPreservingSources test("collate returns proper type") { - Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => + Seq( + "utf8_binary", + "utf8_lcase", + "unicode", + "unicode_ci", + "unicode_ltrim_ci", + "utf8_lcase_trim", + "utf8_binary_rtrim" + ).foreach { collationName => checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + assert( + sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId) + ) } } test("collation name is case insensitive") { - Seq("uTf8_BiNaRy", "utf8_lcase", "uNicOde", "UNICODE_ci").foreach { collationName => + Seq( + "uTf8_BiNaRy", + "utf8_lcase", + "uNicOde", + "UNICODE_ci", + "uNiCoDE_ltRIm_cI", + "UtF8_lCaSE_tRIM", + "utf8_biNAry_RtRiM" + ).foreach { collationName => checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + assert( + sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId) + ) } } test("collation expression returns name of collation") { - Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => + Seq( + "utf8_binary", + "utf8_lcase", + "unicode", + "unicode_ci", + "unicode_ci_ltrim", + "utf8_lcase_trim", + "utf8_binary_rtrim" + ).foreach { collationName => checkAnswer( - sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) + sql(s"select collation('aaa' collate $collationName)"), + Row(collationName.toUpperCase()) + ) } } @@ -77,9 +107,15 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("collate function syntax with default collation set") { withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") { - assert(sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == - StringType("UTF8_LCASE")) + assert( + sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == + StringType("UTF8_LCASE") + ) assert(sql(s"select collate('aaa', 'UNICODE')").schema(0).dataType == StringType("UNICODE")) + assert( + sql(s"select collate('aaa', 'UNICODE_TRIM')").schema(0).dataType == + StringType("UNICODE_TRIM") + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 832e1873af6a4..5abdca326f2fd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -868,6 +868,39 @@ class QueryCompilationErrorsSuite "inputTypes" -> "[\"INT\", \"STRING\", \"STRING\"]")) } + test("SPARK-49666: the trim collation feature is off without collate builder call") { + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + Seq( + "CREATE TABLE t(col STRING COLLATE EN_TRIM_CI) USING parquet", + "CREATE TABLE t(col STRING COLLATE UTF8_LCASE_TRIM) USING parquet", + "SELECT 'aaa' COLLATE UNICODE_LTRIM_CI" + ).foreach { sqlText => + checkError( + exception = intercept[AnalysisException](sql(sqlText)), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION" + ) + } + } + } + + test("SPARK-49666: the trim collation feature is off with collate builder call") { + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + Seq( + "SELECT collate('aaa', 'UNICODE_TRIM')", + "SELECT collate('aaa', 'UTF8_BINARY_TRIM')", + "SELECT collate('aaa', 'EN_AI_RTRIM')" + ).foreach { sqlText => + checkError( + exception = intercept[AnalysisException](sql(sqlText)), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION", + parameters = Map.empty, + context = + ExpectedContext(fragment = sqlText.substring(7), start = 7, stop = sqlText.length - 1) + ) + } + } + } + test("UNSUPPORTED_CALL: call the unsupported method update()") { checkError( exception = intercept[SparkUnsupportedOperationException] { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 82795e551b6bf..094c65c63bfdc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -517,6 +517,13 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { "confName" -> "spark.sql.session.collation.default", "proposals" -> "UNICODE" )) + + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + checkError( + exception = intercept[AnalysisException](sql(s"SET COLLATION UNICODE_CI_TRIM")), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION" + ) + } } test("SPARK-43028: config not found error") {