From 72351636e62e69a15b6e3f6694127bf7d4907d32 Mon Sep 17 00:00:00 2001 From: Edward Slavin Date: Mon, 11 Sep 2023 13:15:58 -0400 Subject: [PATCH] Support for UTF-8 chars in TfDictionaryLessThan - Added support for UTF-8 characters in sorting algorithm - Added additional tests with strings containing UTF-8 characters --- pxr/base/tf/stringUtils.cpp | 24 ++++++++++++++--- pxr/base/tf/stringUtils.h | 18 ++++++++++--- pxr/base/tf/testenv/stringUtils.cpp | 34 +++++++++++++++++++++++- pxr/base/tf/testenv/testTfStringUtils.py | 10 +++++++ 4 files changed, 78 insertions(+), 8 deletions(-) diff --git a/pxr/base/tf/stringUtils.cpp b/pxr/base/tf/stringUtils.cpp index 044c55aefb..2eae3ea6cd 100644 --- a/pxr/base/tf/stringUtils.cpp +++ b/pxr/base/tf/stringUtils.cpp @@ -62,6 +62,13 @@ using std::vector; PXR_NAMESPACE_OPEN_SCOPE +namespace { + bool _IsASCIIValue(char c) + { + return (c & (1<<7)) == 0; + } +} + string TfVStringPrintf(const std::string& fmt, va_list ap) { @@ -802,7 +809,8 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const } l = *lcur, r = *rcur; // If they are letters that differ disregarding case, we're done. - if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) { + // but only if they are ASCII (i.e., the high bit is not set) + if (_IsASCIIValue(l) && _IsASCIIValue(r) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) { // Add 5 mod 32 makes '_' sort before all letters. return ((l + 5) & 31) < ((r + 5) & 31); } @@ -871,7 +879,10 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const } else if (IsDigit(l) | IsDigit(r)) { if (lcur == lstr.c_str()) { - return l < r; + // special case could occur where one is a digit, but the other + // might be a UTF-8 character, so we have to treat them + // as unsigned in the comparison + return static_cast(l) < static_cast(r); } // If one is a digit (but not both), then we have to check the // preceding character to determine the outcome. If the @@ -884,7 +895,12 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const } else if (!IsAlpha(l) || !IsAlpha(r)) { // At least one isn't a letter. - return l < r; + // this either means it's an ASCII symbol (digit was checked above) + // or a utf-8 encoded unicode character + // we want ASCII symbols to sort first, so we can't treat the + // utf-8 characters as unsigned (and all ASCII values remain the same + // after the cast) + return static_cast(l) < static_cast(r); } else { // Both letters, differ by case, continue. @@ -909,7 +925,7 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const rstr.c_str()); l = *lcur, r = *rcur; - return (r == '0') | ((l != '0') & (l < r)); + return (r == '0') | ((l != '0') & (static_cast(l) < static_cast(r))); } diff --git a/pxr/base/tf/stringUtils.h b/pxr/base/tf/stringUtils.h index ae0a01f5ab..2e5eabb5a6 100644 --- a/pxr/base/tf/stringUtils.h +++ b/pxr/base/tf/stringUtils.h @@ -496,6 +496,17 @@ TfMatchedStringTokenize(const std::string& source, /// Characters whose ASCII value are inbetween upper- and lowercase letters, /// such as underscore, are sorted to come after all letters. /// +/// \note This comparison is used for the runtime to give a deterministic ordering to strings. +/// ASCII strings will lexicographically sort according to the rules below. +/// Strings with Unicode characters will follow these same rules until a +/// Unicode character is encountered in which case it will be byte compared +/// with the character in the other string. Multi-byte encoded characters +/// will operate this way for each of the bytes. +/// Note that this results in a non-lexicographic ordering of strings that +/// contain Unicode characters. Clients interested in sorting strings +/// lexicographically should not rely on this function for doing so and should +/// instead use a custom sort function (or use one provided by an already +/// existing library such as Qt or ICU). struct TfDictionaryLessThan { /// Return true if \p lhs is less than \p rhs in dictionary order. /// @@ -512,14 +523,15 @@ struct TfDictionaryLessThan { inline bool operator()(const std::string &lhs, const std::string &rhs) const { // Check first chars first. By far, it is most common that these - // characters are letters of the same case that differ, or of different + // characters are ASCII letters of the same case that differ, or of different // case that differ. It is very rare that we have to account for - // different cases, or numerical comparisons, so we special-case this + // different cases, or numerical comparisons, or UTF-8 characters so we special-case this // first. char l = lhs.c_str()[0], r = rhs.c_str()[0]; - if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) { + if ((((l & (1<<7)) == 0) && ((r & (1<<7)) == 0)) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) { // This bit about add 5 mod 32 makes it so that '_' sorts less than // all letters, which preserves existing behavior. + // but only when both characters are ASCII characters return ((l + 5) & 31) < ((r + 5) & 31); } else { diff --git a/pxr/base/tf/testenv/stringUtils.cpp b/pxr/base/tf/testenv/stringUtils.cpp index e404a687c4..90d112ce44 100644 --- a/pxr/base/tf/testenv/stringUtils.cpp +++ b/pxr/base/tf/testenv/stringUtils.cpp @@ -195,7 +195,39 @@ TestPreds() "primvars:curveHierarchy:id")); TF_AXIOM(DictLessThan("primvars:curveHierarchy:id", "primvars:curveHierarchy__id")); - + + // basic UTF-8 character tests + // U+00FC (C3 B2) U+0061 (61) + // U+1300A (F0 93 80 8A) U+0041 (41) + // U+222B (E2 88 AB) U+003D (3D) + // U+0F22 (E0 BC A2) U+0036 (36) + // U+0F22 (E0 BC A2) U+0F28 (E0 BC A8) + TF_AXIOM(!DictLessThan("ü", "a")); + TF_AXIOM(!DictLessThan("𓀊", "A")); + TF_AXIOM(!DictLessThan("∫", "=")); + TF_AXIOM(!DictLessThan("༢", "6")); + TF_AXIOM(DictLessThan("༢", "༨")); + TF_AXIOM(DictLessThan("_", "㤼")); + TF_AXIOM(DictLessThan("_a", "_a㤼")); + TF_AXIOM(DictLessThan("6", "_a")); + TF_AXIOM(!DictLessThan("2_༢1", "2_༢")); + TF_AXIOM(!DictLessThan("∫∫", "∫=")); + + // U+03C7 (CF 87) U+03C0 (CF 80) + TF_AXIOM(!DictLessThan("a00χ", "a0π")); + TF_AXIOM(!DictLessThan("00χ", "0π")); + + // additional tests for UTF-8 characters in the loop + // U+393B (E3 A4 BB) U+393C (E3 A4 BC) + // U+393B (E3 A4 BB) U+393A (E3 A4 BA) + // U+393B (E3 A4 BB) U+393B (E3 A4 BB) + // U+00FC (C3 B2) U+0061 (61) + TF_AXIOM(DictLessThan("foo001bar001abc㤻", "foo001bar001abc㤼")); + TF_AXIOM(!DictLessThan("foo001㤻bar01abc", "foo001㤺bar001abc")); + TF_AXIOM(!DictLessThan("foo001㤻bar001abc", "foo001㤻bar001abc")); + TF_AXIOM(!DictLessThan("foo00001bar0002ü", "foo001bar002abc")); + TF_AXIOM(DictLessThan("üfoo", "㤻foo")); + TF_AXIOM(TfIsValidIdentifier("f")); TF_AXIOM(TfIsValidIdentifier("foo")); TF_AXIOM(TfIsValidIdentifier("foo1")); diff --git a/pxr/base/tf/testenv/testTfStringUtils.py b/pxr/base/tf/testenv/testTfStringUtils.py index fba6946e54..9f82db0364 100644 --- a/pxr/base/tf/testenv/testTfStringUtils.py +++ b/pxr/base/tf/testenv/testTfStringUtils.py @@ -72,6 +72,16 @@ def test_Unicode(self): self.assertEqual(Tf.DictionaryStrcmp(u'apple', 'banana'), -1) self.assertEqual(Tf.DictionaryStrcmp(u'apple', u'banana'), -1) + # U+393B < U+393C + # U+393B > U+393A + # U+393B == U+393B + # U+00FC > U+0030 because 0 is a digit and hence the word + # prefix is less than that of `aü` + self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤼'), -1) + self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤺'), 1) + self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤻'), 0) + self.assertEqual(Tf.DictionaryStrcmp('aü', 'a0'), 1) + def test_StringToLong(self): def checks(val):