Merge pull request #2673 from erslavin/utf8

Support for UTF-8 chars in TfDictionaryLessThan (Internal change: 2301635)
PixarAnimationStudios · Nov 1, 2023 · 9708720 · 9708720
2 parents 9770341 + 7235163
commit 9708720
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 9 deletions.
diff --git a/pxr/base/tf/stringUtils.cpp b/pxr/base/tf/stringUtils.cpp
@@ -793,7 +793,7 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         return false;
     }
 
-    char l, r;
+    unsigned char l, r;
 
     while (true) {
         if (lcur == curEnd) {
@@ -802,7 +802,11 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         }
         l = *lcur, r = *rcur;
         // If they are letters that differ disregarding case, we're done.
-        if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
+        // but only if they are ASCII (i.e., the high bit is not set)
+        const bool bothAscii = l < 0x80 && r < 0x80;
+        const bool differsIgnoringCase = (l & ~0x20) != (r & ~0x20);
+        const bool inLetterZone = (l >= 0x40) && (r >= 0x40);
+        if (bothAscii && differsIgnoringCase && inLetterZone) {
             // Add 5 mod 32 makes '_' sort before all letters.
             return ((l + 5) & 31) < ((r + 5) & 31);
         }

diff --git a/pxr/base/tf/stringUtils.h b/pxr/base/tf/stringUtils.h
@@ -31,6 +31,7 @@
 #include "pxr/pxr.h"
 
 #include "pxr/base/arch/attributes.h"
+#include "pxr/base/arch/hints.h"
 #include "pxr/base/arch/inttypes.h"
 #include "pxr/base/tf/api.h"
 #include "pxr/base/tf/enum.h"
@@ -510,6 +511,20 @@ TfMatchedStringTokenize(const std::string& source,
 /// Characters whose ASCII value are inbetween upper- and lowercase letters,
 /// such as underscore, are sorted to come after all letters.
 ///
+/// \note This comparison is used for the runtime to give a deterministic
+/// ordering to strings.
+///
+/// ASCII strings will sort lexicographically according to the rules below.
+/// Strings with other Unicode characters will follow these same rules until a
+/// multi-byte codepoint is encountered in which case it will be byte compared
+/// with the bytes in the other string.  Multi-byte encoded characters will
+/// operate this way for each of the bytes.
+///
+/// Note that this results in a non-lexicographic ordering of strings that
+/// contain non-ASCII characters.  Clients interested in sorting strings
+/// lexicographically should not rely on this function for doing so and should
+/// instead use a custom sort function (or use one provided by an already
+/// existing library such as Qt or ICU).
 struct TfDictionaryLessThan {
     /// Return true if \p lhs is less than \p rhs in dictionary order.
     ///
@@ -526,12 +541,14 @@ struct TfDictionaryLessThan {
     inline bool operator()(const std::string &lhs,
                            const std::string &rhs) const {
         // Check first chars first.  By far, it is most common that these
-        // characters are letters of the same case that differ, or of different
-        // case that differ.  It is very rare that we have to account for
-        // different cases, or numerical comparisons, so we special-case this
-        // first.
-        char l = lhs.c_str()[0], r = rhs.c_str()[0];
-        if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
+        // characters are ASCII letters that differ.  It is very rare that we
+        // have to account for different cases, or numerical comparisons, or
+        // UTF-8 characters so we special-case this first.
+        const unsigned char l = lhs.c_str()[0], r = rhs.c_str()[0];
+        const bool bothAscii = l < 0x80 && r < 0x80;
+        const bool differsIgnoringCase = (l & ~0x20) != (r & ~0x20);
+        const bool inLetterZone = (l >= 0x40) && (r >= 0x40);
+        if (ARCH_LIKELY(bothAscii && differsIgnoringCase && inLetterZone)) {
             // This bit about add 5 mod 32 makes it so that '_' sorts less than
             // all letters, which preserves existing behavior.
             return ((l + 5) & 31) < ((r + 5) & 31);

diff --git a/pxr/base/tf/testenv/stringUtils.cpp b/pxr/base/tf/testenv/stringUtils.cpp
@@ -195,7 +195,39 @@ TestPreds()
                            "primvars:curveHierarchy:id"));
     TF_AXIOM(DictLessThan("primvars:curveHierarchy:id",
                           "primvars:curveHierarchy__id"));
-
+
+    // basic UTF-8 character tests
+    // U+00FC (C3 B2)           U+0061 (61)
+    // U+1300A (F0 93 80 8A)    U+0041 (41)
+    // U+222B (E2 88 AB)        U+003D (3D)
+    // U+0F22 (E0 BC A2)        U+0036 (36)
+    // U+0F22 (E0 BC A2)        U+0F28 (E0 BC A8)
+    TF_AXIOM(!DictLessThan("ü", "a"));
+    TF_AXIOM(!DictLessThan("𓀊", "A"));
+    TF_AXIOM(!DictLessThan("∫", "="));
+    TF_AXIOM(!DictLessThan("༢", "6"));
+    TF_AXIOM(DictLessThan("༢", "༨"));
+    TF_AXIOM(DictLessThan("_", "㤼"));
+    TF_AXIOM(DictLessThan("_a", "_a㤼"));
+    TF_AXIOM(DictLessThan("6", "_a"));
+    TF_AXIOM(!DictLessThan("2_༢1", "2_༢"));
+    TF_AXIOM(!DictLessThan("∫∫", "∫="));
+
+    // U+03C7 (CF 87)  U+03C0 (CF 80)
+    TF_AXIOM(!DictLessThan("a00χ", "a0π"));
+    TF_AXIOM(!DictLessThan("00χ", "0π"));
+
+    // additional tests for UTF-8 characters in the loop
+    // U+393B (E3 A4 BB)        U+393C (E3 A4 BC)
+    // U+393B (E3 A4 BB)        U+393A (E3 A4 BA)
+    // U+393B (E3 A4 BB)        U+393B (E3 A4 BB)
+    // U+00FC (C3 B2)           U+0061 (61)
+    TF_AXIOM(DictLessThan("foo001bar001abc㤻", "foo001bar001abc㤼"));
+    TF_AXIOM(!DictLessThan("foo001㤻bar01abc", "foo001㤺bar001abc"));
+    TF_AXIOM(!DictLessThan("foo001㤻bar001abc", "foo001㤻bar001abc"));
+    TF_AXIOM(!DictLessThan("foo00001bar0002ü", "foo001bar002abc"));
+    TF_AXIOM(DictLessThan("üfoo", "㤻foo"));
+
     TF_AXIOM(TfIsValidIdentifier("f"));
     TF_AXIOM(TfIsValidIdentifier("foo"));
     TF_AXIOM(TfIsValidIdentifier("foo1"));

diff --git a/pxr/base/tf/testenv/testTfStringUtils.py b/pxr/base/tf/testenv/testTfStringUtils.py
@@ -72,6 +72,16 @@ def test_Unicode(self):
         self.assertEqual(Tf.DictionaryStrcmp(u'apple', 'banana'), -1)
         self.assertEqual(Tf.DictionaryStrcmp(u'apple', u'banana'), -1)
 
+        # U+393B < U+393C
+        # U+393B > U+393A
+        # U+393B == U+393B
+        # U+00FC > U+0030 because 0 is a digit and hence the word
+        # prefix is less than that of `aü`
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤼'), -1)
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤺'), 1)
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤻'), 0)
+        self.assertEqual(Tf.DictionaryStrcmp('aü', 'a0'), 1)
+
     def test_StringToLong(self):
 
         def checks(val):