From 72351636e62e69a15b6e3f6694127bf7d4907d32 Mon Sep 17 00:00:00 2001
From: Edward Slavin <eslavin@nvidia.com>
Date: Mon, 11 Sep 2023 13:15:58 -0400
Subject: [PATCH] Support for UTF-8 chars in TfDictionaryLessThan

- Added support for UTF-8 characters in sorting algorithm
- Added additional tests with strings containing UTF-8 characters
---
 pxr/base/tf/stringUtils.cpp              | 24 ++++++++++++++---
 pxr/base/tf/stringUtils.h                | 18 ++++++++++---
 pxr/base/tf/testenv/stringUtils.cpp      | 34 +++++++++++++++++++++++-
 pxr/base/tf/testenv/testTfStringUtils.py | 10 +++++++
 4 files changed, 78 insertions(+), 8 deletions(-)
diff --git a/pxr/base/tf/stringUtils.cpp b/pxr/base/tf/stringUtils.cpp
index 044c55aefb..2eae3ea6cd 100644
--- a/pxr/base/tf/stringUtils.cpp
+++ b/pxr/base/tf/stringUtils.cpp
@@ -62,6 +62,13 @@ using std::vector;
 
 PXR_NAMESPACE_OPEN_SCOPE
 
+namespace {
+    bool _IsASCIIValue(char c)
+    {
+        return (c & (1<<7)) == 0;
+    }
+}
+
 string
 TfVStringPrintf(const std::string& fmt, va_list ap)
 {
@@ -802,7 +809,8 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         }
         l = *lcur, r = *rcur;
         // If they are letters that differ disregarding case, we're done.
-        if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
+        // but only if they are ASCII (i.e., the high bit is not set)
+        if (_IsASCIIValue(l) && _IsASCIIValue(r) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
             // Add 5 mod 32 makes '_' sort before all letters.
             return ((l + 5) & 31) < ((r + 5) & 31);
         }
@@ -871,7 +879,10 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
             }
             else if (IsDigit(l) | IsDigit(r)) {
                 if (lcur == lstr.c_str()) {
-                    return l < r;
+                    // special case could occur where one is a digit, but the other
+                    // might be a UTF-8 character, so we have to treat them
+                    // as unsigned in the comparison
+                    return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
                 }
                 // If one is a digit (but not both), then we have to check the
                 // preceding character to determine the outcome.  If the
@@ -884,7 +895,12 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         }
         else if (!IsAlpha(l) || !IsAlpha(r)) {
             // At least one isn't a letter.
-            return l < r;
+            // this either means it's an ASCII symbol (digit was checked above)
+            // or a utf-8 encoded unicode character
+            // we want ASCII symbols to sort first, so we can't treat the
+            // utf-8 characters as unsigned (and all ASCII values remain the same
+            // after the cast)
+            return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
         }
         else {
             // Both letters, differ by case, continue.
@@ -909,7 +925,7 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         rstr.c_str());
     
     l = *lcur, r = *rcur;
-    return (r == '0') | ((l != '0') & (l < r));
+    return (r == '0') | ((l != '0') & (static_cast<unsigned char>(l) < static_cast<unsigned char>(r)));
 
 }
 
diff --git a/pxr/base/tf/stringUtils.h b/pxr/base/tf/stringUtils.h
index ae0a01f5ab..2e5eabb5a6 100644
--- a/pxr/base/tf/stringUtils.h
+++ b/pxr/base/tf/stringUtils.h
@@ -496,6 +496,17 @@ TfMatchedStringTokenize(const std::string& source,
 /// Characters whose ASCII value are inbetween upper- and lowercase letters,
 /// such as underscore, are sorted to come after all letters.
 ///
+/// \note This comparison is used for the runtime to give a deterministic ordering to strings.
+/// ASCII strings will lexicographically sort according to the rules below.
+///   Strings with Unicode characters will follow these same rules until a
+///   Unicode character is encountered in which case it will be byte compared
+///   with the character in the other string.  Multi-byte encoded characters
+///   will operate this way for each of the bytes.
+/// Note that this results in a non-lexicographic ordering of strings that
+/// contain Unicode characters.  Clients interested in sorting strings 
+/// lexicographically should not rely on this function for doing so and should
+/// instead use a custom sort function (or use one provided by an already
+/// existing library such as Qt or ICU).
 struct TfDictionaryLessThan {
     /// Return true if \p lhs is less than \p rhs in dictionary order.
     ///
@@ -512,14 +523,15 @@ struct TfDictionaryLessThan {
     inline bool operator()(const std::string &lhs,
                            const std::string &rhs) const {
         // Check first chars first.  By far, it is most common that these
-        // characters are letters of the same case that differ, or of different
+        // characters are ASCII letters of the same case that differ, or of different
         // case that differ.  It is very rare that we have to account for
-        // different cases, or numerical comparisons, so we special-case this
+        // different cases, or numerical comparisons, or UTF-8 characters so we special-case this
         // first.
         char l = lhs.c_str()[0], r = rhs.c_str()[0];
-        if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
+        if ((((l & (1<<7)) == 0) && ((r & (1<<7)) == 0)) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
             // This bit about add 5 mod 32 makes it so that '_' sorts less than
             // all letters, which preserves existing behavior.
+            // but only when both characters are ASCII characters
             return ((l + 5) & 31) < ((r + 5) & 31);
         }
         else {
diff --git a/pxr/base/tf/testenv/stringUtils.cpp b/pxr/base/tf/testenv/stringUtils.cpp
index e404a687c4..90d112ce44 100644
--- a/pxr/base/tf/testenv/stringUtils.cpp
+++ b/pxr/base/tf/testenv/stringUtils.cpp
@@ -195,7 +195,39 @@ TestPreds()
                            "primvars:curveHierarchy:id"));
     TF_AXIOM(DictLessThan("primvars:curveHierarchy:id",
                           "primvars:curveHierarchy__id"));
-        
+    
+    // basic UTF-8 character tests
+    // U+00FC (C3 B2)           U+0061 (61)
+    // U+1300A (F0 93 80 8A)    U+0041 (41)
+    // U+222B (E2 88 AB)        U+003D (3D)
+    // U+0F22 (E0 BC A2)        U+0036 (36)
+    // U+0F22 (E0 BC A2)        U+0F28 (E0 BC A8)
+    TF_AXIOM(!DictLessThan("ü", "a"));
+    TF_AXIOM(!DictLessThan("𓀊", "A"));
+    TF_AXIOM(!DictLessThan("∫", "="));
+    TF_AXIOM(!DictLessThan("༢", "6"));
+    TF_AXIOM(DictLessThan("༢", "༨"));
+    TF_AXIOM(DictLessThan("_", "㤼"));
+    TF_AXIOM(DictLessThan("_a", "_a㤼"));
+    TF_AXIOM(DictLessThan("6", "_a"));
+    TF_AXIOM(!DictLessThan("2_༢1", "2_༢"));
+    TF_AXIOM(!DictLessThan("∫∫", "∫="));
+
+    // U+03C7 (CF 87)  U+03C0 (CF 80)
+    TF_AXIOM(!DictLessThan("a00χ", "a0π"));
+    TF_AXIOM(!DictLessThan("00χ", "0π"));
+
+    // additional tests for UTF-8 characters in the loop
+    // U+393B (E3 A4 BB)        U+393C (E3 A4 BC)
+    // U+393B (E3 A4 BB)        U+393A (E3 A4 BA)
+    // U+393B (E3 A4 BB)        U+393B (E3 A4 BB)
+    // U+00FC (C3 B2)           U+0061 (61)
+    TF_AXIOM(DictLessThan("foo001bar001abc㤻", "foo001bar001abc㤼"));
+    TF_AXIOM(!DictLessThan("foo001㤻bar01abc", "foo001㤺bar001abc"));
+    TF_AXIOM(!DictLessThan("foo001㤻bar001abc", "foo001㤻bar001abc"));
+    TF_AXIOM(!DictLessThan("foo00001bar0002ü", "foo001bar002abc"));
+    TF_AXIOM(DictLessThan("üfoo", "㤻foo"));
+
     TF_AXIOM(TfIsValidIdentifier("f"));
     TF_AXIOM(TfIsValidIdentifier("foo"));
     TF_AXIOM(TfIsValidIdentifier("foo1"));
diff --git a/pxr/base/tf/testenv/testTfStringUtils.py b/pxr/base/tf/testenv/testTfStringUtils.py
index fba6946e54..9f82db0364 100644
--- a/pxr/base/tf/testenv/testTfStringUtils.py
+++ b/pxr/base/tf/testenv/testTfStringUtils.py
@@ -72,6 +72,16 @@ def test_Unicode(self):
         self.assertEqual(Tf.DictionaryStrcmp(u'apple', 'banana'), -1)
         self.assertEqual(Tf.DictionaryStrcmp(u'apple', u'banana'), -1)
 
+        # U+393B < U+393C
+        # U+393B > U+393A
+        # U+393B == U+393B
+        # U+00FC > U+0030 because 0 is a digit and hence the word
+        # prefix is less than that of `aü`
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤼'), -1)
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤺'), 1)
+        self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤻'), 0)
+        self.assertEqual(Tf.DictionaryStrcmp('aü', 'a0'), 1)
+
     def test_StringToLong(self):
 
         def checks(val):