Skip to content

Commit

Permalink
Support for UTF-8 chars in TfDictionaryLessThan
Browse files Browse the repository at this point in the history
- Added support for UTF-8 characters in sorting algorithm
- Added additional tests with strings containing UTF-8 characters
  • Loading branch information
erslavin committed Sep 13, 2023
1 parent 6988a51 commit 7235163
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 8 deletions.
24 changes: 20 additions & 4 deletions pxr/base/tf/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ using std::vector;

PXR_NAMESPACE_OPEN_SCOPE

namespace {
bool _IsASCIIValue(char c)
{
return (c & (1<<7)) == 0;
}
}

string
TfVStringPrintf(const std::string& fmt, va_list ap)
{
Expand Down Expand Up @@ -802,7 +809,8 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
l = *lcur, r = *rcur;
// If they are letters that differ disregarding case, we're done.
if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
// but only if they are ASCII (i.e., the high bit is not set)
if (_IsASCIIValue(l) && _IsASCIIValue(r) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
// Add 5 mod 32 makes '_' sort before all letters.
return ((l + 5) & 31) < ((r + 5) & 31);
}
Expand Down Expand Up @@ -871,7 +879,10 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
else if (IsDigit(l) | IsDigit(r)) {
if (lcur == lstr.c_str()) {
return l < r;
// special case could occur where one is a digit, but the other
// might be a UTF-8 character, so we have to treat them
// as unsigned in the comparison
return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
}
// If one is a digit (but not both), then we have to check the
// preceding character to determine the outcome. If the
Expand All @@ -884,7 +895,12 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
else if (!IsAlpha(l) || !IsAlpha(r)) {
// At least one isn't a letter.
return l < r;
// this either means it's an ASCII symbol (digit was checked above)
// or a utf-8 encoded unicode character
// we want ASCII symbols to sort first, so we can't treat the
// utf-8 characters as unsigned (and all ASCII values remain the same
// after the cast)
return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
}
else {
// Both letters, differ by case, continue.
Expand All @@ -909,7 +925,7 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
rstr.c_str());

l = *lcur, r = *rcur;
return (r == '0') | ((l != '0') & (l < r));
return (r == '0') | ((l != '0') & (static_cast<unsigned char>(l) < static_cast<unsigned char>(r)));

}

Expand Down
18 changes: 15 additions & 3 deletions pxr/base/tf/stringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,17 @@ TfMatchedStringTokenize(const std::string& source,
/// Characters whose ASCII value are inbetween upper- and lowercase letters,
/// such as underscore, are sorted to come after all letters.
///
/// \note This comparison is used for the runtime to give a deterministic ordering to strings.
/// ASCII strings will lexicographically sort according to the rules below.
/// Strings with Unicode characters will follow these same rules until a
/// Unicode character is encountered in which case it will be byte compared
/// with the character in the other string. Multi-byte encoded characters
/// will operate this way for each of the bytes.
/// Note that this results in a non-lexicographic ordering of strings that
/// contain Unicode characters. Clients interested in sorting strings
/// lexicographically should not rely on this function for doing so and should
/// instead use a custom sort function (or use one provided by an already
/// existing library such as Qt or ICU).
struct TfDictionaryLessThan {
/// Return true if \p lhs is less than \p rhs in dictionary order.
///
Expand All @@ -512,14 +523,15 @@ struct TfDictionaryLessThan {
inline bool operator()(const std::string &lhs,
const std::string &rhs) const {
// Check first chars first. By far, it is most common that these
// characters are letters of the same case that differ, or of different
// characters are ASCII letters of the same case that differ, or of different
// case that differ. It is very rare that we have to account for
// different cases, or numerical comparisons, so we special-case this
// different cases, or numerical comparisons, or UTF-8 characters so we special-case this
// first.
char l = lhs.c_str()[0], r = rhs.c_str()[0];
if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
if ((((l & (1<<7)) == 0) && ((r & (1<<7)) == 0)) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
// This bit about add 5 mod 32 makes it so that '_' sorts less than
// all letters, which preserves existing behavior.
// but only when both characters are ASCII characters
return ((l + 5) & 31) < ((r + 5) & 31);
}
else {
Expand Down
34 changes: 33 additions & 1 deletion pxr/base/tf/testenv/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,39 @@ TestPreds()
"primvars:curveHierarchy:id"));
TF_AXIOM(DictLessThan("primvars:curveHierarchy:id",
"primvars:curveHierarchy__id"));


// basic UTF-8 character tests
// U+00FC (C3 B2) U+0061 (61)
// U+1300A (F0 93 80 8A) U+0041 (41)
// U+222B (E2 88 AB) U+003D (3D)
// U+0F22 (E0 BC A2) U+0036 (36)
// U+0F22 (E0 BC A2) U+0F28 (E0 BC A8)
TF_AXIOM(!DictLessThan("ü", "a"));
TF_AXIOM(!DictLessThan("𓀊", "A"));
TF_AXIOM(!DictLessThan("", "="));
TF_AXIOM(!DictLessThan("", "6"));
TF_AXIOM(DictLessThan("", ""));
TF_AXIOM(DictLessThan("_", ""));
TF_AXIOM(DictLessThan("_a", "_a㤼"));
TF_AXIOM(DictLessThan("6", "_a"));
TF_AXIOM(!DictLessThan("2_༢1", "2_༢"));
TF_AXIOM(!DictLessThan("∫∫", "∫="));

// U+03C7 (CF 87) U+03C0 (CF 80)
TF_AXIOM(!DictLessThan("a00χ", "a0π"));
TF_AXIOM(!DictLessThan("00χ", ""));

// additional tests for UTF-8 characters in the loop
// U+393B (E3 A4 BB) U+393C (E3 A4 BC)
// U+393B (E3 A4 BB) U+393A (E3 A4 BA)
// U+393B (E3 A4 BB) U+393B (E3 A4 BB)
// U+00FC (C3 B2) U+0061 (61)
TF_AXIOM(DictLessThan("foo001bar001abc㤻", "foo001bar001abc㤼"));
TF_AXIOM(!DictLessThan("foo001㤻bar01abc", "foo001㤺bar001abc"));
TF_AXIOM(!DictLessThan("foo001㤻bar001abc", "foo001㤻bar001abc"));
TF_AXIOM(!DictLessThan("foo00001bar0002ü", "foo001bar002abc"));
TF_AXIOM(DictLessThan("üfoo", "㤻foo"));

TF_AXIOM(TfIsValidIdentifier("f"));
TF_AXIOM(TfIsValidIdentifier("foo"));
TF_AXIOM(TfIsValidIdentifier("foo1"));
Expand Down
10 changes: 10 additions & 0 deletions pxr/base/tf/testenv/testTfStringUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ def test_Unicode(self):
self.assertEqual(Tf.DictionaryStrcmp(u'apple', 'banana'), -1)
self.assertEqual(Tf.DictionaryStrcmp(u'apple', u'banana'), -1)

# U+393B < U+393C
# U+393B > U+393A
# U+393B == U+393B
# U+00FC > U+0030 because 0 is a digit and hence the word
# prefix is less than that of `aü`
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤼'), -1)
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤺'), 1)
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤻'), 0)
self.assertEqual(Tf.DictionaryStrcmp('aü', 'a0'), 1)

def test_StringToLong(self):

def checks(val):
Expand Down

0 comments on commit 7235163

Please sign in to comment.