Skip to content

Commit

Permalink
Merge pull request #2673 from erslavin/utf8
Browse files Browse the repository at this point in the history
Support for UTF-8 chars in TfDictionaryLessThan

(Internal change: 2301635)
  • Loading branch information
pixar-oss committed Nov 1, 2023
2 parents 9770341 + 7235163 commit 9708720
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 9 deletions.
8 changes: 6 additions & 2 deletions pxr/base/tf/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
return false;
}

char l, r;
unsigned char l, r;

while (true) {
if (lcur == curEnd) {
Expand All @@ -802,7 +802,11 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
l = *lcur, r = *rcur;
// If they are letters that differ disregarding case, we're done.
if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
// but only if they are ASCII (i.e., the high bit is not set)
const bool bothAscii = l < 0x80 && r < 0x80;
const bool differsIgnoringCase = (l & ~0x20) != (r & ~0x20);
const bool inLetterZone = (l >= 0x40) && (r >= 0x40);
if (bothAscii && differsIgnoringCase && inLetterZone) {
// Add 5 mod 32 makes '_' sort before all letters.
return ((l + 5) & 31) < ((r + 5) & 31);
}
Expand Down
29 changes: 23 additions & 6 deletions pxr/base/tf/stringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "pxr/pxr.h"

#include "pxr/base/arch/attributes.h"
#include "pxr/base/arch/hints.h"
#include "pxr/base/arch/inttypes.h"
#include "pxr/base/tf/api.h"
#include "pxr/base/tf/enum.h"
Expand Down Expand Up @@ -510,6 +511,20 @@ TfMatchedStringTokenize(const std::string& source,
/// Characters whose ASCII value are inbetween upper- and lowercase letters,
/// such as underscore, are sorted to come after all letters.
///
/// \note This comparison is used for the runtime to give a deterministic
/// ordering to strings.
///
/// ASCII strings will sort lexicographically according to the rules below.
/// Strings with other Unicode characters will follow these same rules until a
/// multi-byte codepoint is encountered in which case it will be byte compared
/// with the bytes in the other string. Multi-byte encoded characters will
/// operate this way for each of the bytes.
///
/// Note that this results in a non-lexicographic ordering of strings that
/// contain non-ASCII characters. Clients interested in sorting strings
/// lexicographically should not rely on this function for doing so and should
/// instead use a custom sort function (or use one provided by an already
/// existing library such as Qt or ICU).
struct TfDictionaryLessThan {
/// Return true if \p lhs is less than \p rhs in dictionary order.
///
Expand All @@ -526,12 +541,14 @@ struct TfDictionaryLessThan {
inline bool operator()(const std::string &lhs,
const std::string &rhs) const {
// Check first chars first. By far, it is most common that these
// characters are letters of the same case that differ, or of different
// case that differ. It is very rare that we have to account for
// different cases, or numerical comparisons, so we special-case this
// first.
char l = lhs.c_str()[0], r = rhs.c_str()[0];
if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
// characters are ASCII letters that differ. It is very rare that we
// have to account for different cases, or numerical comparisons, or
// UTF-8 characters so we special-case this first.
const unsigned char l = lhs.c_str()[0], r = rhs.c_str()[0];
const bool bothAscii = l < 0x80 && r < 0x80;
const bool differsIgnoringCase = (l & ~0x20) != (r & ~0x20);
const bool inLetterZone = (l >= 0x40) && (r >= 0x40);
if (ARCH_LIKELY(bothAscii && differsIgnoringCase && inLetterZone)) {
// This bit about add 5 mod 32 makes it so that '_' sorts less than
// all letters, which preserves existing behavior.
return ((l + 5) & 31) < ((r + 5) & 31);
Expand Down
34 changes: 33 additions & 1 deletion pxr/base/tf/testenv/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,39 @@ TestPreds()
"primvars:curveHierarchy:id"));
TF_AXIOM(DictLessThan("primvars:curveHierarchy:id",
"primvars:curveHierarchy__id"));


// basic UTF-8 character tests
// U+00FC (C3 B2) U+0061 (61)
// U+1300A (F0 93 80 8A) U+0041 (41)
// U+222B (E2 88 AB) U+003D (3D)
// U+0F22 (E0 BC A2) U+0036 (36)
// U+0F22 (E0 BC A2) U+0F28 (E0 BC A8)
TF_AXIOM(!DictLessThan("ü", "a"));
TF_AXIOM(!DictLessThan("𓀊", "A"));
TF_AXIOM(!DictLessThan("", "="));
TF_AXIOM(!DictLessThan("", "6"));
TF_AXIOM(DictLessThan("", ""));
TF_AXIOM(DictLessThan("_", ""));
TF_AXIOM(DictLessThan("_a", "_a㤼"));
TF_AXIOM(DictLessThan("6", "_a"));
TF_AXIOM(!DictLessThan("2_༢1", "2_༢"));
TF_AXIOM(!DictLessThan("∫∫", "∫="));

// U+03C7 (CF 87) U+03C0 (CF 80)
TF_AXIOM(!DictLessThan("a00χ", "a0π"));
TF_AXIOM(!DictLessThan("00χ", ""));

// additional tests for UTF-8 characters in the loop
// U+393B (E3 A4 BB) U+393C (E3 A4 BC)
// U+393B (E3 A4 BB) U+393A (E3 A4 BA)
// U+393B (E3 A4 BB) U+393B (E3 A4 BB)
// U+00FC (C3 B2) U+0061 (61)
TF_AXIOM(DictLessThan("foo001bar001abc㤻", "foo001bar001abc㤼"));
TF_AXIOM(!DictLessThan("foo001㤻bar01abc", "foo001㤺bar001abc"));
TF_AXIOM(!DictLessThan("foo001㤻bar001abc", "foo001㤻bar001abc"));
TF_AXIOM(!DictLessThan("foo00001bar0002ü", "foo001bar002abc"));
TF_AXIOM(DictLessThan("üfoo", "㤻foo"));

TF_AXIOM(TfIsValidIdentifier("f"));
TF_AXIOM(TfIsValidIdentifier("foo"));
TF_AXIOM(TfIsValidIdentifier("foo1"));
Expand Down
10 changes: 10 additions & 0 deletions pxr/base/tf/testenv/testTfStringUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ def test_Unicode(self):
self.assertEqual(Tf.DictionaryStrcmp(u'apple', 'banana'), -1)
self.assertEqual(Tf.DictionaryStrcmp(u'apple', u'banana'), -1)

# U+393B < U+393C
# U+393B > U+393A
# U+393B == U+393B
# U+00FC > U+0030 because 0 is a digit and hence the word
# prefix is less than that of `aü`
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤼'), -1)
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤺'), 1)
self.assertEqual(Tf.DictionaryStrcmp('apple㤻', 'apple㤻'), 0)
self.assertEqual(Tf.DictionaryStrcmp('aü', 'a0'), 1)

def test_StringToLong(self):

def checks(val):
Expand Down

0 comments on commit 9708720

Please sign in to comment.