PixarAnimationStudios · erslavin · Dec 2, 2022 · Dec 2, 2022 · Mar 30, 2023 · Mar 31, 2023
diff --git a/pxr/base/tf/CMakeLists.txt b/pxr/base/tf/CMakeLists.txt
@@ -81,6 +81,7 @@ pxr_library(tf
         type
         typeFunctions
         typeNotice
+        unicodeUtils
         warning
         weakBase
         weakPtr
@@ -172,6 +173,7 @@ pxr_library(tf
     CPPFILES
         initConfig.cpp
         preprocessorUtils.cpp
+        unicodeCharacterClasses.cpp
         pxrDoubleConversion/double-conversion.cc
         pxrDoubleConversion/bignum-dtoa.cc
         pxrDoubleConversion/bignum.cc

diff --git a/pxr/base/tf/stringUtils.cpp b/pxr/base/tf/stringUtils.cpp
@@ -63,6 +63,32 @@ using std::vector;
 
 PXR_NAMESPACE_OPEN_SCOPE
 
+TF_DEFINE_ENV_SETTING(TF_UTF8_IDENTIFIERS, 
+    true, 
+    "Allow UTF8 strings as identifiers and prim names");
+
+namespace {
+    bool _IsASCIIValue(const char& c)
+    {
+        return(static_cast<int>(c) >=0 && static_cast<int>(c) <=127 );
+    }
+
+    bool _IsInASCIIValueRange(const std::string& str)
+    {
+        for(std::string::const_iterator it = str.begin(); 
+            it != str.end(); it++)
+        {
+            if(static_cast<int>(*it) < 0 
+                || static_cast<int>(*it) > 127)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+}
+
 string
 TfVStringPrintf(const std::string& fmt, va_list ap)
 {
@@ -269,10 +295,6 @@ TfStringToUpper(const string &source)
 string
 TfStringCapitalize(const string& source)
 {
-    if (source.empty()) {
-        return source;
-    }
-
     string result(source);
     result[0] = toupper(result[0]);
 
@@ -294,6 +316,8 @@ TfStringGetCommonPrefix(string a, string b)
 string
 TfStringGetSuffix(const string& name, char delimiter)
 {
+    TF_DEV_AXIOM(_IsASCIIValue(delimiter));
+
     size_t i = name.rfind(delimiter);
     if (i == string::npos)
         return "";
@@ -304,6 +328,8 @@ TfStringGetSuffix(const string& name, char delimiter)
 string
 TfStringGetBeforeSuffix(const string& name, char delimiter)
 {
+    TF_DEV_AXIOM(_IsASCIIValue(delimiter));
+
     size_t i = name.rfind(delimiter);
     if (i == string::npos)
         return name;
@@ -484,6 +510,8 @@ TfStringSplit(string const &src, string const &separator)
 vector<string>
 TfStringTokenize(string const &src, const char* delimiters)
 {
+    TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));
+
     vector<pair<char const *, char const *> > segments;
     _TokenizeToSegments(src, delimiters, segments);
 
@@ -497,6 +525,8 @@ TfStringTokenize(string const &src, const char* delimiters)
 set<string>
 TfStringTokenizeToSet(string const &src, const char* delimiters)
 {
+    TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));
+
     vector<pair<char const *, char const *> > segments;
     _TokenizeToSegments(src, delimiters, segments);
 
@@ -523,7 +553,9 @@ _FindFirstOfNotEscaped(const string &source, const char *toFind, size_t offset)
 vector<string>
 TfQuotedStringTokenize(const string &source, const char *delimiters, 
                        string *errors)
-{    
+{   
+    TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));
+
     vector<string> resultVec;
     size_t j, quoteIndex, delimIndex;
     const char *quotes = "\"\'`";
@@ -801,7 +833,8 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         }
         l = *lcur, r = *rcur;
         // If they are letters that differ disregarding case, we're done.
-        if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
+        // but only if they are ASCII (i.e., the high bit is not set)
+        if ((((l & (1<<7)) == 0) && ((r & (1<<7)) == 0)) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
             // Add 5 mod 32 makes '_' sort before all letters.
             return ((l + 5) & 31) < ((r + 5) & 31);
         }
@@ -883,7 +916,12 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
         }
         else if (!IsAlpha(l) || !IsAlpha(r)) {
             // At least one isn't a letter.
-            return l < r;
+            // this either means it's an ASCII symbol (digit was checked above)
+            // or a utf-8 encoded unicode character
+            // we want ASCII symbols to sort first, so we can't treat the
+            // utf-8 characters as unsigned (and all ASCII values remain the same
+            // after the cast)
+            return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
         }
         else {
             // Both letters, differ by case, continue.
@@ -1150,37 +1188,52 @@ TfStringCatPaths( const string &prefix, const string &suffix )
     return TfNormPath(prefix + "/" + suffix);
 }
 
+bool UseUTF8Identifiers()
+{
+    static bool useUtf8Identifiers = 
+        (TfGetEnvSetting(TF_UTF8_IDENTIFIERS) == true);
+
+    return useUtf8Identifiers;
+}
+
 std::string
 TfMakeValidIdentifier(const std::string &in)
 {
-    std::string result;
-
-    if (in.empty()) {
-        result.push_back('_');
-        return result;
+    if (UseUTF8Identifiers())
+    {
+        return TfUnicodeUtils::MakeValidUTF8Identifier(in);
     }
+    else
+    {
+        std::string result;
 
-    result.reserve(in.size());
-    char const *p = in.c_str();
-    if (!(('a' <= *p && *p <= 'z') || 
-          ('A' <= *p && *p <= 'Z') || 
-          *p == '_')) {
-        result.push_back('_');
-    } else {
-        result.push_back(*p);
-    }
+        if (in.empty()) {
+            result.push_back('_');
+            return result;
+        }
 
-    for (++p; *p; ++p) {
-        if (!(('a' <= *p && *p <= 'z') ||    
-              ('A' <= *p && *p <= 'Z') ||  
-              ('0' <= *p && *p <= '9') ||  
-              *p == '_')) {
+        result.reserve(in.size());
+        char const *p = in.c_str();
+        if (!(('a' <= *p && *p <= 'z') || 
+            ('A' <= *p && *p <= 'Z') || 
+            *p == '_')) {
             result.push_back('_');
         } else {
             result.push_back(*p);
         }
+
+        for (++p; *p; ++p) {
+            if (!(('a' <= *p && *p <= 'z') ||    
+                ('A' <= *p && *p <= 'Z') ||  
+                ('0' <= *p && *p <= '9') ||  
+                *p == '_')) {
+                result.push_back('_');
+            } else {
+                result.push_back(*p);
+            }
+        }
+        return result;
     }
-    return result;
 }
 
 std::string