Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Unicode UTF-8 Identifiers #2120

Closed
2 changes: 2 additions & 0 deletions pxr/base/tf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ pxr_library(tf
type
typeFunctions
typeNotice
unicodeUtils
warning
weakBase
weakPtr
Expand Down Expand Up @@ -172,6 +173,7 @@ pxr_library(tf
CPPFILES
initConfig.cpp
preprocessorUtils.cpp
unicodeCharacterClasses.cpp
pxrDoubleConversion/double-conversion.cc
pxrDoubleConversion/bignum-dtoa.cc
pxrDoubleConversion/bignum.cc
Expand Down
107 changes: 80 additions & 27 deletions pxr/base/tf/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,32 @@ using std::vector;

PXR_NAMESPACE_OPEN_SCOPE

TF_DEFINE_ENV_SETTING(TF_UTF8_IDENTIFIERS,
true,
"Allow UTF8 strings as identifiers and prim names");

namespace {
bool _IsASCIIValue(const char& c)
erslavin marked this conversation as resolved.
Show resolved Hide resolved
{
return(static_cast<int>(c) >=0 && static_cast<int>(c) <=127 );
erslavin marked this conversation as resolved.
Show resolved Hide resolved
}

bool _IsInASCIIValueRange(const std::string& str)
{
for(std::string::const_iterator it = str.begin();
erslavin marked this conversation as resolved.
Show resolved Hide resolved
it != str.end(); it++)
{
if(static_cast<int>(*it) < 0
|| static_cast<int>(*it) > 127)
{
return false;
}
}

return true;
}
}

string
TfVStringPrintf(const std::string& fmt, va_list ap)
{
Expand Down Expand Up @@ -269,10 +295,6 @@ TfStringToUpper(const string &source)
string
TfStringCapitalize(const string& source)
{
if (source.empty()) {
return source;
}

string result(source);
result[0] = toupper(result[0]);

Expand All @@ -294,6 +316,8 @@ TfStringGetCommonPrefix(string a, string b)
string
TfStringGetSuffix(const string& name, char delimiter)
{
TF_DEV_AXIOM(_IsASCIIValue(delimiter));

size_t i = name.rfind(delimiter);
if (i == string::npos)
return "";
Expand All @@ -304,6 +328,8 @@ TfStringGetSuffix(const string& name, char delimiter)
string
TfStringGetBeforeSuffix(const string& name, char delimiter)
{
TF_DEV_AXIOM(_IsASCIIValue(delimiter));

size_t i = name.rfind(delimiter);
if (i == string::npos)
return name;
Expand Down Expand Up @@ -484,6 +510,8 @@ TfStringSplit(string const &src, string const &separator)
vector<string>
TfStringTokenize(string const &src, const char* delimiters)
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<pair<char const *, char const *> > segments;
_TokenizeToSegments(src, delimiters, segments);

Expand All @@ -497,6 +525,8 @@ TfStringTokenize(string const &src, const char* delimiters)
set<string>
TfStringTokenizeToSet(string const &src, const char* delimiters)
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<pair<char const *, char const *> > segments;
_TokenizeToSegments(src, delimiters, segments);

Expand All @@ -523,7 +553,9 @@ _FindFirstOfNotEscaped(const string &source, const char *toFind, size_t offset)
vector<string>
TfQuotedStringTokenize(const string &source, const char *delimiters,
string *errors)
{
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<string> resultVec;
size_t j, quoteIndex, delimIndex;
const char *quotes = "\"\'`";
Expand Down Expand Up @@ -801,7 +833,8 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
l = *lcur, r = *rcur;
// If they are letters that differ disregarding case, we're done.
if (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f)) {
// but only if they are ASCII (i.e., the high bit is not set)
if ((((l & (1<<7)) == 0) && ((r & (1<<7)) == 0)) && (((l & ~0x20) != (r & ~0x20)) & bool(l & r & ~0x3f))) {
erslavin marked this conversation as resolved.
Show resolved Hide resolved
// Add 5 mod 32 makes '_' sort before all letters.
return ((l + 5) & 31) < ((r + 5) & 31);
}
Expand Down Expand Up @@ -883,7 +916,12 @@ TfDictionaryLessThan::_LessImpl(const string& lstr, const string& rstr) const
}
else if (!IsAlpha(l) || !IsAlpha(r)) {
// At least one isn't a letter.
return l < r;
// this either means it's an ASCII symbol (digit was checked above)
// or a utf-8 encoded unicode character
// we want ASCII symbols to sort first, so we can't treat the
// utf-8 characters as unsigned (and all ASCII values remain the same
// after the cast)
return static_cast<unsigned char>(l) < static_cast<unsigned char>(r);
}
else {
// Both letters, differ by case, continue.
Expand Down Expand Up @@ -1150,37 +1188,52 @@ TfStringCatPaths( const string &prefix, const string &suffix )
return TfNormPath(prefix + "/" + suffix);
}

bool UseUTF8Identifiers()
{
static bool useUtf8Identifiers =
(TfGetEnvSetting(TF_UTF8_IDENTIFIERS) == true);

return useUtf8Identifiers;
}

std::string
TfMakeValidIdentifier(const std::string &in)
{
std::string result;

if (in.empty()) {
result.push_back('_');
return result;
if (UseUTF8Identifiers())
{
return TfUnicodeUtils::MakeValidUTF8Identifier(in);
}
else
{
std::string result;

result.reserve(in.size());
char const *p = in.c_str();
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
*p == '_')) {
result.push_back('_');
} else {
result.push_back(*p);
}
if (in.empty()) {
result.push_back('_');
return result;
}

for (++p; *p; ++p) {
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
('0' <= *p && *p <= '9') ||
*p == '_')) {
result.reserve(in.size());
char const *p = in.c_str();
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
*p == '_')) {
result.push_back('_');
} else {
result.push_back(*p);
}

for (++p; *p; ++p) {
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
('0' <= *p && *p <= '9') ||
*p == '_')) {
result.push_back('_');
} else {
result.push_back(*p);
}
}
return result;
}
return result;
}

std::string
Expand Down
Loading