Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Unicode UTF-8 Identifiers #2120

Closed
34 changes: 33 additions & 1 deletion pxr/base/tf/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,28 @@ TF_DEFINE_ENV_SETTING(TF_UTF8_IDENTIFIERS,
false,
"Allow UTF8 strings as identifiers and prim names");

namespace {
bool _IsASCIIValue(const char& c)
erslavin marked this conversation as resolved.
Show resolved Hide resolved
{
return(static_cast<int>(c) >=0 && static_cast<int>(c) <=127 );
erslavin marked this conversation as resolved.
Show resolved Hide resolved
}

bool _IsInASCIIValueRange(const std::string& str)
{
for(std::string::const_iterator it = str.begin();
erslavin marked this conversation as resolved.
Show resolved Hide resolved
it != str.end(); it++)
{
if(static_cast<int>(*it) < 0
|| static_cast<int>(*it) > 127)
{
return false;
}
}

return true;
}
}

string
TfVStringPrintf(const std::string& fmt, va_list ap)
{
Expand Down Expand Up @@ -294,6 +316,8 @@ TfStringGetCommonPrefix(string a, string b)
string
TfStringGetSuffix(const string& name, char delimiter)
{
TF_DEV_AXIOM(_IsASCIIValue(delimiter));

size_t i = name.rfind(delimiter);
if (i == string::npos)
return "";
Expand All @@ -304,6 +328,8 @@ TfStringGetSuffix(const string& name, char delimiter)
string
TfStringGetBeforeSuffix(const string& name, char delimiter)
{
TF_DEV_AXIOM(_IsASCIIValue(delimiter));

size_t i = name.rfind(delimiter);
if (i == string::npos)
return name;
Expand Down Expand Up @@ -484,6 +510,8 @@ TfStringSplit(string const &src, string const &separator)
vector<string>
TfStringTokenize(string const &src, const char* delimiters)
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<pair<char const *, char const *> > segments;
_TokenizeToSegments(src, delimiters, segments);

Expand All @@ -497,6 +525,8 @@ TfStringTokenize(string const &src, const char* delimiters)
set<string>
TfStringTokenizeToSet(string const &src, const char* delimiters)
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<pair<char const *, char const *> > segments;
_TokenizeToSegments(src, delimiters, segments);

Expand All @@ -523,7 +553,9 @@ _FindFirstOfNotEscaped(const string &source, const char *toFind, size_t offset)
vector<string>
TfQuotedStringTokenize(const string &source, const char *delimiters,
string *errors)
{
{
TF_DEV_AXIOM(_IsInASCIIValueRange(delimiters));

vector<string> resultVec;
size_t j, quoteIndex, delimIndex;
const char *quotes = "\"\'`";
Expand Down
43 changes: 25 additions & 18 deletions pxr/base/tf/stringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,8 @@ std::string TfStringGetCommonPrefix(std::string a, std::string b);
/// a string. Thus suffix of "abc.def" is "def" using "." as the delimiter.
/// If the delimiter does not occur, the empty string is returned.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c delimiter must be an ASCII character. If these conditions are met
/// \c name can be either an ASCII string or a UTF-8 encoded Unicode string.
TF_API
std::string TfStringGetSuffix(const std::string& name, char delimiter = '.');

Expand All @@ -349,8 +349,8 @@ std::string TfStringGetSuffix(const std::string& name, char delimiter = '.');
/// delimiter. If the delimiter does not occur, the original string is
/// returned.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c delimiter must be an ASCII character. If these conditions are met
/// \c name can be either an ASCII string or a UTF-8 encoded Unicode string.
TF_API
std::string TfStringGetBeforeSuffix(const std::string& name, char delimiter = '.');

Expand Down Expand Up @@ -383,8 +383,8 @@ std::string TfStringReplace(const std::string& source, const std::string& from,
/// with \p separator (by default, a space) added between each successive pair
/// of strings.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c separator must be either an ASCII string or a UTF-8 encoded
/// Unicode string.
template <class ForwardIterator>
std::string TfStringJoin(
ForwardIterator begin, ForwardIterator end,
Expand Down Expand Up @@ -420,8 +420,8 @@ std::string TfStringJoin(
/// Returns the concatenation of the strings in \p strings, with \p separator
/// (by default, a space) added between each successive pair of strings.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c separator must be either an ASCII string or a UTF-8 encoded
/// Unicode string.
TF_API
std::string TfStringJoin(const std::vector<std::string>& strings,
const char* separator = " ");
Expand All @@ -431,8 +431,8 @@ std::string TfStringJoin(const std::vector<std::string>& strings,
/// Returns the concatenation of the strings in \p strings, with \p separator
/// (by default, a space) added between each successive pair of strings.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c separator must be either an ASCII string or a UTF-8 encoded
/// Unicode string.
TF_API
std::string TfStringJoin(const std::set<std::string>& strings,
const char* separator = " ");
Expand All @@ -442,6 +442,9 @@ std::string TfStringJoin(const std::set<std::string>& strings,
/// The string \p source is broken apart into individual words, where a word
/// is delimited by the string \p separator. This function behaves like
/// pythons string split method.
///
/// \note \c separator must be either an ASCII string or a UTF-8 encoded
/// Unicode string.
TF_API
std::vector<std::string> TfStringSplit(std::string const &src,
std::string const &separator);
Expand All @@ -456,8 +459,9 @@ std::vector<std::string> TfStringSplit(std::string const &src,
/// consecutive delimiters are treated as though they were one, and an empty
/// input will result in an empty return vector.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c delimeters must must be ASCII characters. If these conditions are
/// met, \c source can be either an ASCII string or a UTF-8 encoded Unicode
/// string.
TF_API
std::vector<std::string> TfStringTokenize(const std::string& source,
const char* delimiters = " \t\n");
Expand All @@ -466,8 +470,9 @@ std::vector<std::string> TfStringTokenize(const std::string& source,
///
/// Same as TfStringTokenize, except this one returns a set.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c delimiters must be ASCII characters. If these conditions are
/// met, \c source can be either an ASCII string or a UTF-8 encoded Unicode
/// string.
TF_API
std::set<std::string> TfStringTokenizeToSet(const std::string& source,
const char* delimiters = " \t\n");
Expand All @@ -482,8 +487,9 @@ std::set<std::string> TfStringTokenizeToSet(const std::string& source,
/// contains any error messages. Delimiters default to white space (space,
/// tab, and newline).
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c delimiters must be ASCII characters. If these conditions are
/// met, \c source can be either an ASCII string or a UTF-8 encoded Unicode
/// string.
TF_API
std::vector<std::string>
TfQuotedStringTokenize(const std::string& source,
Expand All @@ -501,8 +507,9 @@ TfQuotedStringTokenize(const std::string& source,
/// \p closeDelimiter cannot be the same. \p errors, if provided, contains any
/// error messages.
///
/// \note This method is intended to work only for ASCII strings. Using this
/// method on Unicode strings results in undefined behavior.
/// \note \c openDelimeter, \c closeDelimiter, and \c escapeCharacter must
/// be ASCII characters. If these conditions are met, \c source can be either
/// an ASCII string or a UTF-8 encoded Unicode string.
TF_API
std::vector<std::string>
TfMatchedStringTokenize(const std::string& source,
Expand Down