Skip to content

Commit

Permalink
Support for UTF-8 Identifiers
Browse files Browse the repository at this point in the history
- Added UTF-8 character class support to Path parser
- Added UTF-8 based validation rules for SDF identifier methods
- Added UTF-8 support to lex-based text file format lexer
- Added tests for UTF-8 based identifiers
  • Loading branch information
erslavin committed Dec 8, 2023
1 parent 4cd3eee commit 1261a11
Show file tree
Hide file tree
Showing 11 changed files with 3,284 additions and 2,173 deletions.
123 changes: 79 additions & 44 deletions pxr/usd/sdf/path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "pxr/base/tf/staticData.h"
#include "pxr/base/tf/stl.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/base/tf/type.h"

#include "pxr/base/trace/trace.h"
Expand Down Expand Up @@ -117,6 +118,9 @@ class _DeferredDiagnostics

} // anon

constexpr uint32_t SDF_UNDERSCORE_CODE_POINT = 0x005Fu;

static inline bool _IsValidIdentifier(const std::string& name);
static inline bool _IsValidIdentifier(TfToken const &name);

// XXX: Enable this define to make bad path strings
Expand Down Expand Up @@ -1787,101 +1791,132 @@ SdfPath::MakeRelativePath(const SdfPath & anchor) const
return result;
}

static inline bool _IsValidIdentifier(const std::string& name)
{
// empty strings are not valid identifiers
if (name.empty())
{
return false;
}

TfUtf8CodePointView view {name};
bool first = true;
for (const uint32_t codePoint : view)
{
bool result = first ?
((codePoint == SDF_UNDERSCORE_CODE_POINT) ||
TfIsUtf8CodePointXidStart(codePoint))
: TfIsUtf8CodePointXidContinue(codePoint);

if (!result)
{
return false;
}

first = false;
}

return true;
}

static inline bool _IsValidIdentifier(TfToken const &name)
{
return TfIsValidIdentifier(name.GetString());
return _IsValidIdentifier(name.GetString());
}

bool
SdfPath::IsValidIdentifier(const std::string &name)
{
return TfIsValidIdentifier(name);
}

// We use our own _IsAlpha and _IsAlnum here for two reasons. One, we want to
// ensure that they follow C/Python identifier rules and are not subject to
// various locale differences. And two, since we are not consulting a locale,
// it is faster.
static constexpr bool _IsAlpha(int x) {
return ('a' <= (x|32)) && ((x|32) <= 'z');
}
static constexpr bool _IsAlnum(int x) {
return _IsAlpha(x) || (('0' <= x) && (x <= '9'));
return _IsValidIdentifier(name);
}

bool
SdfPath::IsValidNamespacedIdentifier(const std::string &name)
{
// empty strings are not valid identifiers
if (name.empty())
{
return false;
}

// A valid C/Python identifier except we also allow the namespace delimiter
// and if we tokenize on that delimiter then all tokens are valid C/Python
// identifiers. That means following a delimiter there must be an '_' or
// alphabetic character.
constexpr char delim = SDF_PATH_NS_DELIMITER_CHAR;
for (char const *p = name.c_str(); *p; ++p) {
if (!_IsAlpha(*p) && *p != '_') {
// identifiers.
std::vector<std::string> tokens = SdfPath::TokenizeIdentifier(name);
if (tokens.empty())
{
return false;
}

for (const std::string& token : tokens)
{
if (!SdfPath::IsValidIdentifier(token))
{
return false;
}
for (++p; _IsAlnum(*p) ||*p == '_'; ++p) {
/* consume identifier */
}
if (*p != delim) {
return !*p;
}
}
return false;

return true;
}

std::vector<std::string>
SdfPath::TokenizeIdentifier(const std::string &name)
{
constexpr uint32_t SDF_NAMESPACE_DELIMITER_CODE_POINT = 0x003Au;

std::vector<std::string> result;

// This code currently assumes the namespace delimiter is one character.
const char namespaceDelimiter =
SdfPathTokens->namespaceDelimiter.GetText()[0];

std::string::const_iterator first = name.begin();
std::string::const_iterator last = name.end();

// Not empty and first character is alpha or '_'.
if (first == last || !(isalpha(*first) || (*first == '_')))
return result;
// Last character is not the namespace delimiter.
if (*(last - 1) == namespaceDelimiter)
// Empty or last character is namespace delimiter
if (name.empty() || name.back() == namespaceDelimiter)
{
return result;
}

// Count delimiters and reserve space in result.
result.reserve(1 + std::count(first, last, namespaceDelimiter));
result.reserve(1 + std::count(name.begin(), name.end(),
namespaceDelimiter));

std::string::const_iterator anchor = first;
for (++first; first != last; ++first) {
TfUtf8CodePointView view {name};
TfUtf8CodePointIterator iterator = view.begin();
TfUtf8CodePointIterator anchor = iterator;
for (; iterator != view.end(); ++iterator)
{
// Allow a namespace delimiter.
if (*first == namespaceDelimiter) {
if (*iterator == SDF_NAMESPACE_DELIMITER_CODE_POINT)
{
// Record token.
result.push_back(std::string(anchor, first));
result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));

// Skip delimiter. We know we will not go beyond the end of
// the string because we checked before the loop that the
// last character was not the delimiter.
anchor = ++first;
anchor = ++iterator;

// First character.
if (!(isalpha(*first) || (*first == '_'))) {
if (!TfIsUtf8CodePointXidStart(*iterator) &&
*iterator != SDF_UNDERSCORE_CODE_POINT)
{
TfReset(result);
return result;
}
}
else {
// Next character
if (!(isalnum(*first) || (*first == '_'))) {
else
{
// Next character
if (!TfIsUtf8CodePointXidContinue(*iterator))
{
TfReset(result);
return result;
}
}
}

// Record the last token.
result.push_back(std::string(anchor, first));
result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));

return result;
}
Expand Down
77 changes: 71 additions & 6 deletions pxr/usd/sdf/pathParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include "pxr/pxr.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/usd/sdf/path.h"

#include "pxr/base/tf/pxrPEGTL/pegtl.h"
Expand All @@ -39,6 +40,60 @@ namespace Sdf_PathParser {

namespace PEGTL_NS = tao::TAO_PEGTL_NAMESPACE;

////////////////////////////////////////////////////////////////////////
// Helper rules for parsing UTF8 content
struct XidStart
{
template <typename ParseInput>
static bool match(ParseInput& in)
{
if (!in.empty())
{
// peek at the next character in the input
// if the size is not 0, it was a valid code point
auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
if (utf8_char.size != 0)
{
// valid utf8_char, data has the code point
if (TfIsUtf8CodePointXidStart(static_cast<uint32_t>(utf8_char.data)))
{
// it has the property we want, consume the input
in.bump(utf8_char.size);
return true;
}
}
}

return false;
}
};

struct XidContinue
{
template <typename ParseInput>
static bool match(ParseInput& in)
{
if (!in.empty())
{
// peek at the next character in the input
// if the size is not 0, it was a valid code point
auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
if (utf8_char.size != 0)
{
// valid utf8_char, data has the code point
if (TfIsUtf8CodePointXidContinue(static_cast<uint32_t>(utf8_char.data)))
{
// it has the property we want, consume the input
in.bump(utf8_char.size);
return true;
}
}
}

return false;
}
};

////////////////////////////////////////////////////////////////////////
// SdfPath grammar:

Expand All @@ -51,21 +106,31 @@ struct ReflexiveRelative : Dot {};

struct DotDots : PEGTL_NS::list<DotDot, Slash> {};

struct PrimName : PEGTL_NS::identifier {};
// valid identifiers start with an '_' character or anything in the XidStart
// character class, then continue with zero or more characters in the
// XidContinue character class
struct Utf8IdentifierStart : PEGTL_NS::sor<
PEGTL_NS::one<'_'>,
XidStart> {};
struct Utf8Identifier : PEGTL_NS::seq<
Utf8IdentifierStart,
PEGTL_NS::star<XidContinue>> {};

struct PrimName : Utf8Identifier {};

// XXX This replicates old behavior where '-' chars are allowed in variant set
// names in SdfPaths, but variant sets in layers cannot have '-' in their names.
// For now we preserve the behavior. Internal bug USD-8321 tracks removing
// support for '-' characters in variant set names in SdfPath.
struct VariantSetName :
PEGTL_NS::seq<PEGTL_NS::identifier_first,
PEGTL_NS::seq<Utf8IdentifierStart,
PEGTL_NS::star<PEGTL_NS::sor<
PEGTL_NS::identifier_other, PEGTL_NS::one<'-'>>>> {};
XidContinue, PEGTL_NS::one<'-'>>>> {};

struct VariantName :
PEGTL_NS::seq<PEGTL_NS::opt<
PEGTL_NS::one<'.'>>, PEGTL_NS::star<
PEGTL_NS::sor<PEGTL_NS::identifier_other,
PEGTL_NS::sor<XidContinue,
PEGTL_NS::one<'|', '-'>>>> {};

struct VarSelOpen : PEGTL_NS::pad<PEGTL_NS::one<'{'>, PEGTL_NS::blank> {};
Expand All @@ -88,7 +153,7 @@ struct PrimElts : PEGTL_NS::seq<
LookaheadList<PrimName, PEGTL_NS::sor<Slash, VariantSelections>>,
PEGTL_NS::opt<VariantSelections>> {};

struct PropertyName : PEGTL_NS::list<PEGTL_NS::identifier, PEGTL_NS::one<':'>> {};
struct PropertyName : PEGTL_NS::list<Utf8Identifier, PEGTL_NS::one<':'>> {};

struct MapperPath;
struct TargetPath;
Expand All @@ -103,7 +168,7 @@ struct RelationalAttributeName : PropertyName {};

struct MapperKW : TAO_PEGTL_KEYWORD("mapper") {};

struct MapperArg : PEGTL_NS::identifier {};
struct MapperArg : Utf8Identifier {};

struct MapperPathSeq : PEGTL_NS::if_must<
PEGTL_NS::seq<Dot, MapperKW>, BracketPath<MapperPath>,
Expand Down
42 changes: 25 additions & 17 deletions pxr/usd/sdf/schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "pxr/usd/sdf/layerOffset.h"
#include "pxr/usd/sdf/parserValueContext.h"
#include "pxr/usd/sdf/pathExpression.h"
#include "pxr/usd/sdf/pathParser.h"
#include "pxr/usd/sdf/payload.h"
#include "pxr/usd/sdf/reference.h"
#include "pxr/usd/sdf/schemaTypeRegistration.h"
Expand All @@ -41,6 +42,7 @@
#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/envSetting.h"
#include "pxr/base/tf/instantiateSingleton.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/base/trace/trace.h"
#include "pxr/base/vt/dictionary.h"

Expand Down Expand Up @@ -1287,28 +1289,34 @@ SdfSchemaBase::IsValidNamespacedIdentifier(const std::string& identifier)
SdfAllowed
SdfSchemaBase::IsValidVariantIdentifier(const std::string& identifier)
{
// Allow [[:alnum:]_|\-]+ with an optional leading dot.

std::string::const_iterator first = identifier.begin();
std::string::const_iterator last = identifier.end();

// Allow optional leading dot.
if (first != last && *first == '.') {
++first;
}
// use the path parser rules to determine validity of variant name
Sdf_PathParser::PPContext context;
bool result = false;
try
{
result = Sdf_PathParser::PEGTL_NS::parse<
Sdf_PathParser::PEGTL_NS::must<Sdf_PathParser::VariantName,
Sdf_PathParser::PEGTL_NS::eof>>(
Sdf_PathParser::PEGTL_NS::string_input<> {identifier, ""}, context);

for (; first != last; ++first) {
char c = *first;
if (!(isalnum(c) || (c == '_') || (c == '|') || (c == '-'))) {
if (!result)
{
return SdfAllowed(TfStringPrintf(
"\"%s\" is not a valid variant "
"name due to '%c' at index %d",
identifier.c_str(),
c,
(int)(first - identifier.begin())));
"\"%s\" is not a valid variant name",
identifier.c_str()));
}
}
catch(const Sdf_PathParser::PEGTL_NS::parse_error& e)
{
return SdfAllowed(TfStringPrintf(
"\"%s\" is not a valid variant "
"name due to '%s'",
identifier.c_str(),
e.what()));

return false;
}

return true;
}

Expand Down
1 change: 1 addition & 0 deletions pxr/usd/sdf/testenv/testSdfParsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_Basic(self):
# This will mean that your new test runs first and you can spot
# failures much quicker.
testFiles = '''
217_utf8_identifiers.sdf
216_bad_variant_in_relocates_path.sdf
215_bad_variant_in_specializes_path.sdf
214_bad_variant_in_inherits_path.sdf
Expand Down
Loading

0 comments on commit 1261a11

Please sign in to comment.