Skip to content

Commit

Permalink
Specify invalid UTF-8 characters in URI scheme registration errors
Browse files Browse the repository at this point in the history
  • Loading branch information
nvmkuruc committed Dec 12, 2023
1 parent 013d65a commit 925525c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 27 deletions.
22 changes: 13 additions & 9 deletions pxr/base/tf/unicodeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,19 @@ class TfUtf8CodePointIterator final {
using pointer = void;
using reference = uint32_t;

/// Constructs an iterator that can read UTF-8 character sequences from
/// the given starting string_view iterator \a it. \a end is used as a
/// guard against reading byte sequences past the end of the source string.
///
/// When working with views of substrings, \a end must not point to a
/// continuation byte in a valid UTF-8 character sequence to avoid decoding
/// errors.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {
TF_DEV_AXIOM(_it <= _end);
}

/// Retrieves the next UTF-8 character in the sequence as its Unicode
/// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the
/// byte sequence pointed to by the iterator cannot be decoded.
Expand Down Expand Up @@ -282,15 +295,6 @@ class TfUtf8CodePointIterator final {
}

private:
// Constructs an iterator that can read UTF-8 character sequences from
// the given starting string_view iterator \a it. \a end is used as a
// guard against reading byte sequences past the end of the source string.
// \a end must not be in the middle of a UTF-8 character sequence.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {
TF_DEV_AXIOM(_it <= _end);
}

using _EncodingLength = unsigned char;

Expand Down
30 changes: 12 additions & 18 deletions pxr/usd/ar/resolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "pxr/base/tf/stl.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/type.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <tbb/concurrent_hash_map.h>

Expand Down Expand Up @@ -139,36 +140,29 @@ class _ResolverInfo
// with an ASCII alpha character, followed by any number of ASCII alphanumeric
// or the hyphen, period, and plus characters.
std::pair<bool, std::string>
_ValidateResourceIdentifierScheme(const std::string& caseFoldedScheme) {
_ValidateResourceIdentifierScheme(const std::string_view& caseFoldedScheme) {
if (caseFoldedScheme.empty()) {
return std::make_pair(false, "Scheme cannot be empty");
}
if (caseFoldedScheme[0] > 'z' || caseFoldedScheme[0] < 'a') {
if (caseFoldedScheme.front() > 'z' || caseFoldedScheme.front() < 'a') {
return std::make_pair(false, "Scheme must start with ASCII 'a-z'");
}
const auto it = std::find_if(caseFoldedScheme.begin() + 1,
const auto it = std::find_if(std::next(caseFoldedScheme.begin()),
caseFoldedScheme.end(),
[](const char c) {
return !((c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'z') ||
(c == '-') || (c== '.') || (c=='+'));
});
if (it != caseFoldedScheme.end()) {
if ((((*it) & (1<<7)) == 0)) {
// TODO: Once the UTF-8 character iterator lands, it would be
// helpful to include the invalid UTF-8 character in the error
// message output. As invalid UTF-8 characters may span multiple
// bytes, it can't be trivially identified by the character
// iterator.
return std::make_pair(
false, "Non-ASCII UTF-8 characters not allowed in scheme");
}
else {
return std::make_pair(
- false, TfStringPrintf("Character '%c' not allowed in scheme. "
"Must be ASCII 'a-z', '-', '+', or '.'",
*it));
}
TfUtf8CodePointIterator codePointIt(it, caseFoldedScheme.end());
return std::make_pair(
false,
TfStringPrintf(
"Character '%s' not allowed in scheme. "
"Must be ASCII 'a-z', '-', '+', or '.'",
TfStringify(TfUtf8CodePoint{*codePointIt}).c_str())
);
}
return std::make_pair(true, "");
}
Expand Down

0 comments on commit 925525c

Please sign in to comment.