diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp index 423128c28c..1e821a45a7 100644 --- a/pxr/base/tf/testenv/unicodeUtils.cpp +++ b/pxr/base/tf/testenv/unicodeUtils.cpp @@ -26,6 +26,8 @@ #include "pxr/base/tf/regTest.h" #include "pxr/base/tf/unicodeUtils.h" +#include +#include #include PXR_NAMESPACE_USING_DIRECTIVE @@ -36,6 +38,8 @@ TestUtf8CodePointView() { TF_AXIOM(TfUtf8CodePointView{}.empty()); + TF_AXIOM(TfUtf8CodePointView{}.cbegin() == TfUtf8CodePointView{}.EndAsIterator()); + TF_AXIOM(std::cbegin(TfUtf8CodePointView{}) == std::cend(TfUtf8CodePointView{})); } // Exercise the iterator converting from UTF-8 char to code point @@ -98,6 +102,38 @@ TestUtf8CodePointView() } } + { + // Unexpected continuations (\x80 and \x81) should be decoded as + // invalid code points. + const std::string_view sv{"\x80\x61\x62\x81\x63"}; + const TfUtf8CodePointView uv{sv}; + TF_AXIOM(std::next(std::begin(uv), 5) == std::end(uv)); + TF_AXIOM(std::next(std::begin(uv), 5).GetBase() == std::end(sv)); + + std::array codePoints{0}; + const std::array expectedCodePoints{{ + TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62, + TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}}; + std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints)); + TF_AXIOM(codePoints == expectedCodePoints); + } + { + // Incomplete UTF-8 sequences should not consume valid character + // sequences + const std::string_view sv{"\xc0\x61\xe0\x85\x62\xf0\x83\x84\x63\xf1"}; + const TfUtf8CodePointView uv{sv}; + TF_AXIOM(std::next(std::begin(uv), 7) == std::end(uv)); + TF_AXIOM(std::next(std::begin(uv), 7).GetBase() == std::end(sv)); + + std::array codePoints{0}; + const std::array expectedCodePoints{{ + TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, + TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62, + TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63, + TfUtf8CodePointIterator::INVALID_CODE_POINT}}; + std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints)); + TF_AXIOM(codePoints == expectedCodePoints); + } return true; } diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h index b051c7779b..8bce30c349 100644 --- a/pxr/base/tf/unicodeUtils.h +++ b/pxr/base/tf/unicodeUtils.h @@ -32,7 +32,6 @@ #include "pxr/base/tf/diagnostic.h" #include "pxr/base/tf/unicodeCharacterClasses.h" -#include #include #include @@ -94,6 +93,18 @@ class TfUtf8CodePointView final { return _view.empty(); } + /// Returns an iterator of the same type as begin that identifies the end + /// of the string. + /// + /// As the end iterator is stored three times, this is slightly heavier + /// than using the PastTheEndSentinel and should be avoided in performance + /// critical code paths. It is provided for convenience when an algorithm + /// restricts the iterators to have the same type. + /// + /// As C++20 ranges exposes more sentinel friendly algorithms, this can + /// likely be deprecated in the future. + inline const_iterator EndAsIterator() const; + private: std::string_view _view; }; @@ -129,13 +140,6 @@ class TfUtf8CodePointIterator final { // throwing an exception, _GetCodePoint signals this is // bad by setting the code point to 0xFFFD (this mostly happens // when a high / low private surrogate is used) - // TODO: note that this isn't precisely conformant, as we - // likely consumed an entire sequence when a subset would have - // been invalid e.g. the byte sequence C2 41 would be detected - // as a 2-byte sequence, but it is invalid because of the - // second byte signature by the standard, this should process - // the C2 as invalid but consume 41 as a valid 1-byte UTF-8 - // character return _GetCodePoint(); } @@ -164,31 +168,42 @@ class TfUtf8CodePointIterator final { /// Advances the iterator logically one UTF-8 character sequence in /// the string. The underlying string iterator will be advanced /// according to the variable length encoding of the next UTF-8 - /// character. Invalid leading bytes will increment until the end - /// of the range is reached. + /// character, but will never consume non-continuation bytes after + /// the current one. TfUtf8CodePointIterator& operator++ () { + // The increment operator should never be called if it's past + // the end. The user is expected to have already checked this + // condition. + TF_DEV_AXIOM(!_IsPastTheEnd()); + _EncodingLength increment = _GetEncodingLength(); // note that in cases where the encoding is invalid, we move to the // next byte this is necessary because otherwise the iterator would // never advanced and the end condition of == iterator::end() would - // never be satisfied - _EncodingLength encodingLength = _GetEncodingLength(); - std::advance(_it, (encodingLength != 0) ? encodingLength : 1); - if (_IsPastTheEnd()) { - _it = _end; + // never be satisfied. This means that we increment, even if the + // encoding length is 0. + ++_it; + // Only continuation bytes will be consumed after the the first byte. + // This avoid consumption of ASCII characters or other starting bytes. + auto isContinuation = [](const char c) { + const auto uc = static_cast(c); + return (uc >= static_cast('\x80')) && + (uc < static_cast('\xc0')); + }; + while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) { + ++_it; + --increment; } return *this; } - /// Advances the iterator logically one UTF-8 character sequence in the - /// string. The underlying string iterator will be advanced according - /// to the variable length encoding of the next UTF-8 character. + /// Advances the iterator logically one UTF-8 character sequence in + /// the string. The underlying string iterator will be advanced + /// according to the variable length encoding of the next UTF-8 + /// character, but will never consume non-continuation bytes after + /// the current one. TfUtf8CodePointIterator operator++ (int) { - // note that in cases where the encoding is invalid, we move to the - // next byte this is necessary because otherwise the iterator would - // never advanced and the end condition of == iterator::end() would - // never be satisfied auto temp = *this; ++(*this); return temp; @@ -223,6 +238,7 @@ class TfUtf8CodePointIterator final { // Constructs an iterator that can read UTF-8 character sequences from // the given starting string_view iterator \a it. \a end is used as a // guard against reading byte sequences past the end of the source string. + // \a end must not be in the middle of a UTF-8 character sequence. TfUtf8CodePointIterator( const std::string_view::const_iterator& it, const std::string_view::const_iterator& end) : _it(it), _end(end) { @@ -252,15 +268,15 @@ class TfUtf8CodePointIterator final { { return 1; } - else if ((x >> 5) == 0x6) + else if ((x >= 0xc0) && (x < 0xe0)) { return 2; } - else if ((x >> 4) == 0xe) + else if ((x >= 0xe0) && (x < 0xf0)) { return 3; } - else if ((x >> 3) == 0x1e) + else if ((x >= 0xf0) && (x < 0xf8)) { return 4; } @@ -302,6 +318,12 @@ inline TfUtf8CodePointView::const_iterator TfUtf8CodePointView::cbegin() const return begin(); } +inline TfUtf8CodePointView::const_iterator +TfUtf8CodePointView::EndAsIterator() const +{ + return const_iterator(std::cend(_view), std::cend(_view)); +} + /// Determines whether the given Unicode \a codePoint is in the XID_Start /// character class. ///