Skip to content

Commit

Permalink
Validate continuation bytes during code point iteration.
Browse files Browse the repository at this point in the history
  • Loading branch information
nvmkuruc committed Dec 12, 2023
1 parent f728d7f commit 4a0caef
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 25 deletions.
36 changes: 36 additions & 0 deletions pxr/base/tf/testenv/unicodeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include "pxr/base/tf/regTest.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <algorithm>
#include <array>
#include <string_view>

PXR_NAMESPACE_USING_DIRECTIVE
Expand All @@ -36,6 +38,8 @@ TestUtf8CodePointView()

{
TF_AXIOM(TfUtf8CodePointView{}.empty());
TF_AXIOM(TfUtf8CodePointView{}.cbegin() == TfUtf8CodePointView{}.EndAsIterator());
TF_AXIOM(std::cbegin(TfUtf8CodePointView{}) == std::cend(TfUtf8CodePointView{}));
}

// Exercise the iterator converting from UTF-8 char to code point
Expand Down Expand Up @@ -98,6 +102,38 @@ TestUtf8CodePointView()
}

}
{
// Unexpected continuations (\x80 and \x81) should be decoded as
// invalid code points.
const std::string_view sv{"\x80\x61\x62\x81\x63"};
const TfUtf8CodePointView uv{sv};
TF_AXIOM(std::next(std::begin(uv), 5) == std::end(uv));
TF_AXIOM(std::next(std::begin(uv), 5).GetBase() == std::end(sv));

std::array<uint32_t, 5> codePoints{0};
const std::array<uint32_t, 5> expectedCodePoints{{
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}};
std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
TF_AXIOM(codePoints == expectedCodePoints);
}
{
// Incomplete UTF-8 sequences should not consume valid character
// sequences
const std::string_view sv{"\xc0\x61\xe0\x85\x62\xf0\x83\x84\x63\xf1"};
const TfUtf8CodePointView uv{sv};
TF_AXIOM(std::next(std::begin(uv), 7) == std::end(uv));
TF_AXIOM(std::next(std::begin(uv), 7).GetBase() == std::end(sv));

std::array<uint32_t, 7> codePoints{0};
const std::array<uint32_t, 7> expectedCodePoints{{
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62,
TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63,
TfUtf8CodePointIterator::INVALID_CODE_POINT}};
std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
TF_AXIOM(codePoints == expectedCodePoints);
}
return true;
}

Expand Down
72 changes: 47 additions & 25 deletions pxr/base/tf/unicodeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/unicodeCharacterClasses.h"

#include <optional>
#include <string>
#include <string_view>

Expand Down Expand Up @@ -94,6 +93,18 @@ class TfUtf8CodePointView final {
return _view.empty();
}

/// Returns an iterator of the same type as begin that identifies the end
/// of the string.
///
/// As the end iterator is stored three times, this is slightly heavier
/// than using the PastTheEndSentinel and should be avoided in performance
/// critical code paths. It is provided for convenience when an algorithm
/// restricts the iterators to have the same type.
///
/// As C++20 ranges exposes more sentinel friendly algorithms, this can
/// likely be deprecated in the future.
inline const_iterator EndAsIterator() const;

private:
std::string_view _view;
};
Expand Down Expand Up @@ -129,13 +140,6 @@ class TfUtf8CodePointIterator final {
// throwing an exception, _GetCodePoint signals this is
// bad by setting the code point to 0xFFFD (this mostly happens
// when a high / low private surrogate is used)
// TODO: note that this isn't precisely conformant, as we
// likely consumed an entire sequence when a subset would have
// been invalid e.g. the byte sequence C2 41 would be detected
// as a 2-byte sequence, but it is invalid because of the
// second byte signature by the standard, this should process
// the C2 as invalid but consume 41 as a valid 1-byte UTF-8
// character
return _GetCodePoint();
}

Expand Down Expand Up @@ -164,31 +168,42 @@ class TfUtf8CodePointIterator final {
/// Advances the iterator logically one UTF-8 character sequence in
/// the string. The underlying string iterator will be advanced
/// according to the variable length encoding of the next UTF-8
/// character. Invalid leading bytes will increment until the end
/// of the range is reached.
/// character, but will never consume non-continuation bytes after
/// the current one.
TfUtf8CodePointIterator& operator++ ()
{
// The increment operator should never be called if it's past
// the end. The user is expected to have already checked this
// condition.
TF_DEV_AXIOM(!_IsPastTheEnd());
_EncodingLength increment = _GetEncodingLength();
// note that in cases where the encoding is invalid, we move to the
// next byte this is necessary because otherwise the iterator would
// never advanced and the end condition of == iterator::end() would
// never be satisfied
_EncodingLength encodingLength = _GetEncodingLength();
std::advance(_it, (encodingLength != 0) ? encodingLength : 1);
if (_IsPastTheEnd()) {
_it = _end;
// never be satisfied. This means that we increment, even if the
// encoding length is 0.
++_it;
// Only continuation bytes will be consumed after the the first byte.
// This avoid consumption of ASCII characters or other starting bytes.
auto isContinuation = [](const char c) {
const auto uc = static_cast<unsigned char>(c);
return (uc >= static_cast<unsigned char>('\x80')) &&
(uc < static_cast<unsigned char>('\xc0'));
};
while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
++_it;
--increment;
}
return *this;
}

/// Advances the iterator logically one UTF-8 character sequence in the
/// string. The underlying string iterator will be advanced according
/// to the variable length encoding of the next UTF-8 character.
/// Advances the iterator logically one UTF-8 character sequence in
/// the string. The underlying string iterator will be advanced
/// according to the variable length encoding of the next UTF-8
/// character, but will never consume non-continuation bytes after
/// the current one.
TfUtf8CodePointIterator operator++ (int)
{
// note that in cases where the encoding is invalid, we move to the
// next byte this is necessary because otherwise the iterator would
// never advanced and the end condition of == iterator::end() would
// never be satisfied
auto temp = *this;
++(*this);
return temp;
Expand Down Expand Up @@ -223,6 +238,7 @@ class TfUtf8CodePointIterator final {
// Constructs an iterator that can read UTF-8 character sequences from
// the given starting string_view iterator \a it. \a end is used as a
// guard against reading byte sequences past the end of the source string.
// \a end must not be in the middle of a UTF-8 character sequence.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {
Expand Down Expand Up @@ -252,15 +268,15 @@ class TfUtf8CodePointIterator final {
{
return 1;
}
else if ((x >> 5) == 0x6)
else if ((x >= 0xc0) && (x < 0xe0))
{
return 2;
}
else if ((x >> 4) == 0xe)
else if ((x >= 0xe0) && (x < 0xf0))
{
return 3;
}
else if ((x >> 3) == 0x1e)
else if ((x >= 0xf0) && (x < 0xf8))
{
return 4;
}
Expand Down Expand Up @@ -302,6 +318,12 @@ inline TfUtf8CodePointView::const_iterator TfUtf8CodePointView::cbegin() const
return begin();
}

inline TfUtf8CodePointView::const_iterator
TfUtf8CodePointView::EndAsIterator() const
{
return const_iterator(std::cend(_view), std::cend(_view));
}

/// Determines whether the given Unicode \a codePoint is in the XID_Start
/// character class.
///
Expand Down

0 comments on commit 4a0caef

Please sign in to comment.