Validate continuation bytes during code point iteration.

PixarAnimationStudios · Dec 12, 2023 · 4a0caef · 4a0caef
1 parent f728d7f
commit 4a0caef
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 25 deletions.
diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp
@@ -26,6 +26,8 @@
 #include "pxr/base/tf/regTest.h"
 #include "pxr/base/tf/unicodeUtils.h"
 
+#include <algorithm>
+#include <array>
 #include <string_view>
 
 PXR_NAMESPACE_USING_DIRECTIVE
@@ -36,6 +38,8 @@ TestUtf8CodePointView()
 
     {
         TF_AXIOM(TfUtf8CodePointView{}.empty());
+        TF_AXIOM(TfUtf8CodePointView{}.cbegin() == TfUtf8CodePointView{}.EndAsIterator());
+        TF_AXIOM(std::cbegin(TfUtf8CodePointView{}) == std::cend(TfUtf8CodePointView{}));
     }
 
     // Exercise the iterator converting from UTF-8 char to code point
@@ -98,6 +102,38 @@ TestUtf8CodePointView()
         }
 
     }
+    {
+        // Unexpected continuations (\x80 and \x81) should be decoded as
+        // invalid code points.
+        const std::string_view sv{"\x80\x61\x62\x81\x63"};
+        const TfUtf8CodePointView uv{sv};
+        TF_AXIOM(std::next(std::begin(uv), 5) == std::end(uv));
+        TF_AXIOM(std::next(std::begin(uv), 5).GetBase() == std::end(sv));
+
+        std::array<uint32_t, 5> codePoints{0};
+        const std::array<uint32_t, 5> expectedCodePoints{{
+            TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62,
+            TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}};
+        std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
+        TF_AXIOM(codePoints == expectedCodePoints);
+    }
+    {
+        // Incomplete UTF-8 sequences should not consume valid character
+        // sequences
+        const std::string_view sv{"\xc0\x61\xe0\x85\x62\xf0\x83\x84\x63\xf1"};
+        const TfUtf8CodePointView uv{sv};
+        TF_AXIOM(std::next(std::begin(uv), 7) == std::end(uv));
+        TF_AXIOM(std::next(std::begin(uv), 7).GetBase() == std::end(sv));
+
+        std::array<uint32_t, 7> codePoints{0};
+        const std::array<uint32_t, 7> expectedCodePoints{{
+            TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61,
+            TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62,
+            TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63,
+            TfUtf8CodePointIterator::INVALID_CODE_POINT}};
+        std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
+        TF_AXIOM(codePoints == expectedCodePoints);
+    }
     return true;
 }
 

diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h
@@ -32,7 +32,6 @@
 #include "pxr/base/tf/diagnostic.h"
 #include "pxr/base/tf/unicodeCharacterClasses.h"
 
-#include <optional>
 #include <string>
 #include <string_view>
 
@@ -94,6 +93,18 @@ class TfUtf8CodePointView final {
         return _view.empty();
     }
 
+    /// Returns an iterator of the same type as begin that identifies the end
+    /// of the string.
+    ///
+    /// As the end iterator is stored three times, this is slightly heavier
+    /// than using the PastTheEndSentinel and should be avoided in performance
+    /// critical code paths. It is provided for convenience when an algorithm
+    /// restricts the iterators to have the same type.
+    ///
+    /// As C++20 ranges exposes more sentinel friendly algorithms, this can
+    /// likely be deprecated in the future.
+    inline const_iterator EndAsIterator() const;
+
 private:
     std::string_view _view;
 };
@@ -129,13 +140,6 @@ class TfUtf8CodePointIterator final {
         // throwing an exception, _GetCodePoint signals this is
         // bad by setting the code point to 0xFFFD (this mostly happens
         // when a high / low private surrogate is used)
-        // TODO: note that this isn't precisely conformant, as we
-        // likely consumed an entire sequence when a subset would have
-        // been invalid e.g. the byte sequence C2 41 would be detected
-        // as a 2-byte  sequence, but it is invalid because of the
-        // second byte signature by the standard, this should process
-        // the C2 as invalid but consume 41 as a valid 1-byte UTF-8
-        // character
         return _GetCodePoint();
     }
 
@@ -164,31 +168,42 @@ class TfUtf8CodePointIterator final {
     /// Advances the iterator logically one UTF-8 character sequence in
     /// the string. The underlying string iterator will be advanced
     /// according to the variable length encoding of the next UTF-8
-    /// character. Invalid leading bytes will increment until the end
-    /// of the range is reached.
+    /// character, but will never consume non-continuation bytes after
+    /// the current one.
     TfUtf8CodePointIterator& operator++ ()
     {
+        // The increment operator should never be called if it's past
+        // the end. The user is expected to have already checked this
+        // condition.
+        TF_DEV_AXIOM(!_IsPastTheEnd());
+        _EncodingLength increment = _GetEncodingLength();
         // note that in cases where the encoding is invalid, we move to the
         // next byte this is necessary because otherwise the iterator would
         // never advanced and the end condition of == iterator::end() would
-        // never be satisfied
-        _EncodingLength encodingLength = _GetEncodingLength();
-        std::advance(_it, (encodingLength != 0) ? encodingLength : 1);
-        if (_IsPastTheEnd()) {
-            _it = _end;
+        // never be satisfied. This means that we increment, even if the
+        // encoding length is 0.
+        ++_it;
+        // Only continuation bytes will be consumed after the the first byte.
+        // This avoid consumption of ASCII characters or other starting bytes.
+        auto isContinuation = [](const char c) {
+            const auto uc = static_cast<unsigned char>(c);
+            return (uc >= static_cast<unsigned char>('\x80')) &&
+                   (uc < static_cast<unsigned char>('\xc0'));
+        };
+        while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
+            ++_it;
+            --increment;
         }
         return *this;
     }
 
-    /// Advances the iterator logically one UTF-8 character sequence in the
-    /// string. The underlying string iterator will be advanced according
-    /// to the variable length encoding of the next UTF-8 character.
+    /// Advances the iterator logically one UTF-8 character sequence in
+    /// the string. The underlying string iterator will be advanced
+    /// according to the variable length encoding of the next UTF-8
+    /// character, but will never consume non-continuation bytes after
+    /// the current one.
     TfUtf8CodePointIterator operator++ (int)
     {
-        // note that in cases where the encoding is invalid, we move to the
-        // next byte this is necessary because otherwise the iterator would
-        // never advanced and the end condition of == iterator::end() would
-        // never be satisfied
         auto temp = *this;
         ++(*this);
         return temp;
@@ -223,6 +238,7 @@ class TfUtf8CodePointIterator final {
     // Constructs an iterator that can read UTF-8 character sequences from
     // the given starting string_view iterator \a it. \a end is used as a
     // guard against reading byte sequences past the end of the source string.
+    // \a end must not be in the middle of a UTF-8 character sequence.
     TfUtf8CodePointIterator(
         const std::string_view::const_iterator& it,
         const std::string_view::const_iterator& end) : _it(it), _end(end) {
@@ -252,15 +268,15 @@ class TfUtf8CodePointIterator final {
         {
             return 1;
         }
-        else if ((x >> 5) == 0x6)
+        else if ((x >= 0xc0) && (x < 0xe0))
         {
             return 2;
         }
-        else if ((x >> 4) == 0xe)
+        else if ((x >= 0xe0) && (x < 0xf0))
         {
             return 3;
         }
-        else if ((x >> 3) == 0x1e)
+        else if ((x >= 0xf0) && (x < 0xf8))
         {
             return 4;
         }
@@ -302,6 +318,12 @@ inline TfUtf8CodePointView::const_iterator TfUtf8CodePointView::cbegin() const
     return begin();
 }
 
+inline TfUtf8CodePointView::const_iterator
+TfUtf8CodePointView::EndAsIterator() const
+{
+    return const_iterator(std::cend(_view), std::cend(_view));
+}
+
 /// Determines whether the given Unicode \a codePoint is in the XID_Start
 /// character class.
 ///