From a69226293e20862113e09f0e9f472a3e8e699736 Mon Sep 17 00:00:00 2001 From: Matt Kuruc Date: Tue, 26 Dec 2023 07:46:38 -0800 Subject: [PATCH] Update documentation for UTF-8 utilities --- pxr/base/tf/overview.dox | 2 +- pxr/base/tf/unicodeUtils.h | 102 +++++++++++++++++++++++-------------- 2 files changed, 65 insertions(+), 39 deletions(-) diff --git a/pxr/base/tf/overview.dox b/pxr/base/tf/overview.dox index 319aa06fc2..f463d23a00 100644 --- a/pxr/base/tf/overview.dox +++ b/pxr/base/tf/overview.dox @@ -94,7 +94,7 @@ The high-level grouping of C++ classes and functions is as follows:
  • \link group_tf_DebuggingOutput \b Output \b For \b Debugging - \endlink TfDebug, TF_DEBUG(), TF_FUNC_NAME()
  • \link group_tf_String \b String \b Utilities - \endlink - TfStringPrintf(), TfHash, (and a large number of miscellaneous free functions) + TfStringPrintf(), TfHash, TfUtf8CodePointView, (and a large number of miscellaneous free functions)
  • \link group_tf_Containers \b Containers - \endlink TfByteData, TfArray2, TfArray3, TfTypeInfoMap
  • \link group_tf_Stl \b STL \b Utilities - \endlink diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h index 4f9e242be5..7d78e3f2e5 100644 --- a/pxr/base/tf/unicodeUtils.h +++ b/pxr/base/tf/unicodeUtils.h @@ -26,6 +26,7 @@ /// \file tf/unicodeUtils.h /// \ingroup group_tf_String +/// Definitions of basic UTF-8 utilities in tf. #include "pxr/pxr.h" #include "pxr/base/tf/api.h" @@ -38,11 +39,22 @@ PXR_NAMESPACE_OPEN_SCOPE -/// Wrapper for a code point value that can be encoded as UTF-8 +/// \class TfUtf8CodePoint +/// \ingroup group_tf_String +/// +/// Wrapper for a 32-bit code point value that can be encoded as UTF-8. +/// +/// \code{.cpp} +/// // Stream operator overload encodes each code point as UTF-8. +/// std::stringstream s; +/// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120); +/// \endcode +/// A single `TfUtf8CodePoint` may be converted to a string using +/// `TfStringify` as well. class TfUtf8CodePoint { public: - /// Code points that cannot be decoded or outside of the valid range are - /// may be replaced with this value. + /// Code points that cannot be decoded or are outside of the valid range + /// will be replaced with this value. static constexpr uint32_t ReplacementValue = 0xFFFD; /// Values higher than this will be replaced with the replacement @@ -89,31 +101,46 @@ constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{ class TfUtf8CodePointIterator; +/// \class TfUtf8CodePointView +/// \ingroup group_tf_String +/// /// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over /// as code points instead of bytes. /// -/// Because of the variable length encoding, the `Utf8StringView` iterator is -/// a ForwardIterator and is read only. +/// Because of the variable length encoding, the `TfUtf8CodePointView` iterator +/// is a ForwardIterator and is read only. /// /// \code{.cpp} /// std::string value{"∫dx"}; -/// TfUtf8CodePointView view{value}; -/// for (const uint32_t codePoint : view) { +/// for (const uint32_t codePoint : TfUtf8CodePointView{value}) { /// if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) { /// TF_WARN("String cannot be decoded."); +/// break; /// } /// } -/// (The TfUtf8CodePointView's sentinel end() will make it compatible with -/// the STL ranges library). +/// \endcode +/// +/// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range +/// based for loops and the forthcoming STL ranges library; it avoids +/// triplicating the storage for the end iterator. `EndAsIterator()` +/// can be used for algorithms that require the begin and end iterators to be +/// of the same type but necessarily stores redundant copies of the endpoint. +/// +/// \code{.cpp} +/// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(), +/// [](const auto c) { return c == TfUtf8InvalidCodePoint.AsUInt32(); })) +/// { +/// TF_WARN("String cannot be decoded"); +/// } /// \endcode class TfUtf8CodePointView final { public: using const_iterator = TfUtf8CodePointIterator; - /// Model iteration ending when the underlying string_view's end iterator + /// Model iteration ending when the underlying `string_view`'s end iterator /// has been exceeded. This guards against strings whose variable length /// encoding pushes the iterator past the end of the underlying - /// string_view. + /// `string_view`. class PastTheEndSentinel final {}; TfUtf8CodePointView() = default; @@ -121,8 +148,8 @@ class TfUtf8CodePointView final { inline const_iterator begin() const; - /// The sentinel will compare as equal with any iterator at or past the end - /// of the underlying string_view + /// The sentinel will compare as equal with any iterator at the end + /// of the underlying `string_view` PastTheEndSentinel end() const { return PastTheEndSentinel{}; @@ -130,8 +157,8 @@ class TfUtf8CodePointView final { inline const_iterator cbegin() const; - /// The out of range sentinel will compare as equal with any iterator - /// at or past the end of the underlying string_view's + /// The sentinel will compare as equal with any iterator at the end + /// of the underlying `string_view` PastTheEndSentinel cend() const { return end(); @@ -143,11 +170,11 @@ class TfUtf8CodePointView final { return _view.empty(); } - /// Returns an iterator of the same type as begin that identifies the end + /// Returns an iterator of the same type as `begin` that identifies the end /// of the string. /// /// As the end iterator is stored three times, this is slightly heavier - /// than using the PastTheEndSentinel and should be avoided in performance + /// than using the `PastTheEndSentinel` and should be avoided in performance /// critical code paths. It is provided for convenience when an algorithm /// restricts the iterators to have the same type. /// @@ -163,10 +190,9 @@ class TfUtf8CodePointView final { /// code point values. /// /// UTF-8 is a variable length encoding, meaning that one Unicode -/// character can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This -/// iterator takes care of iterating the necessary characters in a string -/// and extracing the Unicode code point of each UTF-8 encoded character -/// in the sequence. +/// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This +/// iterator takes care of consuming the valid UTF-8 bytes for a +/// code point while incrementing. class TfUtf8CodePointIterator final { public: using iterator_category = std::forward_iterator_tag; @@ -175,19 +201,14 @@ class TfUtf8CodePointIterator final { using pointer = void; using reference = uint32_t; - /// Retrieves the next UTF-8 character in the sequence as its Unicode - /// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the + /// Retrieves the current UTF-8 character in the sequence as its Unicode + /// code point value. Returns `TfUtf8InvalidCodePoint.AsUInt32()` when the /// byte sequence pointed to by the iterator cannot be decoded. /// - /// If during read of the UTF-8 character sequence the underlying - /// string iterator would go beyond \a end defined at construction - /// time, a std::out_of_range exception will be thrown. + /// A code point might be invalid because it's incorrectly encoded, exceeds + /// the maximum allowed value, or is in the disallowed surrogate range. uint32_t operator* () const { - // If the current UTF-8 character is invalid, instead of - // throwing an exception, _GetCodePoint signals this is - // bad by setting the code point to 0xFFFD (this mostly happens - // when a high / low private surrogate is used) return _GetCodePoint(); } @@ -199,7 +220,8 @@ class TfUtf8CodePointIterator final { /// Determines if two iterators are equal. /// This intentionally does not consider the end iterator to allow for - /// comparison of iterators between substring views. + /// comparison of iterators between different substring views of the + /// same underlying string. bool operator== (const TfUtf8CodePointIterator& rhs) const { return (this->_it == rhs._it); @@ -207,7 +229,8 @@ class TfUtf8CodePointIterator final { /// Determines if two iterators are unequal. /// This intentionally does not consider the end iterator to allow for - /// comparison of iterators between substring views. + /// comparison of iterators between different substring views of the + /// same underlying string. bool operator!= (const TfUtf8CodePointIterator& rhs) const { return (this->_it != rhs._it); @@ -225,14 +248,14 @@ class TfUtf8CodePointIterator final { // condition. TF_DEV_AXIOM(!_IsPastTheEnd()); _EncodingLength increment = _GetEncodingLength(); - // note that in cases where the encoding is invalid, we move to the - // next byte this is necessary because otherwise the iterator would - // never advanced and the end condition of == iterator::end() would + // Note that in cases where the encoding is invalid, we move to the + // next byte. This is necessary because otherwise the iterator would + // never advance and the end condition of == iterator::end() would // never be satisfied. This means that we increment, even if the // encoding length is 0. ++_it; // Only continuation bytes will be consumed after the the first byte. - // This avoid consumption of ASCII characters or other starting bytes. + // This avoids consumption of ASCII characters or other starting bytes. auto isContinuation = [](const char c) { const auto uc = static_cast(c); return (uc >= static_cast('\x80')) && @@ -258,7 +281,7 @@ class TfUtf8CodePointIterator final { } /// Checks if the `lhs` iterator is at or past the end for the - /// underlying string_view + /// underlying `string_view` friend bool operator==(const TfUtf8CodePointIterator& lhs, TfUtf8CodePointView::PastTheEndSentinel) { @@ -286,7 +309,10 @@ class TfUtf8CodePointIterator final { // Constructs an iterator that can read UTF-8 character sequences from // the given starting string_view iterator \a it. \a end is used as a // guard against reading byte sequences past the end of the source string. - // \a end must not be in the middle of a UTF-8 character sequence. + // + // When working with views of substrings, \a end must not point to a + // continuation byte in a valid UTF-8 byte sequence to avoid decoding + // errors. TfUtf8CodePointIterator( const std::string_view::const_iterator& it, const std::string_view::const_iterator& end) : _it(it), _end(end) {