Update documentation for UTF-8 utilities

PixarAnimationStudios · Jan 3, 2024 · a692262 · a692262
1 parent da57a76
commit a692262
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 39 deletions.
diff --git a/pxr/base/tf/overview.dox b/pxr/base/tf/overview.dox
@@ -94,7 +94,7 @@ The high-level grouping of C++ classes and functions is as follows:
 	<li> \link group_tf_DebuggingOutput \b Output \b For \b Debugging - \endlink 
 		TfDebug,  TF_DEBUG(),  TF_FUNC_NAME() 
 	<li> \link group_tf_String \b String \b Utilities - \endlink 
-		TfStringPrintf(),  TfHash, (and a large number of miscellaneous free functions) 
+		TfStringPrintf(), TfHash, TfUtf8CodePointView, (and a large number of miscellaneous free functions)
 	<li> \link group_tf_Containers \b Containers - \endlink 
 		TfByteData,  TfArray2,  TfArray3, TfTypeInfoMap
 	<li> \link group_tf_Stl \b STL \b Utilities - \endlink 

diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h
@@ -26,6 +26,7 @@
 
 /// \file tf/unicodeUtils.h
 /// \ingroup group_tf_String
+/// Definitions of basic UTF-8 utilities in tf.
 
 #include "pxr/pxr.h"
 #include "pxr/base/tf/api.h"
@@ -38,11 +39,22 @@
 
 PXR_NAMESPACE_OPEN_SCOPE
 
-/// Wrapper for a code point value that can be encoded as UTF-8
+/// \class TfUtf8CodePoint
+/// \ingroup group_tf_String
+///
+/// Wrapper for a 32-bit code point value that can be encoded as UTF-8.
+///
+/// \code{.cpp}
+/// // Stream operator overload encodes each code point as UTF-8.
+/// std::stringstream s;
+/// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120);
+/// \endcode
+/// A single `TfUtf8CodePoint` may be converted to a string using
+/// `TfStringify` as well.
 class TfUtf8CodePoint {
 public:
-    /// Code points that cannot be decoded or outside of the valid range are
-    /// may be replaced with this value.
+    /// Code points that cannot be decoded or are outside of the valid range
+    /// will be replaced with this value.
     static constexpr uint32_t ReplacementValue = 0xFFFD;
 
     /// Values higher than this will be replaced with the replacement
@@ -89,49 +101,64 @@ constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
 
 class TfUtf8CodePointIterator;
 
+/// \class TfUtf8CodePointView
+/// \ingroup group_tf_String
+///
 /// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
 /// as code points instead of bytes.
 ///
-/// Because of the variable length encoding, the `Utf8StringView` iterator is
-/// a ForwardIterator and is read only.
+/// Because of the variable length encoding, the `TfUtf8CodePointView` iterator
+/// is a ForwardIterator and is read only.
 ///
 /// \code{.cpp}
 /// std::string value{"∫dx"};
-/// TfUtf8CodePointView view{value};
-/// for (const uint32_t codePoint : view) {
+/// for (const uint32_t codePoint : TfUtf8CodePointView{value}) {
 ///     if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) {
 ///         TF_WARN("String cannot be decoded.");
+///         break;
 ///     }
 /// }
-/// (The TfUtf8CodePointView's sentinel end() will make it compatible with
-///  the STL ranges library).
+/// \endcode
+///
+/// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range
+/// based for loops and the forthcoming STL ranges library; it avoids
+/// triplicating the storage for the end iterator. `EndAsIterator()`
+/// can be used for algorithms that require the begin and end iterators to be
+/// of the same type but necessarily stores redundant copies of the endpoint.
+///
+/// \code{.cpp}
+/// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
+///     [](const auto c) { return c == TfUtf8InvalidCodePoint.AsUInt32(); }))
+/// {
+///     TF_WARN("String cannot be decoded");
+/// }
 /// \endcode
 class TfUtf8CodePointView final {
 public:
     using const_iterator = TfUtf8CodePointIterator;
 
-    /// Model iteration ending when the underlying string_view's end iterator
+    /// Model iteration ending when the underlying `string_view`'s end iterator
     /// has been exceeded. This guards against strings whose variable length
     /// encoding pushes the iterator past the end of the underlying
-    /// string_view.
+    /// `string_view`.
     class PastTheEndSentinel final {};
 
     TfUtf8CodePointView() = default;
     explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}
 
     inline const_iterator begin() const;
 
-    /// The sentinel will compare as equal with any iterator at or past the end
-    /// of the underlying string_view
+    /// The sentinel will compare as equal with any iterator at the end
+    /// of the underlying `string_view`
     PastTheEndSentinel end() const
     {
         return PastTheEndSentinel{};
     }
 
     inline const_iterator cbegin() const;
 
-    /// The out of range sentinel will compare as equal with any iterator
-    /// at or past the end of the underlying string_view's
+    /// The sentinel will compare as equal with any iterator at the end
+    /// of the underlying `string_view`
     PastTheEndSentinel cend() const
     {
         return end();
@@ -143,11 +170,11 @@ class TfUtf8CodePointView final {
         return _view.empty();
     }
 
-    /// Returns an iterator of the same type as begin that identifies the end
+    /// Returns an iterator of the same type as `begin` that identifies the end
     /// of the string.
     ///
     /// As the end iterator is stored three times, this is slightly heavier
-    /// than using the PastTheEndSentinel and should be avoided in performance
+    /// than using the `PastTheEndSentinel` and should be avoided in performance
     /// critical code paths. It is provided for convenience when an algorithm
     /// restricts the iterators to have the same type.
     ///
@@ -163,10 +190,9 @@ class TfUtf8CodePointView final {
 /// code point values.
 ///
 /// UTF-8 is a variable length encoding, meaning that one Unicode
-/// character can be encoded in UTF-8 as 1, 2, 3, or 4 bytes.  This
-/// iterator takes care of iterating the necessary characters in a string
-/// and extracing the Unicode code point of each UTF-8 encoded character
-/// in the sequence.
+/// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes.  This
+/// iterator takes care of consuming the valid UTF-8 bytes for a
+/// code point while incrementing.
 class TfUtf8CodePointIterator final {
 public:
     using iterator_category = std::forward_iterator_tag;
@@ -175,19 +201,14 @@ class TfUtf8CodePointIterator final {
     using pointer = void;
     using reference = uint32_t;
 
-    /// Retrieves the next UTF-8 character in the sequence as its Unicode
-    /// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the
+    /// Retrieves the current UTF-8 character in the sequence as its Unicode
+    /// code point value. Returns `TfUtf8InvalidCodePoint.AsUInt32()` when the
     /// byte sequence pointed to by the iterator cannot be decoded.
     ///
-    /// If during read of the UTF-8 character sequence the underlying
-    /// string iterator would go beyond \a end defined at construction
-    /// time, a std::out_of_range exception will be thrown.
+    /// A code point might be invalid because it's incorrectly encoded, exceeds
+    /// the maximum allowed value, or is in the disallowed surrogate range.
     uint32_t operator* () const
     {
-        // If the current UTF-8 character is invalid, instead of
-        // throwing an exception, _GetCodePoint signals this is
-        // bad by setting the code point to 0xFFFD (this mostly happens
-        // when a high / low private surrogate is used)
         return _GetCodePoint();
     }
 
@@ -199,15 +220,17 @@ class TfUtf8CodePointIterator final {
 
     /// Determines if two iterators are equal.
     /// This intentionally does not consider the end iterator to allow for
-    /// comparison of iterators between substring views.
+    /// comparison of iterators between different substring views of the
+    /// same underlying string.
     bool operator== (const TfUtf8CodePointIterator& rhs) const
     {
         return (this->_it == rhs._it);
     }
 
     /// Determines if two iterators are unequal.
     /// This intentionally does not consider the end iterator to allow for
-    /// comparison of iterators between substring views.
+    /// comparison of iterators between different substring views of the
+    /// same underlying string.
     bool operator!= (const TfUtf8CodePointIterator& rhs) const
     {
         return (this->_it != rhs._it);
@@ -225,14 +248,14 @@ class TfUtf8CodePointIterator final {
         // condition.
         TF_DEV_AXIOM(!_IsPastTheEnd());
         _EncodingLength increment = _GetEncodingLength();
-        // note that in cases where the encoding is invalid, we move to the
-        // next byte this is necessary because otherwise the iterator would
-        // never advanced and the end condition of == iterator::end() would
+        // Note that in cases where the encoding is invalid, we move to the
+        // next byte. This is necessary because otherwise the iterator would
+        // never advance and the end condition of == iterator::end() would
         // never be satisfied. This means that we increment, even if the
         // encoding length is 0.
         ++_it;
         // Only continuation bytes will be consumed after the the first byte.
-        // This avoid consumption of ASCII characters or other starting bytes.
+        // This avoids consumption of ASCII characters or other starting bytes.
         auto isContinuation = [](const char c) {
             const auto uc = static_cast<unsigned char>(c);
             return (uc >= static_cast<unsigned char>('\x80')) &&
@@ -258,7 +281,7 @@ class TfUtf8CodePointIterator final {
     }
 
     /// Checks if the `lhs` iterator is at or past the end for the
-    /// underlying string_view
+    /// underlying `string_view`
     friend bool operator==(const TfUtf8CodePointIterator& lhs,
                            TfUtf8CodePointView::PastTheEndSentinel)
     {
@@ -286,7 +309,10 @@ class TfUtf8CodePointIterator final {
     // Constructs an iterator that can read UTF-8 character sequences from
     // the given starting string_view iterator \a it. \a end is used as a
     // guard against reading byte sequences past the end of the source string.
-    // \a end must not be in the middle of a UTF-8 character sequence.
+    //
+    // When working with views of substrings, \a end must not point to a
+    // continuation byte in a valid UTF-8 byte sequence to avoid decoding
+    // errors.
     TfUtf8CodePointIterator(
         const std::string_view::const_iterator& it,
         const std::string_view::const_iterator& end) : _it(it), _end(end) {