diff --git a/pxr/base/tf/overview.dox b/pxr/base/tf/overview.dox
index 319aa06fc2..f463d23a00 100644
--- a/pxr/base/tf/overview.dox
+++ b/pxr/base/tf/overview.dox
@@ -94,7 +94,7 @@ The high-level grouping of C++ classes and functions is as follows:
\link group_tf_DebuggingOutput \b Output \b For \b Debugging - \endlink
TfDebug, TF_DEBUG(), TF_FUNC_NAME()
\link group_tf_String \b String \b Utilities - \endlink
- TfStringPrintf(), TfHash, (and a large number of miscellaneous free functions)
+ TfStringPrintf(), TfHash, TfUtf8CodePointView, (and a large number of miscellaneous free functions)
\link group_tf_Containers \b Containers - \endlink
TfByteData, TfArray2, TfArray3, TfTypeInfoMap
\link group_tf_Stl \b STL \b Utilities - \endlink
diff --git a/pxr/base/tf/unicodeUtils.h b/pxr/base/tf/unicodeUtils.h
index 4f9e242be5..7d78e3f2e5 100644
--- a/pxr/base/tf/unicodeUtils.h
+++ b/pxr/base/tf/unicodeUtils.h
@@ -26,6 +26,7 @@
/// \file tf/unicodeUtils.h
/// \ingroup group_tf_String
+/// Definitions of basic UTF-8 utilities in tf.
#include "pxr/pxr.h"
#include "pxr/base/tf/api.h"
@@ -38,11 +39,22 @@
PXR_NAMESPACE_OPEN_SCOPE
-/// Wrapper for a code point value that can be encoded as UTF-8
+/// \class TfUtf8CodePoint
+/// \ingroup group_tf_String
+///
+/// Wrapper for a 32-bit code point value that can be encoded as UTF-8.
+///
+/// \code{.cpp}
+/// // Stream operator overload encodes each code point as UTF-8.
+/// std::stringstream s;
+/// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120);
+/// \endcode
+/// A single `TfUtf8CodePoint` may be converted to a string using
+/// `TfStringify` as well.
class TfUtf8CodePoint {
public:
- /// Code points that cannot be decoded or outside of the valid range are
- /// may be replaced with this value.
+ /// Code points that cannot be decoded or are outside of the valid range
+ /// will be replaced with this value.
static constexpr uint32_t ReplacementValue = 0xFFFD;
/// Values higher than this will be replaced with the replacement
@@ -89,31 +101,46 @@ constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
class TfUtf8CodePointIterator;
+/// \class TfUtf8CodePointView
+/// \ingroup group_tf_String
+///
/// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
/// as code points instead of bytes.
///
-/// Because of the variable length encoding, the `Utf8StringView` iterator is
-/// a ForwardIterator and is read only.
+/// Because of the variable length encoding, the `TfUtf8CodePointView` iterator
+/// is a ForwardIterator and is read only.
///
/// \code{.cpp}
/// std::string value{"∫dx"};
-/// TfUtf8CodePointView view{value};
-/// for (const uint32_t codePoint : view) {
+/// for (const uint32_t codePoint : TfUtf8CodePointView{value}) {
/// if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) {
/// TF_WARN("String cannot be decoded.");
+/// break;
/// }
/// }
-/// (The TfUtf8CodePointView's sentinel end() will make it compatible with
-/// the STL ranges library).
+/// \endcode
+///
+/// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range
+/// based for loops and the forthcoming STL ranges library; it avoids
+/// triplicating the storage for the end iterator. `EndAsIterator()`
+/// can be used for algorithms that require the begin and end iterators to be
+/// of the same type but necessarily stores redundant copies of the endpoint.
+///
+/// \code{.cpp}
+/// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
+/// [](const auto c) { return c == TfUtf8InvalidCodePoint.AsUInt32(); }))
+/// {
+/// TF_WARN("String cannot be decoded");
+/// }
/// \endcode
class TfUtf8CodePointView final {
public:
using const_iterator = TfUtf8CodePointIterator;
- /// Model iteration ending when the underlying string_view's end iterator
+ /// Model iteration ending when the underlying `string_view`'s end iterator
/// has been exceeded. This guards against strings whose variable length
/// encoding pushes the iterator past the end of the underlying
- /// string_view.
+ /// `string_view`.
class PastTheEndSentinel final {};
TfUtf8CodePointView() = default;
@@ -121,8 +148,8 @@ class TfUtf8CodePointView final {
inline const_iterator begin() const;
- /// The sentinel will compare as equal with any iterator at or past the end
- /// of the underlying string_view
+ /// The sentinel will compare as equal with any iterator at the end
+ /// of the underlying `string_view`
PastTheEndSentinel end() const
{
return PastTheEndSentinel{};
@@ -130,8 +157,8 @@ class TfUtf8CodePointView final {
inline const_iterator cbegin() const;
- /// The out of range sentinel will compare as equal with any iterator
- /// at or past the end of the underlying string_view's
+ /// The sentinel will compare as equal with any iterator at the end
+ /// of the underlying `string_view`
PastTheEndSentinel cend() const
{
return end();
@@ -143,11 +170,11 @@ class TfUtf8CodePointView final {
return _view.empty();
}
- /// Returns an iterator of the same type as begin that identifies the end
+ /// Returns an iterator of the same type as `begin` that identifies the end
/// of the string.
///
/// As the end iterator is stored three times, this is slightly heavier
- /// than using the PastTheEndSentinel and should be avoided in performance
+ /// than using the `PastTheEndSentinel` and should be avoided in performance
/// critical code paths. It is provided for convenience when an algorithm
/// restricts the iterators to have the same type.
///
@@ -163,10 +190,9 @@ class TfUtf8CodePointView final {
/// code point values.
///
/// UTF-8 is a variable length encoding, meaning that one Unicode
-/// character can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
-/// iterator takes care of iterating the necessary characters in a string
-/// and extracing the Unicode code point of each UTF-8 encoded character
-/// in the sequence.
+/// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
+/// iterator takes care of consuming the valid UTF-8 bytes for a
+/// code point while incrementing.
class TfUtf8CodePointIterator final {
public:
using iterator_category = std::forward_iterator_tag;
@@ -175,19 +201,14 @@ class TfUtf8CodePointIterator final {
using pointer = void;
using reference = uint32_t;
- /// Retrieves the next UTF-8 character in the sequence as its Unicode
- /// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the
+ /// Retrieves the current UTF-8 character in the sequence as its Unicode
+ /// code point value. Returns `TfUtf8InvalidCodePoint.AsUInt32()` when the
/// byte sequence pointed to by the iterator cannot be decoded.
///
- /// If during read of the UTF-8 character sequence the underlying
- /// string iterator would go beyond \a end defined at construction
- /// time, a std::out_of_range exception will be thrown.
+ /// A code point might be invalid because it's incorrectly encoded, exceeds
+ /// the maximum allowed value, or is in the disallowed surrogate range.
uint32_t operator* () const
{
- // If the current UTF-8 character is invalid, instead of
- // throwing an exception, _GetCodePoint signals this is
- // bad by setting the code point to 0xFFFD (this mostly happens
- // when a high / low private surrogate is used)
return _GetCodePoint();
}
@@ -199,7 +220,8 @@ class TfUtf8CodePointIterator final {
/// Determines if two iterators are equal.
/// This intentionally does not consider the end iterator to allow for
- /// comparison of iterators between substring views.
+ /// comparison of iterators between different substring views of the
+ /// same underlying string.
bool operator== (const TfUtf8CodePointIterator& rhs) const
{
return (this->_it == rhs._it);
@@ -207,7 +229,8 @@ class TfUtf8CodePointIterator final {
/// Determines if two iterators are unequal.
/// This intentionally does not consider the end iterator to allow for
- /// comparison of iterators between substring views.
+ /// comparison of iterators between different substring views of the
+ /// same underlying string.
bool operator!= (const TfUtf8CodePointIterator& rhs) const
{
return (this->_it != rhs._it);
@@ -225,14 +248,14 @@ class TfUtf8CodePointIterator final {
// condition.
TF_DEV_AXIOM(!_IsPastTheEnd());
_EncodingLength increment = _GetEncodingLength();
- // note that in cases where the encoding is invalid, we move to the
- // next byte this is necessary because otherwise the iterator would
- // never advanced and the end condition of == iterator::end() would
+ // Note that in cases where the encoding is invalid, we move to the
+ // next byte. This is necessary because otherwise the iterator would
+ // never advance and the end condition of == iterator::end() would
// never be satisfied. This means that we increment, even if the
// encoding length is 0.
++_it;
// Only continuation bytes will be consumed after the the first byte.
- // This avoid consumption of ASCII characters or other starting bytes.
+ // This avoids consumption of ASCII characters or other starting bytes.
auto isContinuation = [](const char c) {
const auto uc = static_cast(c);
return (uc >= static_cast('\x80')) &&
@@ -258,7 +281,7 @@ class TfUtf8CodePointIterator final {
}
/// Checks if the `lhs` iterator is at or past the end for the
- /// underlying string_view
+ /// underlying `string_view`
friend bool operator==(const TfUtf8CodePointIterator& lhs,
TfUtf8CodePointView::PastTheEndSentinel)
{
@@ -286,7 +309,10 @@ class TfUtf8CodePointIterator final {
// Constructs an iterator that can read UTF-8 character sequences from
// the given starting string_view iterator \a it. \a end is used as a
// guard against reading byte sequences past the end of the source string.
- // \a end must not be in the middle of a UTF-8 character sequence.
+ //
+ // When working with views of substrings, \a end must not point to a
+ // continuation byte in a valid UTF-8 byte sequence to avoid decoding
+ // errors.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {