Skip to content

Commit

Permalink
Update documentation for UTF-8 utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
nvmkuruc committed Jan 3, 2024
1 parent da57a76 commit a692262
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 39 deletions.
2 changes: 1 addition & 1 deletion pxr/base/tf/overview.dox
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ The high-level grouping of C++ classes and functions is as follows:
<li> \link group_tf_DebuggingOutput \b Output \b For \b Debugging - \endlink
TfDebug, TF_DEBUG(), TF_FUNC_NAME()
<li> \link group_tf_String \b String \b Utilities - \endlink
TfStringPrintf(), TfHash, (and a large number of miscellaneous free functions)
TfStringPrintf(), TfHash, TfUtf8CodePointView, (and a large number of miscellaneous free functions)
<li> \link group_tf_Containers \b Containers - \endlink
TfByteData, TfArray2, TfArray3, TfTypeInfoMap
<li> \link group_tf_Stl \b STL \b Utilities - \endlink
Expand Down
102 changes: 64 additions & 38 deletions pxr/base/tf/unicodeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

/// \file tf/unicodeUtils.h
/// \ingroup group_tf_String
/// Definitions of basic UTF-8 utilities in tf.

#include "pxr/pxr.h"
#include "pxr/base/tf/api.h"
Expand All @@ -38,11 +39,22 @@

PXR_NAMESPACE_OPEN_SCOPE

/// Wrapper for a code point value that can be encoded as UTF-8
/// \class TfUtf8CodePoint
/// \ingroup group_tf_String
///
/// Wrapper for a 32-bit code point value that can be encoded as UTF-8.
///
/// \code{.cpp}
/// // Stream operator overload encodes each code point as UTF-8.
/// std::stringstream s;
/// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120);
/// \endcode
/// A single `TfUtf8CodePoint` may be converted to a string using
/// `TfStringify` as well.
class TfUtf8CodePoint {
public:
/// Code points that cannot be decoded or outside of the valid range are
/// may be replaced with this value.
/// Code points that cannot be decoded or are outside of the valid range
/// will be replaced with this value.
static constexpr uint32_t ReplacementValue = 0xFFFD;

/// Values higher than this will be replaced with the replacement
Expand Down Expand Up @@ -89,49 +101,64 @@ constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{

class TfUtf8CodePointIterator;

/// \class TfUtf8CodePointView
/// \ingroup group_tf_String
///
/// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
/// as code points instead of bytes.
///
/// Because of the variable length encoding, the `Utf8StringView` iterator is
/// a ForwardIterator and is read only.
/// Because of the variable length encoding, the `TfUtf8CodePointView` iterator
/// is a ForwardIterator and is read only.
///
/// \code{.cpp}
/// std::string value{"∫dx"};
/// TfUtf8CodePointView view{value};
/// for (const uint32_t codePoint : view) {
/// for (const uint32_t codePoint : TfUtf8CodePointView{value}) {
/// if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) {
/// TF_WARN("String cannot be decoded.");
/// break;
/// }
/// }
/// (The TfUtf8CodePointView's sentinel end() will make it compatible with
/// the STL ranges library).
/// \endcode
///
/// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range
/// based for loops and the forthcoming STL ranges library; it avoids
/// triplicating the storage for the end iterator. `EndAsIterator()`
/// can be used for algorithms that require the begin and end iterators to be
/// of the same type but necessarily stores redundant copies of the endpoint.
///
/// \code{.cpp}
/// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
/// [](const auto c) { return c == TfUtf8InvalidCodePoint.AsUInt32(); }))
/// {
/// TF_WARN("String cannot be decoded");
/// }
/// \endcode
class TfUtf8CodePointView final {
public:
using const_iterator = TfUtf8CodePointIterator;

/// Model iteration ending when the underlying string_view's end iterator
/// Model iteration ending when the underlying `string_view`'s end iterator
/// has been exceeded. This guards against strings whose variable length
/// encoding pushes the iterator past the end of the underlying
/// string_view.
/// `string_view`.
class PastTheEndSentinel final {};

TfUtf8CodePointView() = default;
explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}

inline const_iterator begin() const;

/// The sentinel will compare as equal with any iterator at or past the end
/// of the underlying string_view
/// The sentinel will compare as equal with any iterator at the end
/// of the underlying `string_view`
PastTheEndSentinel end() const
{
return PastTheEndSentinel{};
}

inline const_iterator cbegin() const;

/// The out of range sentinel will compare as equal with any iterator
/// at or past the end of the underlying string_view's
/// The sentinel will compare as equal with any iterator at the end
/// of the underlying `string_view`
PastTheEndSentinel cend() const
{
return end();
Expand All @@ -143,11 +170,11 @@ class TfUtf8CodePointView final {
return _view.empty();
}

/// Returns an iterator of the same type as begin that identifies the end
/// Returns an iterator of the same type as `begin` that identifies the end
/// of the string.
///
/// As the end iterator is stored three times, this is slightly heavier
/// than using the PastTheEndSentinel and should be avoided in performance
/// than using the `PastTheEndSentinel` and should be avoided in performance
/// critical code paths. It is provided for convenience when an algorithm
/// restricts the iterators to have the same type.
///
Expand All @@ -163,10 +190,9 @@ class TfUtf8CodePointView final {
/// code point values.
///
/// UTF-8 is a variable length encoding, meaning that one Unicode
/// character can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
/// iterator takes care of iterating the necessary characters in a string
/// and extracing the Unicode code point of each UTF-8 encoded character
/// in the sequence.
/// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
/// iterator takes care of consuming the valid UTF-8 bytes for a
/// code point while incrementing.
class TfUtf8CodePointIterator final {
public:
using iterator_category = std::forward_iterator_tag;
Expand All @@ -175,19 +201,14 @@ class TfUtf8CodePointIterator final {
using pointer = void;
using reference = uint32_t;

/// Retrieves the next UTF-8 character in the sequence as its Unicode
/// code point value. Returns TfUtf8InvalidCodePoint.AsUInt32() when the
/// Retrieves the current UTF-8 character in the sequence as its Unicode
/// code point value. Returns `TfUtf8InvalidCodePoint.AsUInt32()` when the
/// byte sequence pointed to by the iterator cannot be decoded.
///
/// If during read of the UTF-8 character sequence the underlying
/// string iterator would go beyond \a end defined at construction
/// time, a std::out_of_range exception will be thrown.
/// A code point might be invalid because it's incorrectly encoded, exceeds
/// the maximum allowed value, or is in the disallowed surrogate range.
uint32_t operator* () const
{
// If the current UTF-8 character is invalid, instead of
// throwing an exception, _GetCodePoint signals this is
// bad by setting the code point to 0xFFFD (this mostly happens
// when a high / low private surrogate is used)
return _GetCodePoint();
}

Expand All @@ -199,15 +220,17 @@ class TfUtf8CodePointIterator final {

/// Determines if two iterators are equal.
/// This intentionally does not consider the end iterator to allow for
/// comparison of iterators between substring views.
/// comparison of iterators between different substring views of the
/// same underlying string.
bool operator== (const TfUtf8CodePointIterator& rhs) const
{
return (this->_it == rhs._it);
}

/// Determines if two iterators are unequal.
/// This intentionally does not consider the end iterator to allow for
/// comparison of iterators between substring views.
/// comparison of iterators between different substring views of the
/// same underlying string.
bool operator!= (const TfUtf8CodePointIterator& rhs) const
{
return (this->_it != rhs._it);
Expand All @@ -225,14 +248,14 @@ class TfUtf8CodePointIterator final {
// condition.
TF_DEV_AXIOM(!_IsPastTheEnd());
_EncodingLength increment = _GetEncodingLength();
// note that in cases where the encoding is invalid, we move to the
// next byte this is necessary because otherwise the iterator would
// never advanced and the end condition of == iterator::end() would
// Note that in cases where the encoding is invalid, we move to the
// next byte. This is necessary because otherwise the iterator would
// never advance and the end condition of == iterator::end() would
// never be satisfied. This means that we increment, even if the
// encoding length is 0.
++_it;
// Only continuation bytes will be consumed after the the first byte.
// This avoid consumption of ASCII characters or other starting bytes.
// This avoids consumption of ASCII characters or other starting bytes.
auto isContinuation = [](const char c) {
const auto uc = static_cast<unsigned char>(c);
return (uc >= static_cast<unsigned char>('\x80')) &&
Expand All @@ -258,7 +281,7 @@ class TfUtf8CodePointIterator final {
}

/// Checks if the `lhs` iterator is at or past the end for the
/// underlying string_view
/// underlying `string_view`
friend bool operator==(const TfUtf8CodePointIterator& lhs,
TfUtf8CodePointView::PastTheEndSentinel)
{
Expand Down Expand Up @@ -286,7 +309,10 @@ class TfUtf8CodePointIterator final {
// Constructs an iterator that can read UTF-8 character sequences from
// the given starting string_view iterator \a it. \a end is used as a
// guard against reading byte sequences past the end of the source string.
// \a end must not be in the middle of a UTF-8 character sequence.
//
// When working with views of substrings, \a end must not point to a
// continuation byte in a valid UTF-8 byte sequence to avoid decoding
// errors.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {
Expand Down

0 comments on commit a692262

Please sign in to comment.