From 37b41495d7161cf25daa72ee842d02258bafbbff Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 10 Sep 2024 23:26:45 +0000 Subject: [PATCH] ICU-22876 C++ UnicodeSet/USet easy item iteration See #3120 --- icu4c/source/common/unicode/uniset.h | 114 ++++- icu4c/source/common/unicode/uset.h | 614 +++++++++++++++++++++++- icu4c/source/common/unicode/uversion.h | 43 ++ icu4c/source/common/uniset.cpp | 102 ++-- icu4c/source/common/uniset_closure.cpp | 10 +- icu4c/source/common/uset.cpp | 21 + icu4c/source/common/usetiter.cpp | 4 +- icu4c/source/test/intltest/usettest.cpp | 232 ++++++++- icu4c/source/test/intltest/usettest.h | 9 + 9 files changed, 1087 insertions(+), 62 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index de385d18f31e..c45f93d19674 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -313,7 +313,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { char16_t *pat = nullptr; int32_t patLen = 0; - UVector* strings = nullptr; // maintained in sorted order + UVector* strings_ = nullptr; // maintained in sorted order UnicodeSetStringSpan *stringSpan = nullptr; /** @@ -1102,6 +1102,118 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { */ UChar32 charAt(int32_t index) const; +#ifndef U_HIDE_DRAFT_API + /** + * Returns a C++ "range" for iterating over the code points of this set. + * + * \code + * UnicodeSet set(u"[abcçカ🚴]", errorCode); + * for (UChar32 c : set.codePoints()) { + * printf("set.codePoint U+%04lx\n", (long)c); + * } + * \endcode + * + * @return a "range" object for iterating over the code points of this set. + * @draft ICU 76 + * @see ranges + * @see strings + * @see begin + * @see end + */ + inline U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const { + return U_HEADER_NESTED_NAMESPACE::USetCodePoints(toUSet()); + } + + /** + * Returns a C++ "range" for iterating over the code point ranges of this set. + * + * \code + * UnicodeSet set(u"[abcçカ🚴]", errorCode); + * for (auto [start, end] : set.ranges()) { + * printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end); + * } + * for (auto range : set.ranges()) { + * for (UChar32 c : range) { + * printf("set.range.c U+%04lx\n", (long)c); + * } + * } + * \endcode + * + * @return a "range" object for iterating over the code point ranges of this set. + * @draft ICU 76 + * @see codePoints + * @see strings + * @see begin + * @see end + */ + inline U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const { + return U_HEADER_NESTED_NAMESPACE::USetRanges(toUSet()); + } + + /** + * Returns a C++ "range" for iterating over the empty and multi-character strings of this set. + * Returns each string as a std::u16string_view without copying its contents. + * + * \code + * UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); + * for (auto s : set.strings()) { + * UnicodeString us(s); + * std::string u8; + * printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str()); + * } + * \endcode + * + * @return a "range" object for iterating over the strings of this set. + * @draft ICU 76 + * @see codePoints + * @see ranges + * @see begin + * @see end + */ + inline U_HEADER_NESTED_NAMESPACE::USetStrings strings() const { + return U_HEADER_NESTED_NAMESPACE::USetStrings(toUSet()); + } + + /** + * Returns a C++ iterator for iterating over all of the elements of this set. + * Convenient all-in one iteration, but creates a UnicodeString for each + * code point or string. + * (Similar to how Java UnicodeSet *is an* Iterable<String>.) + * + * Code points are returned first, then empty and multi-character strings. + * + * \code + * UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); + * for (auto el : set) { + * std::string u8; + * printf("set.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str()); + * } + * \endcode + * + * @return an all-elements iterator. + * @draft ICU 76 + * @see end + * @see codePoints + * @see ranges + * @see strings + */ + inline U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const { + return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).begin(); + } + + /** + * @return an exclusive-end sentinel for iterating over all of the elements of this set. + * @draft ICU 76 + * @see begin + * @see codePoints + * @see ranges + * @see strings + */ + inline U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const { + return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).end(); + } +#endif // U_HIDE_DRAFT_API + /** * Adds the specified range to this set if it is not already * present. If this set already contains the specified range, diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index cce95ce92120..7cf0565f485f 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -33,7 +33,10 @@ #include "unicode/uchar.h" #if U_SHOW_CPLUSPLUS_API +#include +#include "unicode/char16ptr.h" #include "unicode/localpointer.h" +#include "unicode/unistr.h" #endif // U_SHOW_CPLUSPLUS_API #ifndef USET_DEFINED @@ -955,7 +958,7 @@ uset_charAt(const USet* set, int32_t charIndex); /** * Returns the number of characters and strings contained in this set. - * The last (uset_getItemCount() - uset_getRangeCount()) items are strings. + * The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings. * * This is slower than uset_getRangeCount() and uset_getItemCount() because * it counts the code points of all ranges. @@ -965,6 +968,8 @@ uset_charAt(const USet* set, int32_t charIndex); * contained in set * @stable ICU 2.4 * @see uset_getRangeCount + * @see uset_getStringCount + * @see uset_getItemCount */ U_CAPI int32_t U_EXPORT2 uset_size(const USet* set); @@ -975,11 +980,42 @@ uset_size(const USet* set); * @stable ICU 70 * @see uset_getItemCount * @see uset_getItem + * @see uset_getStringCount * @see uset_size */ U_CAPI int32_t U_EXPORT2 uset_getRangeCount(const USet *set); +#ifndef U_HIDE_DRAFT_API + +/** + * @param set the set + * @return the number of strings in this set. + * @draft ICU 76 + * @see uset_getRangeCount + * @see uset_getItemCount + * @see uset_size + */ +U_CAPI int32_t U_EXPORT2 +uset_getStringCount(const USet *set); + +/** + * Returns the index-th string (empty or multi-character) in the set. + * The string may not be NUL-terminated. + * The output length must be used, and the caller must not read more than that many UChars. + * + * @param set the set + * @param index the string index, 0 .. uset_getStringCount() - 1 + * @param pLength the output string length; must not be NULL + * @return the pointer to the string; NULL if the index is out of range or pLength is NULL + * @draft ICU 76 + * @see uset_getStringCount + */ +U_CAPI const UChar* U_EXPORT2 +uset_getString(const USet *set, int32_t index, int32_t *pLength); + +#endif // U_HIDE_DRAFT_API + /** * Returns the number of items in this set. An item is either a range * of characters or a single multicharacter string. @@ -987,6 +1023,8 @@ uset_getRangeCount(const USet *set); * @return a non-negative integer counting the character ranges * and/or strings contained in set * @stable ICU 2.4 + * @see uset_getRangeCount + * @see uset_getStringCount */ U_CAPI int32_t U_EXPORT2 uset_getItemCount(const USet* set); @@ -1001,6 +1039,7 @@ uset_getItemCount(const USet* set); * If itemIndex is at least uset_getRangeCount() and less than uset_getItemCount(), then * this function copies the string into str[strCapacity] and * returns the length of the string (0 for the empty string). + * See uset_getString() for a function that does not copy the string contents. * * If itemIndex is out of range, then this function returns -1. * @@ -1018,6 +1057,7 @@ uset_getItemCount(const USet* set); * @return the length of the string (0 or >= 2), or 0 if the item is a range, * or -1 if the itemIndex is out of range * @stable ICU 2.4 + * @see uset_getString */ U_CAPI int32_t U_EXPORT2 uset_getItem(const USet* set, int32_t itemIndex, @@ -1285,4 +1325,574 @@ U_CAPI UBool U_EXPORT2 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, UChar32* pStart, UChar32* pEnd); -#endif +#if U_SHOW_CPLUSPLUS_API +#ifndef U_HIDE_DRAFT_API + +namespace U_HEADER_ONLY_NAMESPACE { + +// Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class, +// not intended to be used via export from the ICU DLL. + +/** + * Iterator returned by USetCodePoints. + * @draft ICU 76 + */ +class USetCodePointIterator { +public: + /** @draft ICU 76 */ + USetCodePointIterator(const USetCodePointIterator &other) = default; + + /** @draft ICU 76 */ + bool operator==(const USetCodePointIterator &other) const { + // No need to compare rangeCount & end given private constructor + // and assuming we don't compare iterators across the set being modified. + // And comparing rangeIndex is redundant with comparing c. + // We might even skip comparing uset. + // Unless we want operator==() to be "correct" for more than iteration. + return uset == other.uset && c == other.c; + } + + /** @draft ICU 76 */ + bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); } + + /** @draft ICU 76 */ + UChar32 operator*() const { return c; } + + /** + * Pre-increment. + * @draft ICU 76 + */ + USetCodePointIterator &operator++() { + if (c < end) { + ++c; + } else if (rangeIndex < rangeCount) { + UErrorCode errorCode = U_ZERO_ERROR; + int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode); + if (U_SUCCESS(errorCode) && result == 0) { + ++rangeIndex; + } else { + c = end = U_SENTINEL; + } + } else { + c = end = U_SENTINEL; + } + return *this; + } + + /** + * Post-increment. + * @draft ICU 76 + */ + USetCodePointIterator operator++(int) { + USetCodePointIterator result(*this); + operator++(); + return result; + } + +private: + friend class USetCodePoints; + + USetCodePointIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount) + : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount), + c(U_SENTINEL), end(U_SENTINEL) { + // Fetch the first range. + operator++(); + } + + const USet *uset; + int32_t rangeIndex; + int32_t rangeCount; + UChar32 c, end; +}; + +/** + * C++ "range" for iterating over the code points of a USet. + * + * \code + * using U_HEADER_NESTED_NAMESPACE::USetCodePoints; + * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode)); + * for (UChar32 c : USetCodePoints(uset.getAlias())) { + * printf("uset.codePoint U+%04lx\n", (long)c); + * } + * \endcode + * + * C++ UnicodeSet has member functions for iteration, including codePoints(). + * + * @draft ICU 76 + * @see USetRanges + * @see USetStrings + * @see USetElements + */ +class USetCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points of the USet. + * @draft ICU 76 + */ + USetCodePoints(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {} + + /** @draft ICU 76 */ + USetCodePoints(const USetCodePoints &other) = default; + + /** @draft ICU 76 */ + USetCodePointIterator begin() const { + return USetCodePointIterator(uset, 0, rangeCount); + } + + /** @draft ICU 76 */ + USetCodePointIterator end() const { + return USetCodePointIterator(uset, rangeCount, rangeCount); + } + +private: + const USet *uset; + int32_t rangeCount; +}; + +/** + * A contiguous range of code points in a USet/UnicodeSet. + * Returned by USetRangeIterator which is returned by USetRanges. + * Both the rangeStart and rangeEnd are in the range. + * (end() returns an iterator corresponding to rangeEnd+1.) + * @draft ICU 76 + */ +struct CodePointRange { + /** @draft ICU 76 */ + struct iterator { + /** @draft ICU 76 */ + iterator(UChar32 c) : c(c) {} + + /** @draft ICU 76 */ + bool operator==(const iterator &other) const { return c == other.c; } + /** @draft ICU 76 */ + bool operator!=(const iterator &other) const { return !operator==(other); } + + /** @draft ICU 76 */ + UChar32 operator*() const { return c; } + + /** + * Pre-increment. + * @draft ICU 76 + */ + iterator &operator++() { + ++c; + return *this; + } + + /** + * Post-increment. + * @draft ICU 76 + */ + iterator operator++(int) { + return c++; + } + + /** + * The current code point in the range. + * @draft ICU 76 + */ + UChar32 c; + }; + + /** @draft ICU 76 */ + CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {} + /** @draft ICU 76 */ + CodePointRange(const CodePointRange &other) = default; + /** @draft ICU 76 */ + size_t size() const { return (rangeEnd + 1) - rangeStart; } + /** @draft ICU 76 */ + iterator begin() const { return rangeStart; } + /** @draft ICU 76 */ + iterator end() const { return rangeEnd + 1; } + + /** + * Start of a USet/UnicodeSet range of code points. + * @draft ICU 76 + */ + UChar32 rangeStart; + /** + * Inclusive end of a USet/UnicodeSet range of code points. + * @draft ICU 76 + */ + UChar32 rangeEnd; +}; + +/** + * Iterator returned by USetRanges. + * @draft ICU 76 + */ +class USetRangeIterator { +public: + /** @draft ICU 76 */ + USetRangeIterator(const USetRangeIterator &other) = default; + + /** @draft ICU 76 */ + bool operator==(const USetRangeIterator &other) const { + // No need to compare rangeCount given private constructor + // and assuming we don't compare iterators across the set being modified. + // We might even skip comparing uset. + // Unless we want operator==() to be "correct" for more than iteration. + return uset == other.uset && rangeIndex == other.rangeIndex; + } + + /** @draft ICU 76 */ + bool operator!=(const USetRangeIterator &other) const { return !operator==(other); } + + /** @draft ICU 76 */ + CodePointRange operator*() const { + if (rangeIndex < rangeCount) { + UChar32 start, end; + UErrorCode errorCode = U_ZERO_ERROR; + int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode); + if (U_SUCCESS(errorCode) && result == 0) { + return CodePointRange(start, end); + } + } + return CodePointRange(U_SENTINEL, U_SENTINEL); + } + + /** + * Pre-increment. + * @draft ICU 76 + */ + USetRangeIterator &operator++() { + ++rangeIndex; + return *this; + } + + /** + * Post-increment. + * @draft ICU 76 + */ + USetRangeIterator operator++(int) { + USetRangeIterator result(*this); + ++rangeIndex; + return result; + } + +private: + friend class USetRanges; + + USetRangeIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount) + : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount) {} + + const USet *uset; + int32_t rangeIndex; + int32_t rangeCount; +}; + +/** + * C++ "range" for iterating over the code point ranges of a USet. + * + * \code + * using U_HEADER_NESTED_NAMESPACE::USetRanges; + * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode)); + * for (auto [start, end] : USetRanges(uset.getAlias())) { + * printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end); + * } + * for (auto range : USetRanges(uset.getAlias())) { + * for (UChar32 c : range) { + * printf("uset.range.c U+%04lx\n", (long)c); + * } + * } + * \endcode + * + * C++ UnicodeSet has member functions for iteration, including ranges(). + * + * @draft ICU 76 + * @see USetCodePoints + * @see USetStrings + * @see USetElements + */ +class USetRanges { +public: + /** + * Constructs a C++ "range" object over the code point ranges of the USet. + * @draft ICU 76 + */ + USetRanges(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {} + + /** @draft ICU 76 */ + USetRanges(const USetRanges &other) = default; + + /** @draft ICU 76 */ + USetRangeIterator begin() const { + return USetRangeIterator(uset, 0, rangeCount); + } + + /** @draft ICU 76 */ + USetRangeIterator end() const { + return USetRangeIterator(uset, rangeCount, rangeCount); + } + +private: + const USet *uset; + int32_t rangeCount; +}; + +/** + * Iterator returned by USetStrings. + * @draft ICU 76 + */ +class USetStringIterator { +public: + /** @draft ICU 76 */ + USetStringIterator(const USetStringIterator &other) = default; + + /** @draft ICU 76 */ + bool operator==(const USetStringIterator &other) const { + // No need to compare count given private constructor + // and assuming we don't compare iterators across the set being modified. + // We might even skip comparing uset. + // Unless we want operator==() to be "correct" for more than iteration. + return uset == other.uset && index == other.index; + } + + /** @draft ICU 76 */ + bool operator!=(const USetStringIterator &other) const { return !operator==(other); } + + /** @draft ICU 76 */ + std::u16string_view operator*() const { + if (index < count) { + int32_t length; + const UChar *uchars = uset_getString(uset, index, &length); + // assert uchars != nullptr; + return { ConstChar16Ptr(uchars), (uint32_t)length }; + } + return {}; + } + + /** + * Pre-increment. + * @draft ICU 76 + */ + USetStringIterator &operator++() { + ++index; + return *this; + } + + /** + * Post-increment. + * @draft ICU 76 + */ + USetStringIterator operator++(int) { + USetStringIterator result(*this); + ++index; + return result; + } + +private: + friend class USetStrings; + + USetStringIterator(const USet *uset, int32_t index, int32_t count) + : uset(uset), index(index), count(count) {} + + const USet *uset; + int32_t index; + int32_t count; +}; + +/** + * C++ "range" for iterating over the empty and multi-character strings of a USet. + * + * \code + * using U_HEADER_NESTED_NAMESPACE::USetStrings; + * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode)); + * for (auto s : USetStrings(uset.getAlias())) { + * UnicodeString us(s); + * std::string u8; + * printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str()); + * } + * \endcode + * + * C++ UnicodeSet has member functions for iteration, including strings(). + * + * @draft ICU 76 + * @see USetCodePoints + * @see USetRanges + * @see USetElements + */ +class USetStrings { +public: + /** + * Constructs a C++ "range" object over the strings of the USet. + * @draft ICU 76 + */ + USetStrings(const USet *uset) : uset(uset), count(uset_getStringCount(uset)) {} + + /** @draft ICU 76 */ + USetStrings(const USetStrings &other) = default; + + /** @draft ICU 76 */ + USetStringIterator begin() const { + return USetStringIterator(uset, 0, count); + } + + /** @draft ICU 76 */ + USetStringIterator end() const { + return USetStringIterator(uset, count, count); + } + +private: + const USet *uset; + int32_t count; +}; + +/** + * Iterator returned by USetElements. + * @draft ICU 76 + */ +class USetElementIterator { +public: + /** @draft ICU 76 */ + USetElementIterator(const USetElementIterator &other) = default; + + /** @draft ICU 76 */ + bool operator==(const USetElementIterator &other) const { + // No need to compare rangeCount & end given private constructor + // and assuming we don't compare iterators across the set being modified. + // We might even skip comparing uset. + // Unless we want operator==() to be "correct" for more than iteration. + return uset == other.uset && c == other.c && index == other.index; + } + + /** @draft ICU 76 */ + bool operator!=(const USetElementIterator &other) const { return !operator==(other); } + + /** @draft ICU 76 */ + UnicodeString operator*() const { + if (c >= 0) { + return UnicodeString(c); + } else if (index < totalCount) { + int32_t length; + const UChar *uchars = uset_getString(uset, index - rangeCount, &length); + // assert uchars != nullptr; + return UnicodeString(uchars, length); + } else { + return UnicodeString(); + } + } + + /** + * Pre-increment. + * @draft ICU 76 + */ + USetElementIterator &operator++() { + if (c < end) { + ++c; + } else if (index < rangeCount) { + UErrorCode errorCode = U_ZERO_ERROR; + int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode); + if (U_SUCCESS(errorCode) && result == 0) { + ++index; + } else { + c = end = U_SENTINEL; + } + } else if (c >= 0) { + // assert index == rangeCount; + // Switch from the last range to the first string. + c = end = U_SENTINEL; + } else { + ++index; + } + return *this; + } + + /** + * Post-increment. + * @draft ICU 76 + */ + USetElementIterator operator++(int) { + USetElementIterator result(*this); + operator++(); + return result; + } + +private: + friend class USetElements; + + USetElementIterator(const USet *uset, int32_t index, int32_t rangeCount, int32_t totalCount) + : uset(uset), index(index), rangeCount(rangeCount), totalCount(totalCount), + c(U_SENTINEL), end(U_SENTINEL) { + if (index < rangeCount) { + // Fetch the first range. + operator++(); + } + // Otherwise don't move beyond the (index - rangeCount)-th string. + } + + const USet *uset; + int32_t index; + /** Number of UnicodeSet/USet code point ranges. */ + int32_t rangeCount; + /** + * Number of code point ranges plus number of strings. + * index starts from 0, counts ranges while less than rangeCount, + * then counts strings while at least rangeCount and less than totalCount. + * + * Note that totalCount is the same as uset_getItemCount(), but usually + * smaller than the number of elements returned by this iterator + * because we return each code point of each range. + */ + int32_t totalCount; + UChar32 c, end; +}; + +/** + * A C++ "range" for iterating over all of the elements of a USet. + * Convenient all-in one iteration, but creates a UnicodeString for each + * code point or string. + * + * Code points are returned first, then empty and multi-character strings. + * + * \code + * using U_HEADER_NESTED_NAMESPACE::USetElements; + * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode)); + * for (auto el : USetElements(uset.getAlias())) { + * std::string u8; + * printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str()); + * } + * \endcode + * + * C++ UnicodeSet has member functions for iteration, including begin() and end(). + * + * @return an all-elements iterator. + * @draft ICU 76 + * @see USetCodePoints + * @see USetRanges + * @see USetStrings + */ +class USetElements { +public: + /** + * Constructs a C++ "range" object over all of the elements of the USet. + * @draft ICU 76 + */ + USetElements(const USet *uset) + : uset(uset), rangeCount(uset_getRangeCount(uset)), + stringCount(uset_getStringCount(uset)) {} + + /** @draft ICU 76 */ + USetElements(const USetElements &other) = default; + + /** @draft ICU 76 */ + USetElementIterator begin() const { + return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount); + } + + /** @draft ICU 76 */ + USetElementIterator end() const { + return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount); + } + +private: + const USet *uset; + int32_t rangeCount, stringCount; +}; + +} // namespace U_HEADER_ONLY_NAMESPACE + +#endif // U_HIDE_DRAFT_API +#endif // U_SHOW_CPLUSPLUS_API + +#endif // __USET_H__ diff --git a/icu4c/source/common/unicode/uversion.h b/icu4c/source/common/unicode/uversion.h index 113568df8c12..25d73a3aeb54 100644 --- a/icu4c/source/common/unicode/uversion.h +++ b/icu4c/source/common/unicode/uversion.h @@ -124,6 +124,49 @@ typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH]; # if U_USING_ICU_NAMESPACE U_NAMESPACE_USE # endif + +#ifndef U_HIDE_DRAFT_API +/** + * \def U_HEADER_NESTED_NAMESPACE + * Nested namespace used inside U_ICU_NAMESPACE for header-only APIs. + * Different when used inside ICU to prevent public use of internal instantiations: + * "header" when compiling calling code; "internal" when compiling ICU library code. + * + * When compiling for Windows, where DLL exports of APIs are explicit, + * this is always "header". Header-only types are not marked for export, + * which on Windows already avoids callers linking with library instantiations. + * + * @draft ICU 76 + * @see U_HEADER_ONLY_NAMESPACE + */ + +/** + * \def U_HEADER_ONLY_NAMESPACE + * Namespace used for header-only APIs. + * Different when used inside ICU to prevent public use of internal instantiations. + * "U_ICU_NAMESPACE::header" or "U_ICU_NAMESPACE::internal", + * see U_HEADER_NESTED_NAMESPACE for details. + * + * @draft ICU 76 + */ + +// The first test is the same as for defining U_EXPORT for Windows. +#if defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \ + UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__)) +# define U_HEADER_NESTED_NAMESPACE header +#elif defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \ + defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) || \ + defined(U_LAYOUTEX_IMPLEMENTATION) || defined(U_TOOLUTIL_IMPLEMENTATION) +# define U_HEADER_NESTED_NAMESPACE internal +#else +# define U_HEADER_NESTED_NAMESPACE header +#endif + +#define U_HEADER_ONLY_NAMESPACE U_ICU_NAMESPACE::U_HEADER_NESTED_NAMESPACE + +namespace U_HEADER_ONLY_NAMESPACE {} +#endif // U_HIDE_DRAFT_API + #endif /* __cplusplus */ /*===========================================================================*/ diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 6481c8aa1ed9..ef78ce33b195 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -118,15 +118,15 @@ static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { } UBool UnicodeSet::hasStrings() const { - return strings != nullptr && !strings->isEmpty(); + return strings_ != nullptr && !strings_->isEmpty(); } int32_t UnicodeSet::stringsSize() const { - return strings == nullptr ? 0 : strings->size(); + return strings_ == nullptr ? 0 : strings_->size(); } UBool UnicodeSet::stringsContains(const UnicodeString &s) const { - return strings != nullptr && strings->contains((void*) &s); + return strings_ != nullptr && strings_->contains((void*) &s); } //---------------------------------------------------------------- @@ -171,7 +171,7 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilte if (o.hasStrings()) { UErrorCode status = U_ZERO_ERROR; if (!allocateStrings(status) || - (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) { setToBogus(); return; } @@ -195,7 +195,7 @@ UnicodeSet::~UnicodeSet() { if (buffer != stackList) { uprv_free(buffer); } - delete strings; + delete strings_; delete stringSpan; releasePattern(); } @@ -233,16 +233,16 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { } if (o.hasStrings()) { UErrorCode status = U_ZERO_ERROR; - if ((strings == nullptr && !allocateStrings(status)) || - (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + if ((strings_ == nullptr && !allocateStrings(status)) || + (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) { setToBogus(); return *this; } } else if (hasStrings()) { - strings->removeAllElements(); + strings_->removeAllElements(); } if (o.stringSpan != nullptr && !asThawed) { - stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); + stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_); if (stringSpan == nullptr) { // Check for memory allocation error. setToBogus(); return *this; @@ -284,7 +284,7 @@ bool UnicodeSet::operator==(const UnicodeSet& o) const { if (list[i] != o.list[i]) return false; } if (hasStrings() != o.hasStrings()) { return false; } - if (hasStrings() && *strings != *o.strings) return false; + if (hasStrings() && *strings_ != *o.strings_) return false; return true; } @@ -450,7 +450,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { return false; } } - return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); + return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_)); } /** @@ -495,7 +495,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { return false; } } - return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); + return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_); } /** @@ -536,8 +536,8 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { } } if (hasStrings()) { - for (i=0; isize(); ++i) { - const UnicodeString& s = *static_cast(strings->elementAt(i)); + for (i=0; isize(); ++i) { + const UnicodeString& s = *static_cast(strings_->elementAt(i)); if (s.isEmpty()) { continue; // skip the empty string } @@ -586,8 +586,8 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, // return the longest match. int32_t highWaterLength = 0; - for (i=0; isize(); ++i) { - const UnicodeString& trial = *static_cast(strings->elementAt(i)); + for (i=0; isize(); ++i) { + const UnicodeString& trial = *static_cast(strings_->elementAt(i)); if (trial.isEmpty()) { continue; // skip the empty string } @@ -962,15 +962,15 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { } /** - * Adds the given string, in order, to 'strings'. The given string - * must have been checked by the caller to not already be in 'strings'. + * Adds the given string, in order, to 'strings_'. The given string + * must have been checked by the caller to not already be in 'strings_'. */ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { return; } UErrorCode ec = U_ZERO_ERROR; - if (strings == nullptr && !allocateStrings(ec)) { + if (strings_ == nullptr && !allocateStrings(ec)) { setToBogus(); return; } @@ -979,7 +979,7 @@ void UnicodeSet::_add(const UnicodeString& s) { setToBogus(); return; } - strings->sortedInsert(t, compareUnicodeString, ec); + strings_->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); } @@ -1058,7 +1058,7 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { UnicodeSet& UnicodeSet::removeAllStrings() { if (!isFrozen() && hasStrings()) { - strings->removeAllElements(); + strings_->removeAllElements(); releasePattern(); } return *this; @@ -1176,7 +1176,7 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { if (isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings != nullptr && strings->removeElement((void*) &s)) { + if (strings_ != nullptr && strings_->removeElement((void*) &s)) { releasePattern(); } } else { @@ -1248,7 +1248,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { int32_t cp = getSingleCP(s); if (cp < 0) { if (stringsContains(s)) { - strings->removeElement((void*) &s); + strings_->removeElement((void*) &s); } else { _add(s); } @@ -1275,9 +1275,9 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { } // Add strings in order - if ( c.strings!=nullptr ) { - for (int32_t i=0; isize(); ++i) { - const UnicodeString* s = static_cast(c.strings->elementAt(i)); + if ( c.strings_!=nullptr ) { + for (int32_t i=0; isize(); ++i) { + const UnicodeString* s = static_cast(c.strings_->elementAt(i)); if (!stringsContains(*s)) { _add(*s); } @@ -1302,9 +1302,9 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { retain(c.list, c.len, 0); if (hasStrings()) { if (!c.hasStrings()) { - strings->removeAllElements(); + strings_->removeAllElements(); } else { - strings->retainAll(*c.strings); + strings_->retainAll(*c.strings_); } } return *this; @@ -1325,7 +1325,7 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { } retain(c.list, c.len, 2); if (hasStrings() && c.hasStrings()) { - strings->removeAll(*c.strings); + strings_->removeAll(*c.strings_); } return *this; } @@ -1344,10 +1344,10 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { } exclusiveOr(c.list, c.len, 0); - if (c.strings != nullptr) { - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (strings == nullptr || !strings->removeElement(e)) { + if (c.strings_ != nullptr) { + for (int32_t i=0; isize(); ++i) { + void* e = c.strings_->elementAt(i); + if (strings_ == nullptr || !strings_->removeElement(e)) { _add(*static_cast(e)); } } @@ -1366,8 +1366,8 @@ UnicodeSet& UnicodeSet::clear() { list[0] = UNICODESET_HIGH; len = 1; releasePattern(); - if (strings != nullptr) { - strings->removeAllElements(); + if (strings_ != nullptr) { + strings_->removeAllElements(); } // Remove bogus fFlags = 0; @@ -1405,7 +1405,7 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { } const UnicodeString* UnicodeSet::getString(int32_t index) const { - return static_cast(strings->elementAt(index)); + return static_cast(strings_->elementAt(index)); } /** @@ -1439,9 +1439,9 @@ UnicodeSet& UnicodeSet::compact() { // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array. } - if (strings != nullptr && strings->isEmpty()) { - delete strings; - strings = nullptr; + if (strings_ != nullptr && strings_->isEmpty()) { + delete strings_; + strings_ = nullptr; } return *this; } @@ -1607,15 +1607,15 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) { if (U_FAILURE(status)) { return false; } - strings = new UVector(uprv_deleteUObject, + strings_ = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, 1, status); - if (strings == nullptr) { // Check for memory allocation error. + if (strings_ == nullptr) { // Check for memory allocation error. status = U_MEMORY_ALLOCATION_ERROR; return false; } if (U_FAILURE(status)) { - delete strings; - strings = nullptr; + delete strings_; + strings_ = nullptr; return false; } return true; @@ -2131,11 +2131,11 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, } } - if (strings != nullptr) { - for (int32_t i = 0; isize(); ++i) { + if (strings_ != nullptr) { + for (int32_t i = 0; isize(); ++i) { result.append(u'{'); _appendToPat(result, - *static_cast(strings->elementAt(i)), + *static_cast(strings_->elementAt(i)), escapeUnprintable); result.append(u'}'); } @@ -2175,7 +2175,7 @@ UnicodeSet *UnicodeSet::freeze() { // Optimize contains() and span() and similar functions. if (hasStrings()) { - stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); + stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL); if (stringSpan == nullptr) { setToBogus(); return this; @@ -2216,7 +2216,7 @@ int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition sp uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF16_CONTAINED; - UnicodeSetStringSpan strSpan(*this, *strings, which); + UnicodeSetStringSpan strSpan(*this, *strings_, which); if(strSpan.needsStringSpanUTF16()) { return strSpan.span(s, length, spanCondition); } @@ -2253,7 +2253,7 @@ int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanConditio uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF16_CONTAINED; - UnicodeSetStringSpan strSpan(*this, *strings, which); + UnicodeSetStringSpan strSpan(*this, *strings_, which); if(strSpan.needsStringSpanUTF16()) { return strSpan.spanBack(s, length, spanCondition); } @@ -2291,7 +2291,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF8_CONTAINED; - UnicodeSetStringSpan strSpan(*this, *strings, which); + UnicodeSetStringSpan strSpan(*this, *strings_, which); if(strSpan.needsStringSpanUTF8()) { return strSpan.spanUTF8(reinterpret_cast(s), length, spanCondition); } @@ -2329,7 +2329,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF8_CONTAINED; - UnicodeSetStringSpan strSpan(*this, *strings, which); + UnicodeSetStringSpan strSpan(*this, *strings_, which); if(strSpan.needsStringSpanUTF8()) { return strSpan.spanBackUTF8(reinterpret_cast(s), length, spanCondition); } diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index 061ae35717ae..ae777c5facdf 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -242,7 +242,7 @@ void UnicodeSet::closeOverCaseInsensitive(bool simple) { // therefore, start with no strings and add only those needed. // Do this before processing code points, because they may add strings. if (!simple && foldSet.hasStrings()) { - foldSet.strings->removeAllElements(); + foldSet.strings_->removeAllElements(); } USetAdder sa = { @@ -276,8 +276,8 @@ void UnicodeSet::closeOverCaseInsensitive(bool simple) { } if (hasStrings()) { UnicodeString str; - for (int32_t j=0; jsize(); ++j) { - const UnicodeString* pStr = static_cast(strings->elementAt(j)); + for (int32_t j=0; jsize(); ++j) { + const UnicodeString* pStr = static_cast(strings_->elementAt(j)); if (simple) { if (scfString(*pStr, str)) { foldSet.remove(*pStr).add(str); @@ -334,8 +334,8 @@ void UnicodeSet::closeOverAddCaseMappings() { BreakIterator *bi = BreakIterator::createWordInstance(root, status); if (U_SUCCESS(status)) { #endif - for (int32_t j=0; jsize(); ++j) { - const UnicodeString* pStr = static_cast(strings->elementAt(j)); + for (int32_t j=0; jsize(); ++j) { + const UnicodeString* pStr = static_cast(strings_->elementAt(j)); (str = *pStr).toLower(root); foldSet.add(str); #if !UCONFIG_NO_BREAK_ITERATION diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp index b2d0b91d4b6a..8712da8b07fe 100644 --- a/icu4c/source/common/uset.cpp +++ b/icu4c/source/common/uset.cpp @@ -21,6 +21,7 @@ */ #include "unicode/utypes.h" +#include "unicode/char16ptr.h" #include "unicode/uobject.h" #include "unicode/uset.h" #include "unicode/uniset.h" @@ -306,12 +307,32 @@ uset_getRangeCount(const USet *set) { return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount(); } +U_CAPI int32_t U_EXPORT2 +uset_getStringCount(const USet *uset) { + const UnicodeSet &set = *(const UnicodeSet *)uset; + return USetAccess::getStringCount(set); +} + U_CAPI int32_t U_EXPORT2 uset_getItemCount(const USet* uset) { const UnicodeSet& set = *(const UnicodeSet*)uset; return set.getRangeCount() + USetAccess::getStringCount(set); } +U_CAPI const UChar* U_EXPORT2 +uset_getString(const USet *uset, int32_t index, int32_t *pLength) { + if (pLength == nullptr) { return nullptr; } + const UnicodeSet &set = *(const UnicodeSet *)uset; + int32_t count = USetAccess::getStringCount(set); + if (index < 0 || count <= index) { + *pLength = 0; + return nullptr; + } + const UnicodeString *s = USetAccess::getString(set, index); + *pLength = s->length(); + return toUCharPtr(s->getBuffer()); +} + U_CAPI int32_t U_EXPORT2 uset_getItem(const USet* uset, int32_t itemIndex, UChar32* start, UChar32* end, diff --git a/icu4c/source/common/usetiter.cpp b/icu4c/source/common/usetiter.cpp index 703f9d63db63..a79d1a26f100 100644 --- a/icu4c/source/common/usetiter.cpp +++ b/icu4c/source/common/usetiter.cpp @@ -61,7 +61,7 @@ UBool UnicodeSetIterator::next() { if (nextString >= stringCount) return false; codepoint = static_cast(IS_STRING); // signal that value is actually a string - string = static_cast(set->strings->elementAt(nextString++)); + string = static_cast(set->strings_->elementAt(nextString++)); return true; } @@ -94,7 +94,7 @@ UBool UnicodeSetIterator::nextRange() { if (nextString >= stringCount) return false; codepoint = static_cast(IS_STRING); // signal that value is actually a string - string = static_cast(set->strings->elementAt(nextString++)); + string = static_cast(set->strings_->elementAt(nextString++)); return true; } diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index cae7f2b1f097..18320f928fd7 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -12,9 +12,11 @@ */ #include - #include + +#include #include + #include "unicode/utypes.h" #include "usettest.h" #include "unicode/ucnv.h" @@ -104,6 +106,14 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestEmptyString); TESTCASE_AUTO(TestSkipToStrings); TESTCASE_AUTO(TestPatternCodePointComplement); + TESTCASE_AUTO(TestCodePointIterator); + TESTCASE_AUTO(TestUSetCodePointIterator); + TESTCASE_AUTO(TestRangeIterator); + TESTCASE_AUTO(TestUSetRangeIterator); + TESTCASE_AUTO(TestStringIterator); + TESTCASE_AUTO(TestUSetStringIterator); + TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestUSetElementIterator); TESTCASE_AUTO_END; } @@ -4259,3 +4269,223 @@ void UnicodeSetTest::TestPatternCodePointComplement() { assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'🚲')); } } + +void UnicodeSetTest::TestCodePointIterator() { + IcuTestErrorCode errorCode(*this, "TestCodePointIterator"); + UnicodeSet set(u"[abcçカ🚴]", errorCode); + UnicodeString result; + for (UChar32 c : set.codePoints()) { + // printf("set.codePoint U+%04lx\n", (long)c); + result.append(u' ').append(c); + } + assertEquals(WHERE, u" a b c ç カ 🚴", result); + + // codePoints() returns USetCodePoints for which explicit APIs are tested via USet. +} + +void UnicodeSetTest::TestUSetCodePointIterator() { + IcuTestErrorCode errorCode(*this, "TestUSetCodePointIterator"); + using U_HEADER_NESTED_NAMESPACE::USetCodePoints; + LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, errorCode)); + UnicodeString result; + for (UChar32 c : USetCodePoints(uset.getAlias())) { + // printf("uset.codePoint U+%04lx\n", (long)c); + result.append(u' ').append(c); + } + assertEquals(WHERE, u" a b c ç カ 🚴", result); + + USetCodePoints range1(uset.getAlias()); + auto range2(range1); // copy constructor + auto iter = range1.begin(); + auto limit = range2.end(); + // operator* with pre- and post-increment + assertEquals(WHERE, u'a', *iter); + ++iter; + assertEquals(WHERE, u'b', *iter); + assertEquals(WHERE, u'c', *++iter); + auto iter2(iter); // copy constructor + assertEquals(WHERE, u'c', *iter2++); + assertEquals(WHERE, u'ç', *iter2++); + assertEquals(WHERE, u'カ', *iter2); + assertTrue(WHERE, ++iter2 != limit); + auto iter3(iter2++); + assertEquals(WHERE, U'🚴', *iter3); + assertTrue(WHERE, iter2 == limit); +} + +void UnicodeSetTest::TestRangeIterator() { + IcuTestErrorCode errorCode(*this, "TestRangeIterator"); + UnicodeSet set(u"[abcçカ🚴]", errorCode); + UnicodeString result; + for (auto [start, end] : set.ranges()) { + // printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end); + result.append(u' ').append(start).append(u'-').append(end); + } + assertEquals(WHERE, u" a-c ç-ç カ-カ 🚴-🚴", result); + result.remove(); + for (auto range : set.ranges()) { + for (UChar32 c : range) { + // printf("set.range.c U+%04lx\n", (long)c); + result.append(u' ').append(c); + } + result.append(u" |"); + } + assertEquals(WHERE, u" a b c | ç | カ | 🚴 |", result); + + // ranges() returns USetRanges for which explicit APIs are tested via USet. +} + +void UnicodeSetTest::TestUSetRangeIterator() { + IcuTestErrorCode errorCode(*this, "TestUSetRangeIterator"); + using U_HEADER_NESTED_NAMESPACE::USetRanges; + using U_HEADER_NESTED_NAMESPACE::CodePointRange; + LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, errorCode)); + UnicodeString result; + for (auto [start, end] : USetRanges(uset.getAlias())) { + // printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end); + result.append(u' ').append(start).append(u'-').append(end); + } + assertEquals(WHERE, u" a-c ç-ç カ-カ 🚴-🚴", result); + result.remove(); + for (auto range : USetRanges(uset.getAlias())) { + for (UChar32 c : range) { + // printf("uset.range.c U+%04lx\n", (long)c); + result.append(u' ').append(c); + } + result.append(u" |"); + } + assertEquals(WHERE, u" a b c | ç | カ | 🚴 |", result); + + USetRanges range1(uset.getAlias()); + auto range2(range1); // copy constructor + auto iter = range1.begin(); + auto limit = range2.end(); + // operator* with pre- and post-increment + { + auto cpRange = *iter; + assertEquals(WHERE, u'a', cpRange.rangeStart); + assertEquals(WHERE, u'c', cpRange.rangeEnd); + assertEquals(WHERE, 3, cpRange.size()); + auto cpRange2(cpRange); + auto cpIter = cpRange.begin(); + auto cpLimit = cpRange2.end(); + assertEquals(WHERE, u'a', *cpIter++); + assertEquals(WHERE, u'b', *cpIter); + assertTrue(WHERE, cpIter != cpLimit); + CodePointRange::iterator cpIter2(u'b'); // public constructor + assertTrue(WHERE, cpIter == cpIter2); + assertEquals(WHERE, u'c', *++cpIter); + assertTrue(WHERE, cpIter != cpIter2); + assertTrue(WHERE, ++cpIter == cpLimit); + } + ++iter; + auto iter2(iter); // copy constructor + assertEquals(WHERE, u'ç', (*iter2).rangeStart); + assertEquals(WHERE, u'ç', (*iter2).rangeEnd); + assertEquals(WHERE, 1, (*iter2).size()); + assertEquals(WHERE, u'ç', (*iter2++).rangeStart); + assertEquals(WHERE, u'カ', (*iter2).rangeStart); + assertTrue(WHERE, ++iter2 != limit); + auto iter3(iter2++); + assertEquals(WHERE, U'🚴', (*iter3).rangeStart); + assertTrue(WHERE, iter2 == limit); + + { + CodePointRange cpRange(u'h', u'k'); // public constructor + // FYI: currently no operator== + assertEquals(WHERE, u'h', cpRange.rangeStart); + assertEquals(WHERE, u'k', cpRange.rangeEnd); + assertEquals(WHERE, 4, cpRange.size()); + assertEquals(WHERE, u'i', *++(cpRange.begin())); + } +} + +void UnicodeSetTest::TestStringIterator() { + IcuTestErrorCode errorCode(*this, "TestStringIterator"); + UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); + UnicodeString result; + for (auto s : set.strings()) { + // UnicodeString us(s); + // std::string u8; + // printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str()); + result.append(u" \"").append(s).append(u'"'); + } + assertEquals(WHERE, uR"( "" "abc" "de")", result); + + // strings() returns USetStrins for which explicit APIs are tested via USet. +} + +void UnicodeSetTest::TestUSetStringIterator() { + IcuTestErrorCode errorCode(*this, "TestUSetStringIterator"); + using U_HEADER_NESTED_NAMESPACE::USetStrings; + LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, errorCode)); + UnicodeString result; + for (auto s : USetStrings(uset.getAlias())) { + // UnicodeString us(s); + // std::string u8; + // printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str()); + result.append(u" \"").append(s).append(u'"'); + } + assertEquals(WHERE, uR"( "" "abc" "de")", result); + + USetStrings range1(uset.getAlias()); + auto range2(range1); // copy constructor + auto iter = range1.begin(); + auto limit = range2.end(); + // operator* with pre- and post-increment + assertEquals(WHERE, UnicodeString(), UnicodeString(*iter)); + assertEquals(WHERE, u"abc", UnicodeString(*++iter)); + auto iter2(iter); // copy constructor + assertEquals(WHERE, u"abc", UnicodeString(*iter2++)); + assertTrue(WHERE, iter2 != limit); + auto iter3(iter2++); + assertEquals(WHERE, u"de", UnicodeString(*iter3)); + assertTrue(WHERE, iter2 == limit); +} + +void UnicodeSetTest::TestElementIterator() { + IcuTestErrorCode errorCode(*this, "TestElementIterator"); + UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); + UnicodeString result; + for (auto el : set) { + // std::string u8; + // printf("set.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str()); + result.append(u" \"").append(el).append(u'"'); + } + assertEquals(WHERE, uR"( "a" "b" "c" "ç" "カ" "🚴" "" "abc" "de")", result); + + // begin() & end() return USetElementIterator for which explicit APIs are tested via USet. +} + +void UnicodeSetTest::TestUSetElementIterator() { + IcuTestErrorCode errorCode(*this, "TestUSetElementIterator"); + using U_HEADER_NESTED_NAMESPACE::USetElements; + LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, errorCode)); + UnicodeString result; + for (auto el : USetElements(uset.getAlias())) { + // std::string u8; + // printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str()); + result.append(u" \"").append(el).append(u'"'); + } + assertEquals(WHERE, uR"( "a" "b" "c" "ç" "カ" "🚴" "" "abc" "de")", result); + + USetElements range1(uset.getAlias()); + auto range2(range1); // copy constructor + auto iter = range1.begin(); + auto limit = range2.end(); + // operator* with pre- and post-increment + assertEquals(WHERE, u"a", *iter); + ++iter; + assertEquals(WHERE, u"b", *iter); + assertEquals(WHERE, u"c", *++iter); + auto iter2(iter); // copy constructor + assertEquals(WHERE, u"c", *iter2++); + // skip çカ🚴 + ++++++iter2; + assertEquals(WHERE, UnicodeString(), *iter2++); + assertEquals(WHERE, u"abc", *iter2); + assertTrue(WHERE, ++iter2 != limit); + auto iter3(iter2++); + assertEquals(WHERE, u"de", *iter3); + assertTrue(WHERE, iter2 == limit); +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index e0974e2d189e..0127042c7365 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -105,6 +105,15 @@ class UnicodeSetTest: public IntlTest { void TestSkipToStrings(); void TestPatternCodePointComplement(); + void TestCodePointIterator(); + void TestUSetCodePointIterator(); + void TestRangeIterator(); + void TestUSetRangeIterator(); + void TestStringIterator(); + void TestUSetStringIterator(); + void TestElementIterator(); + void TestUSetElementIterator(); + private: UBool toPatternAux(UChar32 start, UChar32 end);