Skip to content

Commit

Permalink
Moving holes conditionally; #576
Browse files Browse the repository at this point in the history
  • Loading branch information
the-moisrex committed Dec 30, 2024
1 parent 666ef99 commit c3950ce
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 44 deletions.
12 changes: 12 additions & 0 deletions tests/unicode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6974,4 +6974,16 @@ TEST(Unicode, FuzzFixes2) {
unicode_fuzz("\x0\xda\x0\x3"sv);
}

TEST(Unicode, FuzzFixes3) {
using webpp::tests::unicode_fuzz;
using std::string_view_literals::operator""sv;

unicode_fuzz("\xed\x96\x96\xd6\x96"sv);
unicode_fuzz(
"\xaa\xaa\xaa\xaa\xaa\xc8\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xea\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\x67\xaa\xaa"
"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xa"sv);
}

// NOLINTEND(*-magic-numbers, *-pro-bounds-pointer-arithmetic)
7 changes: 4 additions & 3 deletions webpp/unicode/normalization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -614,17 +614,18 @@ namespace webpp::unicode {
if (prev_ccc < ccc && replaced_cp != replacement_char<char32_t>) {
// found a composition of cp1 and cp2
cp1 = replaced_cp;
hole.mark(cp2_pin.iter());
hole.mark_code_point(cp2_pin.iter(), reducer.end());
continue;
}
if (ccc == 0) [[likely]] {
break;
}
prev_ccc = ccc;

utf_range_marker cp2_hole{cp2_pin.iter()};
utf_range_marker<Iter> cp2_hole;
cp2_hole.mark_code_point(cp2_pin.iter(), reducer.end());
++rep_pin;
rep_pin.set(cp2, cp2_hole);
rep_pin.set(cp2, cp2_hole, hole);
rep_pin.fallback_hole(hole, cp2_hole);
}

Expand Down
13 changes: 6 additions & 7 deletions webpp/unicode/unicode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ namespace webpp::unicode {

// from glib/gutf8.c
// NOLINTBEGIN(*-avoid-c-arrays)
template <typename CharT = char8_t>
static constexpr CharT utf8_skip[256] = {
static constexpr stl::array<stl::uint8_t, 256> utf8_skip{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand Down Expand Up @@ -236,7 +235,7 @@ namespace webpp::unicode {

// impl 3:
using unsigned_type = stl::make_unsigned_t<value_type>; // to avoid using "char" warnings
return static_cast<SizeT>(details::utf8_skip<value_type>[static_cast<unsigned_type>(value)]);
return static_cast<SizeT>(details::utf8_skip[static_cast<unsigned_type>(value)]);
} else {
return 1U;
}
Expand Down Expand Up @@ -569,7 +568,7 @@ namespace webpp::unicode {
static_assert(sizeof(char_type) == sizeof(src_char_type),
"Character types need to have the same size.");
if constexpr (UTF8<char_type>) {
auto const size = static_cast<stl::size_t>(details::utf8_skip<src_char_type>[*from]);
auto const size = static_cast<stl::size_t>(details::utf8_skip[*from]);
webpp_assume(size <= 6);
for (stl::size_t index = 0U; index != size; ++index) {
istl::iter_append(ito, *from++);
Expand Down Expand Up @@ -602,7 +601,7 @@ namespace webpp::unicode {
// alternative implementation:
// for (++p; (*p & 0xc0) == 0x80; ++p) ;
using unsigned_type = stl::make_unsigned_t<char_type>;
pos += details::utf8_skip<char_type>[static_cast<unsigned_type>(*pos)];
pos += details::utf8_skip[static_cast<unsigned_type>(*pos)];
} else if constexpr (UTF16<char_type>) {
++pos;
if (!(*pos < trail_surrogate_min<char_type> || *pos > trail_surrogate_max<char_type>) ) {
Expand All @@ -625,8 +624,8 @@ namespace webpp::unicode {
// alternative implementation:
// for (++p; (*p & 0xc0) == 0x80; ++p) ;
using unsigned_type = stl::make_unsigned_t<char_type>;
auto const len = static_cast<difference_type>(
details::utf8_skip<char_type>[static_cast<unsigned_type>(*pos)]);
auto const len =
static_cast<difference_type>(details::utf8_skip[static_cast<unsigned_type>(*pos)]);
if (end - pos < len) {
++pos;
return false;
Expand Down
59 changes: 25 additions & 34 deletions webpp/unicode/utf_reducer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ namespace webpp::unicode {
constexpr utf_range_marker& operator=(utf_range_marker&&) = delete;
constexpr ~utf_range_marker() noexcept = default;

explicit constexpr utf_range_marker(IterT inp_beg) noexcept
: beginp{inp_beg},
endp{stl::next(beginp, required_length_of<unit_type, difference_type>(*inp_beg))} {
assert(beginp < endp);
}

// NOLINTNEXTLINE(*-easily-swappable-parameters)
explicit constexpr utf_range_marker(IterT inp_beg, IterT inp_end) noexcept
: beginp{inp_beg},
Expand Down Expand Up @@ -71,9 +65,10 @@ namespace webpp::unicode {
}
}

constexpr void mark(IterT inp_beg) noexcept {
beginp = inp_beg;
endp = stl::next(beginp, required_length_of<unit_type, difference_type>(*inp_beg));
/// str_end is the end of the string, and has nothing to do with the hole's end
constexpr void mark_code_point(IterT const& cp_beg, IterT const& str_end) noexcept {
beginp = cp_beg;
endp = stl::next(beginp, checked::code_point_length<IterT, difference_type>(cp_beg, str_end));
assert(beginp < endp);
}

Expand Down Expand Up @@ -329,14 +324,6 @@ namespace webpp::unicode {
this->append(other, iters);
}

template <typename IterableT>
constexpr void append(IterT start, IterableT& iters) noexcept {
auto const len = required_length_of<unit_type, stl::int_fast8_t>(*start);
auto end = stl::next(start, len);
utf_range_marker other(start, end);
this->append(other, iters);
}

constexpr void clear() noexcept {
beginp = endp = IterT{};
}
Expand All @@ -360,6 +347,10 @@ namespace webpp::unicode {
// do nothing
}

constexpr void mark_code_point([[maybe_unused]] auto&&... args) noexcept {
// do nothing
}

[[nodiscard]] constexpr bool empty() const noexcept {
return true;
}
Expand Down Expand Up @@ -452,9 +443,10 @@ namespace webpp::unicode {
auto iter_cpy = istl::deref(iter());
while (count <= cp_len) {
assert(iter_cpy <= reducer->endptr);
auto const cur_len = required_length_of<unit_type, stl::int_fast8_t>(*iter_cpy);
iter_cpy += cur_len;
count += cur_len;
auto const cur_len =
checked::code_point_length<iterator, stl::int_fast8_t>(iter_cpy, reducer->end());
iter_cpy += cur_len;
count += cur_len;
}
webpp_assume(count <= 6);
return count;
Expand Down Expand Up @@ -680,8 +672,11 @@ namespace webpp::unicode {
}

/// Pin Act: Set (and use the hole if needed as extra space)
constexpr void set(value_type inp_code_point, [[maybe_unused]] utf_range_marker<iterator>& hole)
noexcept(is_nothrow) {
template <typename... HoleT>
requires(stl::same_as<HoleT, utf_range_marker<iterator>> && ...)
constexpr void set(value_type inp_code_point,
[[maybe_unused]] utf_range_marker<iterator>& hole,
[[maybe_unused]] HoleT&... holes) noexcept(is_nothrow) {
if constexpr (UTF32<unit_type>) {
*iter() = inp_code_point;
} else {
Expand Down Expand Up @@ -727,7 +722,12 @@ namespace webpp::unicode {
test_state_correctness();
} else {
set_inplace(inp_code_point, cur_len, new_len);
hole.move_mark(static_cast<difference_type>(-old_diff));
if (hole.begin() > iter()) {
hole.move_mark(static_cast<difference_type>(-old_diff));
}
((holes.begin() > iter() &&
(holes.move_mark(static_cast<difference_type>(-old_diff)), true)),
...);
}
}
}
Expand All @@ -736,9 +736,8 @@ namespace webpp::unicode {
constexpr void fallback_hole(utf_range_marker<iterator>& lhs, utf_range_marker<iterator>& rhs)
noexcept(is_nothrow) {
if constexpr (!UTF32<unit_type>) {
if (auto const cur_len = required_length_of<unit_type, size_type>(*iter());
lhs.has_overlaps(iter(), cur_len))
{
auto const cur_len = checked::code_point_length<iterator, size_type>(iter(), reducer->end());
if (lhs.has_overlaps(iter(), cur_len)) {
lhs.mark(stl::move(rhs));
}
}
Expand Down Expand Up @@ -885,14 +884,6 @@ namespace webpp::unicode {
return iters[static_cast<size_type>(index)];
}

[[nodiscard]] constexpr iterator end_pin_iter(size_type index = 0) noexcept
requires(!UTF32<unit_type>)
{
assert(index < PinCount);
auto ptr = iters[index];
return ptr + required_length_of<unit_type, difference_type>(*ptr);
}

/// Get all pins in a tuple construct
/// Usage:
/// auto [pin1, pin2, pin3] = reducer.pins();
Expand Down

0 comments on commit c3950ce

Please sign in to comment.