Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Utf8Decoder operator== handling of the last code point in the input #1316

Merged
merged 2 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions include/vcpkg/base/message-data.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed,
"{command_line}\n"
"failed with the following results:")
DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:")
DECLARE_MESSAGE(ComparingUtf8Decoders,
(),
"",
"Comparing Utf8Decoders with different provenance; this is always an error")
DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":")
DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...")
DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline,
Expand Down Expand Up @@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed
DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}")
DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature")
DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature")
DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string")
DECLARE_MESSAGE(InfoSetEnvVar,
(msg::env_var),
"In this context 'editor' means IDE",
Expand Down
57 changes: 42 additions & 15 deletions include/vcpkg/base/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ namespace vcpkg::Unicode

Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;

constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
int utf8_code_unit_count(char code_unit) noexcept;

int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
Expand Down Expand Up @@ -87,20 +86,20 @@ namespace vcpkg::Unicode

bool utf8_is_valid_string(const char* first, const char* last) noexcept;

constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xD800 && code_point < 0xDC00;
}
constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xDC00 && code_point < 0xE000;
}
constexpr bool utf16_is_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xD800 && code_point < 0xE000;
}

char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing);
char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept;

/*
There are two ways to parse utf-8: we could allow unpaired surrogates (as in [wtf-8]) -- this is important
Expand All @@ -119,7 +118,7 @@ namespace vcpkg::Unicode
struct Utf8Decoder
{
constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
explicit constexpr Utf8Decoder(StringView sv) : Utf8Decoder(sv.begin(), sv.end()) { }
explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
{
if (next_ != last_)
Expand All @@ -131,13 +130,31 @@ namespace vcpkg::Unicode
current_ = end_of_file;
}
}
constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) noexcept
: Utf8Decoder(sv.begin(), sv.end(), first_decode_error)
{
}
constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
: current_(0), next_(first), last_(last)
{
if (next_ != last_)
{
first_decode_error = next();
}
else
{
current_ = end_of_file;
first_decode_error = utf8_errc::NoError;
}
}

struct sentinel
{
};

constexpr inline bool is_eof() const noexcept { return current_ == end_of_file; }

[[nodiscard]] utf8_errc next();
[[nodiscard]] utf8_errc next() noexcept;

Utf8Decoder& operator=(sentinel) noexcept;

Expand All @@ -161,7 +178,24 @@ namespace vcpkg::Unicode

constexpr sentinel end() const { return sentinel(); }

friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept;
friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
// comparing decoders of different provenance is always an error
Checks::unreachable(VCPKG_LINE_INFO);
}

return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
}
friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
return !(lhs == rhs);
}
friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return d.is_eof(); }
friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d == s; }
friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return !d.is_eof(); }
friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d != s; }

using difference_type = std::ptrdiff_t;
using value_type = char32_t;
Expand All @@ -174,11 +208,4 @@ namespace vcpkg::Unicode
const char* next_;
const char* last_;
};

inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); }

constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }
}
2 changes: 0 additions & 2 deletions locales/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,6 @@
"CommandFailed": "command:\n{command_line}\nfailed with the following results:",
"_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.",
"CommunityTriplets": "Community Triplets:",
"ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error",
"CompressFolderFailed": "Failed to compress folder \"{path}\":",
"_CompressFolderFailed.comment": "An example of {path} is /foo/bar.",
"ComputingInstallPlan": "Computing installation plan...",
Expand Down Expand Up @@ -972,7 +971,6 @@
"_ImproperShaLength.comment": "{value} is a sha.",
"IncorrectArchiveFileSignature": "Incorrect archive file signature",
"IncorrectPESignature": "Incorrect PE signature",
"IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string",
"InfoSetEnvVar": "You can also set {env_var} to your editor of choice.",
"_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.",
"InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.",
Expand Down
6 changes: 3 additions & 3 deletions src/vcpkg-test/ci-baseline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
{
check_error("hello", R"(test:1:6: error: expected ':' here
on expression: hello
^)");
^)");

check_error("hello\n:", R"(test:1:6: error: expected ':' here
on expression: hello
Expand All @@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")

check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: x64-windows:
^)");
^)");

check_error("x64-windows:\nport:x64-windows=skip",
R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
Expand All @@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
// clang-format off
check_error(" \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: )" "\t" R"(x64-windows:
)" "\t" R"( ^)");
)" "\t" R"( ^)");
// clang-format on

check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here
Expand Down
2 changes: 1 addition & 1 deletion src/vcpkg-test/manifests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]")
CHECK(messages.error->to_string() ==
R"(<license string>:1:8: error: Expected a license name, found the end of the string.
on expression: MIT AND
^)");
^)");

parse_spdx_license_expression("MIT AND unknownlicense", messages);
CHECK(!messages.error);
Expand Down
106 changes: 106 additions & 0 deletions src/vcpkg-test/unicode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include <vcpkg-test/util.h>

#include <vcpkg/base/unicode.h>

#include <iterator>

using namespace vcpkg::Unicode;

TEST_CASE ("Utf8Decoder valid", "[unicode]")
{
const char32_t* expected = U"";
const char* input = "";
SECTION ("hello")
{
expected = U"hello";
input = "hello";
}

SECTION ("all types of code points")
{
expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8";
input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88";
}

SECTION ("wtf-8 leading")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xD83C, 0};
expected = storage;
input = "\xED\xA0\xBC";
}

SECTION ("wtf-8 trailing")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xDFC8, 0};
expected = storage;
input = "\xED\xBF\x88";
}

auto input_end = input + strlen(input);
Utf8Decoder decode(input);
// strlen for char32_t:
size_t expected_size = 0;
for (auto* e = expected; *e; ++e)
{
++expected_size;
}

auto decode_at_end = std::next(decode, expected_size);
for (size_t idx = 0; idx < expected_size; ++idx)
{
REQUIRE(decode != decode.end()); // compare sentinel
REQUIRE(decode != decode_at_end); // compare iterator
REQUIRE(*decode == expected[idx]);
REQUIRE(!decode.is_eof());
char32_t decoded;
auto pointer_to_current = decode.pointer_to_current();
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
REQUIRE(decoded == expected[idx]);
char encoded[4];
auto encoded_size = utf8_encode_code_point(encoded, decoded);
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
++decode;
}

REQUIRE(decode == decode.end());
REQUIRE(decode == decode_at_end);
}

TEST_CASE ("Utf8Decoder first decode empty", "[unicode]")
{
utf8_errc err;
Utf8Decoder uut("", err);
REQUIRE(err == utf8_errc::NoError);
REQUIRE(uut.is_eof());
REQUIRE(uut == uut.end());
REQUIRE(uut == uut);
}

TEST_CASE ("Utf8Decoder invalid", "[unicode]")
{
utf8_errc err;
// clang-format off
Utf8Decoder uut(GENERATE(
"hello \xFF too big",
"hello \xC3\xBF\xBF\xBF also too big",
"hello \x9C continuation",
"hello \xE0\x28 overlong",
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
"missing two: \xC3",
"missing three one: \xE6\x9C",
"missing three two: \xE6",
"missing four one: \xF0\x9F\x8F",
"missing four two: \xF0\x9F",
"missing four three: \xF0"
), err);
// clang-format on
while (err == utf8_errc::NoError)
{
REQUIRE(!uut.is_eof());
err = uut.next();
}

REQUIRE(uut.is_eof());
}
19 changes: 6 additions & 13 deletions src/vcpkg/base/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ namespace vcpkg::Unicode
}
}

static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }

int utf8_code_unit_count(char code_unit) noexcept
{
return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
Expand Down Expand Up @@ -174,7 +176,7 @@ namespace vcpkg::Unicode
return err == utf8_errc::NoError;
}

char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing)
char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept
{
vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_leading_surrogate_code_point(leading));
vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_trailing_surrogate_code_point(trailing));
Expand Down Expand Up @@ -212,11 +214,12 @@ namespace vcpkg::Unicode
return next_ - count;
}

utf8_errc Utf8Decoder::next()
utf8_errc Utf8Decoder::next() noexcept
{
if (is_eof())
{
vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder);
// incremented Utf8Decoder at the end of the string
Checks::unreachable(VCPKG_LINE_INFO);
}

if (next_ == last_)
Expand Down Expand Up @@ -262,14 +265,4 @@ namespace vcpkg::Unicode
current_ = end_of_file;
return *this;
}

bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders);
}

return lhs.next_ == rhs.next_;
}
}