Skip to content

Commit

Permalink
Fix Utf8Decoder operator== handling of the last code point in the inp…
Browse files Browse the repository at this point in the history
…ut (#1316)

* Fix Utf8Decoder operator== handling of the last code point in the input

While working on diagnostics for #1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input.

The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/parse.cpp#L51-L68

however, if the intended location for the ^ is the "end" of the line, we hit this bug:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L273

The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L222-L226

This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early.

Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself.

* Add noexcepts as requested by @Thomas1664
  • Loading branch information
BillyONeal authored Jan 5, 2024
1 parent 5b8f9c4 commit 4e75e1c
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 39 deletions.
5 changes: 0 additions & 5 deletions include/vcpkg/base/message-data.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed,
"{command_line}\n"
"failed with the following results:")
DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:")
DECLARE_MESSAGE(ComparingUtf8Decoders,
(),
"",
"Comparing Utf8Decoders with different provenance; this is always an error")
DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":")
DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...")
DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline,
Expand Down Expand Up @@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed
DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}")
DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature")
DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature")
DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string")
DECLARE_MESSAGE(InfoSetEnvVar,
(msg::env_var),
"In this context 'editor' means IDE",
Expand Down
57 changes: 42 additions & 15 deletions include/vcpkg/base/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ namespace vcpkg::Unicode

Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;

constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
int utf8_code_unit_count(char code_unit) noexcept;

int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
Expand Down Expand Up @@ -87,20 +86,20 @@ namespace vcpkg::Unicode

bool utf8_is_valid_string(const char* first, const char* last) noexcept;

constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xD800 && code_point < 0xDC00;
}
constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xDC00 && code_point < 0xE000;
}
constexpr bool utf16_is_surrogate_code_point(char32_t code_point)
constexpr bool utf16_is_surrogate_code_point(char32_t code_point) noexcept
{
return code_point >= 0xD800 && code_point < 0xE000;
}

char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing);
char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept;

/*
There are two ways to parse utf-8: we could allow unpaired surrogates (as in [wtf-8]) -- this is important
Expand All @@ -119,7 +118,7 @@ namespace vcpkg::Unicode
struct Utf8Decoder
{
constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
explicit constexpr Utf8Decoder(StringView sv) : Utf8Decoder(sv.begin(), sv.end()) { }
explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
{
if (next_ != last_)
Expand All @@ -131,13 +130,31 @@ namespace vcpkg::Unicode
current_ = end_of_file;
}
}
constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) noexcept
: Utf8Decoder(sv.begin(), sv.end(), first_decode_error)
{
}
constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
: current_(0), next_(first), last_(last)
{
if (next_ != last_)
{
first_decode_error = next();
}
else
{
current_ = end_of_file;
first_decode_error = utf8_errc::NoError;
}
}

struct sentinel
{
};

constexpr inline bool is_eof() const noexcept { return current_ == end_of_file; }

[[nodiscard]] utf8_errc next();
[[nodiscard]] utf8_errc next() noexcept;

Utf8Decoder& operator=(sentinel) noexcept;

Expand All @@ -161,7 +178,24 @@ namespace vcpkg::Unicode

constexpr sentinel end() const { return sentinel(); }

friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept;
friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
// comparing decoders of different provenance is always an error
Checks::unreachable(VCPKG_LINE_INFO);
}

return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
}
friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
return !(lhs == rhs);
}
friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return d.is_eof(); }
friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d == s; }
friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return !d.is_eof(); }
friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d != s; }

using difference_type = std::ptrdiff_t;
using value_type = char32_t;
Expand All @@ -174,11 +208,4 @@ namespace vcpkg::Unicode
const char* next_;
const char* last_;
};

inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); }

constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }
}
2 changes: 0 additions & 2 deletions locales/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,6 @@
"CommandFailed": "command:\n{command_line}\nfailed with the following results:",
"_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.",
"CommunityTriplets": "Community Triplets:",
"ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error",
"CompressFolderFailed": "Failed to compress folder \"{path}\":",
"_CompressFolderFailed.comment": "An example of {path} is /foo/bar.",
"ComputingInstallPlan": "Computing installation plan...",
Expand Down Expand Up @@ -972,7 +971,6 @@
"_ImproperShaLength.comment": "{value} is a sha.",
"IncorrectArchiveFileSignature": "Incorrect archive file signature",
"IncorrectPESignature": "Incorrect PE signature",
"IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string",
"InfoSetEnvVar": "You can also set {env_var} to your editor of choice.",
"_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.",
"InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.",
Expand Down
6 changes: 3 additions & 3 deletions src/vcpkg-test/ci-baseline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
{
check_error("hello", R"(test:1:6: error: expected ':' here
on expression: hello
^)");
^)");

check_error("hello\n:", R"(test:1:6: error: expected ':' here
on expression: hello
Expand All @@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")

check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: x64-windows:
^)");
^)");

check_error("x64-windows:\nport:x64-windows=skip",
R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
Expand All @@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
// clang-format off
check_error(" \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: )" "\t" R"(x64-windows:
)" "\t" R"( ^)");
)" "\t" R"( ^)");
// clang-format on

check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here
Expand Down
2 changes: 1 addition & 1 deletion src/vcpkg-test/manifests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]")
CHECK(messages.error->to_string() ==
R"(<license string>:1:8: error: Expected a license name, found the end of the string.
on expression: MIT AND
^)");
^)");

parse_spdx_license_expression("MIT AND unknownlicense", messages);
CHECK(!messages.error);
Expand Down
106 changes: 106 additions & 0 deletions src/vcpkg-test/unicode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include <vcpkg-test/util.h>

#include <vcpkg/base/unicode.h>

#include <iterator>

using namespace vcpkg::Unicode;

TEST_CASE ("Utf8Decoder valid", "[unicode]")
{
const char32_t* expected = U"";
const char* input = "";
SECTION ("hello")
{
expected = U"hello";
input = "hello";
}

SECTION ("all types of code points")
{
expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8";
input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88";
}

SECTION ("wtf-8 leading")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xD83C, 0};
expected = storage;
input = "\xED\xA0\xBC";
}

SECTION ("wtf-8 trailing")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xDFC8, 0};
expected = storage;
input = "\xED\xBF\x88";
}

auto input_end = input + strlen(input);
Utf8Decoder decode(input);
// strlen for char32_t:
size_t expected_size = 0;
for (auto* e = expected; *e; ++e)
{
++expected_size;
}

auto decode_at_end = std::next(decode, expected_size);
for (size_t idx = 0; idx < expected_size; ++idx)
{
REQUIRE(decode != decode.end()); // compare sentinel
REQUIRE(decode != decode_at_end); // compare iterator
REQUIRE(*decode == expected[idx]);
REQUIRE(!decode.is_eof());
char32_t decoded;
auto pointer_to_current = decode.pointer_to_current();
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
REQUIRE(decoded == expected[idx]);
char encoded[4];
auto encoded_size = utf8_encode_code_point(encoded, decoded);
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
++decode;
}

REQUIRE(decode == decode.end());
REQUIRE(decode == decode_at_end);
}

TEST_CASE ("Utf8Decoder first decode empty", "[unicode]")
{
utf8_errc err;
Utf8Decoder uut("", err);
REQUIRE(err == utf8_errc::NoError);
REQUIRE(uut.is_eof());
REQUIRE(uut == uut.end());
REQUIRE(uut == uut);
}

TEST_CASE ("Utf8Decoder invalid", "[unicode]")
{
utf8_errc err;
// clang-format off
Utf8Decoder uut(GENERATE(
"hello \xFF too big",
"hello \xC3\xBF\xBF\xBF also too big",
"hello \x9C continuation",
"hello \xE0\x28 overlong",
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
"missing two: \xC3",
"missing three one: \xE6\x9C",
"missing three two: \xE6",
"missing four one: \xF0\x9F\x8F",
"missing four two: \xF0\x9F",
"missing four three: \xF0"
), err);
// clang-format on
while (err == utf8_errc::NoError)
{
REQUIRE(!uut.is_eof());
err = uut.next();
}

REQUIRE(uut.is_eof());
}
19 changes: 6 additions & 13 deletions src/vcpkg/base/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ namespace vcpkg::Unicode
}
}

static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }

int utf8_code_unit_count(char code_unit) noexcept
{
return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
Expand Down Expand Up @@ -174,7 +176,7 @@ namespace vcpkg::Unicode
return err == utf8_errc::NoError;
}

char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing)
char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept
{
vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_leading_surrogate_code_point(leading));
vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_trailing_surrogate_code_point(trailing));
Expand Down Expand Up @@ -212,11 +214,12 @@ namespace vcpkg::Unicode
return next_ - count;
}

utf8_errc Utf8Decoder::next()
utf8_errc Utf8Decoder::next() noexcept
{
if (is_eof())
{
vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder);
// incremented Utf8Decoder at the end of the string
Checks::unreachable(VCPKG_LINE_INFO);
}

if (next_ == last_)
Expand Down Expand Up @@ -262,14 +265,4 @@ namespace vcpkg::Unicode
current_ = end_of_file;
return *this;
}

bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders);
}

return lhs.next_ == rhs.next_;
}
}

0 comments on commit 4e75e1c

Please sign in to comment.