Skip to content

Commit

Permalink
(stdlib) Support comparison between UTF8String and ASCIIString
Browse files Browse the repository at this point in the history
...as long as the content is ASCII-only in both strings.

https://gitlab.perlang.org/perlang/perlang/-/merge_requests/560
  • Loading branch information
perlun committed Nov 7, 2024
1 parent 6a8f278 commit 677d801
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 34 deletions.
2 changes: 2 additions & 0 deletions release-notes/v0.6.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
- Support dynamically resizing `StringBuilder` buffer [[!550][550]]
- Fix buffer overrun in dynamic resizing of `StringBuilder` [[!551][551]]
- Fix memory leak in tests [[!558][558]]
- Support comparison between `UTF8String` and `ASCIIString` [[!560][560]]

#### CI
- Add `dotnet test` CI job [[!499][499]]
Expand Down Expand Up @@ -149,3 +150,4 @@
[556]: https://gitlab.perlang.org/perlang/perlang/merge_requests/556
[558]: https://gitlab.perlang.org/perlang/perlang/merge_requests/558
[559]: https://gitlab.perlang.org/perlang/perlang/merge_requests/559
[560]: https://gitlab.perlang.org/perlang/perlang/merge_requests/560
12 changes: 6 additions & 6 deletions src/Perlang.Common/TypeReference.cs
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,15 @@ public void SetClrType(Type? value)

// These are wrapped in std::shared_ptr<>, as a simple way to deal with ownership for now. For the
// long-term solution, see https://gitlab.perlang.org/perlang/perlang/-/issues/378.
var t when t.FullName == "Perlang.Lang.AsciiString" => "std::shared_ptr<const perlang::ASCIIString>",
var t when t.FullName == "Perlang.Lang.String" => "std::shared_ptr<const perlang::String>",
var t when t.FullName == "Perlang.Lang.Utf8String" => "std::shared_ptr<const perlang::UTF8String>",
var t when t.FullName == "Perlang.Lang.AsciiString" => "std::shared_ptr<perlang::ASCIIString>",
var t when t.FullName == "Perlang.Lang.String" => "std::shared_ptr<perlang::String>",
var t when t.FullName == "Perlang.Lang.Utf8String" => "std::shared_ptr<perlang::UTF8String>",

// Arrays of reference types. Note how the array types are StringArray for all of these, because it is much
// more complex to use different types here.
var t when t.FullName == "Perlang.Lang.String[]" => "std::shared_ptr<const perlang::StringArray>",
var t when t.FullName == "Perlang.Lang.AsciiString[]" => "std::shared_ptr<const perlang::StringArray>",
var t when t.FullName == "Perlang.Lang.Utf8String[]" => "std::shared_ptr<const perlang::StringArray>",
var t when t.FullName == "Perlang.Lang.String[]" => "std::shared_ptr<perlang::StringArray>",
var t when t.FullName == "Perlang.Lang.AsciiString[]" => "std::shared_ptr<perlang::StringArray>",
var t when t.FullName == "Perlang.Lang.Utf8String[]" => "std::shared_ptr<perlang::StringArray>",

_ => throw new NotImplementedInCompiledModeException($"Internal error: C++ type for {clrType} not defined")
};
Expand Down
2 changes: 1 addition & 1 deletion src/Perlang.Interpreter/Compiler/PerlangCompiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1636,7 +1636,7 @@ public object VisitFunctionStmt(Stmt.Function functionStmt)
methods[functionStmt.Name.Lexeme] = new Method(
functionStmt.Name.Lexeme,
functionStmt.Parameters,
$"{(functionStmt.ReturnTypeReference.CppWrapInSharedPtr ? "std::shared_ptr<const " : "")}" +
$"{(functionStmt.ReturnTypeReference.CppWrapInSharedPtr ? "std::shared_ptr<" : "")}" +
$"{functionStmt.ReturnTypeReference.CppType}" +
$"{(functionStmt.ReturnTypeReference.CppWrapInSharedPtr ? ">" : "")}",
functionContent
Expand Down
1 change: 1 addition & 0 deletions src/stdlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
test/perlang_string.cc
test/print.cc
test/string_builder.cc
test/utf8_string.cc
)

add_executable(
Expand Down
67 changes: 53 additions & 14 deletions src/stdlib/src/perlang_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ namespace perlang
{
// Would be nice to inline this, but it's hard because of forward declaration of UTF8String in perlang_string.h and
// circular dependencies. We'll have to live with this until we can reimplement it in Perlang... :)
bool String::operator==(UTF8String& rhs) const
bool String::operator==(UTF8String& rhs)
{
return *this == static_cast<String*>(&rhs);
}

bool String::operator==(String* rhs) const
bool String::operator==(String* rhs)
{
const UTF8String* utf8_lhs;
const UTF8String* utf8_rhs;
auto* utf8_lhs = dynamic_cast<UTF8String*>(this);
auto* utf8_rhs = dynamic_cast<UTF8String*>(rhs);

const ASCIIString* ascii_lhs;
const ASCIIString* ascii_rhs;
const auto* ascii_lhs = dynamic_cast<const ASCIIString*>(this);
const auto* ascii_rhs = dynamic_cast<ASCIIString*>(rhs);

if ((utf8_lhs = dynamic_cast<const UTF8String*>(this)) != nullptr && (utf8_rhs = dynamic_cast<UTF8String*>(rhs)) != nullptr) {
if (utf8_lhs != nullptr && utf8_rhs != nullptr) {
// If the length differs, there's no way the strings can match.
if (utf8_lhs->length() != utf8_rhs->length()) {
return false;
Expand All @@ -32,10 +32,10 @@ namespace perlang

// memcmp() is an unsafe operation, but since we have verified that both strings have the expected length,
// we should be on the safe side by now. The types have been validated to both be UTF8String, so comparing
// the backing byte buffers are guranteed to work.
// the backing byte buffers are guaranteed to work.
return memcmp(utf8_lhs->bytes(), utf8_rhs->bytes(), length()) == 0;
}
else if ((ascii_lhs = dynamic_cast<const ASCIIString*>(this)) != nullptr && (ascii_rhs = dynamic_cast<ASCIIString*>(rhs)) != nullptr)
else if (ascii_lhs != nullptr && ascii_rhs != nullptr)
{
// If the length differs, there's no way the strings can match.
if (ascii_lhs->length() != ascii_rhs->length()) {
Expand All @@ -49,9 +49,48 @@ namespace perlang

// memcmp() is an unsafe operation, but since we have verified that both strings have the expected length,
// we should be on the safe side by now. The types have been validated to both be UTF8String, so comparing
// the backing byte buffers are guranteed to work.
// the backing byte buffers are guaranteed to work.
return memcmp(ascii_lhs->bytes(), ascii_rhs->bytes(), length()) == 0;
}
else if (ascii_lhs != nullptr && utf8_rhs != nullptr) {
// If the UTF8 string contains one or more non-ASCII characters, it's logically impossible for the strings
// to match.
if (!utf8_rhs->is_ascii()) {
return false;
}

// Also, if the length differs, there's no way the strings can match.
if (ascii_lhs->length() != utf8_rhs->length()) {
return false;
}

// Pointing at the same backing byte array means the strings are equal.
if (ascii_lhs->bytes() == utf8_rhs->bytes()) {
return true;
}

// memcmp() is an unsafe operation, but since we have verified that both strings have the expected length,
// we should be on the safe side by now. Both of the types have been validated to contain ASCII-only
// content, so comparing the backing byte buffers are guaranteed to work.
return memcmp(ascii_lhs->bytes(), utf8_rhs->bytes(), length()) == 0;
}
else if (utf8_lhs != nullptr && ascii_rhs != nullptr) {
// Identical to the previous branch, but it's the left-hand side that is a UTF8String instead of the
// right-hand side.
if (!utf8_lhs->is_ascii()) {
return false;
}

if (utf8_lhs->length() != ascii_rhs->length()) {
return false;
}

if (utf8_lhs->bytes() == ascii_rhs->bytes()) {
return true;
}

return memcmp(utf8_lhs->bytes(), ascii_rhs->bytes(), length()) == 0;
}
else {
// The strings are of different types => consider them unequal. In the future, we want to support
// "ASCII-only" strings stored in UTF8String instances (i.e. UTF8Strings with only 7-bit content), and
Expand All @@ -60,22 +99,22 @@ namespace perlang
}
}

bool String::operator!=(const String& rhs) const
bool String::operator!=(const String& rhs)
{
return !(*this == rhs);
}

bool String::operator!=(String& rhs) const
bool String::operator!=(String& rhs)
{
return !(*this == rhs);
}

bool String::operator!=(UTF8String& rhs) const
bool String::operator!=(UTF8String& rhs)
{
return !(*this == rhs);
}

bool String::operator!=(String* rhs) const
bool String::operator!=(String* rhs)
{
return !(*this == rhs);
}
Expand Down
16 changes: 8 additions & 8 deletions src/stdlib/src/perlang_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,40 +71,40 @@ namespace perlang

// Compares this string to another string, returning true if they are equal.
[[nodiscard]]
inline bool operator==(const String& rhs) const
inline bool operator==(const String& rhs)
{
return *this == const_cast<String&>(rhs);
}

// Compares this string to another string, returning true if they are equal.
[[nodiscard]]
inline bool operator==(String& rhs) const
inline bool operator==(String& rhs)
{
return *this == &rhs;
}

// Compares this string to another string, returning true if they are equal.
[[nodiscard]]
bool operator==(UTF8String& rhs) const;
bool operator==(UTF8String& rhs);

// Compares this string to another string, returning true if they are equal.
[[nodiscard]]
bool operator==(String* rhs) const;
bool operator==(String* rhs);

// Compares this string to another string, returning true if they are not equal.
[[nodiscard]]
bool operator!=(const String& rhs) const;
bool operator!=(const String& rhs);

// Compares this string to another string, returning true if they are not equal.
[[nodiscard]]
bool operator!=(String& rhs) const;
bool operator!=(String& rhs);

// Compares this string to another string, returning true if they are not equal.
[[nodiscard]]
bool operator!=(UTF8String& rhs) const;
bool operator!=(UTF8String& rhs);

// Compares this string to another string, returning true if they are not equal.
[[nodiscard]]
bool operator!=(String* rhs) const;
bool operator!=(String* rhs);
};
}
23 changes: 22 additions & 1 deletion src/stdlib/src/utf8_string.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <cstring>
#include <stdexcept>
#include <memory>
#include <stdexcept>

#include "bigint.h"
#include "internal/string_utils.h"
Expand Down Expand Up @@ -170,6 +170,27 @@ namespace perlang
return *this + str;
}

bool UTF8String::is_ascii()
{
// Note that this is susceptible to data races; two threads could enter this method simultaneously. However,
// this is considered tolerable. Either one of them will "win" and set the is_ascii_ value accordingly; the data
// is immutable, so they will inevitably end up with the same result anyway.

if (is_ascii_ != nullptr)
return *is_ascii_.get();

for (size_t i = 0; i < length_; i++) {
if ((uint8_t)bytes_[i] > 127) {
is_ascii_ = std::make_unique<bool>(false);
return *is_ascii_.get();
}
}

// No bytes with bit 7 (value 128) encountered => this is an ASCII string.
is_ascii_ = std::make_unique<bool>(true);
return *is_ascii_.get();
}

std::unique_ptr<UTF8String> operator+(const int64_t lhs, const UTF8String& rhs)
{
std::string str = std::to_string(lhs);
Expand Down
9 changes: 9 additions & 0 deletions src/stdlib/src/utf8_string.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <cstddef>
#include <atomic>

#include "perlang_string.h"

Expand Down Expand Up @@ -42,6 +43,10 @@ namespace perlang
[[nodiscard]]
static std::unique_ptr<UTF8String> from_copied_string(const char* str, size_t length);

// Determines if the string is ASCII safe or not. Multiple subsequent calls to this method may return a cached
// result from a previous run. The first call may use a pre-calculated value, but this is not guaranteed by
// this method.
bool is_ascii();
private:
// Private constructor for creating a new UTF8String from a C-style (NUL-terminated) string. The `owned`
// parameter indicates whether the UTF8String should take ownership of the memory it points to, and thus be
Expand Down Expand Up @@ -120,6 +125,10 @@ namespace perlang
// responsible for deallocating the memory when it is no longer needed. If false, the string is borrowing the
// memory from somewhere else, and should not deallocate it.
bool owned_;

// A flag indicating whether this string contains only ASCII characters or not. Characters containing ASCII-only
// can trivially be compared to ASCIIString instances.
std::unique_ptr<bool> is_ascii_;
};

// Concatenate an int/long+UTF8String. The memory for the new string is allocated from the heap. This is a free
Expand Down
17 changes: 13 additions & 4 deletions src/stdlib/test/perlang_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,26 @@

TEST_CASE( "perlang::String, comparing with String" )
{
std::shared_ptr<const perlang::String> s1 = perlang::ASCIIString::from_static_string("this is a string");
std::shared_ptr<const perlang::String> s2 = perlang::ASCIIString::from_static_string("this is a string");
std::shared_ptr<perlang::String> s1 = perlang::ASCIIString::from_static_string("this is a string");
std::shared_ptr<perlang::String> s2 = perlang::ASCIIString::from_static_string("this is a string");

// Assert
REQUIRE(*s1 == *s2);
}

TEST_CASE( "perlang::String, comparing ASCIIString with equal UTF8String" )
{
std::shared_ptr<perlang::String> s1 = perlang::ASCIIString::from_static_string("this is an ASCII-clean string");
std::shared_ptr<perlang::String> s2 = perlang::UTF8String::from_static_string("this is an ASCII-clean string");

// Assert
REQUIRE(*s1 == *s2);
}

TEST_CASE( "perlang::String, comparing ASCIIString with unequal UTF8String" )
{
std::shared_ptr<const perlang::String> s1 = perlang::ASCIIString::from_static_string("this is a string");
std::shared_ptr<const perlang::String> s2 = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし");
std::shared_ptr<perlang::String> s1 = perlang::ASCIIString::from_static_string("this is an ASCII-clean string");
std::shared_ptr<perlang::String> s2 = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし");

// Assert
REQUIRE(*s1 != *s2);
Expand Down
22 changes: 22 additions & 0 deletions src/stdlib/test/utf8_string.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// utf8_string.cc - tests for the perlang::UTF8String

#include <catch2/catch_test_macros.hpp>
#include <memory>

#include "perlang_stdlib.h"

TEST_CASE( "perlang::UTF8String::is_ascii(), returns true for ASCII-only string" )
{
std::shared_ptr<perlang::UTF8String> s = perlang::UTF8String::from_static_string("this is a an ASCII string");

// Assert
REQUIRE(s->is_ascii());
}

TEST_CASE( "perlang::UTF8String::is_ascii(), returns true for non-ASCII string" )
{
std::shared_ptr<perlang::UTF8String> s = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし");

// Assert
REQUIRE_FALSE(s->is_ascii());
}

0 comments on commit 677d801

Please sign in to comment.