From 677d801cbbc50382767561f5230955b655e29b2e Mon Sep 17 00:00:00 2001 From: Per Lundberg Date: Thu, 7 Nov 2024 21:49:50 +0000 Subject: [PATCH] (stdlib) Support comparison between `UTF8String` and `ASCIIString` ...as long as the content is ASCII-only in both strings. https://gitlab.perlang.org/perlang/perlang/-/merge_requests/560 --- release-notes/v0.6.0.md | 2 + src/Perlang.Common/TypeReference.cs | 12 ++-- .../Compiler/PerlangCompiler.cs | 2 +- src/stdlib/CMakeLists.txt | 1 + src/stdlib/src/perlang_string.cc | 67 +++++++++++++++---- src/stdlib/src/perlang_string.h | 16 ++--- src/stdlib/src/utf8_string.cc | 23 ++++++- src/stdlib/src/utf8_string.h | 9 +++ src/stdlib/test/perlang_string.cc | 17 +++-- src/stdlib/test/utf8_string.cc | 22 ++++++ 10 files changed, 137 insertions(+), 34 deletions(-) create mode 100644 src/stdlib/test/utf8_string.cc diff --git a/release-notes/v0.6.0.md b/release-notes/v0.6.0.md index ca2831c..96ed6ce 100644 --- a/release-notes/v0.6.0.md +++ b/release-notes/v0.6.0.md @@ -46,6 +46,7 @@ - Support dynamically resizing `StringBuilder` buffer [[!550][550]] - Fix buffer overrun in dynamic resizing of `StringBuilder` [[!551][551]] - Fix memory leak in tests [[!558][558]] +- Support comparison between `UTF8String` and `ASCIIString` [[!560][560]] #### CI - Add `dotnet test` CI job [[!499][499]] @@ -149,3 +150,4 @@ [556]: https://gitlab.perlang.org/perlang/perlang/merge_requests/556 [558]: https://gitlab.perlang.org/perlang/perlang/merge_requests/558 [559]: https://gitlab.perlang.org/perlang/perlang/merge_requests/559 +[560]: https://gitlab.perlang.org/perlang/perlang/merge_requests/560 diff --git a/src/Perlang.Common/TypeReference.cs b/src/Perlang.Common/TypeReference.cs index 4882c60..2466329 100644 --- a/src/Perlang.Common/TypeReference.cs +++ b/src/Perlang.Common/TypeReference.cs @@ -105,15 +105,15 @@ public void SetClrType(Type? value) // These are wrapped in std::shared_ptr<>, as a simple way to deal with ownership for now. For the // long-term solution, see https://gitlab.perlang.org/perlang/perlang/-/issues/378. - var t when t.FullName == "Perlang.Lang.AsciiString" => "std::shared_ptr", - var t when t.FullName == "Perlang.Lang.String" => "std::shared_ptr", - var t when t.FullName == "Perlang.Lang.Utf8String" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.AsciiString" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.String" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.Utf8String" => "std::shared_ptr", // Arrays of reference types. Note how the array types are StringArray for all of these, because it is much // more complex to use different types here. - var t when t.FullName == "Perlang.Lang.String[]" => "std::shared_ptr", - var t when t.FullName == "Perlang.Lang.AsciiString[]" => "std::shared_ptr", - var t when t.FullName == "Perlang.Lang.Utf8String[]" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.String[]" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.AsciiString[]" => "std::shared_ptr", + var t when t.FullName == "Perlang.Lang.Utf8String[]" => "std::shared_ptr", _ => throw new NotImplementedInCompiledModeException($"Internal error: C++ type for {clrType} not defined") }; diff --git a/src/Perlang.Interpreter/Compiler/PerlangCompiler.cs b/src/Perlang.Interpreter/Compiler/PerlangCompiler.cs index 235d3cd..7a58623 100644 --- a/src/Perlang.Interpreter/Compiler/PerlangCompiler.cs +++ b/src/Perlang.Interpreter/Compiler/PerlangCompiler.cs @@ -1636,7 +1636,7 @@ public object VisitFunctionStmt(Stmt.Function functionStmt) methods[functionStmt.Name.Lexeme] = new Method( functionStmt.Name.Lexeme, functionStmt.Parameters, - $"{(functionStmt.ReturnTypeReference.CppWrapInSharedPtr ? "std::shared_ptr" : "")}", functionContent diff --git a/src/stdlib/CMakeLists.txt b/src/stdlib/CMakeLists.txt index 88741fa..82b2225 100644 --- a/src/stdlib/CMakeLists.txt +++ b/src/stdlib/CMakeLists.txt @@ -159,6 +159,7 @@ if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") test/perlang_string.cc test/print.cc test/string_builder.cc + test/utf8_string.cc ) add_executable( diff --git a/src/stdlib/src/perlang_string.cc b/src/stdlib/src/perlang_string.cc index 232cedc..e9ccaa6 100644 --- a/src/stdlib/src/perlang_string.cc +++ b/src/stdlib/src/perlang_string.cc @@ -6,20 +6,20 @@ namespace perlang { // Would be nice to inline this, but it's hard because of forward declaration of UTF8String in perlang_string.h and // circular dependencies. We'll have to live with this until we can reimplement it in Perlang... :) - bool String::operator==(UTF8String& rhs) const + bool String::operator==(UTF8String& rhs) { return *this == static_cast(&rhs); } - bool String::operator==(String* rhs) const + bool String::operator==(String* rhs) { - const UTF8String* utf8_lhs; - const UTF8String* utf8_rhs; + auto* utf8_lhs = dynamic_cast(this); + auto* utf8_rhs = dynamic_cast(rhs); - const ASCIIString* ascii_lhs; - const ASCIIString* ascii_rhs; + const auto* ascii_lhs = dynamic_cast(this); + const auto* ascii_rhs = dynamic_cast(rhs); - if ((utf8_lhs = dynamic_cast(this)) != nullptr && (utf8_rhs = dynamic_cast(rhs)) != nullptr) { + if (utf8_lhs != nullptr && utf8_rhs != nullptr) { // If the length differs, there's no way the strings can match. if (utf8_lhs->length() != utf8_rhs->length()) { return false; @@ -32,10 +32,10 @@ namespace perlang // memcmp() is an unsafe operation, but since we have verified that both strings have the expected length, // we should be on the safe side by now. The types have been validated to both be UTF8String, so comparing - // the backing byte buffers are guranteed to work. + // the backing byte buffers are guaranteed to work. return memcmp(utf8_lhs->bytes(), utf8_rhs->bytes(), length()) == 0; } - else if ((ascii_lhs = dynamic_cast(this)) != nullptr && (ascii_rhs = dynamic_cast(rhs)) != nullptr) + else if (ascii_lhs != nullptr && ascii_rhs != nullptr) { // If the length differs, there's no way the strings can match. if (ascii_lhs->length() != ascii_rhs->length()) { @@ -49,9 +49,48 @@ namespace perlang // memcmp() is an unsafe operation, but since we have verified that both strings have the expected length, // we should be on the safe side by now. The types have been validated to both be UTF8String, so comparing - // the backing byte buffers are guranteed to work. + // the backing byte buffers are guaranteed to work. return memcmp(ascii_lhs->bytes(), ascii_rhs->bytes(), length()) == 0; } + else if (ascii_lhs != nullptr && utf8_rhs != nullptr) { + // If the UTF8 string contains one or more non-ASCII characters, it's logically impossible for the strings + // to match. + if (!utf8_rhs->is_ascii()) { + return false; + } + + // Also, if the length differs, there's no way the strings can match. + if (ascii_lhs->length() != utf8_rhs->length()) { + return false; + } + + // Pointing at the same backing byte array means the strings are equal. + if (ascii_lhs->bytes() == utf8_rhs->bytes()) { + return true; + } + + // memcmp() is an unsafe operation, but since we have verified that both strings have the expected length, + // we should be on the safe side by now. Both of the types have been validated to contain ASCII-only + // content, so comparing the backing byte buffers are guaranteed to work. + return memcmp(ascii_lhs->bytes(), utf8_rhs->bytes(), length()) == 0; + } + else if (utf8_lhs != nullptr && ascii_rhs != nullptr) { + // Identical to the previous branch, but it's the left-hand side that is a UTF8String instead of the + // right-hand side. + if (!utf8_lhs->is_ascii()) { + return false; + } + + if (utf8_lhs->length() != ascii_rhs->length()) { + return false; + } + + if (utf8_lhs->bytes() == ascii_rhs->bytes()) { + return true; + } + + return memcmp(utf8_lhs->bytes(), ascii_rhs->bytes(), length()) == 0; + } else { // The strings are of different types => consider them unequal. In the future, we want to support // "ASCII-only" strings stored in UTF8String instances (i.e. UTF8Strings with only 7-bit content), and @@ -60,22 +99,22 @@ namespace perlang } } - bool String::operator!=(const String& rhs) const + bool String::operator!=(const String& rhs) { return !(*this == rhs); } - bool String::operator!=(String& rhs) const + bool String::operator!=(String& rhs) { return !(*this == rhs); } - bool String::operator!=(UTF8String& rhs) const + bool String::operator!=(UTF8String& rhs) { return !(*this == rhs); } - bool String::operator!=(String* rhs) const + bool String::operator!=(String* rhs) { return !(*this == rhs); } diff --git a/src/stdlib/src/perlang_string.h b/src/stdlib/src/perlang_string.h index 9f65a18..2bfe0e3 100644 --- a/src/stdlib/src/perlang_string.h +++ b/src/stdlib/src/perlang_string.h @@ -71,40 +71,40 @@ namespace perlang // Compares this string to another string, returning true if they are equal. [[nodiscard]] - inline bool operator==(const String& rhs) const + inline bool operator==(const String& rhs) { return *this == const_cast(rhs); } // Compares this string to another string, returning true if they are equal. [[nodiscard]] - inline bool operator==(String& rhs) const + inline bool operator==(String& rhs) { return *this == &rhs; } // Compares this string to another string, returning true if they are equal. [[nodiscard]] - bool operator==(UTF8String& rhs) const; + bool operator==(UTF8String& rhs); // Compares this string to another string, returning true if they are equal. [[nodiscard]] - bool operator==(String* rhs) const; + bool operator==(String* rhs); // Compares this string to another string, returning true if they are not equal. [[nodiscard]] - bool operator!=(const String& rhs) const; + bool operator!=(const String& rhs); // Compares this string to another string, returning true if they are not equal. [[nodiscard]] - bool operator!=(String& rhs) const; + bool operator!=(String& rhs); // Compares this string to another string, returning true if they are not equal. [[nodiscard]] - bool operator!=(UTF8String& rhs) const; + bool operator!=(UTF8String& rhs); // Compares this string to another string, returning true if they are not equal. [[nodiscard]] - bool operator!=(String* rhs) const; + bool operator!=(String* rhs); }; } diff --git a/src/stdlib/src/utf8_string.cc b/src/stdlib/src/utf8_string.cc index d6ce4be..e328f3c 100644 --- a/src/stdlib/src/utf8_string.cc +++ b/src/stdlib/src/utf8_string.cc @@ -1,6 +1,6 @@ #include -#include #include +#include #include "bigint.h" #include "internal/string_utils.h" @@ -170,6 +170,27 @@ namespace perlang return *this + str; } + bool UTF8String::is_ascii() + { + // Note that this is susceptible to data races; two threads could enter this method simultaneously. However, + // this is considered tolerable. Either one of them will "win" and set the is_ascii_ value accordingly; the data + // is immutable, so they will inevitably end up with the same result anyway. + + if (is_ascii_ != nullptr) + return *is_ascii_.get(); + + for (size_t i = 0; i < length_; i++) { + if ((uint8_t)bytes_[i] > 127) { + is_ascii_ = std::make_unique(false); + return *is_ascii_.get(); + } + } + + // No bytes with bit 7 (value 128) encountered => this is an ASCII string. + is_ascii_ = std::make_unique(true); + return *is_ascii_.get(); + } + std::unique_ptr operator+(const int64_t lhs, const UTF8String& rhs) { std::string str = std::to_string(lhs); diff --git a/src/stdlib/src/utf8_string.h b/src/stdlib/src/utf8_string.h index 6d8a0a3..0e848d7 100644 --- a/src/stdlib/src/utf8_string.h +++ b/src/stdlib/src/utf8_string.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "perlang_string.h" @@ -42,6 +43,10 @@ namespace perlang [[nodiscard]] static std::unique_ptr from_copied_string(const char* str, size_t length); + // Determines if the string is ASCII safe or not. Multiple subsequent calls to this method may return a cached + // result from a previous run. The first call may use a pre-calculated value, but this is not guaranteed by + // this method. + bool is_ascii(); private: // Private constructor for creating a new UTF8String from a C-style (NUL-terminated) string. The `owned` // parameter indicates whether the UTF8String should take ownership of the memory it points to, and thus be @@ -120,6 +125,10 @@ namespace perlang // responsible for deallocating the memory when it is no longer needed. If false, the string is borrowing the // memory from somewhere else, and should not deallocate it. bool owned_; + + // A flag indicating whether this string contains only ASCII characters or not. Characters containing ASCII-only + // can trivially be compared to ASCIIString instances. + std::unique_ptr is_ascii_; }; // Concatenate an int/long+UTF8String. The memory for the new string is allocated from the heap. This is a free diff --git a/src/stdlib/test/perlang_string.cc b/src/stdlib/test/perlang_string.cc index 6e6856c..e1f2f92 100644 --- a/src/stdlib/test/perlang_string.cc +++ b/src/stdlib/test/perlang_string.cc @@ -6,8 +6,17 @@ TEST_CASE( "perlang::String, comparing with String" ) { - std::shared_ptr s1 = perlang::ASCIIString::from_static_string("this is a string"); - std::shared_ptr s2 = perlang::ASCIIString::from_static_string("this is a string"); + std::shared_ptr s1 = perlang::ASCIIString::from_static_string("this is a string"); + std::shared_ptr s2 = perlang::ASCIIString::from_static_string("this is a string"); + + // Assert + REQUIRE(*s1 == *s2); +} + +TEST_CASE( "perlang::String, comparing ASCIIString with equal UTF8String" ) +{ + std::shared_ptr s1 = perlang::ASCIIString::from_static_string("this is an ASCII-clean string"); + std::shared_ptr s2 = perlang::UTF8String::from_static_string("this is an ASCII-clean string"); // Assert REQUIRE(*s1 == *s2); @@ -15,8 +24,8 @@ TEST_CASE( "perlang::String, comparing with String" ) TEST_CASE( "perlang::String, comparing ASCIIString with unequal UTF8String" ) { - std::shared_ptr s1 = perlang::ASCIIString::from_static_string("this is a string"); - std::shared_ptr s2 = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし"); + std::shared_ptr s1 = perlang::ASCIIString::from_static_string("this is an ASCII-clean string"); + std::shared_ptr s2 = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし"); // Assert REQUIRE(*s1 != *s2); diff --git a/src/stdlib/test/utf8_string.cc b/src/stdlib/test/utf8_string.cc new file mode 100644 index 0000000..95a68cc --- /dev/null +++ b/src/stdlib/test/utf8_string.cc @@ -0,0 +1,22 @@ +// utf8_string.cc - tests for the perlang::UTF8String + +#include +#include + +#include "perlang_stdlib.h" + +TEST_CASE( "perlang::UTF8String::is_ascii(), returns true for ASCII-only string" ) +{ + std::shared_ptr s = perlang::UTF8String::from_static_string("this is a an ASCII string"); + + // Assert + REQUIRE(s->is_ascii()); +} + +TEST_CASE( "perlang::UTF8String::is_ascii(), returns true for non-ASCII string" ) +{ + std::shared_ptr s = perlang::UTF8String::from_static_string("this is a string with non-ASCII characters: åäöÅÄÖéèüÜÿŸïÏすし"); + + // Assert + REQUIRE_FALSE(s->is_ascii()); +}