Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

src: improve utf8 string generation performance #54873

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions src/string_bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -419,47 +419,47 @@ Maybe<size_t> StringBytes::StorageSize(Isolate* isolate,
Local<Value> val,
enum encoding encoding) {
HandleScope scope(isolate);
size_t data_size = 0;
bool is_buffer = Buffer::HasInstance(val);

if (is_buffer && (encoding == BUFFER || encoding == LATIN1)) {
if (Buffer::HasInstance(val) && (encoding == BUFFER || encoding == LATIN1)) {
return Just(Buffer::Length(val));
}

Local<String> str;
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
return Nothing<size_t>();
String::ValueView view(isolate, str);
size_t data_size = 0;

switch (encoding) {
case ASCII:
case LATIN1:
data_size = str->Length();
data_size = view.length();
break;

case BUFFER:
case UTF8:
// A single UCS2 codepoint never takes up more than 3 utf8 bytes.
// It is an exercise for the caller to decide when a string is
// long enough to justify calling Size() instead of StorageSize()
data_size = 3 * str->Length();
data_size = 3 * view.length();
break;

case UCS2:
data_size = str->Length() * sizeof(uint16_t);
data_size = view.length() * sizeof(uint16_t);
break;

case BASE64URL:
data_size = simdutf::base64_length_from_binary(str->Length(),
data_size = simdutf::base64_length_from_binary(view.length(),
simdutf::base64_url);
break;

case BASE64:
data_size = simdutf::base64_length_from_binary(str->Length());
data_size = simdutf::base64_length_from_binary(view.length());
break;

case HEX:
CHECK(str->Length() % 2 == 0 && "invalid hex string length");
data_size = str->Length() / 2;
CHECK(view.length() % 2 == 0 && "invalid hex string length");
data_size = view.length() / 2;
break;

default:
Expand All @@ -480,32 +480,36 @@ Maybe<size_t> StringBytes::Size(Isolate* isolate,
Local<String> str;
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
return Nothing<size_t>();
String::ValueView view(isolate, str);

switch (encoding) {
case ASCII:
case LATIN1:
return Just<size_t>(str->Length());
return Just<size_t>(view.length());

case BUFFER:
case UTF8:
return Just<size_t>(str->Utf8Length(isolate));
if (view.is_one_byte()) {
return Just<size_t>(simdutf::utf8_length_from_latin1(
reinterpret_cast<const char*>(view.data8()), view.length()));
}
return Just<size_t>(simdutf::utf8_length_from_utf16(
reinterpret_cast<const char16_t*>(view.data16()), view.length()));

case UCS2:
return Just(str->Length() * sizeof(uint16_t));
return Just(view.length() * sizeof(uint16_t));

case BASE64URL: {
String::Value value(isolate, str);
return Just(simdutf::base64_length_from_binary(value.length(),
return Just(simdutf::base64_length_from_binary(view.length(),
simdutf::base64_url));
}

case BASE64: {
String::Value value(isolate, str);
return Just(simdutf::base64_length_from_binary(value.length()));
return Just(simdutf::base64_length_from_binary(view.length()));
}

case HEX:
return Just<size_t>(str->Length() / 2);
return Just<size_t>(view.length() / 2);
}

UNREACHABLE();
Expand Down
28 changes: 25 additions & 3 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include <sys/types.h>
#endif

#include <simdutf.h>

#include <atomic>
#include <cstdio>
#include <cstring>
Expand Down Expand Up @@ -100,11 +102,31 @@ static void MakeUtf8String(Isolate* isolate,
MaybeStackBuffer<T>* target) {
Local<String> string;
if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return;
String::ValueView value_view(isolate, string);

auto value_length = value_view.length();

if (value_view.is_one_byte()) {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
auto const_char = reinterpret_cast<const char*>(value_view.data8());
auto expected_length =
target->capacity() < (static_cast<size_t>(value_length) * 2 + 1)
? simdutf::utf8_length_from_latin1(const_char, value_length)
: value_length * 2;

// Add +1 for null termination.
target->AllocateSufficientStorage(expected_length + 1);
const auto actual_length = simdutf::convert_latin1_to_utf8(
const_char, value_length, target->out());
target->SetLengthAndZeroTerminate(actual_length);
return;
}

size_t storage;
if (!StringBytes::StorageSize(isolate, string, UTF8).To(&storage)) return;
storage += 1;
// Add +1 for null termination.
size_t storage = (3 * value_length) + 1;
target->AllocateSufficientStorage(storage);

// TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's
// implemented
const int flags =
String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8;
const int length =
Expand Down
Loading