Skip to content

Commit

Permalink
buffer: use simdutf for atob implementation
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel Lemire <[email protected]>
  • Loading branch information
anonrig and lemire committed Apr 5, 2024
1 parent b6619c2 commit 79973d9
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 73 deletions.
20 changes: 20 additions & 0 deletions benchmark/buffers/buffer-atob.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
'use strict';
const common = require('../common.js');
const assert = require('node:assert');

const bench = common.createBenchmark(main, {
size: [16, 32, 64, 128],
n: [1e6],
});

function main({ n, size }) {
const input = btoa('A'.repeat(size));
let out = 0;

bench.start();
for (let i = 0; i < n; i++) {
out += atob(input).length;
}
bench.end(n);
assert(out > 0);
}
86 changes: 13 additions & 73 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@

const {
Array,
ArrayFrom,
ArrayIsArray,
ArrayPrototypeForEach,
ArrayPrototypeIndexOf,
MathFloor,
MathMin,
MathTrunc,
Expand Down Expand Up @@ -70,6 +68,7 @@ const {
swap64: _swap64,
kMaxLength,
kStringMaxLength,
atob: _atob,
} = internalBinding('buffer');
const {
constants: {
Expand Down Expand Up @@ -1259,85 +1258,26 @@ function btoa(input) {
return buf.toString('base64');
}

// Refs: https://infra.spec.whatwg.org/#forgiving-base64-decode
const kForgivingBase64AllowedChars = [
// ASCII whitespace
// Refs: https://infra.spec.whatwg.org/#ascii-whitespace
0x09, 0x0A, 0x0C, 0x0D, 0x20,

// Uppercase letters
...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('A') + i),

// Lowercase letters
...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('a') + i),

// Decimal digits
...ArrayFrom({ length: 10 }, (_, i) => StringPrototypeCharCodeAt('0') + i),

0x2B, // +
0x2F, // /
0x3D, // =
];
const kEqualSignIndex = ArrayPrototypeIndexOf(kForgivingBase64AllowedChars,
0x3D);

function atob(input) {
// The implementation here has not been performance optimized in any way and
// should not be.
// Refs: https://github.com/nodejs/node/pull/38433#issuecomment-828426932
if (arguments.length === 0) {
throw new ERR_MISSING_ARGS('input');
}

input = `${input}`;
let nonAsciiWhitespaceCharCount = 0;
let equalCharCount = 0;
const result = _atob(`${input}`);

for (let n = 0; n < input.length; n++) {
const index = ArrayPrototypeIndexOf(
kForgivingBase64AllowedChars,
StringPrototypeCharCodeAt(input, n));

if (index > 4) {
// The first 5 elements of `kForgivingBase64AllowedChars` are
// ASCII whitespace char codes.
nonAsciiWhitespaceCharCount++;

if (index === kEqualSignIndex) {
equalCharCount++;
} else if (equalCharCount) {
// The `=` char is only allowed at the end.
throw lazyDOMException('Invalid character', 'InvalidCharacterError');
}

if (equalCharCount > 2) {
// Only one more `=` is permitted after the first equal sign.
throw lazyDOMException('Invalid character', 'InvalidCharacterError');
}
} else if (index === -1) {
switch (result) {
case -2: // Invalid character
throw lazyDOMException('Invalid character', 'InvalidCharacterError');
}
}

let reminder = nonAsciiWhitespaceCharCount % 4;

// See #2, #3, #4 - https://infra.spec.whatwg.org/#forgiving-base64
if (!reminder) {
// Remove all trailing `=` characters and get the new reminder.
reminder = (nonAsciiWhitespaceCharCount - equalCharCount) % 4;
} else if (equalCharCount) {
// `=` should not in the input if there's a reminder.
throw lazyDOMException('Invalid character', 'InvalidCharacterError');
}

// See #3 - https://infra.spec.whatwg.org/#forgiving-base64
if (reminder === 1) {
throw lazyDOMException(
'The string to be decoded is not correctly encoded.',
'InvalidCharacterError');
case -1: // Single character remained
throw lazyDOMException(
'The string to be decoded is not correctly encoded.',
'InvalidCharacterError');
case -3: // Possible overflow
// TODO(@anonrig): Throw correct error in here.
throw lazyDOMException('The input causes overflow.', 'InvalidCharacterError');
default:
return result;
}

return Buffer.from(input, 'base64').toString('latin1');
}

function isUtf8(input) {
Expand Down
58 changes: 58 additions & 0 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,60 @@ void DetachArrayBuffer(const FunctionCallbackInfo<Value>& args) {
}
}

// on success: returns a non-negative integer indicating the size of the
// binary produced, it most be no larger than 2147483647 bytes.
// In case of error, a negative value is returned:
// * -1 indicates a single character remained,
// * -2 indicates an invalid character,
// * -3 indicates a possible overflow (i.e., more than 2 GB output).
static void Atob(const FunctionCallbackInfo<Value>& args) {
CHECK_EQ(args.Length(), 1);
Environment* env = Environment::GetCurrent(args);
THROW_AND_RETURN_IF_NOT_STRING(env, args[0], "argument");

Local<String> input = args[0].As<String>();
MaybeStackBuffer<char> buffer;
simdutf::result result;

if (input->IsExternalOneByte()) { // 8-bit case
auto ext = input->GetExternalOneByteStringResource();
size_t expected_length =
simdutf::maximal_binary_length_from_base64(ext->data(), ext->length());
buffer.AllocateSufficientStorage(expected_length + 1);
buffer.SetLengthAndZeroTerminate(expected_length);
result = simdutf::base64_to_binary(
ext->data(), ext->length(), buffer.out(), simdutf::base64_default);
} else { // 16-bit case
String::Value value(env->isolate(), input);
auto data = reinterpret_cast<const char16_t*>(*value);
size_t expected_length =
simdutf::maximal_binary_length_from_base64(data, value.length());
buffer.AllocateSufficientStorage(expected_length + 1);
buffer.SetLengthAndZeroTerminate(expected_length);
result = simdutf::base64_to_binary(
data, value.length(), buffer.out(), simdutf::base64_default);
}

if (result.error == simdutf::error_code::SUCCESS) {
auto value =
String::NewFromOneByte(env->isolate(),
reinterpret_cast<const uint8_t*>(buffer.out()))
.ToLocalChecked();
return args.GetReturnValue().Set(value);
}

// Default value is: "possible overflow"
int32_t error_code{-3};

if (result.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
error_code = -2;
} else if (result.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
error_code = -1;
}

args.GetReturnValue().Set(error_code);
}

namespace {

std::pair<void*, size_t> DecomposeBufferToParts(Local<Value> buffer) {
Expand Down Expand Up @@ -1267,6 +1321,8 @@ void Initialize(Local<Object> target,
Environment* env = Environment::GetCurrent(context);
Isolate* isolate = env->isolate();

SetMethodNoSideEffect(context, target, "atob", Atob);

SetMethod(context, target, "setBufferPrototype", SetBufferPrototype);
SetMethodNoSideEffect(context, target, "createFromString", CreateFromString);

Expand Down Expand Up @@ -1368,6 +1424,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {

registry->Register(DetachArrayBuffer);
registry->Register(CopyArrayBuffer);

registry->Register(Atob);
}

} // namespace Buffer
Expand Down

0 comments on commit 79973d9

Please sign in to comment.