diff --git a/op_crates/web/08_text_encoding.js b/op_crates/web/08_text_encoding.js index 1fda1a8167ffa8..019eb52c99c0c3 100644 --- a/op_crates/web/08_text_encoding.js +++ b/op_crates/web/08_text_encoding.js @@ -56,6 +56,131 @@ return u; } + // Minor Unicode reference for readers. + // + // Unicode code points are integers in the range 0x0 - 0x10ffff, (using at + // most 21 bits). These integers are what rendering engines use to decide what + // glyphs are displayed on the screen. Since most code points use less than + // 21-bits, there are encodings that can represent code points more + // efficiently. + // + // UTF-16 is one such encoding, and is used by Javascript engines to store + // strings internally. UTF-16 uses 1 or 2 16-bit integers (2 or 4 bytes) to + // represent a single code point. + // + // UTF-8 is another encoding, and uses 1, 2, 3 or 4 bytes to represent a + // single code point. + // + // The goal of the function below is to transform UTF-16 into UTF-8 without + // allocating any memory (writing to the buffer passed as parameter). The + // conversion loop is roughly divided into 3 steps: + // + // - Decode UTF-16 into Unicode. + // - Check if there's still enough space in the output buffer. If not, break + // out of the loop. + // - Encode UTF-8 into the output buffer. + // + // Some references to learn more about the topic: + // - https://dmitripavlutin.com/what-every-javascript-developer-should-know-about-unicode + // - https://en.wikipedia.org/wiki/UTF-8 + // - https://en.wikipedia.org/wiki/UTF-16 + function encodeUtf8(input, output, state) { + let { read, written } = state; + const inLen = input.length; + const outLen = output.length; + while (read < inLen) { + // Step 1: Decode the UTF-16 code unit(s) into an unicode code point. + // + // There are three possibilities here: + // - The code unit is outside the high surrogate range and is treated as + // the code point. + // - The code unit is in the high surrogate range and the next one + // is in the low surrogate range. The surrogate pair is combined into + // the final code point. + // - The code unit is a lone surrogate (high or low) which is invalid in + // UTF-16. In this case it is replaced by 0xfffd (� ) + const badCodePoint = 0xfffd; + const codeUnit = input.charCodeAt(read++); + const surrogateMask = codeUnit & 0xfc00; + let codePoint = codeUnit; + if (surrogateMask === 0xd800) { + // codeUnit is a high surrogate, check if there's a next character + if (read < inLen) { + // check if the next one is a low surrogate + const nextCodeUnit = input.charCodeAt(read); + if ((nextCodeUnit & 0xfc00) === 0xdc00) { + // low surrogate, advance input offset and compute code point + codePoint = 0x10000 + + ((codeUnit & 0x3ff) << 10) + (nextCodeUnit & 0x3ff); + read++; + } else { + // lone high surrogate + codePoint = badCodePoint; + } + } else { + // lone high surrogate + codePoint = badCodePoint; + } + } else if (surrogateMask === 0xdc00) { + // lone low surrogate + codePoint = badCodePoint; + } + + // Step 2: Check if there's available space to encode the code point as + // UTF-8. It will take at most 4 bytes, only need to check if the + // available space is lower than that. + const availableSpace = outLen - written; + if (availableSpace < 4) { + // Possibly not enough space, make the final decision based on the code + // point range. + if ( + availableSpace < 1 || + (availableSpace < 2 && codePoint >= 0x80) || + (availableSpace < 3 && codePoint >= 0x800) || + codePoint >= 0x10000 + ) { + // Not enough space. Rewind read offset and bail out + const isSurrogatePair = codePoint !== codeUnit && + codePoint !== badCodePoint; + read -= isSurrogatePair ? 2 : 1; + break; + } + } + + // Step 3: Encode the code point as UTF-8 into the output buffer. + if (codePoint < 0x80) { + // 7 bits, encoded in 1 byte directly (0xxxxxxx). + output[written++] = codePoint; + } else if (codePoint < 0x800) { + // 11 bits, encode in 2 bytes where: + // byte 1: 110xxxxx (5 bits) + // byte 2: 10xxxxxx (6 bits) + output[written++] = 0xc0 | (0x1f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); + } else if (codePoint < 0x10000) { + // 16 bits, encode in 3 bytes where: + // byte 1: 1110xxxx (4 bits) + // byte 2: 10xxxxxx (6 bits) + // byte 3: 10xxxxxx (6 bits) + output[written++] = 0xe0 | (0x0f & (codePoint >> 12)); + output[written++] = 0x80 | (0x3f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); + } else { + // 21 bits, encode in 4 bytes where: + // byte 1: 11110xxx (3 bits) + // byte 2: 10xxxxxx (6 bits) + // byte 3: 10xxxxxx (6 bits) + // byte 4: 10xxxxxx (6 bits) + output[written++] = 0xf0 | (0x07 & (codePoint >> 18)); + output[written++] = 0x80 | (0x3f & (codePoint >> 12)); + output[written++] = 0x80 | (0x3f & (codePoint >> 6)); + output[written++] = 0x80 | (0x3f & (codePoint)); + } + } + state.read = read; + state.written = written; + } + class UTF8Encoder { handler(codePoint) { if (codePoint === END_OF_STREAM) { @@ -4238,37 +4363,12 @@ "2nd argument to TextEncoder.encodeInto must be Uint8Array", ); } + const state = { read: 0, written: 0 }; if (dest.byteLength === 0) { - return { read: 0, written: 0 }; + return state; } - const encoder = new UTF8Encoder(); - const inputStream = new Stream(stringToCodePoints(input)); - - let written = 0; - let read = 0; - while (true) { - const item = inputStream.read(); - const result = encoder.handler(item); - if (result === "finished") { - break; - } - if (dest.length - written >= result.length) { - read++; - if (item > 0xFFFF) { - // increment read a second time if greater than U+FFFF - read++; - } - dest.set(result, written); - written += result.length; - } else { - break; - } - } - - return { - read, - written, - }; + encodeUtf8(input, dest, state); + return state; } get [Symbol.toStringTag]() { return "TextEncoder"; diff --git a/tools/wpt/expectation.json b/tools/wpt/expectation.json index a7eca6f79b6da2..004412881671ee 100644 --- a/tools/wpt/expectation.json +++ b/tools/wpt/expectation.json @@ -61,29 +61,17 @@ "encodeInto() into SharedArrayBuffer with 𝌆A and destination length 3, offset 4, filler 128", "encodeInto() into SharedArrayBuffer with 𝌆A and destination length 3, offset 0, filler random", "encodeInto() into SharedArrayBuffer with 𝌆A and destination length 3, offset 4, filler random", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random", - "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random", "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler random", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler random", - "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler random", "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler random", "encodeInto() into SharedArrayBuffer with ¥¥ and destination length 4, offset 0, filler 0", "encodeInto() into SharedArrayBuffer with ¥¥ and destination length 4, offset 4, filler 0", @@ -838,4 +826,4 @@ } } } -} \ No newline at end of file +}