diff --git a/src/util.cc b/src/util.cc index f382b3d565a8cf..e17a6f286b9e6c 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1,8 +1,187 @@ #include "util.h" #include "string_bytes.h" +#define UNI_SUR_HIGH_START 0xD800UL +#define UNI_SUR_LOW_END 0xDFFFUL +#define UNI_REPLACEMENT_CHAR 0x0000FFFDUL +#define UNI_MAX_LEGAL_UTF32 0x0010FFFFUL + +inline uint32_t log2(uint8_t v) { + const uint32_t r = (v > 15) << 2; + v >>= r; + const uint32_t s = (v > 3) << 1; + v >>= s; + v >>= 1; + return r | s | v; +} + +inline uint32_t clz(uint8_t v) { + return 7 - log2(v); +} + namespace node { +typedef size_t (*ErrorStrategy)(size_t, const uint8_t*, size_t); +typedef void (*GlyphStrategy)(const uint8_t*, size_t, uint32_t, size_t); + + +inline size_t Skip( + const size_t remaining, + const uint8_t* input, + const size_t glyph_size) { + if (remaining > glyph_size) { + return 1; + } + return 0; +} + + +inline size_t Halt( + const size_t remaining, + const uint8_t* input, + const size_t glyph_size) { + return 0; +} + + +inline void DiscardGlyph( + const uint8_t* glyph, + const size_t glyph_size, + const uint32_t glyph_value, + const size_t pos) { +} + + +inline bool IsLegalUtf8Glyph(const uint8_t* input, const size_t length) { + uint8_t acc; + const uint8_t* srcptr = input + length; + switch (length) { + default: return false; + case 4: + acc = (*--srcptr); + if (acc < 0x80 || acc > 0xBF) + return false; + // fall-through + case 3: + acc = (*--srcptr); + if (acc < 0x80 || acc > 0xBF) + return false; + // fall-through + case 2: + acc = (*--srcptr); + if (acc > 0xBF) + return false; + switch (*input) { + case 0xE0: + if (acc < 0xA0) + return false; + break; + case 0xED: + if (acc > 0x9F) + return false; + break; + case 0xF0: + if (acc < 0x90) + return false; + break; + case 0xF4: + if (acc > 0x8F) + return false; + break; + default: + if (acc < 0x80) + return false; + } + // fall-through + case 1: + if (*input >= 0x80 && *input < 0xC2) { + return false; + } + } + return *input <= 0xF4; +} + + +static const uint32_t offsets_from_utf8[6] = { + 0x00000000, 0x00003080, 0x000E2080, + 0x03C82080, 0xFA082080, 0x82082080 +}; + + +template +inline size_t Utf8Consume( + const uint8_t* const input, + const size_t length, + const GlyphStrategy OnGlyph) { + size_t idx = 0; + while (idx < length) { + size_t advance = 0; + uint32_t glyph = 0; + uint8_t extrabytes = input[idx] ? clz(~input[idx]) : 0; + size_t i = idx; + + if (extrabytes + idx > length) { + advance = OnError(length - idx, input, extrabytes); + } else if (!IsLegalUtf8Glyph(input + idx, extrabytes + 1)) { + advance = OnError(length - idx, input, extrabytes); + } else { + ASSERT(extrabytes < 4); + switch (extrabytes) { + case 3: + glyph += input[i++]; + glyph <<= 6; + // fall-through + case 2: + glyph += input[i++]; + glyph <<= 6; + // fall-through + case 1: + glyph += input[i++]; + glyph <<= 6; + // fall-through + case 0: + glyph += input[i]; + } + + glyph -= offsets_from_utf8[extrabytes]; + + if (glyph > UNI_MAX_LEGAL_UTF32 || + (glyph >= UNI_SUR_HIGH_START && glyph <= UNI_SUR_LOW_END)) { + advance = OnError(length - idx, input, extrabytes); + } else { + advance = extrabytes + 1; + OnGlyph(input + idx, extrabytes + 1, glyph, idx); + } + } + + if (advance == 0) { + break; + } + idx += advance; + } + return idx; +} + + +size_t Utf8Value::StripInvalidUtf8Glyphs(uint8_t* const input, const size_t size) { + size_t idx = 0; + auto on_glyph = [input, &idx]( + const uint8_t* data, size_t size, uint32_t glyph, size_t pos) { + size_t old_idx = idx; + idx += size; + if (old_idx == pos) return; + memmove(input + old_idx, data, size); + }; + + return Utf8Consume(input, size, on_glyph); +} + + +bool Utf8Value::IsValidUtf8(const uint8_t * const input, const size_t size) { + return Utf8Consume(input, size, DiscardGlyph) == size; +} + + Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) : length_(0), str_(str_st_) { if (value.IsEmpty()) diff --git a/src/util.h b/src/util.h index ea17a155745993..a2881adb6b6924 100644 --- a/src/util.h +++ b/src/util.h @@ -189,6 +189,8 @@ class Utf8Value { return length_; }; + static bool IsValidUtf8(const uint8_t* const, const size_t); + static size_t StripInvalidUtf8Glyphs(uint8_t* const, const size_t); private: size_t length_; char* str_;