-
Notifications
You must be signed in to change notification settings - Fork 29.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
wip,src: add utf8 consumer/validator #1319
Changes from 1 commit
58c62a2
e7918fe
09cd52f
5cb6d3d
7cd5465
e4bc82f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,196 @@ | ||
#include "util.h" | ||
#include "string_bytes.h" | ||
|
||
#define UNI_SUR_HIGH_START (uint32_t) 0xD800 | ||
#define UNI_SUR_LOW_END (uint32_t) 0xDFFF | ||
#define UNI_REPLACEMENT_CHAR (uint32_t) 0x0000FFFD | ||
#define UNI_MAX_LEGAL_UTF32 (uint32_t) 0x0010FFFF | ||
|
||
#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) | ||
#define HAS_GCC_BUILTIN_CLZ | ||
#endif | ||
|
||
#ifdef WIN32 | ||
#include <intrin.h> | ||
static uint32_t __inline clz(uint32_t xs) { | ||
unsigned long result = 0; | ||
_BitScanReverse(&result, xs); | ||
return (31 - result); | ||
} | ||
#elif defined(HAS_GCC_BUILTIN_CLZ) | ||
#define clz(xs) __builtin_clz(xs) | ||
#else | ||
static inline uint32_t clz(uint32_t xs) { | ||
uint32_t out = 0; | ||
while (xs >> (31 - out)) { | ||
++out; | ||
} | ||
return out; | ||
} | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compilers are pretty good in replacing idiomatic bit hacks with machine-specific instructions. Have you checked whether it's really necessary to have these #ifdefs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with movl %ebx, %eax
shll $24, %eax
notl %eax
bsrl %eax, %ecx # <-- __builtin_clz (bitscanreverse)
xorl $31, %ecx With ##DEBUG_VALUE: glyph <- R10D
movl %r10d, %ebx
shll $24, %ebx
notl %ebx
Ltmp17:
##DEBUG_VALUE: clz:out <- 0
##DEBUG_VALUE: clz:xs <- EBX
xorl %eax, %eax
Ltmp18:
LBB0_4: ## Parent Loop BB0_3 Depth=1
## => This Inner Loop Header: Depth=2
##DEBUG_VALUE: StripInvalidUtf8Glyphs:input <- R14
##DEBUG_VALUE: StripInvalidUtf8Glyphs:size <- R15
##DEBUG_VALUE: StripInvalidUtf8Glyphs:idx <- 0
##DEBUG_VALUE: Utf8Consume<&node::Skip, <lambda at ../src/util.cc:184:19> >:length <- R15
##DEBUG_VALUE: Utf8Consume<&node::Skip, <lambda at ../src/util.cc:184:19> >:idx <- 0
##DEBUG_VALUE: advance <- 0
##DEBUG_VALUE: glyph <- R10D
##DEBUG_VALUE: clz:xs <- EBX
##DEBUG_VALUE: clz:out <- 0
.loc 1 24 0 ## ../src/util.cc:24:0
leal 31(%rax), %ecx
movl %ebx, %edx
## kill: CL<def> CL<kill> ECX<kill>
shrl %cl, %edx
decl %eax
testl %edx, %edx
jne LBB0_4
Ltmp19:
## BB#5: ## %_ZL3clzj.exit.i
## in Loop: Header=BB0_3 Depth=1
##DEBUG_VALUE: StripInvalidUtf8Glyphs:input <- R14
##DEBUG_VALUE: StripInvalidUtf8Glyphs:size <- R15
##DEBUG_VALUE: StripInvalidUtf8Glyphs:idx <- 0
##DEBUG_VALUE: Utf8Consume<&node::Skip, <lambda at ../src/util.cc:184:19> >:length <- R15
##DEBUG_VALUE: Utf8Consume<&node::Skip, <lambda at ../src/util.cc:184:19> >:idx <- 0
##DEBUG_VALUE: advance <- 0
##DEBUG_VALUE: glyph <- R10D
.loc 1 131 0 ## ../src/util.cc:131:0
notl %eax
movzbl %al, %eax This is on OSX 10.10, clang++ Apple LLVM version 6.0 (clang-600.0.57) (based on LLVM 3.5svn), built with: $ c++ '-D_DARWIN_USE_64_BIT_INODE=1' '-DNODE_ARCH="x64"' '-DNODE_TAG=""' '-DNODE_V8_OPTIONS=""' '-DNODE_WANT_INTERNALS=1' '-DHAVE_OPENSSL=1' '-DHAVE_DTRACE=1' '-D__POSIX__' '-DNODE_PLATFORM="darwin"' '-DHTTP_PARSER_STRICT=0' '-D_LARGEFILE_SOURCE' '-D_FILE_OFFSET_BITS=64' -I../src -I../tools/msvs/genfiles -I../deps/uv/src/ares -I/Users/chris/projects/iojs/out/Release/obj/gen -I../deps/cares/include -I../deps/openssl/openssl/include -I../deps/v8/include -I../deps/zlib -I../deps/http_parser -I../deps/uv/include -Os -gdwarf-2 -mmacosx-version-min=10.5 -arch x86_64 -Wall -Wendif-labels -W -Wno-unused-parameter -std=gnu++0x -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-strict-aliasing -MMD -MF /Users/chris/projects/iojs/out/Release/.deps//Users/chris/projects/iojs/out/Release/obj.target/iojs/src/util.o.d.raw -S -c -o /Users/chris/projects/iojs/util.s ../src/util.cc |
||
|
||
namespace node { | ||
|
||
typedef size_t (*ErrorStrategy)( | ||
const size_t, const char*, const size_t); | ||
typedef void (*GlyphStrategy)( | ||
const char*, const size_t, const uint32_t, const size_t); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't have to const-qualify parameters in function pointer typedefs. It's not enforced by the compiler. (To be clear, that doesn't mean you should replace |
||
|
||
|
||
static size_t Skip( | ||
const size_t remaining, const char* input, const size_t glyph_size) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Style: parameters should line up. Also, prefer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. Switched |
||
if (remaining > glyph_size) { | ||
return 1; | ||
} | ||
return 0; | ||
} | ||
|
||
|
||
static size_t Halt( | ||
const size_t remaining, const char* input, const size_t glyph_size) { | ||
return 0; | ||
} | ||
|
||
|
||
static void DiscardGlyph( | ||
const char* glyph, | ||
size_t glyph_size, | ||
uint32_t glyph_value, | ||
const size_t pos) { | ||
} | ||
|
||
|
||
static bool IsLegalUTF8Glyph(const uint8_t* input, const size_t length) { | ||
uint8_t acc; | ||
const uint8_t* srcptr = input + length; | ||
switch (length) { | ||
default: return false; | ||
case 4: | ||
acc = (*--srcptr); | ||
if (acc < 0x80 || acc > 0xBF) | ||
return false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
case 3: | ||
acc = (*--srcptr); | ||
if (acc < 0x80 || acc > 0xBF) | ||
return false; | ||
case 2: | ||
acc = (*--srcptr); | ||
if (acc > 0xBF) | ||
return false; | ||
switch (*input) { | ||
case 0xE0: | ||
if (acc < 0xA0) | ||
return false; | ||
break; | ||
case 0xED: | ||
if (acc > 0x9F) | ||
return false; | ||
break; | ||
case 0xF0: | ||
if (acc < 0x90) | ||
return false; | ||
break; | ||
case 0xF4: | ||
if (acc > 0x8F) | ||
return false; | ||
break; | ||
default: | ||
if (acc < 0x80) | ||
return false; | ||
} | ||
case 1: | ||
if (*input >= 0x80 && *input < 0xC2) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0xC2? Shouldn't that be 0xC0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm. I'll look into that – the original version has it as 0xC2. |
||
return false; | ||
} | ||
} | ||
return *input <= 0xF4; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apart from possible bugs in the implementation, I suspect that this function pessimizes the common case of input < 0x80. |
||
|
||
|
||
static const uint32_t offsets_from_utf8[6] = { | ||
0x00000000, 0x00003080, 0x000E2080, | ||
0x03C82080, 0xFA082080, 0x82082080 | ||
}; | ||
|
||
|
||
template <ErrorStrategy OnError, typename GlyphStrategy> | ||
static size_t Utf8Consume( | ||
char* const input, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But perhaps it's better to mandate |
||
const size_t length, | ||
const GlyphStrategy OnGlyph) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, why don't you encode the callback in the template definition? Right now, it's always passed as the third argument. The type variable GlyphStrategy just shadows the typedef of the same name here. |
||
size_t idx = 0; | ||
while (idx < length) { | ||
size_t advance = 0; | ||
uint32_t glyph = 0; | ||
uint8_t extrabytes = (uint8_t)clz(~(static_cast<int>(input[idx])<<24)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think what you want here can be implemented portable and reasonably efficient using the following: inline uint32_t log2(uint8_t v) {
const uint32_t r = (v > 15) << 2;
v >>= r;
const uint32_t s = (v > 3) << 1;
v >>= s;
v >>= 1;
return r | s | v;
}
inline uint32_t clz(uint8_t v) {
// clz(0) == 7. Add a zero check if that's an issue.
return 7 - log2(v);
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Forgot to mention, the behavior of |
||
size_t i = idx; | ||
|
||
if (extrabytes + idx > length) { | ||
advance = OnError(length - idx, input, extrabytes); | ||
} else if (!IsLegalUTF8Glyph( | ||
reinterpret_cast<uint8_t*>(input + idx), extrabytes + 1)) { | ||
advance = OnError(length - idx, input, extrabytes); | ||
} else { | ||
switch (extrabytes) { | ||
case 5: | ||
glyph += (uint8_t) input[i++]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this mask off the high bits? Also, when is |
||
glyph <<= 6; | ||
case 4: | ||
glyph += (uint8_t) input[i++]; | ||
glyph <<= 6; | ||
case 3: | ||
glyph += (uint8_t) input[i++]; | ||
glyph <<= 6; | ||
case 2: | ||
glyph += (uint8_t) input[i++]; | ||
glyph <<= 6; | ||
case 1: | ||
glyph += (uint8_t) input[i++]; | ||
glyph <<= 6; | ||
case 0: | ||
glyph += (uint8_t) input[i]; | ||
} | ||
|
||
glyph -= offsets_from_utf8[extrabytes]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of bounds read when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That should be caught by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, right. That's not obvious from looking at the code so yes, a comment would help. Perhaps even better is a CHECK or an ASSERT. |
||
|
||
if (glyph > UNI_MAX_LEGAL_UTF32 || | ||
(glyph >= UNI_SUR_HIGH_START && glyph <= UNI_SUR_LOW_END)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Real Programmers(TM) write this as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Compilers are pretty good in replacing idiomatic range comparisons with bit hacks. Have you checked whether it's really necessary to have these ... ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Har, har. (But it's true that both clang and gcc manage to pull it off.) |
||
advance = OnError(length - idx, input, extrabytes); | ||
} else { | ||
advance = extrabytes + 1; | ||
OnGlyph(input + idx, extrabytes + 1, glyph, idx); | ||
} | ||
} | ||
|
||
if (advance == 0) { | ||
break; | ||
} | ||
idx += advance; | ||
} | ||
return idx; | ||
} | ||
|
||
|
||
size_t StripInvalidUtf8Glyphs(char * input, const size_t size) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
size_t idx = 0; | ||
auto on_glyph = [&input, &idx]( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would just use pass-by-value here. |
||
const char* data, size_t size, uint32_t glyph, size_t pos) { | ||
size_t old_idx = idx; | ||
idx += size; | ||
if (pos == old_idx) | ||
return; | ||
memcpy(input + old_idx, data, size); | ||
}; | ||
|
||
return Utf8Consume<Skip>(input, size, on_glyph); | ||
} | ||
|
||
|
||
bool Utf8Value::IsValidUTF8(char * const input, const size_t size) { | ||
return Utf8Consume<Halt>(input, size, DiscardGlyph) == size; | ||
} | ||
|
||
|
||
Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle<v8::Value> value) | ||
: length_(0), str_(str_st_) { | ||
if (value.IsEmpty()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Style: no C-style casts (here and elsewhere.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Sorry about that, missed while porting.