diff --git a/base/utf16.jl b/base/utf16.jl index 9f91c2c5def05..86ffc3e5eeeca 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -139,7 +139,7 @@ function encode16(s::AbstractString) push!(buf, UInt16(0xd7c0 + (c >>> 10))) push!(buf, UInt16(0xdc00 + (c & 0x3ff))) else - throw(UnicodeError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) + throw(UnicodeError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) end end push!(buf, 0) # NULL termination @@ -156,16 +156,16 @@ function ascii_to_utf16(s::Array{UInt8,1}) UTF16String(buf) end -const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' -const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 -const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) +const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' +const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 +const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) -const UTF_LONG = 1 # Long encodings are present -const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present -const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present -const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff -const UTF_UNICODE4 = 16 # non-BMP characters present -const UTF_SURROGATE = 32 # surrogate pairs present +const UTF_LONG = 1 # Long encodings are present +const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present +const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present +const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff +const UTF_UNICODE4 = 16 # non-BMP characters present +const UTF_SURROGATE = 32 # surrogate pairs present #= @doc """ @@ -200,11 +200,11 @@ function utf8_validate(iStr::Array{UInt8,1}, iOptions::Integer=0) !is_valid_continuation(c1) && utf_errfunc(UTF_ERR_CONT, pos, ch) ch = ((ch & 0x3f) << 6) | (c1 & 0x3f) if ch > 0x7f - cnt2 += 1 + cnt2 += 1 flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 elseif (ch != 0 ? (iOptions & UTF_ACCEPT_LONG) : !(iOptions & UTF_NO_LONG_NULL)) != 0 flags |= UTF_LONG - else + else utf_errfunc(UTF_ERR_LONG, pos, ch) end elseif ch < 0xf0 @@ -214,8 +214,8 @@ function utf8_validate(iStr::Array{UInt8,1}, iOptions::Integer=0) c2 = iStr[pos += 1] !is_valid_continuation(c2) && utf_errfunc(UTF_ERR_CONT, pos, c2) ch = ((ch & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f) - # check for surrogate pairs, make sure correct - if is_surrogate_char(ch) + # check for surrogate pairs, make sure correct + if is_surrogate_char(ch) !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) # next character *must* be a trailing surrogate character (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) @@ -227,15 +227,15 @@ function utf8_validate(iStr::Array{UInt8,1}, iOptions::Integer=0) !is_valid_continuation(c3) && utf_errfunc(UTF_ERR_CONT, pos, c3) c3 = ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f) !is_surrogate_trail(c3) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, c3) - (iOptions & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, c3) + (iOptions & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, c3) flags |= UTF_SURROGATE cnt4 += 1 - elseif ch > 0x07ff - cnt3 += 1 - elseif (iOptions & UTF_ACCEPT_LONG) != 0 + elseif ch > 0x07ff + cnt3 += 1 + elseif (iOptions & UTF_ACCEPT_LONG) != 0 flags |= UTF_LONG - cnt2 += 1 - else + cnt2 += 1 + else utf_errfunc(UTF_ERR_LONG, pos-2, ch) end elseif ch < 0xf6 @@ -247,20 +247,20 @@ function utf8_validate(iStr::Array{UInt8,1}, iOptions::Integer=0) c3 = iStr[pos += 1] !is_valid_continuation(c3) && utf_errfunc(UTF_ERR_CONT, pos, c3) ch = ((ch & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f) - if ch > 0x10ffff - utf_errfunc(UTF_ERR_INVALID, pos-3, ch) - elseif ch > 0xffff - cnt4 += 1 - elseif is_surrogate_char(ch) + if ch > 0x10ffff + utf_errfunc(UTF_ERR_INVALID, pos-3, ch) + elseif ch > 0xffff + cnt4 += 1 + elseif is_surrogate_char(ch) utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) - elseif (iOptions & UTF_ACCEPT_LONG) != 0 + elseif (iOptions & UTF_ACCEPT_LONG) != 0 flags |= UTF_LONG - if ch > 0x7ff - cnt3 += 1 - elseif ch > 0x7f - cnt2 += 1 - end - else + if ch > 0x7ff + cnt3 += 1 + elseif ch > 0x7f + cnt2 += 1 + end + else utf_errfunc(UTF_ERR_LONG, pos-2, ch) end else @@ -295,13 +295,13 @@ function utf16_validate(iStr::Array{UInt16,1}, iOptions::Integer=0) while pos < len ch = iStr[pos += 1] cntT += 1 - if ch > 0x7f + if ch > 0x7f if ch < 0x100 cnt2 += 1 - flags |= UTF_LATIN1 + flags |= UTF_LATIN1 elseif ch < 0x800 cnt2 += 1 - flags |= UTF_UNICODE2 + flags |= UTF_UNICODE2 elseif !is_surrogate_char(ch) cnt3 += 1 elseif is_surrogate_lead(ch) @@ -341,13 +341,13 @@ function utf32_validate(iStr::Array{Char,1}, iOptions::Integer=0) while pos < len ch = reinterpret(UInt32, iStr[pos += 1]) cntT += 1 - if ch > 0x7f + if ch > 0x7f if ch < 0x100 cnt2 += 1 - flags |= UTF_LATIN1 + flags |= UTF_LATIN1 elseif ch < 0x800 cnt2 += 1 - flags |= UTF_UNICODE2 + flags |= UTF_UNICODE2 elseif ch > 0xffff (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) cnt4 += 1 @@ -359,8 +359,8 @@ function utf32_validate(iStr::Array{Char,1}, iOptions::Integer=0) ch = reinterpret(UInt32, iStr[pos += 1]) !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) cnt4 += 1 - (iOptions & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) - flags |= UTF_SURROGATE + (iOptions & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) + flags |= UTF_SURROGATE else utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) end diff --git a/base/utf32.jl b/base/utf32.jl index 636973d17cb82..deaa3ddde2260 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -80,15 +80,15 @@ function utf8_to_utf32(iStr::Array{UInt8,1}) while out < len ch::UInt32 = iStr[pos += 1] if ch > 0x7f - if ch < 0xe0 - ch = ((ch & 0x1f) << 6) | (iStr[pos += 1] & 0x3f) - elseif ch < 0xf0 - ch = ((ch & 0xf) << 12) | (UInt32(iStr[pos+1] & 0x3f) << 6) | (iStr[pos+2] & 0x3f) - pos += 2 - else - ch = ((ch & 0x7) << 18) | (UInt32(iStr[pos+1] & 0x3f) << 12) | (UInt32(iStr[pos+2] & 0x3f) << 6) | (iStr[pos+3] & 0x3f) - pos += 3 - end + if ch < 0xe0 + ch = ((ch & 0x1f) << 6) | (iStr[pos += 1] & 0x3f) + elseif ch < 0xf0 + ch = ((ch & 0xf) << 12) | (UInt32(iStr[pos+1] & 0x3f) << 6) | (iStr[pos+2] & 0x3f) + pos += 2 + else + ch = ((ch & 0x7) << 18) | (UInt32(iStr[pos+1] & 0x3f) << 12) | (UInt32(iStr[pos+2] & 0x3f) << 6) | (iStr[pos+3] & 0x3f) + pos += 3 + end end buf[out += 1] = Char(ch) end @@ -162,7 +162,7 @@ function utf32_to_utf16(iStr::Array{Char,1}) buf[pos] = UInt16(iStr[pos]) end else - out::UInt = 0 + out::UInt = 0 pos::UInt = 0 while pos < len ch = UInt32(iStr[pos += 1]) @@ -171,7 +171,7 @@ function utf32_to_utf16(iStr::Array{Char,1}) ch = 0xdc00 + (ch & 0x3ff) end buf[out += 1] = ch - end + end end UTF16String(buf) end @@ -201,7 +201,7 @@ function utf32_to_utf8(iStr::Array{Char,1}) buf[pos] = reinterpret(UInt32, iStr[pos]) end else - out::UInt = 0 + out::UInt = 0 pos::UInt = 0 while out < len ch::UInt32 = reinterpret(UInt32, iStr[pos += 1])