From 93376fbec82c27ce60555266694a1e1a3f062cb1 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Wed, 1 Jul 2015 01:05:17 -0400 Subject: [PATCH] Fix #10959 problems with UTF-8 conversions --- base/utf8.jl | 108 +++++++++-- base/utfconvert.jl | 444 +++++++++++++++++++++++++++++++++++++++++++++ test/strings.jl | 68 ++++--- 3 files changed, 563 insertions(+), 57 deletions(-) create mode 100644 base/utfconvert.jl diff --git a/base/utf8.jl b/base/utf8.jl index e94a988777521d..587f8e008bfdec 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -3,7 +3,7 @@ ## from base/boot.jl: # # immutable UTF8String <: AbstractString -# data::Array{UInt8,1} +# data::Vector{UInt8} # end # @@ -26,27 +26,23 @@ const utf8_trailing = [ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, ] -is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80) - ## required core functionality ## function endof(s::UTF8String) d = s.data i = length(d) i == 0 && return i - while !is_utf8_start(d[i]) + while is_valid_continuation(d[i]) i -= 1 end i end -is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80) - function length(s::UTF8String) d = s.data cnum = 0 for i = 1:length(d) - @inbounds cnum += !is_utf8_continuation(d[i]) + @inbounds cnum += !is_valid_continuation(d[i]) end cnum end @@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int) d = s.data b = d[i] - if !is_utf8_start(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + if is_valid_continuation(b) + j = i-1 + while 0 < j && is_valid_continuation(d[j]) + j -= 1 + end + if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) + # b is a continuation byte of a valid UTF-8 character + throw(UnicodeError(UTF_ERR_CONT, i, d[j])) + end + # move past 1 byte in case the data is actually Latin-1 + return '\ufffd', i+1 end trailing = utf8_trailing[b+1] if length(d) < i + trailing @@ -93,7 +98,7 @@ end function reverseind(s::UTF8String, i::Integer) j = lastidx(s) + 1 - i d = s.data - while !is_utf8_start(d[j]) + while is_valid_continuation(d[j]) j -= 1 end return j @@ -106,7 +111,7 @@ sizeof(s::UTF8String) = sizeof(s.data) lastidx(s::UTF8String) = length(s.data) isvalid(s::UTF8String, i::Integer) = - (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i]) + (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) const empty_utf8 = UTF8String(UInt8[]) @@ -114,11 +119,8 @@ function getindex(s::UTF8String, r::UnitRange{Int}) isempty(r) && return empty_utf8 i, j = first(r), last(r) d = s.data - if i < 1 || i > length(s.data) - throw(BoundsError(s, i)) - end - if !is_utf8_start(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + if is_valid_continuation(d[i]) + i = nextind(s,i) end if j > length(d) throw(BoundsError()) @@ -214,20 +216,86 @@ write(io::IO, s::UTF8String) = write(io, s.data) utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) -convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8)) -function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) + +" +Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String` + +### Input Arguments: +* `::Type{UTF8String}` +* `dat::Vector{UInt8}` + +### Returns: +* `::UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, dat::Vector{UInt8}) + # handle zero length string quickly + isempty(dat) && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string(dat) + if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 + len = sizeof(dat) + @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + end + # Copy, but eliminate over-long encodings and surrogate pairs + len += num2byte + num3byte*2 + num4byte*3 + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle overlong < 0x100 + elseif ch < 0xc2 + buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f) + # Handle 0x100-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + elseif ch != 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + buf[out += 1] = dat[pos += 1] + # Copy 4-byte encoded value + ch >= 0xf0 && (buf[out += 1] = dat[pos += 1]) + # Handle surrogate pairs + else + ch = dat[pos += 1] + if ch < 0xa0 # not surrogate pairs + buf[out += 1] = 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + else + # Pick up surrogate pairs (CESU-8 format) + ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) + + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f)) + - 0xc00) + pos += 4 + output_utf8_4byte!(buf, out, ch) + out += 4 + end + end + end + UTF8String(buf) +end + +function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString) l = length(a) idx = 1 iscopy = false while idx <= l - if is_utf8_start(a[idx]) + if !is_valid_continuation(a[idx]) nextidx = idx+1+utf8_trailing[a[idx]+1] (nextidx <= (l+1)) && (idx = nextidx; continue) end !iscopy && (a = copy(a); iscopy = true) endn = idx while endn <= l - is_utf8_start(a[endn]) && break + !is_valid_continuation(a[endn]) && break endn += 1 end (endn > idx) && (endn -= 1) diff --git a/base/utfconvert.jl b/base/utfconvert.jl new file mode 100644 index 00000000000000..cd5b12cb8b068b --- /dev/null +++ b/base/utfconvert.jl @@ -0,0 +1,444 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# Functions to convert to different UTF encodings + +# Quickly copy and set trailing \0 +@inline function fast_utf_copy(T::Type{UInt16}, len, dat) + @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len)) +end +@inline function fast_utf_copy(T::Type{Char}, len, dat) + @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len)) +end + +# Get rest of character ch from 3-byte UTF-8 sequence in dat +@inline function get_utf8_3(dat, pos, ch) + @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) +end + +# Get rest of character ch from 4-byte UTF-8 sequence in dat +@inline function get_utf8_4(dat, pos, ch) + @inbounds return (((ch & 0x7) << 18) + | (UInt32(dat[pos-2] & 0x3f) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) +end + +# Output a character as a 4-byte UTF-8 sequence +@inline function output_utf8_4(buf, out, ch) + @inbounds begin + buf[out + 1] = 0xf0 | (ch >>> 18) + buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) + buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out + 4] = 0x80 | (ch & 0x3f) + end +end + +#= +""" +@brief Converts an AbstractString to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::AbstractString + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::AbstractString) + len, flags, num4byte = check_string_abs(str) + buf = Vector{UInt16}(len+num4byte+1) + out = 0 + @inbounds for ch in str + c = UInt32(ch) + if c < 0x10000 + buf[out += 1] = UInt16(c) + else + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) + end + end + @inbounds buf[out + 1] = 0 # NULL termination + UTF16String(buf) +end + +#= +""" +@brief Converts an AbstractString to a UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::AbstractString + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::AbstractString) + len, flags = check_string_abs(str) + buf = Vector{Char}(len+1) + out = 0 + @inbounds for ch in str ; buf[out += 1] = ch ; end + @inbounds buf[out + 1] = 0 # NULL termination + UTF32String(buf) +end + +#= +@doc """ +@brief Converts a UTF8String to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::UTF8String + +@return ::UTF16String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF16String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf16 + # Check that is correct UTF-8 encoding and get number of words needed + len, flags, num4byte = check_string_utf8(dat) + len += num4byte + buf = Vector{UInt16}(len+1) + @inbounds buf[len+1] = 0 + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + buf[out += 1] = get_utf8_3(dat, pos, ch) + # Handle range 0x10000-0x10ffff + else + pos += 3 + ch = get_utf8_4(dat, pos, ch) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) + end + end + UTF16String(buf) +end + +#= +@doc """ +@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] dat::Vector{UInt16} + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt16}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF16String to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] str::UTF16String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF16String) + dat = str.data + len = sizeof(dat) >>> 1 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Encodes a UTF-32 encoded vector of UInt32 to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] dat::Vector{UInt32} + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF32String to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] str::UTF32String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) >>> 2 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts an already validated vector of UInt16 or UInt32 to a UTF8String + +@param[in] T type (UInt16 or UInt32) +@param[in] dat Vector{T} +@param[in] len length of output in bytes + +@return ::UTF8String +""" -> +=# +function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle 0x80-0x7ff + elseif ch < 0x800 + buf[out += 1] = 0xc0 | (ch >>> 6) + buf[out += 1] = 0x80 | (ch & 0x3f) + # Handle 0x10000-0x10ffff (if input is UInt32) + elseif T == UInt32 && ch > 0xffff + output_utf8_4(buf, out, ch) + out += 4 + # Handle surrogate pairs + elseif is_surrogate_codeunit(ch) + output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1])) + out += 4 + # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters + else + buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) + buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out += 1] = 0x80 | (ch & 0x3f) + end + end + UTF8String(buf) +end + +#= +""" +@brief Converts a UTF8String to a UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::UTF8String + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf32 + # Validate UTF-8 encoding, and get number of words to create + len, flags = check_string_utf8(dat) + # Optimize case where no characters > 0x7f + totlen = len+1 + flags == 0 && return fast_utf_copy(Char, totlen, dat) + # has multi-byte UTF-8 sequences + buf = Vector{Char}(totlen) + @inbounds buf[totlen] = 0 # NULL termination + local ch::UInt32, surr::UInt32 + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + ch = get_utf8_3(dat, pos, ch) + # Handle surrogate pairs (should have been encoded in 4 bytes) + if is_surrogate_lead(ch) + # Build up 32-bit character from ch and trailing surrogate in next 3 bytes + pos += 3 + surr = ((UInt32(dat[pos-2] & 0xf) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) + ch = get_supplementary(ch, surr) + end + buf[out += 1] = ch + # Handle range 0x10000-0x10ffff + else + pos += 3 + buf[out += 1] = get_utf8_4(dat, pos, ch) + end + end + UTF32String(buf) +end + +#= +""" +@brief Converts a UTF16String to UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::UTF16String + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF16String) + dat = str.data + len = sizeof(dat) + # handle zero length string quickly (account for trailing \0) + len <= 2 && return empty_utf32 + # get number of words to create + len, flags, num4byte = check_string_utf16(dat, len>>>1) + # No surrogate pairs, do optimized copy + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) + local ch::UInt32 + buf = Vector{Char}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # check for surrogate pair + if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end + buf[out += 1] = ch + end + UTF32String(buf) +end + +#= +""" +@brief Converts a UTF-32 encoded vector of UInt32 to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] dat::Vector{UInt32} + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + len += num4byte + 1 + # optimized path, no surrogates + num4byte == 0 && return fast_utf_copy(UInt16, len, dat) + return encode_to_utf16(dat, len) +end + +#= +""" +@brief Converts a UTF32String to UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::UTF32String + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + # optimized path, no surrogates + num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + return encode_to_utf16(dat, len + num4byte) +end + +#= +@doc """ +@brief Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String + +@param[in] dat::Vector{UInt32} UTF-32 encoded data +@param[in] len length of output in 16-bit words + +@return ::UTF16String +""" -> +=# +function encode_to_utf16(dat, len) + buf = Vector{UInt16}(len) + @inbounds buf[len] = 0 # NULL termination + out = 0 + pos = 0 + @inbounds while out < len + ch = UInt32(dat[pos += 1]) + if ch > 0xffff + # Output surrogate pair for 0x10000-0x10ffff + buf[out += 1] = 0xd7c0 + (ch >>> 10) + ch = 0xdc00 + (ch & 0x3ff) + end + buf[out += 1] = ch + end + UTF16String(buf) +end + +convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) + +function convert(::Type{UTF16String}, str::ASCIIString) + dat = str.data + fast_utf_copy(UInt16, length(dat)+1, dat) +end + +function convert(::Type{UTF32String}, str::ASCIIString) + dat = str.data + fast_utf_copy(Char, length(dat)+1, dat) +end + +convert(::Type{UTF16String}, str::UTF16String) = str +convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) + +convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data +convert(::Type{Array{UInt16}}, str::UTF16String) = str.data + +convert(::Type{UTF32String}, str::UTF32String) = str + +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) diff --git a/test/strings.jl b/test/strings.jl index 200c5f780c6401..f7424c8a5b0607 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1948,14 +1948,6 @@ foobaz(ch) = Char(0x200000) @test_throws UnicodeError map(foobar, utf16(str)) @test_throws UnicodeError map(foobaz, utf16(str)) -# issue #11551 (#11004,#10959) -function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String) - @test utf16(strUTF8) == strUTF16 - @test utf8(strUTF16) == strUTF8 -end - -# Create some ASCII, UTF8 and UTF16 - # issue #11551 (#11004,#10959) function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) @test utf16(strUTF8) == strUTF16 @@ -1967,7 +1959,6 @@ function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32Strin end # Create some ASCII, UTF8, UTF16, and UTF32 strings - strAscii = "abcdefgh" strA_UTF8 = ("abcdefgh\uff")[1:8] strL_UTF8 = "abcdef\uff\uff" @@ -1976,8 +1967,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") -strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") +strZ = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80" strA_UTF16 = utf16(strA_UTF8) strL_UTF16 = utf16(strL_UTF8) @@ -2012,97 +2003,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) @test utf16(strS_UTF32) == strC_UTF8 # Test converting overlong \0 -# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) -@test utf16(strZ_UTF8) == strz_UTF8 -@test utf32(strZ_UTF8) == strz_UTF8 +@test utf8(strZ) == strz_UTF8 +@test utf16(UTF8String(strZ)) == strz_UTF8 +@test utf32(UTF8String(strZ)) == strz_UTF8 # Test invalid sequences +@inline strval(::Type{UTF8String}, dat) = dat +@inline strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat) + byt = 0x0 -for T in (UTF16String, UTF32String) +for T in (UTF8String, UTF16String, UTF32String) try # Continuation byte not after lead for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt])) end # Test lead bytes for byt in 0xc0:0xff # Single lead byte at end of string - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt])) # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0])) # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0xc0])) end # Test overlong 2-byte for byt in 0x81:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xc0,byt])) end for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xc1,byt])) end # Test overlong 3-byte for byt in 0x80:0x9f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xe0,byt,0x80])) end # Test overlong 4-byte for byt in 0x80:0x8f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xef,byt,0x80,0x80])) end # Test 4-byte > 0x10ffff for byt in 0x90:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xf4,byt,0x80,0x80])) end for byt in 0xf5:0xf7 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80])) end # Test 5-byte for byt in 0xf8:0xfb - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80])) end # Test 6-byte for byt in 0xfc:0xfd - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80,0x80])) end # Test 7-byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) # Three and above byte sequences for byt in 0xe0:0xef # Lead followed by only 1 continuation byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80])) # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0])) # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0xc0])) end # 3-byte encoded surrogate character(s) # Single surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80])) # Not followed by surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) # Trailing surrogate first - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) # Followed by lead surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) # Four byte sequences for byt in 0xf0:0xf4 # Lead followed by only 2 continuation bytes - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80])) # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0])) # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) + @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0xc0])) end catch exp ; println("Error checking $T: $byt")