From 91305f70255449b217ae2e9e49e5971fa5be94ff Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Thu, 23 Jul 2015 19:02:14 -0400 Subject: [PATCH] Fix #10959, fix #11463 bugs with UTF-8 conversions Use generic is_valid_continuation from unicode/checkstring instead of is_utf8_continuation/is_utf8_start --- base/unicode/utf8.jl | 92 ++++++++++++++++++++++++++++++++++++------- test/unicode/utf32.jl | 59 ++++++++++++++------------- 2 files changed, 108 insertions(+), 43 deletions(-) diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index 50423094139d8..ec59ec5083ac0 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -3,7 +3,7 @@ ## from base/boot.jl: # # immutable UTF8String <: AbstractString -# data::Array{UInt8,1} +# data::Vector{UInt8} # end # @@ -26,6 +26,8 @@ const utf8_trailing = [ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, ] +# Retained because although undocumented and unexported, used in a package (MutableStrings) +# should be deprecated is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80) ## required core functionality ## @@ -34,19 +36,17 @@ function endof(s::UTF8String) d = s.data i = length(d) i == 0 && return i - while !is_utf8_start(d[i]) + while is_valid_continuation(d[i]) i -= 1 end i end -is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80) - function length(s::UTF8String) d = s.data cnum = 0 for i = 1:length(d) - @inbounds cnum += !is_utf8_continuation(d[i]) + @inbounds cnum += !is_valid_continuation(d[i]) end cnum end @@ -65,7 +65,7 @@ function next(s::UTF8String, i::Int) d = s.data b = d[i] - if !is_utf8_start(b) + if is_valid_continuation(b) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) end trailing = utf8_trailing[b+1] @@ -93,7 +93,7 @@ end function reverseind(s::UTF8String, i::Integer) j = lastidx(s) + 1 - i d = s.data - while !is_utf8_start(d[j]) + while is_valid_continuation(d[j]) j -= 1 end return j @@ -106,7 +106,7 @@ sizeof(s::UTF8String) = sizeof(s.data) lastidx(s::UTF8String) = length(s.data) isvalid(s::UTF8String, i::Integer) = - (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i]) + (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) const empty_utf8 = UTF8String(UInt8[]) @@ -133,7 +133,7 @@ function search(s::UTF8String, c::Char, i::Integer) throw(BoundsError(s, i)) end d = s.data - if !is_utf8_start(d[i]) + if is_valid_continuation(d[i]) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) end c < Char(0x80) && return search(d, c%UInt8, i) @@ -216,20 +216,82 @@ convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) convert(::Type{SubString{UTF8String}}, s::SubString{ASCIIString}) = SubString(utf8(s.string), s.offset+1, s.endof+s.offset) -convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8)) -function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) + +""" +Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +""" +function convert(::Type{UTF8String}, dat::Vector{UInt8}) + # handle zero length string quickly + isempty(dat) && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) + if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 + len = sizeof(dat) + @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + end + # Copy, but eliminate over-long encodings and surrogate pairs + len += num2byte + num3byte*2 + num4byte*3 + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle overlong < 0x100 + elseif ch < 0xc2 + buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f) + # Handle 0x100-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + elseif ch != 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + buf[out += 1] = dat[pos += 1] + # Copy 4-byte encoded value + ch >= 0xf0 && (buf[out += 1] = dat[pos += 1]) + # Handle surrogate pairs + else + ch = dat[pos += 1] + if ch < 0xa0 # not surrogate pairs + buf[out += 1] = 0xed + buf[out += 1] = ch + buf[out += 1] = dat[pos += 1] + else + # Pick up surrogate pairs (CESU-8 format) + ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) + + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f)) + - 0xc00) + pos += 4 + output_utf8_4byte!(buf, out, ch) + out += 4 + end + end + end + UTF8String(buf) +end + +function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString) l = length(a) idx = 1 iscopy = false while idx <= l - if is_utf8_start(a[idx]) + if !is_valid_continuation(a[idx]) nextidx = idx+1+utf8_trailing[a[idx]+1] (nextidx <= (l+1)) && (idx = nextidx; continue) end !iscopy && (a = copy(a); iscopy = true) endn = idx while endn <= l - is_utf8_start(a[endn]) && break + !is_valid_continuation(a[endn]) && break endn += 1 end (endn > idx) && (endn -= 1) @@ -240,7 +302,7 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr end convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s)) -" +""" Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` ### Input Arguments: @@ -249,7 +311,7 @@ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` ### Returns: * `UTF8String` -" +""" function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) buf = Vector{UInt8}(len) out = 0 diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl index 0d906bc37c110..5a348dc78da71 100644 --- a/test/unicode/utf32.jl +++ b/test/unicode/utf32.jl @@ -32,8 +32,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") -strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") +strZ = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80" strA_UTF16 = utf16(strA_UTF8) strL_UTF16 = utf16(strL_UTF8) @@ -68,97 +68,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) @test utf16(strS_UTF32) == strC_UTF8 # Test converting overlong \0 -# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) -@test utf16(strZ_UTF8) == strz_UTF8 -@test utf32(strZ_UTF8) == strz_UTF8 +@test utf8(strZ) == strz_UTF8 +@test utf16(UTF8String(strZ)) == strz_UTF8 +@test utf32(UTF8String(strZ)) == strz_UTF8 # Test invalid sequences +strval(::Type{UTF8String}, dat) = dat +strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat) + byt = 0x0 -for T in (UTF16String, UTF32String) +for T in (UTF8String, UTF16String, UTF32String) try # Continuation byte not after lead for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt])) end # Test lead bytes for byt in 0xc0:0xff # Single lead byte at end of string - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt])) # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0])) # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0xc0])) end # Test overlong 2-byte for byt in 0x81:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xc0,byt])) end for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xc1,byt])) end # Test overlong 3-byte for byt in 0x80:0x9f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xe0,byt,0x80])) end # Test overlong 4-byte for byt in 0x80:0x8f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xef,byt,0x80,0x80])) end # Test 4-byte > 0x10ffff for byt in 0x90:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xf4,byt,0x80,0x80])) end for byt in 0xf5:0xf7 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80])) end # Test 5-byte for byt in 0xf8:0xfb - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80])) end # Test 6-byte for byt in 0xfc:0xfd - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80,0x80])) end # Test 7-byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) # Three and above byte sequences for byt in 0xe0:0xef # Lead followed by only 1 continuation byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80])) # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0])) # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0xc0])) end # 3-byte encoded surrogate character(s) # Single surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80])) # Not followed by surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) # Trailing surrogate first - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) # Followed by lead surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) # Four byte sequences for byt in 0xf0:0xf4 # Lead followed by only 2 continuation bytes - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80])) # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0])) # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) + @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0xc0])) end catch exp ; println("Error checking $T: $byt")